From 0e35041dc762f25c970503df88d7dd5ed400f799 Mon Sep 17 00:00:00 2001
From: usamoi <usamoi@outlook.com>
Date: Tue, 28 Jan 2025 00:00:50 +0800
Subject: [PATCH] mark AVX512 & AVXNECONVERT intrinsics as safe

Mark all AVX512 & AVXNECONVERT SIMD-computing intrinsics as safe, except for those involving memory operations.
---
 crates/core_arch/src/x86/avx512bf16.rs      |   311 +-
 crates/core_arch/src/x86/avx512bitalg.rs    |   216 +-
 crates/core_arch/src/x86/avx512bw.rs        |  7321 +++---
 crates/core_arch/src/x86/avx512cd.rs        |   252 +-
 crates/core_arch/src/x86/avx512dq.rs        |  2742 ++-
 crates/core_arch/src/x86/avx512f.rs         | 22206 ++++++++++--------
 crates/core_arch/src/x86/avx512fp16.rs      |  4886 ++--
 crates/core_arch/src/x86/avx512ifma.rs      |   128 +-
 crates/core_arch/src/x86/avx512vbmi.rs      |   224 +-
 crates/core_arch/src/x86/avx512vbmi2.rs     |   954 +-
 crates/core_arch/src/x86/avx512vnni.rs      |   458 +-
 crates/core_arch/src/x86/avx512vpopcntdq.rs |   192 +-
 crates/core_arch/src/x86/avxneconvert.rs    |    40 +-
 crates/core_arch/src/x86_64/avx512bw.rs     |     4 +-
 crates/core_arch/src/x86_64/avx512f.rs      |   252 +-
 crates/core_arch/src/x86_64/avx512fp16.rs   |    72 +-
 16 files changed, 21802 insertions(+), 18456 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512bf16.rs b/crates/core_arch/src/x86/avx512bf16.rs
index 6789fb1c31..ca45761d08 100644
--- a/crates/core_arch/src/x86/avx512bf16.rs
+++ b/crates/core_arch/src/x86/avx512bf16.rs
@@ -37,8 +37,8 @@ unsafe extern "C" {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
-    transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4()))
+pub fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
+    unsafe { transmute(cvtne2ps2bf16(a.as_f32x4(), b.as_f32x4())) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in two vectors
@@ -50,9 +50,11 @@ pub unsafe fn _mm_cvtne2ps_pbh(a: __m128, b: __m128) -> __m128bh {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
-    let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+pub fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    unsafe {
+        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in two vectors
@@ -64,9 +66,11 @@ pub unsafe fn _mm_mask_cvtne2ps_pbh(src: __m128bh, k: __mmask8, a: __m128, b: __
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
-    let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+pub fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m128bh {
+    unsafe {
+        let cvt = _mm_cvtne2ps_pbh(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in two 256-bit vectors
@@ -77,8 +81,8 @@ pub unsafe fn _mm_maskz_cvtne2ps_pbh(k: __mmask8, a: __m128, b: __m128) -> __m12
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
-    transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8()))
+pub fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
+    unsafe { transmute(cvtne2ps2bf16_256(a.as_f32x8(), b.as_f32x8())) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
@@ -89,14 +93,11 @@ pub unsafe fn _mm256_cvtne2ps_pbh(a: __m256, b: __m256) -> __m256bh {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm256_mask_cvtne2ps_pbh(
-    src: __m256bh,
-    k: __mmask16,
-    a: __m256,
-    b: __m256,
-) -> __m256bh {
-    let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+pub fn _mm256_mask_cvtne2ps_pbh(src: __m256bh, k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+    unsafe {
+        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in two vectors a and b
@@ -107,9 +108,11 @@ pub unsafe fn _mm256_mask_cvtne2ps_pbh(
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
-    let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+pub fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> __m256bh {
+    unsafe {
+        let cvt = _mm256_cvtne2ps_pbh(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in two 512-bit vectors
@@ -120,8 +123,8 @@ pub unsafe fn _mm256_maskz_cvtne2ps_pbh(k: __mmask16, a: __m256, b: __m256) -> _
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
-    transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16()))
+pub fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
+    unsafe { transmute(cvtne2ps2bf16_512(a.as_f32x16(), b.as_f32x16())) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in two vectors
@@ -133,14 +136,11 @@ pub unsafe fn _mm512_cvtne2ps_pbh(a: __m512, b: __m512) -> __m512bh {
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm512_mask_cvtne2ps_pbh(
-    src: __m512bh,
-    k: __mmask32,
-    a: __m512,
-    b: __m512,
-) -> __m512bh {
-    let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
+pub fn _mm512_mask_cvtne2ps_pbh(src: __m512bh, k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+    unsafe {
+        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x32()))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in two vectors
@@ -152,9 +152,11 @@ pub unsafe fn _mm512_mask_cvtne2ps_pbh(
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtne2ps2bf16"))]
-pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
-    let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
+pub fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> __m512bh {
+    unsafe {
+        let cvt = _mm512_cvtne2ps_pbh(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, cvt, u16x32::ZERO))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@@ -164,8 +166,8 @@ pub unsafe fn _mm512_maskz_cvtne2ps_pbh(k: __mmask32, a: __m512, b: __m512) -> _
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
-    transmute(cvtneps2bf16_256(a.as_f32x8()))
+pub fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
+    unsafe { transmute(cvtneps2bf16_256(a.as_f32x8())) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@@ -176,9 +178,11 @@ pub unsafe fn _mm256_cvtneps_pbh(a: __m256) -> __m128bh {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
-    let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
-    transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+pub fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) -> __m128bh {
+    unsafe {
+        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x8()))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@@ -189,9 +193,11 @@ pub unsafe fn _mm256_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m256) ->
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
-    let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
-    transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+pub fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
+    unsafe {
+        let cvt = _mm256_cvtneps_pbh(a).as_u16x8();
+        transmute(simd_select_bitmask(k, cvt, u16x8::ZERO))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@@ -201,8 +207,8 @@ pub unsafe fn _mm256_maskz_cvtneps_pbh(k: __mmask8, a: __m256) -> __m128bh {
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
-    transmute(cvtneps2bf16_512(a.as_f32x16()))
+pub fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
+    unsafe { transmute(cvtneps2bf16_512(a.as_f32x16())) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@@ -213,9 +219,11 @@ pub unsafe fn _mm512_cvtneps_pbh(a: __m512) -> __m256bh {
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
-    let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
-    transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+pub fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) -> __m256bh {
+    unsafe {
+        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, src.as_u16x16()))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@@ -226,9 +234,11 @@ pub unsafe fn _mm512_mask_cvtneps_pbh(src: __m256bh, k: __mmask16, a: __m512) ->
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
-pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
-    let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
-    transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+pub fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
+    unsafe {
+        let cvt = _mm512_cvtneps_pbh(a).as_u16x16();
+        transmute(simd_select_bitmask(k, cvt, u16x16::ZERO))
+    }
 }
 
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@@ -239,8 +249,8 @@ pub unsafe fn _mm512_maskz_cvtneps_pbh(k: __mmask16, a: __m512) -> __m256bh {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
-    transmute(dpbf16ps(src.as_f32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe { transmute(dpbf16ps(src.as_f32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@@ -252,9 +262,11 @@ pub unsafe fn _mm_dpbf16_ps(src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
-    let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
+pub fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe {
+        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x4()))
+    }
 }
 
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@@ -266,10 +278,12 @@ pub unsafe fn _mm_mask_dpbf16_ps(src: __m128, k: __mmask8, a: __m128bh, b: __m12
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
-    let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
-    let zero = _mm_set1_ps(0.0_f32).as_f32x4();
-    transmute(simd_select_bitmask(k, rst, zero))
+pub fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m128bh) -> __m128 {
+    unsafe {
+        let rst = _mm_dpbf16_ps(src, a, b).as_f32x4();
+        let zero = _mm_set1_ps(0.0_f32).as_f32x4();
+        transmute(simd_select_bitmask(k, rst, zero))
+    }
 }
 
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@@ -280,8 +294,8 @@ pub unsafe fn _mm_maskz_dpbf16_ps(k: __mmask8, src: __m128, a: __m128bh, b: __m1
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
-    transmute(dpbf16ps_256(src.as_f32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe { transmute(dpbf16ps_256(src.as_f32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@@ -293,9 +307,11 @@ pub unsafe fn _mm256_dpbf16_ps(src: __m256, a: __m256bh, b: __m256bh) -> __m256
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
-    let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
+pub fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe {
+        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x8()))
+    }
 }
 
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@@ -307,9 +323,11 @@ pub unsafe fn _mm256_mask_dpbf16_ps(src: __m256, k: __mmask8, a: __m256bh, b: __
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
-    let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
+pub fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: __m256bh) -> __m256 {
+    unsafe {
+        let rst = _mm256_dpbf16_ps(src, a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, rst, f32x8::ZERO))
+    }
 }
 
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@@ -322,8 +340,8 @@ pub unsafe fn _mm256_maskz_dpbf16_ps(k: __mmask8, src: __m256, a: __m256bh, b: _
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
-    transmute(dpbf16ps_512(src.as_f32x16(), a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe { transmute(dpbf16ps_512(src.as_f32x16(), a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@@ -335,9 +353,11 @@ pub unsafe fn _mm512_dpbf16_ps(src: __m512, a: __m512bh, b: __m512bh) -> __m512
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
-    let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
+pub fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe {
+        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, rst, src.as_f32x16()))
+    }
 }
 
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in a and b,
@@ -349,14 +369,11 @@ pub unsafe fn _mm512_mask_dpbf16_ps(src: __m512, k: __mmask16, a: __m512bh, b: _
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr("vdpbf16ps"))]
-pub unsafe fn _mm512_maskz_dpbf16_ps(
-    k: __mmask16,
-    src: __m512,
-    a: __m512bh,
-    b: __m512bh,
-) -> __m512 {
-    let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
+pub fn _mm512_maskz_dpbf16_ps(k: __mmask16, src: __m512, a: __m512bh, b: __m512bh) -> __m512 {
+    unsafe {
+        let rst = _mm512_dpbf16_ps(src, a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, rst, f32x16::ZERO))
+    }
 }
 
 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@@ -366,8 +383,8 @@ pub unsafe fn _mm512_maskz_dpbf16_ps(
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
-    _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a))))
+pub fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
+    unsafe { _mm512_castsi512_ps(_mm512_slli_epi32::<16>(_mm512_cvtepi16_epi32(transmute(a)))) }
 }
 
 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@@ -378,9 +395,11 @@ pub unsafe fn _mm512_cvtpbh_ps(a: __m256bh) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
-    let cvt = _mm512_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
+pub fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> __m512 {
+    unsafe {
+        let cvt = _mm512_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x16(), src.as_f32x16()))
+    }
 }
 
 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@@ -391,9 +410,11 @@ pub unsafe fn _mm512_mask_cvtpbh_ps(src: __m512, k: __mmask16, a: __m256bh) -> _
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
-    let cvt = _mm512_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
+pub fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
+    unsafe {
+        let cvt = _mm512_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x16(), f32x16::ZERO))
+    }
 }
 
 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@@ -403,8 +424,8 @@ pub unsafe fn _mm512_maskz_cvtpbh_ps(k: __mmask16, a: __m256bh) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
-    _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a))))
+pub fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
+    unsafe { _mm256_castsi256_ps(_mm256_slli_epi32::<16>(_mm256_cvtepi16_epi32(transmute(a)))) }
 }
 
 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@@ -415,9 +436,11 @@ pub unsafe fn _mm256_cvtpbh_ps(a: __m128bh) -> __m256 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
-    let cvt = _mm256_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
+pub fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __m256 {
+    unsafe {
+        let cvt = _mm256_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x8(), src.as_f32x8()))
+    }
 }
 
 /// Converts packed BF16 (16-bit) floating-point elements in a to packed single-precision (32-bit)
@@ -428,9 +451,11 @@ pub unsafe fn _mm256_mask_cvtpbh_ps(src: __m256, k: __mmask8, a: __m128bh) -> __
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
-    let cvt = _mm256_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
+pub fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
+    unsafe {
+        let cvt = _mm256_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x8(), f32x8::ZERO))
+    }
 }
 
 /// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
@@ -440,8 +465,8 @@ pub unsafe fn _mm256_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m256 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
-    _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a))))
+pub fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
+    unsafe { _mm_castsi128_ps(_mm_slli_epi32::<16>(_mm_cvtepi16_epi32(transmute(a)))) }
 }
 
 /// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
@@ -452,9 +477,11 @@ pub unsafe fn _mm_cvtpbh_ps(a: __m128bh) -> __m128 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
-    let cvt = _mm_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
+pub fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m128 {
+    unsafe {
+        let cvt = _mm_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x4(), src.as_f32x4()))
+    }
 }
 
 /// Converts packed BF16 (16-bit) floating-point elements in a to single-precision (32-bit) floating-point
@@ -465,9 +492,11 @@ pub unsafe fn _mm_mask_cvtpbh_ps(src: __m128, k: __mmask8, a: __m128bh) -> __m12
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
-    let cvt = _mm_cvtpbh_ps(a);
-    transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
+pub fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
+    unsafe {
+        let cvt = _mm_cvtpbh_ps(a);
+        transmute(simd_select_bitmask(k, cvt.as_f32x4(), f32x4::ZERO))
+    }
 }
 
 /// Converts a single BF16 (16-bit) floating-point element in a to a single-precision (32-bit) floating-point
@@ -477,7 +506,7 @@ pub unsafe fn _mm_maskz_cvtpbh_ps(k: __mmask8, a: __m128bh) -> __m128 {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512f")]
 #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
-pub unsafe fn _mm_cvtsbh_ss(a: bf16) -> f32 {
+pub fn _mm_cvtsbh_ss(a: bf16) -> f32 {
     f32::from_bits((a.to_bits() as u32) << 16)
 }
 
@@ -489,15 +518,17 @@ pub unsafe fn _mm_cvtsbh_ss(a: bf16) -> f32 {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
-    let mut dst: __m128bh;
-    asm!(
-        "vcvtneps2bf16 {dst}, {src}",
-        dst = lateout(xmm_reg) dst,
-        src = in(xmm_reg) a,
-        options(pure, nomem, nostack, preserves_flags)
-    );
-    dst
+pub fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "vcvtneps2bf16 {dst}, {src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
 }
 
 /// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@@ -509,16 +540,18 @@ pub unsafe fn _mm_cvtneps_pbh(a: __m128) -> __m128bh {
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
-    let mut dst = src;
-    asm!(
-        "vcvtneps2bf16 {dst}{{{k}}},{src}",
-        dst = inlateout(xmm_reg) dst,
-        src = in(xmm_reg) a,
-        k = in(kreg) k,
-        options(pure, nomem, nostack, preserves_flags)
-    );
-    dst
+pub fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst = src;
+        asm!(
+            "vcvtneps2bf16 {dst}{{{k}}},{src}",
+            dst = inlateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            k = in(kreg) k,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
 }
 
 /// Converts packed single-precision (32-bit) floating-point elements in a to packed BF16 (16-bit)
@@ -530,16 +563,18 @@ pub unsafe fn _mm_mask_cvtneps_pbh(src: __m128bh, k: __mmask8, a: __m128) -> __m
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[cfg_attr(test, assert_instr("vcvtneps2bf16"))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
-    let mut dst: __m128bh;
-    asm!(
-        "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}",
-        dst = lateout(xmm_reg) dst,
-        src = in(xmm_reg) a,
-        k = in(kreg) k,
-        options(pure, nomem, nostack, preserves_flags)
-    );
-    dst
+pub fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "vcvtneps2bf16 {dst}{{{k}}}{{z}},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            k = in(kreg) k,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
 }
 
 /// Converts a single-precision (32-bit) floating-point element in a to a BF16 (16-bit) floating-point
@@ -549,9 +584,11 @@ pub unsafe fn _mm_maskz_cvtneps_pbh(k: __mmask8, a: __m128) -> __m128bh {
 #[inline]
 #[target_feature(enable = "avx512bf16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
-pub unsafe fn _mm_cvtness_sbh(a: f32) -> bf16 {
-    let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
-    bf16::from_bits(value)
+pub fn _mm_cvtness_sbh(a: f32) -> bf16 {
+    unsafe {
+        let value: u16 = simd_extract!(_mm_cvtneps_pbh(_mm_set_ss(a)), 0);
+        bf16::from_bits(value)
+    }
 }
 
 #[cfg(test)]
diff --git a/crates/core_arch/src/x86/avx512bitalg.rs b/crates/core_arch/src/x86/avx512bitalg.rs
index 5640ef8bf4..e27b737870 100644
--- a/crates/core_arch/src/x86/avx512bitalg.rs
+++ b/crates/core_arch/src/x86/avx512bitalg.rs
@@ -43,8 +43,8 @@ unsafe extern "C" {
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
-    transmute(simd_ctpop(a.as_i16x32()))
+pub fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i16x32())) }
 }
 
 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@@ -57,12 +57,14 @@ pub unsafe fn _mm512_popcnt_epi16(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x32()),
-        i16x32::ZERO,
-    ))
+pub fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x32()),
+            i16x32::ZERO,
+        ))
+    }
 }
 
 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@@ -75,12 +77,14 @@ pub unsafe fn _mm512_maskz_popcnt_epi16(k: __mmask32, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm512_mask_popcnt_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x32()),
-        src.as_i16x32(),
-    ))
+pub fn _mm512_mask_popcnt_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x32()),
+            src.as_i16x32(),
+        ))
+    }
 }
 
 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@@ -90,8 +94,8 @@ pub unsafe fn _mm512_mask_popcnt_epi16(src: __m512i, k: __mmask32, a: __m512i) -
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
-    transmute(simd_ctpop(a.as_i16x16()))
+pub fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i16x16())) }
 }
 
 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@@ -104,12 +108,14 @@ pub unsafe fn _mm256_popcnt_epi16(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x16()),
-        i16x16::ZERO,
-    ))
+pub fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x16()),
+            i16x16::ZERO,
+        ))
+    }
 }
 
 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@@ -122,12 +128,14 @@ pub unsafe fn _mm256_maskz_popcnt_epi16(k: __mmask16, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm256_mask_popcnt_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x16()),
-        src.as_i16x16(),
-    ))
+pub fn _mm256_mask_popcnt_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x16()),
+            src.as_i16x16(),
+        ))
+    }
 }
 
 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@@ -137,8 +145,8 @@ pub unsafe fn _mm256_mask_popcnt_epi16(src: __m256i, k: __mmask16, a: __m256i) -
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
-    transmute(simd_ctpop(a.as_i16x8()))
+pub fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i16x8())) }
 }
 
 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@@ -151,12 +159,14 @@ pub unsafe fn _mm_popcnt_epi16(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x8()),
-        i16x8::ZERO,
-    ))
+pub fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x8()),
+            i16x8::ZERO,
+        ))
+    }
 }
 
 /// For each packed 16-bit integer maps the value to the number of logical 1 bits.
@@ -169,12 +179,14 @@ pub unsafe fn _mm_maskz_popcnt_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntw))]
-pub unsafe fn _mm_mask_popcnt_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i16x8()),
-        src.as_i16x8(),
-    ))
+pub fn _mm_mask_popcnt_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i16x8()),
+            src.as_i16x8(),
+        ))
+    }
 }
 
 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@@ -184,8 +196,8 @@ pub unsafe fn _mm_mask_popcnt_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
-    transmute(simd_ctpop(a.as_i8x64()))
+pub fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i8x64())) }
 }
 
 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@@ -198,12 +210,14 @@ pub unsafe fn _mm512_popcnt_epi8(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x64()),
-        i8x64::ZERO,
-    ))
+pub fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x64()),
+            i8x64::ZERO,
+        ))
+    }
 }
 
 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@@ -216,12 +230,14 @@ pub unsafe fn _mm512_maskz_popcnt_epi8(k: __mmask64, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm512_mask_popcnt_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x64()),
-        src.as_i8x64(),
-    ))
+pub fn _mm512_mask_popcnt_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x64()),
+            src.as_i8x64(),
+        ))
+    }
 }
 
 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@@ -231,8 +247,8 @@ pub unsafe fn _mm512_mask_popcnt_epi8(src: __m512i, k: __mmask64, a: __m512i) ->
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
-    transmute(simd_ctpop(a.as_i8x32()))
+pub fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i8x32())) }
 }
 
 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@@ -245,12 +261,14 @@ pub unsafe fn _mm256_popcnt_epi8(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x32()),
-        i8x32::ZERO,
-    ))
+pub fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x32()),
+            i8x32::ZERO,
+        ))
+    }
 }
 
 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@@ -263,12 +281,14 @@ pub unsafe fn _mm256_maskz_popcnt_epi8(k: __mmask32, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm256_mask_popcnt_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x32()),
-        src.as_i8x32(),
-    ))
+pub fn _mm256_mask_popcnt_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x32()),
+            src.as_i8x32(),
+        ))
+    }
 }
 
 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@@ -278,8 +298,8 @@ pub unsafe fn _mm256_mask_popcnt_epi8(src: __m256i, k: __mmask32, a: __m256i) ->
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
-    transmute(simd_ctpop(a.as_i8x16()))
+pub fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i8x16())) }
 }
 
 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@@ -292,12 +312,14 @@ pub unsafe fn _mm_popcnt_epi8(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x16()),
-        i8x16::ZERO,
-    ))
+pub fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x16()),
+            i8x16::ZERO,
+        ))
+    }
 }
 
 /// For each packed 8-bit integer maps the value to the number of logical 1 bits.
@@ -310,12 +332,14 @@ pub unsafe fn _mm_maskz_popcnt_epi8(k: __mmask16, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntb))]
-pub unsafe fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i8x16()),
-        src.as_i8x16(),
-    ))
+pub fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i8x16()),
+            src.as_i8x16(),
+        ))
+    }
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -327,8 +351,8 @@ pub unsafe fn _mm_mask_popcnt_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
-    bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0)
+pub fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64 {
+    unsafe { bitshuffle_512(b.as_i8x64(), c.as_i8x64(), !0) }
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -343,8 +367,8 @@ pub unsafe fn _mm512_bitshuffle_epi64_mask(b: __m512i, c: __m512i) -> __mmask64
 #[target_feature(enable = "avx512bitalg")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
-    bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k)
+pub fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m512i) -> __mmask64 {
+    unsafe { bitshuffle_512(b.as_i8x64(), c.as_i8x64(), k) }
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -356,8 +380,8 @@ pub unsafe fn _mm512_mask_bitshuffle_epi64_mask(k: __mmask64, b: __m512i, c: __m
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
-    bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0)
+pub fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32 {
+    unsafe { bitshuffle_256(b.as_i8x32(), c.as_i8x32(), !0) }
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -372,8 +396,8 @@ pub unsafe fn _mm256_bitshuffle_epi64_mask(b: __m256i, c: __m256i) -> __mmask32
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
-    bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k)
+pub fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m256i) -> __mmask32 {
+    unsafe { bitshuffle_256(b.as_i8x32(), c.as_i8x32(), k) }
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -385,8 +409,8 @@ pub unsafe fn _mm256_mask_bitshuffle_epi64_mask(k: __mmask32, b: __m256i, c: __m
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
-    bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0)
+pub fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
+    unsafe { bitshuffle_128(b.as_i8x16(), c.as_i8x16(), !0) }
 }
 
 /// Considers the input `b` as packed 64-bit integers and `c` as packed 8-bit integers.
@@ -401,8 +425,8 @@ pub unsafe fn _mm_bitshuffle_epi64_mask(b: __m128i, c: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bitalg,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufbitqmb))]
-pub unsafe fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
-    bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k)
+pub fn _mm_mask_bitshuffle_epi64_mask(k: __mmask16, b: __m128i, c: __m128i) -> __mmask16 {
+    unsafe { bitshuffle_128(b.as_i8x16(), c.as_i8x16(), k) }
 }
 
 #[cfg(test)]
diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs
index caac75b346..11d1f93f37 100644
--- a/crates/core_arch/src/x86/avx512bw.rs
+++ b/crates/core_arch/src/x86/avx512bw.rs
@@ -16,10 +16,12 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsw))]
-pub unsafe fn _mm512_abs_epi16(a: __m512i) -> __m512i {
-    let a = a.as_i16x32();
-    let cmp: i16x32 = simd_gt(a, i16x32::ZERO);
-    transmute(simd_select(cmp, a, simd_neg(a)))
+pub fn _mm512_abs_epi16(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let cmp: i16x32 = simd_gt(a, i16x32::ZERO);
+        transmute(simd_select(cmp, a, simd_neg(a)))
+    }
 }
 
 /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -29,9 +31,11 @@ pub unsafe fn _mm512_abs_epi16(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsw))]
-pub unsafe fn _mm512_mask_abs_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    let abs = _mm512_abs_epi16(a).as_i16x32();
-    transmute(simd_select_bitmask(k, abs, src.as_i16x32()))
+pub fn _mm512_mask_abs_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, abs, src.as_i16x32()))
+    }
 }
 
 /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -41,9 +45,11 @@ pub unsafe fn _mm512_mask_abs_epi16(src: __m512i, k: __mmask32, a: __m512i) -> _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsw))]
-pub unsafe fn _mm512_maskz_abs_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    let abs = _mm512_abs_epi16(a).as_i16x32();
-    transmute(simd_select_bitmask(k, abs, i16x32::ZERO))
+pub fn _mm512_maskz_abs_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, abs, i16x32::ZERO))
+    }
 }
 
 /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -53,9 +59,11 @@ pub unsafe fn _mm512_maskz_abs_epi16(k: __mmask32, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsw))]
-pub unsafe fn _mm256_mask_abs_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    let abs = _mm256_abs_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, abs, src.as_i16x16()))
+pub fn _mm256_mask_abs_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, abs, src.as_i16x16()))
+    }
 }
 
 /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -65,9 +73,11 @@ pub unsafe fn _mm256_mask_abs_epi16(src: __m256i, k: __mmask16, a: __m256i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsw))]
-pub unsafe fn _mm256_maskz_abs_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    let abs = _mm256_abs_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, abs, i16x16::ZERO))
+pub fn _mm256_maskz_abs_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, abs, i16x16::ZERO))
+    }
 }
 
 /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -77,9 +87,11 @@ pub unsafe fn _mm256_maskz_abs_epi16(k: __mmask16, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsw))]
-pub unsafe fn _mm_mask_abs_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let abs = _mm_abs_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, abs, src.as_i16x8()))
+pub fn _mm_mask_abs_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, abs, src.as_i16x8()))
+    }
 }
 
 /// Compute the absolute value of packed signed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -89,9 +101,11 @@ pub unsafe fn _mm_mask_abs_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsw))]
-pub unsafe fn _mm_maskz_abs_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    let abs = _mm_abs_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, abs, i16x8::ZERO))
+pub fn _mm_maskz_abs_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, abs, i16x8::ZERO))
+    }
 }
 
 /// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst.
@@ -101,10 +115,12 @@ pub unsafe fn _mm_maskz_abs_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsb))]
-pub unsafe fn _mm512_abs_epi8(a: __m512i) -> __m512i {
-    let a = a.as_i8x64();
-    let cmp: i8x64 = simd_gt(a, i8x64::ZERO);
-    transmute(simd_select(cmp, a, simd_neg(a)))
+pub fn _mm512_abs_epi8(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let cmp: i8x64 = simd_gt(a, i8x64::ZERO);
+        transmute(simd_select(cmp, a, simd_neg(a)))
+    }
 }
 
 /// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -114,9 +130,11 @@ pub unsafe fn _mm512_abs_epi8(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsb))]
-pub unsafe fn _mm512_mask_abs_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
-    let abs = _mm512_abs_epi8(a).as_i8x64();
-    transmute(simd_select_bitmask(k, abs, src.as_i8x64()))
+pub fn _mm512_mask_abs_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, abs, src.as_i8x64()))
+    }
 }
 
 /// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -126,9 +144,11 @@ pub unsafe fn _mm512_mask_abs_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsb))]
-pub unsafe fn _mm512_maskz_abs_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    let abs = _mm512_abs_epi8(a).as_i8x64();
-    transmute(simd_select_bitmask(k, abs, i8x64::ZERO))
+pub fn _mm512_maskz_abs_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, abs, i8x64::ZERO))
+    }
 }
 
 /// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -138,9 +158,11 @@ pub unsafe fn _mm512_maskz_abs_epi8(k: __mmask64, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsb))]
-pub unsafe fn _mm256_mask_abs_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
-    let abs = _mm256_abs_epi8(a).as_i8x32();
-    transmute(simd_select_bitmask(k, abs, src.as_i8x32()))
+pub fn _mm256_mask_abs_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, abs, src.as_i8x32()))
+    }
 }
 
 /// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -150,9 +172,11 @@ pub unsafe fn _mm256_mask_abs_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsb))]
-pub unsafe fn _mm256_maskz_abs_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    let abs = _mm256_abs_epi8(a).as_i8x32();
-    transmute(simd_select_bitmask(k, abs, i8x32::ZERO))
+pub fn _mm256_maskz_abs_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, abs, i8x32::ZERO))
+    }
 }
 
 /// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set)
@@ -162,9 +186,11 @@ pub unsafe fn _mm256_maskz_abs_epi8(k: __mmask32, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsb))]
-pub unsafe fn _mm_mask_abs_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    let abs = _mm_abs_epi8(a).as_i8x16();
-    transmute(simd_select_bitmask(k, abs, src.as_i8x16()))
+pub fn _mm_mask_abs_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, abs, src.as_i8x16()))
+    }
 }
 
 /// Compute the absolute value of packed signed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -174,9 +200,11 @@ pub unsafe fn _mm_mask_abs_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsb))]
-pub unsafe fn _mm_maskz_abs_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    let abs = _mm_abs_epi8(a).as_i8x16();
-    transmute(simd_select_bitmask(k, abs, i8x16::ZERO))
+pub fn _mm_maskz_abs_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, abs, i8x16::ZERO))
+    }
 }
 
 /// Add packed 16-bit integers in a and b, and store the results in dst.
@@ -186,8 +214,8 @@ pub unsafe fn _mm_maskz_abs_epi8(k: __mmask16, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddw))]
-pub unsafe fn _mm512_add_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_add(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_add_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -197,9 +225,11 @@ pub unsafe fn _mm512_add_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddw))]
-pub unsafe fn _mm512_mask_add_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_add_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, add, src.as_i16x32()))
+pub fn _mm512_mask_add_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, src.as_i16x32()))
+    }
 }
 
 /// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -209,9 +239,11 @@ pub unsafe fn _mm512_mask_add_epi16(src: __m512i, k: __mmask32, a: __m512i, b: _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddw))]
-pub unsafe fn _mm512_maskz_add_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_add_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, add, i16x32::ZERO))
+pub fn _mm512_maskz_add_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, i16x32::ZERO))
+    }
 }
 
 /// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -221,9 +253,11 @@ pub unsafe fn _mm512_maskz_add_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddw))]
-pub unsafe fn _mm256_mask_add_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_add_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, add, src.as_i16x16()))
+pub fn _mm256_mask_add_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, src.as_i16x16()))
+    }
 }
 
 /// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -233,9 +267,11 @@ pub unsafe fn _mm256_mask_add_epi16(src: __m256i, k: __mmask16, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddw))]
-pub unsafe fn _mm256_maskz_add_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_add_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, add, i16x16::ZERO))
+pub fn _mm256_maskz_add_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, i16x16::ZERO))
+    }
 }
 
 /// Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -245,9 +281,11 @@ pub unsafe fn _mm256_maskz_add_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddw))]
-pub unsafe fn _mm_mask_add_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_add_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, add, src.as_i16x8()))
+pub fn _mm_mask_add_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, src.as_i16x8()))
+    }
 }
 
 /// Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -257,9 +295,11 @@ pub unsafe fn _mm_mask_add_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddw))]
-pub unsafe fn _mm_maskz_add_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_add_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, add, i16x8::ZERO))
+pub fn _mm_maskz_add_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, i16x8::ZERO))
+    }
 }
 
 /// Add packed 8-bit integers in a and b, and store the results in dst.
@@ -269,8 +309,8 @@ pub unsafe fn _mm_maskz_add_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddb))]
-pub unsafe fn _mm512_add_epi8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_add(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_add_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -280,9 +320,11 @@ pub unsafe fn _mm512_add_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddb))]
-pub unsafe fn _mm512_mask_add_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_add_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, add, src.as_i8x64()))
+pub fn _mm512_mask_add_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, src.as_i8x64()))
+    }
 }
 
 /// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -292,9 +334,11 @@ pub unsafe fn _mm512_mask_add_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddb))]
-pub unsafe fn _mm512_maskz_add_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_add_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, add, i8x64::ZERO))
+pub fn _mm512_maskz_add_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, i8x64::ZERO))
+    }
 }
 
 /// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -304,9 +348,11 @@ pub unsafe fn _mm512_maskz_add_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddb))]
-pub unsafe fn _mm256_mask_add_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_add_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, add, src.as_i8x32()))
+pub fn _mm256_mask_add_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, src.as_i8x32()))
+    }
 }
 
 /// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -316,9 +362,11 @@ pub unsafe fn _mm256_mask_add_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddb))]
-pub unsafe fn _mm256_maskz_add_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_add_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, add, i8x32::ZERO))
+pub fn _mm256_maskz_add_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, i8x32::ZERO))
+    }
 }
 
 /// Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -328,9 +376,11 @@ pub unsafe fn _mm256_maskz_add_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddb))]
-pub unsafe fn _mm_mask_add_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_add_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, add, src.as_i8x16()))
+pub fn _mm_mask_add_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, src.as_i8x16()))
+    }
 }
 
 /// Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -340,9 +390,11 @@ pub unsafe fn _mm_mask_add_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddb))]
-pub unsafe fn _mm_maskz_add_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_add_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, add, i8x16::ZERO))
+pub fn _mm_maskz_add_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, i8x16::ZERO))
+    }
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.
@@ -352,8 +404,8 @@ pub unsafe fn _mm_maskz_add_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
-pub unsafe fn _mm512_adds_epu16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_saturating_add(a.as_u16x32(), b.as_u16x32()))
+pub fn _mm512_adds_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_u16x32(), b.as_u16x32())) }
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -363,14 +415,11 @@ pub unsafe fn _mm512_adds_epu16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
-pub unsafe fn _mm512_mask_adds_epu16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let add = _mm512_adds_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, add, src.as_u16x32()))
+pub fn _mm512_mask_adds_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, add, src.as_u16x32()))
+    }
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -380,9 +429,11 @@ pub unsafe fn _mm512_mask_adds_epu16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
-pub unsafe fn _mm512_maskz_adds_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_adds_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, add, u16x32::ZERO))
+pub fn _mm512_maskz_adds_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, add, u16x32::ZERO))
+    }
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -392,14 +443,11 @@ pub unsafe fn _mm512_maskz_adds_epu16(k: __mmask32, a: __m512i, b: __m512i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
-pub unsafe fn _mm256_mask_adds_epu16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let add = _mm256_adds_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, add, src.as_u16x16()))
+pub fn _mm256_mask_adds_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, add, src.as_u16x16()))
+    }
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -409,9 +457,11 @@ pub unsafe fn _mm256_mask_adds_epu16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
-pub unsafe fn _mm256_maskz_adds_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_adds_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, add, u16x16::ZERO))
+pub fn _mm256_maskz_adds_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, add, u16x16::ZERO))
+    }
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -421,9 +471,11 @@ pub unsafe fn _mm256_maskz_adds_epu16(k: __mmask16, a: __m256i, b: __m256i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
-pub unsafe fn _mm_mask_adds_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_adds_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, add, src.as_u16x8()))
+pub fn _mm_mask_adds_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, add, src.as_u16x8()))
+    }
 }
 
 /// Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -433,9 +485,11 @@ pub unsafe fn _mm_mask_adds_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
-pub unsafe fn _mm_maskz_adds_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_adds_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, add, u16x8::ZERO))
+pub fn _mm_maskz_adds_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, add, u16x8::ZERO))
+    }
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.
@@ -445,8 +499,8 @@ pub unsafe fn _mm_maskz_adds_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
-pub unsafe fn _mm512_adds_epu8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_saturating_add(a.as_u8x64(), b.as_u8x64()))
+pub fn _mm512_adds_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_u8x64(), b.as_u8x64())) }
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -456,9 +510,11 @@ pub unsafe fn _mm512_adds_epu8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
-pub unsafe fn _mm512_mask_adds_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_adds_epu8(a, b).as_u8x64();
-    transmute(simd_select_bitmask(k, add, src.as_u8x64()))
+pub fn _mm512_mask_adds_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, add, src.as_u8x64()))
+    }
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -468,9 +524,11 @@ pub unsafe fn _mm512_mask_adds_epu8(src: __m512i, k: __mmask64, a: __m512i, b: _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
-pub unsafe fn _mm512_maskz_adds_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_adds_epu8(a, b).as_u8x64();
-    transmute(simd_select_bitmask(k, add, u8x64::ZERO))
+pub fn _mm512_maskz_adds_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, add, u8x64::ZERO))
+    }
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -480,9 +538,11 @@ pub unsafe fn _mm512_maskz_adds_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
-pub unsafe fn _mm256_mask_adds_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_adds_epu8(a, b).as_u8x32();
-    transmute(simd_select_bitmask(k, add, src.as_u8x32()))
+pub fn _mm256_mask_adds_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, add, src.as_u8x32()))
+    }
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -492,9 +552,11 @@ pub unsafe fn _mm256_mask_adds_epu8(src: __m256i, k: __mmask32, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
-pub unsafe fn _mm256_maskz_adds_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_adds_epu8(a, b).as_u8x32();
-    transmute(simd_select_bitmask(k, add, u8x32::ZERO))
+pub fn _mm256_maskz_adds_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, add, u8x32::ZERO))
+    }
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -504,9 +566,11 @@ pub unsafe fn _mm256_maskz_adds_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
-pub unsafe fn _mm_mask_adds_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_adds_epu8(a, b).as_u8x16();
-    transmute(simd_select_bitmask(k, add, src.as_u8x16()))
+pub fn _mm_mask_adds_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, add, src.as_u8x16()))
+    }
 }
 
 /// Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -516,9 +580,11 @@ pub unsafe fn _mm_mask_adds_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m1
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
-pub unsafe fn _mm_maskz_adds_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_adds_epu8(a, b).as_u8x16();
-    transmute(simd_select_bitmask(k, add, u8x16::ZERO))
+pub fn _mm_maskz_adds_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, add, u8x16::ZERO))
+    }
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst.
@@ -528,8 +594,8 @@ pub unsafe fn _mm_maskz_adds_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
-pub unsafe fn _mm512_adds_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_saturating_add(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_adds_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -539,14 +605,11 @@ pub unsafe fn _mm512_adds_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
-pub unsafe fn _mm512_mask_adds_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let add = _mm512_adds_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, add, src.as_i16x32()))
+pub fn _mm512_mask_adds_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, src.as_i16x32()))
+    }
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -556,9 +619,11 @@ pub unsafe fn _mm512_mask_adds_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
-pub unsafe fn _mm512_maskz_adds_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_adds_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, add, i16x32::ZERO))
+pub fn _mm512_maskz_adds_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, add, i16x32::ZERO))
+    }
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -568,14 +633,11 @@ pub unsafe fn _mm512_maskz_adds_epi16(k: __mmask32, a: __m512i, b: __m512i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
-pub unsafe fn _mm256_mask_adds_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let add = _mm256_adds_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, add, src.as_i16x16()))
+pub fn _mm256_mask_adds_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, src.as_i16x16()))
+    }
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -585,9 +647,11 @@ pub unsafe fn _mm256_mask_adds_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
-pub unsafe fn _mm256_maskz_adds_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_adds_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, add, i16x16::ZERO))
+pub fn _mm256_maskz_adds_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, add, i16x16::ZERO))
+    }
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -597,9 +661,11 @@ pub unsafe fn _mm256_maskz_adds_epi16(k: __mmask16, a: __m256i, b: __m256i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
-pub unsafe fn _mm_mask_adds_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_adds_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, add, src.as_i16x8()))
+pub fn _mm_mask_adds_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, src.as_i16x8()))
+    }
 }
 
 /// Add packed signed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -609,9 +675,11 @@ pub unsafe fn _mm_mask_adds_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
-pub unsafe fn _mm_maskz_adds_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_adds_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, add, i16x8::ZERO))
+pub fn _mm_maskz_adds_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, add, i16x8::ZERO))
+    }
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst.
@@ -621,8 +689,8 @@ pub unsafe fn _mm_maskz_adds_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
-pub unsafe fn _mm512_adds_epi8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_saturating_add(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_adds_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_add(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -632,9 +700,11 @@ pub unsafe fn _mm512_adds_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
-pub unsafe fn _mm512_mask_adds_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_adds_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, add, src.as_i8x64()))
+pub fn _mm512_mask_adds_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, src.as_i8x64()))
+    }
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -644,9 +714,11 @@ pub unsafe fn _mm512_mask_adds_epi8(src: __m512i, k: __mmask64, a: __m512i, b: _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
-pub unsafe fn _mm512_maskz_adds_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_adds_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, add, i8x64::ZERO))
+pub fn _mm512_maskz_adds_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_adds_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, add, i8x64::ZERO))
+    }
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -656,9 +728,11 @@ pub unsafe fn _mm512_maskz_adds_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
-pub unsafe fn _mm256_mask_adds_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_adds_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, add, src.as_i8x32()))
+pub fn _mm256_mask_adds_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, src.as_i8x32()))
+    }
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -668,9 +742,11 @@ pub unsafe fn _mm256_mask_adds_epi8(src: __m256i, k: __mmask32, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
-pub unsafe fn _mm256_maskz_adds_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_adds_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, add, i8x32::ZERO))
+pub fn _mm256_maskz_adds_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_adds_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, add, i8x32::ZERO))
+    }
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -680,9 +756,11 @@ pub unsafe fn _mm256_maskz_adds_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
-pub unsafe fn _mm_mask_adds_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_adds_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, add, src.as_i8x16()))
+pub fn _mm_mask_adds_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, src.as_i8x16()))
+    }
 }
 
 /// Add packed signed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -692,9 +770,11 @@ pub unsafe fn _mm_mask_adds_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m1
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
-pub unsafe fn _mm_maskz_adds_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_adds_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, add, i8x16::ZERO))
+pub fn _mm_maskz_adds_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_adds_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, add, i8x16::ZERO))
+    }
 }
 
 /// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.
@@ -704,8 +784,8 @@ pub unsafe fn _mm_maskz_adds_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubw))]
-pub unsafe fn _mm512_sub_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_sub(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_sub_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -715,9 +795,11 @@ pub unsafe fn _mm512_sub_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubw))]
-pub unsafe fn _mm512_mask_sub_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_sub_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
+pub fn _mm512_mask_sub_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
+    }
 }
 
 /// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -727,9 +809,11 @@ pub unsafe fn _mm512_mask_sub_epi16(src: __m512i, k: __mmask32, a: __m512i, b: _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubw))]
-pub unsafe fn _mm512_maskz_sub_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_sub_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, sub, i16x32::ZERO))
+pub fn _mm512_maskz_sub_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, i16x32::ZERO))
+    }
 }
 
 /// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -739,9 +823,11 @@ pub unsafe fn _mm512_maskz_sub_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubw))]
-pub unsafe fn _mm256_mask_sub_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_sub_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
+pub fn _mm256_mask_sub_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
+    }
 }
 
 /// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -751,9 +837,11 @@ pub unsafe fn _mm256_mask_sub_epi16(src: __m256i, k: __mmask16, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubw))]
-pub unsafe fn _mm256_maskz_sub_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_sub_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, sub, i16x16::ZERO))
+pub fn _mm256_maskz_sub_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, i16x16::ZERO))
+    }
 }
 
 /// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -763,9 +851,11 @@ pub unsafe fn _mm256_maskz_sub_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubw))]
-pub unsafe fn _mm_mask_sub_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_sub_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
+pub fn _mm_mask_sub_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
+    }
 }
 
 /// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -775,9 +865,11 @@ pub unsafe fn _mm_mask_sub_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubw))]
-pub unsafe fn _mm_maskz_sub_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_sub_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, sub, i16x8::ZERO))
+pub fn _mm_maskz_sub_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, i16x8::ZERO))
+    }
 }
 
 /// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst.
@@ -787,8 +879,8 @@ pub unsafe fn _mm_maskz_sub_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubb))]
-pub unsafe fn _mm512_sub_epi8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_sub(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_sub_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -798,9 +890,11 @@ pub unsafe fn _mm512_sub_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubb))]
-pub unsafe fn _mm512_mask_sub_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_sub_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
+pub fn _mm512_mask_sub_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
+    }
 }
 
 /// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -810,9 +904,11 @@ pub unsafe fn _mm512_mask_sub_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubb))]
-pub unsafe fn _mm512_maskz_sub_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_sub_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, sub, i8x64::ZERO))
+pub fn _mm512_maskz_sub_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, i8x64::ZERO))
+    }
 }
 
 /// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -822,9 +918,11 @@ pub unsafe fn _mm512_maskz_sub_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubb))]
-pub unsafe fn _mm256_mask_sub_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_sub_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
+pub fn _mm256_mask_sub_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
+    }
 }
 
 /// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -834,9 +932,11 @@ pub unsafe fn _mm256_mask_sub_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubb))]
-pub unsafe fn _mm256_maskz_sub_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_sub_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, sub, i8x32::ZERO))
+pub fn _mm256_maskz_sub_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, i8x32::ZERO))
+    }
 }
 
 /// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -846,9 +946,11 @@ pub unsafe fn _mm256_maskz_sub_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubb))]
-pub unsafe fn _mm_mask_sub_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_sub_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
+pub fn _mm_mask_sub_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
+    }
 }
 
 /// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -858,9 +960,11 @@ pub unsafe fn _mm_mask_sub_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubb))]
-pub unsafe fn _mm_maskz_sub_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_sub_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, sub, i8x16::ZERO))
+pub fn _mm_maskz_sub_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, i8x16::ZERO))
+    }
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst.
@@ -870,8 +974,8 @@ pub unsafe fn _mm_maskz_sub_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
-pub unsafe fn _mm512_subs_epu16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_saturating_sub(a.as_u16x32(), b.as_u16x32()))
+pub fn _mm512_subs_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_u16x32(), b.as_u16x32())) }
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -881,14 +985,11 @@ pub unsafe fn _mm512_subs_epu16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
-pub unsafe fn _mm512_mask_subs_epu16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let sub = _mm512_subs_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, sub, src.as_u16x32()))
+pub fn _mm512_mask_subs_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, sub, src.as_u16x32()))
+    }
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -898,9 +999,11 @@ pub unsafe fn _mm512_mask_subs_epu16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
-pub unsafe fn _mm512_maskz_subs_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_subs_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, sub, u16x32::ZERO))
+pub fn _mm512_maskz_subs_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, sub, u16x32::ZERO))
+    }
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -910,14 +1013,11 @@ pub unsafe fn _mm512_maskz_subs_epu16(k: __mmask32, a: __m512i, b: __m512i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
-pub unsafe fn _mm256_mask_subs_epu16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let sub = _mm256_subs_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, sub, src.as_u16x16()))
+pub fn _mm256_mask_subs_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, sub, src.as_u16x16()))
+    }
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -927,9 +1027,11 @@ pub unsafe fn _mm256_mask_subs_epu16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
-pub unsafe fn _mm256_maskz_subs_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_subs_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, sub, u16x16::ZERO))
+pub fn _mm256_maskz_subs_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, sub, u16x16::ZERO))
+    }
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -939,9 +1041,11 @@ pub unsafe fn _mm256_maskz_subs_epu16(k: __mmask16, a: __m256i, b: __m256i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
-pub unsafe fn _mm_mask_subs_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_subs_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, sub, src.as_u16x8()))
+pub fn _mm_mask_subs_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, sub, src.as_u16x8()))
+    }
 }
 
 /// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -951,9 +1055,11 @@ pub unsafe fn _mm_mask_subs_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
-pub unsafe fn _mm_maskz_subs_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_subs_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, sub, u16x8::ZERO))
+pub fn _mm_maskz_subs_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, sub, u16x8::ZERO))
+    }
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.
@@ -963,8 +1069,8 @@ pub unsafe fn _mm_maskz_subs_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
-pub unsafe fn _mm512_subs_epu8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_saturating_sub(a.as_u8x64(), b.as_u8x64()))
+pub fn _mm512_subs_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_u8x64(), b.as_u8x64())) }
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -974,9 +1080,11 @@ pub unsafe fn _mm512_subs_epu8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
-pub unsafe fn _mm512_mask_subs_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_subs_epu8(a, b).as_u8x64();
-    transmute(simd_select_bitmask(k, sub, src.as_u8x64()))
+pub fn _mm512_mask_subs_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, sub, src.as_u8x64()))
+    }
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -986,9 +1094,11 @@ pub unsafe fn _mm512_mask_subs_epu8(src: __m512i, k: __mmask64, a: __m512i, b: _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
-pub unsafe fn _mm512_maskz_subs_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_subs_epu8(a, b).as_u8x64();
-    transmute(simd_select_bitmask(k, sub, u8x64::ZERO))
+pub fn _mm512_maskz_subs_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, sub, u8x64::ZERO))
+    }
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -998,9 +1108,11 @@ pub unsafe fn _mm512_maskz_subs_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
-pub unsafe fn _mm256_mask_subs_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_subs_epu8(a, b).as_u8x32();
-    transmute(simd_select_bitmask(k, sub, src.as_u8x32()))
+pub fn _mm256_mask_subs_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, sub, src.as_u8x32()))
+    }
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1010,9 +1122,11 @@ pub unsafe fn _mm256_mask_subs_epu8(src: __m256i, k: __mmask32, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
-pub unsafe fn _mm256_maskz_subs_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_subs_epu8(a, b).as_u8x32();
-    transmute(simd_select_bitmask(k, sub, u8x32::ZERO))
+pub fn _mm256_maskz_subs_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, sub, u8x32::ZERO))
+    }
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1022,9 +1136,11 @@ pub unsafe fn _mm256_maskz_subs_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
-pub unsafe fn _mm_mask_subs_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_subs_epu8(a, b).as_u8x16();
-    transmute(simd_select_bitmask(k, sub, src.as_u8x16()))
+pub fn _mm_mask_subs_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, sub, src.as_u8x16()))
+    }
 }
 
 /// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1034,9 +1150,11 @@ pub unsafe fn _mm_mask_subs_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m1
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
-pub unsafe fn _mm_maskz_subs_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_subs_epu8(a, b).as_u8x16();
-    transmute(simd_select_bitmask(k, sub, u8x16::ZERO))
+pub fn _mm_maskz_subs_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, sub, u8x16::ZERO))
+    }
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.
@@ -1046,8 +1164,8 @@ pub unsafe fn _mm_maskz_subs_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
-pub unsafe fn _mm512_subs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_saturating_sub(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_subs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1057,14 +1175,11 @@ pub unsafe fn _mm512_subs_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
-pub unsafe fn _mm512_mask_subs_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let sub = _mm512_subs_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
+pub fn _mm512_mask_subs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x32()))
+    }
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1074,9 +1189,11 @@ pub unsafe fn _mm512_mask_subs_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
-pub unsafe fn _mm512_maskz_subs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_subs_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, sub, i16x32::ZERO))
+pub fn _mm512_maskz_subs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, sub, i16x32::ZERO))
+    }
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1086,14 +1203,11 @@ pub unsafe fn _mm512_maskz_subs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
-pub unsafe fn _mm256_mask_subs_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let sub = _mm256_subs_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
+pub fn _mm256_mask_subs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x16()))
+    }
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1103,9 +1217,11 @@ pub unsafe fn _mm256_mask_subs_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
-pub unsafe fn _mm256_maskz_subs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_subs_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, sub, i16x16::ZERO))
+pub fn _mm256_maskz_subs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, sub, i16x16::ZERO))
+    }
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1115,9 +1231,11 @@ pub unsafe fn _mm256_maskz_subs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
-pub unsafe fn _mm_mask_subs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_subs_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
+pub fn _mm_mask_subs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i16x8()))
+    }
 }
 
 /// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1127,9 +1245,11 @@ pub unsafe fn _mm_mask_subs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
-pub unsafe fn _mm_maskz_subs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_subs_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, sub, i16x8::ZERO))
+pub fn _mm_maskz_subs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, sub, i16x8::ZERO))
+    }
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.
@@ -1139,8 +1259,8 @@ pub unsafe fn _mm_maskz_subs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
-pub unsafe fn _mm512_subs_epi8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_saturating_sub(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_subs_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_saturating_sub(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1150,9 +1270,11 @@ pub unsafe fn _mm512_subs_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
-pub unsafe fn _mm512_mask_subs_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_subs_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
+pub fn _mm512_mask_subs_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x64()))
+    }
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1162,9 +1284,11 @@ pub unsafe fn _mm512_mask_subs_epi8(src: __m512i, k: __mmask64, a: __m512i, b: _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
-pub unsafe fn _mm512_maskz_subs_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_subs_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, sub, i8x64::ZERO))
+pub fn _mm512_maskz_subs_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_subs_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, sub, i8x64::ZERO))
+    }
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1174,9 +1298,11 @@ pub unsafe fn _mm512_maskz_subs_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
-pub unsafe fn _mm256_mask_subs_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_subs_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
+pub fn _mm256_mask_subs_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x32()))
+    }
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1186,9 +1312,11 @@ pub unsafe fn _mm256_mask_subs_epi8(src: __m256i, k: __mmask32, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
-pub unsafe fn _mm256_maskz_subs_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_subs_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, sub, i8x32::ZERO))
+pub fn _mm256_maskz_subs_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_subs_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, sub, i8x32::ZERO))
+    }
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1198,9 +1326,11 @@ pub unsafe fn _mm256_maskz_subs_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
-pub unsafe fn _mm_mask_subs_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_subs_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
+pub fn _mm_mask_subs_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i8x16()))
+    }
 }
 
 /// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1210,9 +1340,11 @@ pub unsafe fn _mm_mask_subs_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m1
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
-pub unsafe fn _mm_maskz_subs_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_subs_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, sub, i8x16::ZERO))
+pub fn _mm_maskz_subs_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_subs_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, sub, i8x16::ZERO))
+    }
 }
 
 /// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
@@ -1222,11 +1354,13 @@ pub unsafe fn _mm_maskz_subs_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
-pub unsafe fn _mm512_mulhi_epu16(a: __m512i, b: __m512i) -> __m512i {
-    let a = simd_cast::<_, u32x32>(a.as_u16x32());
-    let b = simd_cast::<_, u32x32>(b.as_u16x32());
-    let r = simd_shr(simd_mul(a, b), u32x32::splat(16));
-    transmute(simd_cast::<u32x32, u16x32>(r))
+pub fn _mm512_mulhi_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, u32x32>(a.as_u16x32());
+        let b = simd_cast::<_, u32x32>(b.as_u16x32());
+        let r = simd_shr(simd_mul(a, b), u32x32::splat(16));
+        transmute(simd_cast::<u32x32, u16x32>(r))
+    }
 }
 
 /// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1236,14 +1370,11 @@ pub unsafe fn _mm512_mulhi_epu16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
-pub unsafe fn _mm512_mask_mulhi_epu16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, mul, src.as_u16x32()))
+pub fn _mm512_mask_mulhi_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_u16x32()))
+    }
 }
 
 /// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1253,9 +1384,11 @@ pub unsafe fn _mm512_mask_mulhi_epu16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
-pub unsafe fn _mm512_maskz_mulhi_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, mul, u16x32::ZERO))
+pub fn _mm512_maskz_mulhi_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, mul, u16x32::ZERO))
+    }
 }
 
 /// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1265,14 +1398,11 @@ pub unsafe fn _mm512_maskz_mulhi_epu16(k: __mmask32, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
-pub unsafe fn _mm256_mask_mulhi_epu16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, mul, src.as_u16x16()))
+pub fn _mm256_mask_mulhi_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_u16x16()))
+    }
 }
 
 /// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1282,9 +1412,11 @@ pub unsafe fn _mm256_mask_mulhi_epu16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
-pub unsafe fn _mm256_maskz_mulhi_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, mul, u16x16::ZERO))
+pub fn _mm256_maskz_mulhi_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, mul, u16x16::ZERO))
+    }
 }
 
 /// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1294,9 +1426,11 @@ pub unsafe fn _mm256_maskz_mulhi_epu16(k: __mmask16, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
-pub unsafe fn _mm_mask_mulhi_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mulhi_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, mul, src.as_u16x8()))
+pub fn _mm_mask_mulhi_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_u16x8()))
+    }
 }
 
 /// Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1306,9 +1440,11 @@ pub unsafe fn _mm_mask_mulhi_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
-pub unsafe fn _mm_maskz_mulhi_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mulhi_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, mul, u16x8::ZERO))
+pub fn _mm_maskz_mulhi_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, mul, u16x8::ZERO))
+    }
 }
 
 /// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
@@ -1318,11 +1454,13 @@ pub unsafe fn _mm_maskz_mulhi_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m1
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhw))]
-pub unsafe fn _mm512_mulhi_epi16(a: __m512i, b: __m512i) -> __m512i {
-    let a = simd_cast::<_, i32x32>(a.as_i16x32());
-    let b = simd_cast::<_, i32x32>(b.as_i16x32());
-    let r = simd_shr(simd_mul(a, b), i32x32::splat(16));
-    transmute(simd_cast::<i32x32, i16x32>(r))
+pub fn _mm512_mulhi_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, i32x32>(a.as_i16x32());
+        let b = simd_cast::<_, i32x32>(b.as_i16x32());
+        let r = simd_shr(simd_mul(a, b), i32x32::splat(16));
+        transmute(simd_cast::<i32x32, i16x32>(r))
+    }
 }
 
 /// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1332,14 +1470,11 @@ pub unsafe fn _mm512_mulhi_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhw))]
-pub unsafe fn _mm512_mask_mulhi_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+pub fn _mm512_mask_mulhi_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+    }
 }
 
 /// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1349,9 +1484,11 @@ pub unsafe fn _mm512_mask_mulhi_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhw))]
-pub unsafe fn _mm512_maskz_mulhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+pub fn _mm512_maskz_mulhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+    }
 }
 
 /// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1361,14 +1498,11 @@ pub unsafe fn _mm512_maskz_mulhi_epi16(k: __mmask32, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhw))]
-pub unsafe fn _mm256_mask_mulhi_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+pub fn _mm256_mask_mulhi_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+    }
 }
 
 /// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1378,9 +1512,11 @@ pub unsafe fn _mm256_mask_mulhi_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhw))]
-pub unsafe fn _mm256_maskz_mulhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+pub fn _mm256_maskz_mulhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+    }
 }
 
 /// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1390,9 +1526,11 @@ pub unsafe fn _mm256_maskz_mulhi_epi16(k: __mmask16, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhw))]
-pub unsafe fn _mm_mask_mulhi_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mulhi_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+pub fn _mm_mask_mulhi_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+    }
 }
 
 /// Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1402,9 +1540,11 @@ pub unsafe fn _mm_mask_mulhi_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhw))]
-pub unsafe fn _mm_maskz_mulhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mulhi_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+pub fn _mm_maskz_mulhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst.
@@ -1414,8 +1554,8 @@ pub unsafe fn _mm_maskz_mulhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m1
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub unsafe fn _mm512_mulhrs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmulhrsw(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_mulhrs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmulhrsw(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1425,14 +1565,11 @@ pub unsafe fn _mm512_mulhrs_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub unsafe fn _mm512_mask_mulhrs_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+pub fn _mm512_mask_mulhrs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1442,9 +1579,11 @@ pub unsafe fn _mm512_mask_mulhrs_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub unsafe fn _mm512_maskz_mulhrs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+pub fn _mm512_maskz_mulhrs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mulhrs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1454,14 +1593,11 @@ pub unsafe fn _mm512_maskz_mulhrs_epi16(k: __mmask32, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub unsafe fn _mm256_mask_mulhrs_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+pub fn _mm256_mask_mulhrs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1471,9 +1607,11 @@ pub unsafe fn _mm256_mask_mulhrs_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub unsafe fn _mm256_maskz_mulhrs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+pub fn _mm256_maskz_mulhrs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mulhrs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1483,9 +1621,11 @@ pub unsafe fn _mm256_maskz_mulhrs_epi16(k: __mmask16, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub unsafe fn _mm_mask_mulhrs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+pub fn _mm_mask_mulhrs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits \[16:1\] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1495,9 +1635,11 @@ pub unsafe fn _mm_mask_mulhrs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulhrsw))]
-pub unsafe fn _mm_maskz_mulhrs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+pub fn _mm_maskz_mulhrs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mulhrs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+    }
 }
 
 /// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst.
@@ -1507,8 +1649,8 @@ pub unsafe fn _mm_maskz_mulhrs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmullw))]
-pub unsafe fn _mm512_mullo_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_mul(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_mullo_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1518,14 +1660,11 @@ pub unsafe fn _mm512_mullo_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmullw))]
-pub unsafe fn _mm512_mask_mullo_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let mul = _mm512_mullo_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+pub fn _mm512_mask_mullo_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x32()))
+    }
 }
 
 /// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1535,9 +1674,11 @@ pub unsafe fn _mm512_mask_mullo_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmullw))]
-pub unsafe fn _mm512_maskz_mullo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let mul = _mm512_mullo_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+pub fn _mm512_maskz_mullo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, mul, i16x32::ZERO))
+    }
 }
 
 /// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1547,14 +1688,11 @@ pub unsafe fn _mm512_maskz_mullo_epi16(k: __mmask32, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmullw))]
-pub unsafe fn _mm256_mask_mullo_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let mul = _mm256_mullo_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+pub fn _mm256_mask_mullo_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x16()))
+    }
 }
 
 /// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1564,9 +1702,11 @@ pub unsafe fn _mm256_mask_mullo_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmullw))]
-pub unsafe fn _mm256_maskz_mullo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let mul = _mm256_mullo_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+pub fn _mm256_maskz_mullo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, mul, i16x16::ZERO))
+    }
 }
 
 /// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1576,9 +1716,11 @@ pub unsafe fn _mm256_maskz_mullo_epi16(k: __mmask16, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmullw))]
-pub unsafe fn _mm_mask_mullo_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mullo_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+pub fn _mm_mask_mullo_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i16x8()))
+    }
 }
 
 /// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1588,9 +1730,11 @@ pub unsafe fn _mm_mask_mullo_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmullw))]
-pub unsafe fn _mm_maskz_mullo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mullo_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+pub fn _mm_maskz_mullo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, mul, i16x8::ZERO))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.
@@ -1600,10 +1744,12 @@ pub unsafe fn _mm_maskz_mullo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m1
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
-pub unsafe fn _mm512_max_epu16(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_u16x32();
-    let b = b.as_u16x32();
-    transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
+pub fn _mm512_max_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1613,9 +1759,11 @@ pub unsafe fn _mm512_max_epu16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
-pub unsafe fn _mm512_mask_max_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, max, src.as_u16x32()))
+pub fn _mm512_mask_max_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, max, src.as_u16x32()))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1625,9 +1773,11 @@ pub unsafe fn _mm512_mask_max_epu16(src: __m512i, k: __mmask32, a: __m512i, b: _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
-pub unsafe fn _mm512_maskz_max_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, max, u16x32::ZERO))
+pub fn _mm512_maskz_max_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, max, u16x32::ZERO))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1637,9 +1787,11 @@ pub unsafe fn _mm512_maskz_max_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
-pub unsafe fn _mm256_mask_max_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, max, src.as_u16x16()))
+pub fn _mm256_mask_max_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, max, src.as_u16x16()))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1649,9 +1801,11 @@ pub unsafe fn _mm256_mask_max_epu16(src: __m256i, k: __mmask16, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
-pub unsafe fn _mm256_maskz_max_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, max, u16x16::ZERO))
+pub fn _mm256_maskz_max_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, max, u16x16::ZERO))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1661,9 +1815,11 @@ pub unsafe fn _mm256_maskz_max_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
-pub unsafe fn _mm_mask_max_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, max, src.as_u16x8()))
+pub fn _mm_mask_max_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, max, src.as_u16x8()))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1673,9 +1829,11 @@ pub unsafe fn _mm_mask_max_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
-pub unsafe fn _mm_maskz_max_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, max, u16x8::ZERO))
+pub fn _mm_maskz_max_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, max, u16x8::ZERO))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.
@@ -1685,10 +1843,12 @@ pub unsafe fn _mm_maskz_max_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxub))]
-pub unsafe fn _mm512_max_epu8(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_u8x64();
-    let b = b.as_u8x64();
-    transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
+pub fn _mm512_max_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1698,9 +1858,11 @@ pub unsafe fn _mm512_max_epu8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxub))]
-pub unsafe fn _mm512_mask_max_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epu8(a, b).as_u8x64();
-    transmute(simd_select_bitmask(k, max, src.as_u8x64()))
+pub fn _mm512_mask_max_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, max, src.as_u8x64()))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1710,9 +1872,11 @@ pub unsafe fn _mm512_mask_max_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxub))]
-pub unsafe fn _mm512_maskz_max_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epu8(a, b).as_u8x64();
-    transmute(simd_select_bitmask(k, max, u8x64::ZERO))
+pub fn _mm512_maskz_max_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, max, u8x64::ZERO))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1722,9 +1886,11 @@ pub unsafe fn _mm512_maskz_max_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxub))]
-pub unsafe fn _mm256_mask_max_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epu8(a, b).as_u8x32();
-    transmute(simd_select_bitmask(k, max, src.as_u8x32()))
+pub fn _mm256_mask_max_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, max, src.as_u8x32()))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1734,9 +1900,11 @@ pub unsafe fn _mm256_mask_max_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxub))]
-pub unsafe fn _mm256_maskz_max_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epu8(a, b).as_u8x32();
-    transmute(simd_select_bitmask(k, max, u8x32::ZERO))
+pub fn _mm256_maskz_max_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, max, u8x32::ZERO))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1746,9 +1914,11 @@ pub unsafe fn _mm256_maskz_max_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxub))]
-pub unsafe fn _mm_mask_max_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epu8(a, b).as_u8x16();
-    transmute(simd_select_bitmask(k, max, src.as_u8x16()))
+pub fn _mm_mask_max_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, max, src.as_u8x16()))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1758,9 +1928,11 @@ pub unsafe fn _mm_mask_max_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxub))]
-pub unsafe fn _mm_maskz_max_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epu8(a, b).as_u8x16();
-    transmute(simd_select_bitmask(k, max, u8x16::ZERO))
+pub fn _mm_maskz_max_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, max, u8x16::ZERO))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst.
@@ -1770,10 +1942,12 @@ pub unsafe fn _mm_maskz_max_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
-pub unsafe fn _mm512_max_epi16(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i16x32();
-    let b = b.as_i16x32();
-    transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
+pub fn _mm512_max_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        transmute(simd_select::<i16x32, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1783,9 +1957,11 @@ pub unsafe fn _mm512_max_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
-pub unsafe fn _mm512_mask_max_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, max, src.as_i16x32()))
+pub fn _mm512_mask_max_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, max, src.as_i16x32()))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1795,9 +1971,11 @@ pub unsafe fn _mm512_mask_max_epi16(src: __m512i, k: __mmask32, a: __m512i, b: _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
-pub unsafe fn _mm512_maskz_max_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, max, i16x32::ZERO))
+pub fn _mm512_maskz_max_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, max, i16x32::ZERO))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1807,9 +1985,11 @@ pub unsafe fn _mm512_maskz_max_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
-pub unsafe fn _mm256_mask_max_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, max, src.as_i16x16()))
+pub fn _mm256_mask_max_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, max, src.as_i16x16()))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1819,9 +1999,11 @@ pub unsafe fn _mm256_mask_max_epi16(src: __m256i, k: __mmask16, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
-pub unsafe fn _mm256_maskz_max_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, max, i16x16::ZERO))
+pub fn _mm256_maskz_max_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, max, i16x16::ZERO))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1831,9 +2013,11 @@ pub unsafe fn _mm256_maskz_max_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
-pub unsafe fn _mm_mask_max_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, max, src.as_i16x8()))
+pub fn _mm_mask_max_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, max, src.as_i16x8()))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1843,9 +2027,11 @@ pub unsafe fn _mm_mask_max_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
-pub unsafe fn _mm_maskz_max_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, max, i16x8::ZERO))
+pub fn _mm_maskz_max_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, max, i16x8::ZERO))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst.
@@ -1855,10 +2041,12 @@ pub unsafe fn _mm_maskz_max_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
-pub unsafe fn _mm512_max_epi8(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i8x64();
-    let b = b.as_i8x64();
-    transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
+pub fn _mm512_max_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        transmute(simd_select::<i8x64, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1868,9 +2056,11 @@ pub unsafe fn _mm512_max_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
-pub unsafe fn _mm512_mask_max_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, max, src.as_i8x64()))
+pub fn _mm512_mask_max_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, max, src.as_i8x64()))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1880,9 +2070,11 @@ pub unsafe fn _mm512_mask_max_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
-pub unsafe fn _mm512_maskz_max_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, max, i8x64::ZERO))
+pub fn _mm512_maskz_max_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, max, i8x64::ZERO))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1892,9 +2084,11 @@ pub unsafe fn _mm512_maskz_max_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
-pub unsafe fn _mm256_mask_max_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, max, src.as_i8x32()))
+pub fn _mm256_mask_max_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, max, src.as_i8x32()))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1904,9 +2098,11 @@ pub unsafe fn _mm256_mask_max_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
-pub unsafe fn _mm256_maskz_max_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, max, i8x32::ZERO))
+pub fn _mm256_maskz_max_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, max, i8x32::ZERO))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1916,9 +2112,11 @@ pub unsafe fn _mm256_maskz_max_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
-pub unsafe fn _mm_mask_max_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, max, src.as_i8x16()))
+pub fn _mm_mask_max_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, max, src.as_i8x16()))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1928,9 +2126,11 @@ pub unsafe fn _mm_mask_max_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
-pub unsafe fn _mm_maskz_max_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, max, i8x16::ZERO))
+pub fn _mm_maskz_max_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, max, i8x16::ZERO))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
@@ -1940,10 +2140,12 @@ pub unsafe fn _mm_maskz_max_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuw))]
-pub unsafe fn _mm512_min_epu16(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_u16x32();
-    let b = b.as_u16x32();
-    transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
+pub fn _mm512_min_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1953,9 +2155,11 @@ pub unsafe fn _mm512_min_epu16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuw))]
-pub unsafe fn _mm512_mask_min_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, min, src.as_u16x32()))
+pub fn _mm512_mask_min_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, min, src.as_u16x32()))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1965,9 +2169,11 @@ pub unsafe fn _mm512_mask_min_epu16(src: __m512i, k: __mmask32, a: __m512i, b: _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuw))]
-pub unsafe fn _mm512_maskz_min_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, min, u16x32::ZERO))
+pub fn _mm512_maskz_min_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, min, u16x32::ZERO))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1977,9 +2183,11 @@ pub unsafe fn _mm512_maskz_min_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuw))]
-pub unsafe fn _mm256_mask_min_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, min, src.as_u16x16()))
+pub fn _mm256_mask_min_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, min, src.as_u16x16()))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1989,9 +2197,11 @@ pub unsafe fn _mm256_mask_min_epu16(src: __m256i, k: __mmask16, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuw))]
-pub unsafe fn _mm256_maskz_min_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, min, u16x16::ZERO))
+pub fn _mm256_maskz_min_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, min, u16x16::ZERO))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2001,9 +2211,11 @@ pub unsafe fn _mm256_maskz_min_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuw))]
-pub unsafe fn _mm_mask_min_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, min, src.as_u16x8()))
+pub fn _mm_mask_min_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, min, src.as_u16x8()))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2013,9 +2225,11 @@ pub unsafe fn _mm_mask_min_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuw))]
-pub unsafe fn _mm_maskz_min_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, min, u16x8::ZERO))
+pub fn _mm_maskz_min_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, min, u16x8::ZERO))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.
@@ -2025,10 +2239,12 @@ pub unsafe fn _mm_maskz_min_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminub))]
-pub unsafe fn _mm512_min_epu8(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_u8x64();
-    let b = b.as_u8x64();
-    transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
+pub fn _mm512_min_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2038,9 +2254,11 @@ pub unsafe fn _mm512_min_epu8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminub))]
-pub unsafe fn _mm512_mask_min_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epu8(a, b).as_u8x64();
-    transmute(simd_select_bitmask(k, min, src.as_u8x64()))
+pub fn _mm512_mask_min_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, min, src.as_u8x64()))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2050,9 +2268,11 @@ pub unsafe fn _mm512_mask_min_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminub))]
-pub unsafe fn _mm512_maskz_min_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epu8(a, b).as_u8x64();
-    transmute(simd_select_bitmask(k, min, u8x64::ZERO))
+pub fn _mm512_maskz_min_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, min, u8x64::ZERO))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2062,9 +2282,11 @@ pub unsafe fn _mm512_maskz_min_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminub))]
-pub unsafe fn _mm256_mask_min_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epu8(a, b).as_u8x32();
-    transmute(simd_select_bitmask(k, min, src.as_u8x32()))
+pub fn _mm256_mask_min_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, min, src.as_u8x32()))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2074,9 +2296,11 @@ pub unsafe fn _mm256_mask_min_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminub))]
-pub unsafe fn _mm256_maskz_min_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epu8(a, b).as_u8x32();
-    transmute(simd_select_bitmask(k, min, u8x32::ZERO))
+pub fn _mm256_maskz_min_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, min, u8x32::ZERO))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2086,9 +2310,11 @@ pub unsafe fn _mm256_maskz_min_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminub))]
-pub unsafe fn _mm_mask_min_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epu8(a, b).as_u8x16();
-    transmute(simd_select_bitmask(k, min, src.as_u8x16()))
+pub fn _mm_mask_min_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, min, src.as_u8x16()))
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2098,9 +2324,11 @@ pub unsafe fn _mm_mask_min_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminub))]
-pub unsafe fn _mm_maskz_min_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epu8(a, b).as_u8x16();
-    transmute(simd_select_bitmask(k, min, u8x16::ZERO))
+pub fn _mm_maskz_min_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, min, u8x16::ZERO))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst.
@@ -2110,10 +2338,12 @@ pub unsafe fn _mm_maskz_min_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsw))]
-pub unsafe fn _mm512_min_epi16(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i16x32();
-    let b = b.as_i16x32();
-    transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
+pub fn _mm512_min_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        transmute(simd_select::<i16x32, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2123,9 +2353,11 @@ pub unsafe fn _mm512_min_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsw))]
-pub unsafe fn _mm512_mask_min_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, min, src.as_i16x32()))
+pub fn _mm512_mask_min_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, min, src.as_i16x32()))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2135,9 +2367,11 @@ pub unsafe fn _mm512_mask_min_epi16(src: __m512i, k: __mmask32, a: __m512i, b: _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsw))]
-pub unsafe fn _mm512_maskz_min_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, min, i16x32::ZERO))
+pub fn _mm512_maskz_min_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, min, i16x32::ZERO))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2147,9 +2381,11 @@ pub unsafe fn _mm512_maskz_min_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsw))]
-pub unsafe fn _mm256_mask_min_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, min, src.as_i16x16()))
+pub fn _mm256_mask_min_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, min, src.as_i16x16()))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2159,9 +2395,11 @@ pub unsafe fn _mm256_mask_min_epi16(src: __m256i, k: __mmask16, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsw))]
-pub unsafe fn _mm256_maskz_min_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, min, i16x16::ZERO))
+pub fn _mm256_maskz_min_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, min, i16x16::ZERO))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2171,9 +2409,11 @@ pub unsafe fn _mm256_maskz_min_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsw))]
-pub unsafe fn _mm_mask_min_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, min, src.as_i16x8()))
+pub fn _mm_mask_min_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, min, src.as_i16x8()))
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2183,9 +2423,11 @@ pub unsafe fn _mm_mask_min_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsw))]
-pub unsafe fn _mm_maskz_min_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, min, i16x8::ZERO))
+pub fn _mm_maskz_min_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, min, i16x8::ZERO))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst.
@@ -2195,10 +2437,12 @@ pub unsafe fn _mm_maskz_min_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsb))]
-pub unsafe fn _mm512_min_epi8(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i8x64();
-    let b = b.as_i8x64();
-    transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
+pub fn _mm512_min_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        transmute(simd_select::<i8x64, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2208,9 +2452,11 @@ pub unsafe fn _mm512_min_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsb))]
-pub unsafe fn _mm512_mask_min_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, min, src.as_i8x64()))
+pub fn _mm512_mask_min_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, min, src.as_i8x64()))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2220,9 +2466,11 @@ pub unsafe fn _mm512_mask_min_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsb))]
-pub unsafe fn _mm512_maskz_min_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, min, i8x64::ZERO))
+pub fn _mm512_maskz_min_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, min, i8x64::ZERO))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2232,9 +2480,11 @@ pub unsafe fn _mm512_maskz_min_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsb))]
-pub unsafe fn _mm256_mask_min_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, min, src.as_i8x32()))
+pub fn _mm256_mask_min_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, min, src.as_i8x32()))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2244,9 +2494,11 @@ pub unsafe fn _mm256_mask_min_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsb))]
-pub unsafe fn _mm256_maskz_min_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, min, i8x32::ZERO))
+pub fn _mm256_maskz_min_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, min, i8x32::ZERO))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2256,9 +2508,11 @@ pub unsafe fn _mm256_maskz_min_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsb))]
-pub unsafe fn _mm_mask_min_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, min, src.as_i8x16()))
+pub fn _mm_mask_min_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, min, src.as_i8x16()))
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2268,9 +2522,11 @@ pub unsafe fn _mm_mask_min_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsb))]
-pub unsafe fn _mm_maskz_min_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, min, i8x16::ZERO))
+pub fn _mm_maskz_min_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, min, i8x16::ZERO))
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -2280,8 +2536,8 @@ pub unsafe fn _mm_maskz_min_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmplt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<u16x32, _>(simd_lt(a.as_u16x32(), b.as_u16x32()))
+pub fn _mm512_cmplt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_lt(a.as_u16x32(), b.as_u16x32())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2291,7 +2547,7 @@ pub unsafe fn _mm512_cmplt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmplt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmplt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2302,8 +2558,8 @@ pub unsafe fn _mm512_mask_cmplt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmplt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<u16x16, _>(simd_lt(a.as_u16x16(), b.as_u16x16()))
+pub fn _mm256_cmplt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_lt(a.as_u16x16(), b.as_u16x16())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2313,7 +2569,7 @@ pub unsafe fn _mm256_cmplt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmplt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmplt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2324,8 +2580,8 @@ pub unsafe fn _mm256_mask_cmplt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmplt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u16x8, _>(simd_lt(a.as_u16x8(), b.as_u16x8()))
+pub fn _mm_cmplt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_lt(a.as_u16x8(), b.as_u16x8())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2335,7 +2591,7 @@ pub unsafe fn _mm_cmplt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmplt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmplt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2346,8 +2602,8 @@ pub unsafe fn _mm_mask_cmplt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmplt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<u8x64, _>(simd_lt(a.as_u8x64(), b.as_u8x64()))
+pub fn _mm512_cmplt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_lt(a.as_u8x64(), b.as_u8x64())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2357,7 +2613,7 @@ pub unsafe fn _mm512_cmplt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmplt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmplt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2368,8 +2624,8 @@ pub unsafe fn _mm512_mask_cmplt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmplt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<u8x32, _>(simd_lt(a.as_u8x32(), b.as_u8x32()))
+pub fn _mm256_cmplt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_lt(a.as_u8x32(), b.as_u8x32())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2379,7 +2635,7 @@ pub unsafe fn _mm256_cmplt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmplt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmplt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2390,8 +2646,8 @@ pub unsafe fn _mm256_mask_cmplt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmplt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<u8x16, _>(simd_lt(a.as_u8x16(), b.as_u8x16()))
+pub fn _mm_cmplt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_lt(a.as_u8x16(), b.as_u8x16())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2401,7 +2657,7 @@ pub unsafe fn _mm_cmplt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmplt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmplt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2412,8 +2668,8 @@ pub unsafe fn _mm_mask_cmplt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmplt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<i16x32, _>(simd_lt(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_cmplt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_lt(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2423,7 +2679,7 @@ pub unsafe fn _mm512_cmplt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmplt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmplt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2434,8 +2690,8 @@ pub unsafe fn _mm512_mask_cmplt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmplt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<i16x16, _>(simd_lt(a.as_i16x16(), b.as_i16x16()))
+pub fn _mm256_cmplt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_lt(a.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2445,7 +2701,7 @@ pub unsafe fn _mm256_cmplt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmplt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmplt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2456,8 +2712,8 @@ pub unsafe fn _mm256_mask_cmplt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmplt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8()))
+pub fn _mm_cmplt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2467,7 +2723,7 @@ pub unsafe fn _mm_cmplt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmplt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmplt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2478,8 +2734,8 @@ pub unsafe fn _mm_mask_cmplt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmplt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<i8x64, _>(simd_lt(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_cmplt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_lt(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2489,7 +2745,7 @@ pub unsafe fn _mm512_cmplt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmplt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmplt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2500,8 +2756,8 @@ pub unsafe fn _mm512_mask_cmplt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmplt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<i8x32, _>(simd_lt(a.as_i8x32(), b.as_i8x32()))
+pub fn _mm256_cmplt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_lt(a.as_i8x32(), b.as_i8x32())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2511,7 +2767,7 @@ pub unsafe fn _mm256_cmplt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmplt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmplt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2522,8 +2778,8 @@ pub unsafe fn _mm256_mask_cmplt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmplt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16()))
+pub fn _mm_cmplt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2533,7 +2789,7 @@ pub unsafe fn _mm_cmplt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmplt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmplt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -2544,8 +2800,8 @@ pub unsafe fn _mm_mask_cmplt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpgt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<u16x32, _>(simd_gt(a.as_u16x32(), b.as_u16x32()))
+pub fn _mm512_cmpgt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_gt(a.as_u16x32(), b.as_u16x32())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2555,7 +2811,7 @@ pub unsafe fn _mm512_cmpgt_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpgt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmpgt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2566,8 +2822,8 @@ pub unsafe fn _mm512_mask_cmpgt_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpgt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<u16x16, _>(simd_gt(a.as_u16x16(), b.as_u16x16()))
+pub fn _mm256_cmpgt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_gt(a.as_u16x16(), b.as_u16x16())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2577,7 +2833,7 @@ pub unsafe fn _mm256_cmpgt_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpgt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmpgt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2588,8 +2844,8 @@ pub unsafe fn _mm256_mask_cmpgt_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpgt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u16x8, _>(simd_gt(a.as_u16x8(), b.as_u16x8()))
+pub fn _mm_cmpgt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_gt(a.as_u16x8(), b.as_u16x8())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2599,7 +2855,7 @@ pub unsafe fn _mm_cmpgt_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpgt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpgt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2610,8 +2866,8 @@ pub unsafe fn _mm_mask_cmpgt_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpgt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<u8x64, _>(simd_gt(a.as_u8x64(), b.as_u8x64()))
+pub fn _mm512_cmpgt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_gt(a.as_u8x64(), b.as_u8x64())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2621,7 +2877,7 @@ pub unsafe fn _mm512_cmpgt_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpgt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmpgt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2632,8 +2888,8 @@ pub unsafe fn _mm512_mask_cmpgt_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpgt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<u8x32, _>(simd_gt(a.as_u8x32(), b.as_u8x32()))
+pub fn _mm256_cmpgt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_gt(a.as_u8x32(), b.as_u8x32())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2643,7 +2899,7 @@ pub unsafe fn _mm256_cmpgt_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpgt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmpgt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2654,8 +2910,8 @@ pub unsafe fn _mm256_mask_cmpgt_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpgt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<u8x16, _>(simd_gt(a.as_u8x16(), b.as_u8x16()))
+pub fn _mm_cmpgt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_gt(a.as_u8x16(), b.as_u8x16())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2665,7 +2921,7 @@ pub unsafe fn _mm_cmpgt_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpgt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmpgt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2676,8 +2932,8 @@ pub unsafe fn _mm_mask_cmpgt_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpgt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<i16x32, _>(simd_gt(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_cmpgt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_gt(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2687,7 +2943,7 @@ pub unsafe fn _mm512_cmpgt_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpgt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmpgt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2698,8 +2954,8 @@ pub unsafe fn _mm512_mask_cmpgt_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpgt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
+pub fn _mm256_cmpgt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2709,7 +2965,7 @@ pub unsafe fn _mm256_cmpgt_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpgt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmpgt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2720,8 +2976,8 @@ pub unsafe fn _mm256_mask_cmpgt_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpgt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8()))
+pub fn _mm_cmpgt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2731,7 +2987,7 @@ pub unsafe fn _mm_cmpgt_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpgt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpgt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2742,8 +2998,8 @@ pub unsafe fn _mm_mask_cmpgt_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpgt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<i8x64, _>(simd_gt(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_cmpgt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_gt(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2753,7 +3009,7 @@ pub unsafe fn _mm512_cmpgt_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpgt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmpgt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2764,8 +3020,8 @@ pub unsafe fn _mm512_mask_cmpgt_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpgt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
+pub fn _mm256_cmpgt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2775,7 +3031,7 @@ pub unsafe fn _mm256_cmpgt_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpgt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmpgt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2786,8 +3042,8 @@ pub unsafe fn _mm256_mask_cmpgt_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpgt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16()))
+pub fn _mm_cmpgt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2797,7 +3053,7 @@ pub unsafe fn _mm_cmpgt_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpgt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmpgt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -2808,8 +3064,8 @@ pub unsafe fn _mm_mask_cmpgt_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmple_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<u16x32, _>(simd_le(a.as_u16x32(), b.as_u16x32()))
+pub fn _mm512_cmple_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_le(a.as_u16x32(), b.as_u16x32())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2819,7 +3075,7 @@ pub unsafe fn _mm512_cmple_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmple_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmple_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -2830,8 +3086,8 @@ pub unsafe fn _mm512_mask_cmple_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmple_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<u16x16, _>(simd_le(a.as_u16x16(), b.as_u16x16()))
+pub fn _mm256_cmple_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_le(a.as_u16x16(), b.as_u16x16())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2841,7 +3097,7 @@ pub unsafe fn _mm256_cmple_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmple_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmple_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -2852,8 +3108,8 @@ pub unsafe fn _mm256_mask_cmple_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmple_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u16x8, _>(simd_le(a.as_u16x8(), b.as_u16x8()))
+pub fn _mm_cmple_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_le(a.as_u16x8(), b.as_u16x8())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2863,7 +3119,7 @@ pub unsafe fn _mm_cmple_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmple_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmple_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -2874,8 +3130,8 @@ pub unsafe fn _mm_mask_cmple_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmple_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<u8x64, _>(simd_le(a.as_u8x64(), b.as_u8x64()))
+pub fn _mm512_cmple_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_le(a.as_u8x64(), b.as_u8x64())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2885,7 +3141,7 @@ pub unsafe fn _mm512_cmple_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmple_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmple_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -2896,8 +3152,8 @@ pub unsafe fn _mm512_mask_cmple_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmple_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<u8x32, _>(simd_le(a.as_u8x32(), b.as_u8x32()))
+pub fn _mm256_cmple_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_le(a.as_u8x32(), b.as_u8x32())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2907,7 +3163,7 @@ pub unsafe fn _mm256_cmple_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmple_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmple_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -2918,8 +3174,8 @@ pub unsafe fn _mm256_mask_cmple_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmple_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<u8x16, _>(simd_le(a.as_u8x16(), b.as_u8x16()))
+pub fn _mm_cmple_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_le(a.as_u8x16(), b.as_u8x16())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2929,7 +3185,7 @@ pub unsafe fn _mm_cmple_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmple_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmple_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epu8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -2940,8 +3196,8 @@ pub unsafe fn _mm_mask_cmple_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmple_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<i16x32, _>(simd_le(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_cmple_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_le(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2951,7 +3207,7 @@ pub unsafe fn _mm512_cmple_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmple_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmple_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -2962,8 +3218,8 @@ pub unsafe fn _mm512_mask_cmple_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmple_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<i16x16, _>(simd_le(a.as_i16x16(), b.as_i16x16()))
+pub fn _mm256_cmple_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_le(a.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2973,7 +3229,7 @@ pub unsafe fn _mm256_cmple_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmple_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmple_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -2984,8 +3240,8 @@ pub unsafe fn _mm256_mask_cmple_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmple_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i16x8, _>(simd_le(a.as_i16x8(), b.as_i16x8()))
+pub fn _mm_cmple_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_le(a.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -2995,7 +3251,7 @@ pub unsafe fn _mm_cmple_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmple_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmple_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi16_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -3006,8 +3262,8 @@ pub unsafe fn _mm_mask_cmple_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmple_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<i8x64, _>(simd_le(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_cmple_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_le(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3017,7 +3273,7 @@ pub unsafe fn _mm512_cmple_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmple_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmple_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -3028,8 +3284,8 @@ pub unsafe fn _mm512_mask_cmple_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmple_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<i8x32, _>(simd_le(a.as_i8x32(), b.as_i8x32()))
+pub fn _mm256_cmple_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_le(a.as_i8x32(), b.as_i8x32())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3039,7 +3295,7 @@ pub unsafe fn _mm256_cmple_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmple_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmple_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -3050,8 +3306,8 @@ pub unsafe fn _mm256_mask_cmple_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmple_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<i8x16, _>(simd_le(a.as_i8x16(), b.as_i8x16()))
+pub fn _mm_cmple_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_le(a.as_i8x16(), b.as_i8x16())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3061,7 +3317,7 @@ pub unsafe fn _mm_cmple_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmple_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmple_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epi8_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -3072,8 +3328,8 @@ pub unsafe fn _mm_mask_cmple_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpge_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<u16x32, _>(simd_ge(a.as_u16x32(), b.as_u16x32()))
+pub fn _mm512_cmpge_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_ge(a.as_u16x32(), b.as_u16x32())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3083,7 +3339,7 @@ pub unsafe fn _mm512_cmpge_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpge_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmpge_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3094,8 +3350,8 @@ pub unsafe fn _mm512_mask_cmpge_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpge_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<u16x16, _>(simd_ge(a.as_u16x16(), b.as_u16x16()))
+pub fn _mm256_cmpge_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_ge(a.as_u16x16(), b.as_u16x16())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3105,7 +3361,7 @@ pub unsafe fn _mm256_cmpge_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpge_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmpge_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3116,8 +3372,8 @@ pub unsafe fn _mm256_mask_cmpge_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpge_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u16x8, _>(simd_ge(a.as_u16x8(), b.as_u16x8()))
+pub fn _mm_cmpge_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_ge(a.as_u16x8(), b.as_u16x8())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3127,7 +3383,7 @@ pub unsafe fn _mm_cmpge_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpge_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpge_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3138,8 +3394,8 @@ pub unsafe fn _mm_mask_cmpge_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpge_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<u8x64, _>(simd_ge(a.as_u8x64(), b.as_u8x64()))
+pub fn _mm512_cmpge_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_ge(a.as_u8x64(), b.as_u8x64())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3149,7 +3405,7 @@ pub unsafe fn _mm512_cmpge_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpge_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmpge_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3160,8 +3416,8 @@ pub unsafe fn _mm512_mask_cmpge_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpge_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<u8x32, _>(simd_ge(a.as_u8x32(), b.as_u8x32()))
+pub fn _mm256_cmpge_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_ge(a.as_u8x32(), b.as_u8x32())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3171,7 +3427,7 @@ pub unsafe fn _mm256_cmpge_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpge_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmpge_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3182,8 +3438,8 @@ pub unsafe fn _mm256_mask_cmpge_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpge_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<u8x16, _>(simd_ge(a.as_u8x16(), b.as_u8x16()))
+pub fn _mm_cmpge_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_ge(a.as_u8x16(), b.as_u8x16())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3193,7 +3449,7 @@ pub unsafe fn _mm_cmpge_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpge_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmpge_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3204,8 +3460,8 @@ pub unsafe fn _mm_mask_cmpge_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpge_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<i16x32, _>(simd_ge(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_cmpge_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_ge(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3215,7 +3471,7 @@ pub unsafe fn _mm512_cmpge_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpge_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmpge_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3226,8 +3482,8 @@ pub unsafe fn _mm512_mask_cmpge_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpge_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<i16x16, _>(simd_ge(a.as_i16x16(), b.as_i16x16()))
+pub fn _mm256_cmpge_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_ge(a.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3237,7 +3493,7 @@ pub unsafe fn _mm256_cmpge_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpge_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmpge_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3248,8 +3504,8 @@ pub unsafe fn _mm256_mask_cmpge_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpge_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i16x8, _>(simd_ge(a.as_i16x8(), b.as_i16x8()))
+pub fn _mm_cmpge_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_ge(a.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3259,7 +3515,7 @@ pub unsafe fn _mm_cmpge_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpge_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpge_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3270,8 +3526,8 @@ pub unsafe fn _mm_mask_cmpge_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpge_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<i8x64, _>(simd_ge(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_cmpge_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_ge(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3281,7 +3537,7 @@ pub unsafe fn _mm512_cmpge_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpge_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmpge_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3292,8 +3548,8 @@ pub unsafe fn _mm512_mask_cmpge_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpge_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<i8x32, _>(simd_ge(a.as_i8x32(), b.as_i8x32()))
+pub fn _mm256_cmpge_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_ge(a.as_i8x32(), b.as_i8x32())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3303,7 +3559,7 @@ pub unsafe fn _mm256_cmpge_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpge_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmpge_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3314,8 +3570,8 @@ pub unsafe fn _mm256_mask_cmpge_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpge_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<i8x16, _>(simd_ge(a.as_i8x16(), b.as_i8x16()))
+pub fn _mm_cmpge_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_ge(a.as_i8x16(), b.as_i8x16())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3325,7 +3581,7 @@ pub unsafe fn _mm_cmpge_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpge_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmpge_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -3336,8 +3592,8 @@ pub unsafe fn _mm_mask_cmpge_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpeq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<u16x32, _>(simd_eq(a.as_u16x32(), b.as_u16x32()))
+pub fn _mm512_cmpeq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_eq(a.as_u16x32(), b.as_u16x32())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3347,7 +3603,7 @@ pub unsafe fn _mm512_cmpeq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpeq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmpeq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3358,8 +3614,8 @@ pub unsafe fn _mm512_mask_cmpeq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpeq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<u16x16, _>(simd_eq(a.as_u16x16(), b.as_u16x16()))
+pub fn _mm256_cmpeq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_eq(a.as_u16x16(), b.as_u16x16())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3369,7 +3625,7 @@ pub unsafe fn _mm256_cmpeq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpeq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmpeq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3380,8 +3636,8 @@ pub unsafe fn _mm256_mask_cmpeq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpeq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u16x8, _>(simd_eq(a.as_u16x8(), b.as_u16x8()))
+pub fn _mm_cmpeq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_eq(a.as_u16x8(), b.as_u16x8())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3391,7 +3647,7 @@ pub unsafe fn _mm_cmpeq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpeq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpeq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3402,8 +3658,8 @@ pub unsafe fn _mm_mask_cmpeq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpeq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<u8x64, _>(simd_eq(a.as_u8x64(), b.as_u8x64()))
+pub fn _mm512_cmpeq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_eq(a.as_u8x64(), b.as_u8x64())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3413,7 +3669,7 @@ pub unsafe fn _mm512_cmpeq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpeq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmpeq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3424,8 +3680,8 @@ pub unsafe fn _mm512_mask_cmpeq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpeq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<u8x32, _>(simd_eq(a.as_u8x32(), b.as_u8x32()))
+pub fn _mm256_cmpeq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_eq(a.as_u8x32(), b.as_u8x32())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3435,7 +3691,7 @@ pub unsafe fn _mm256_cmpeq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpeq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmpeq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3446,8 +3702,8 @@ pub unsafe fn _mm256_mask_cmpeq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpeq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<u8x16, _>(simd_eq(a.as_u8x16(), b.as_u8x16()))
+pub fn _mm_cmpeq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_eq(a.as_u8x16(), b.as_u8x16())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3457,7 +3713,7 @@ pub unsafe fn _mm_cmpeq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpeq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmpeq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epu8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3468,8 +3724,8 @@ pub unsafe fn _mm_mask_cmpeq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpeq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<i16x32, _>(simd_eq(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_cmpeq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_eq(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3479,7 +3735,7 @@ pub unsafe fn _mm512_cmpeq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpeq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmpeq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3490,8 +3746,8 @@ pub unsafe fn _mm512_mask_cmpeq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpeq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
+pub fn _mm256_cmpeq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3501,7 +3757,7 @@ pub unsafe fn _mm256_cmpeq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpeq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmpeq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3512,8 +3768,8 @@ pub unsafe fn _mm256_mask_cmpeq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpeq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8()))
+pub fn _mm_cmpeq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3523,7 +3779,7 @@ pub unsafe fn _mm_cmpeq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpeq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpeq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi16_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3534,8 +3790,8 @@ pub unsafe fn _mm_mask_cmpeq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpeq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<i8x64, _>(simd_eq(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_cmpeq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_eq(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3545,7 +3801,7 @@ pub unsafe fn _mm512_cmpeq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpeq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmpeq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3556,8 +3812,8 @@ pub unsafe fn _mm512_mask_cmpeq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpeq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
+pub fn _mm256_cmpeq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3567,7 +3823,7 @@ pub unsafe fn _mm256_cmpeq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpeq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmpeq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3578,8 +3834,8 @@ pub unsafe fn _mm256_mask_cmpeq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpeq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16()))
+pub fn _mm_cmpeq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3589,7 +3845,7 @@ pub unsafe fn _mm_cmpeq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpeq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmpeq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epi8_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -3600,8 +3856,8 @@ pub unsafe fn _mm_mask_cmpeq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpneq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<u16x32, _>(simd_ne(a.as_u16x32(), b.as_u16x32()))
+pub fn _mm512_cmpneq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<u16x32, _>(simd_ne(a.as_u16x32(), b.as_u16x32())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3611,7 +3867,7 @@ pub unsafe fn _mm512_cmpneq_epu16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpneq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmpneq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3622,8 +3878,8 @@ pub unsafe fn _mm512_mask_cmpneq_epu16_mask(k1: __mmask32, a: __m512i, b: __m512
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpneq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<u16x16, _>(simd_ne(a.as_u16x16(), b.as_u16x16()))
+pub fn _mm256_cmpneq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<u16x16, _>(simd_ne(a.as_u16x16(), b.as_u16x16())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3633,7 +3889,7 @@ pub unsafe fn _mm256_cmpneq_epu16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpneq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmpneq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3644,8 +3900,8 @@ pub unsafe fn _mm256_mask_cmpneq_epu16_mask(k1: __mmask16, a: __m256i, b: __m256
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpneq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u16x8, _>(simd_ne(a.as_u16x8(), b.as_u16x8()))
+pub fn _mm_cmpneq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u16x8, _>(simd_ne(a.as_u16x8(), b.as_u16x8())) }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3655,7 +3911,7 @@ pub unsafe fn _mm_cmpneq_epu16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpneq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpneq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3666,8 +3922,8 @@ pub unsafe fn _mm_mask_cmpneq_epu16_mask(k1: __mmask8, a: __m128i, b: __m128i) -
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpneq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<u8x64, _>(simd_ne(a.as_u8x64(), b.as_u8x64()))
+pub fn _mm512_cmpneq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<u8x64, _>(simd_ne(a.as_u8x64(), b.as_u8x64())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3677,7 +3933,7 @@ pub unsafe fn _mm512_cmpneq_epu8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpneq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmpneq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3688,8 +3944,8 @@ pub unsafe fn _mm512_mask_cmpneq_epu8_mask(k1: __mmask64, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpneq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<u8x32, _>(simd_ne(a.as_u8x32(), b.as_u8x32()))
+pub fn _mm256_cmpneq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<u8x32, _>(simd_ne(a.as_u8x32(), b.as_u8x32())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3699,7 +3955,7 @@ pub unsafe fn _mm256_cmpneq_epu8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpneq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmpneq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3710,8 +3966,8 @@ pub unsafe fn _mm256_mask_cmpneq_epu8_mask(k1: __mmask32, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpneq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<u8x16, _>(simd_ne(a.as_u8x16(), b.as_u8x16()))
+pub fn _mm_cmpneq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<u8x16, _>(simd_ne(a.as_u8x16(), b.as_u8x16())) }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3721,7 +3977,7 @@ pub unsafe fn _mm_cmpneq_epu8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpneq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmpneq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epu8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3732,8 +3988,8 @@ pub unsafe fn _mm_mask_cmpneq_epu8_mask(k1: __mmask16, a: __m128i, b: __m128i) -
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpneq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
-    simd_bitmask::<i16x32, _>(simd_ne(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_cmpneq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe { simd_bitmask::<i16x32, _>(simd_ne(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3743,7 +3999,7 @@ pub unsafe fn _mm512_cmpneq_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpneq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_cmpneq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     _mm512_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3754,8 +4010,8 @@ pub unsafe fn _mm512_mask_cmpneq_epi16_mask(k1: __mmask32, a: __m512i, b: __m512
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpneq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
-    simd_bitmask::<i16x16, _>(simd_ne(a.as_i16x16(), b.as_i16x16()))
+pub fn _mm256_cmpneq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe { simd_bitmask::<i16x16, _>(simd_ne(a.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3765,7 +4021,7 @@ pub unsafe fn _mm256_cmpneq_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpneq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_cmpneq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     _mm256_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3776,8 +4032,8 @@ pub unsafe fn _mm256_mask_cmpneq_epi16_mask(k1: __mmask16, a: __m256i, b: __m256
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpneq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i16x8, _>(simd_ne(a.as_i16x8(), b.as_i16x8()))
+pub fn _mm_cmpneq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i16x8, _>(simd_ne(a.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Compare packed signed 16-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3787,7 +4043,7 @@ pub unsafe fn _mm_cmpneq_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpneq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpneq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi16_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3798,8 +4054,8 @@ pub unsafe fn _mm_mask_cmpneq_epi16_mask(k1: __mmask8, a: __m128i, b: __m128i) -
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_cmpneq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
-    simd_bitmask::<i8x64, _>(simd_ne(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_cmpneq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe { simd_bitmask::<i8x64, _>(simd_ne(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3809,7 +4065,7 @@ pub unsafe fn _mm512_cmpneq_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm512_mask_cmpneq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_cmpneq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     _mm512_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3820,8 +4076,8 @@ pub unsafe fn _mm512_mask_cmpneq_epi8_mask(k1: __mmask64, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_cmpneq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
-    simd_bitmask::<i8x32, _>(simd_ne(a.as_i8x32(), b.as_i8x32()))
+pub fn _mm256_cmpneq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe { simd_bitmask::<i8x32, _>(simd_ne(a.as_i8x32(), b.as_i8x32())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3831,7 +4087,7 @@ pub unsafe fn _mm256_cmpneq_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm256_mask_cmpneq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_cmpneq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     _mm256_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3842,8 +4098,8 @@ pub unsafe fn _mm256_mask_cmpneq_epi8_mask(k1: __mmask32, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_cmpneq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
-    simd_bitmask::<i8x16, _>(simd_ne(a.as_i8x16(), b.as_i8x16()))
+pub fn _mm_cmpneq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe { simd_bitmask::<i8x16, _>(simd_ne(a.as_i8x16(), b.as_i8x16())) }
 }
 
 /// Compare packed signed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3853,7 +4109,7 @@ pub unsafe fn _mm_cmpneq_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))]
-pub unsafe fn _mm_mask_cmpneq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_cmpneq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     _mm_mask_cmp_epi8_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -3865,21 +4121,23 @@ pub unsafe fn _mm_mask_cmpneq_epi8_mask(k1: __mmask16, a: __m128i, b: __m128i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u16x32();
-    let b = b.as_u16x32();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i16x32::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i16x32::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3890,26 +4148,28 @@ pub unsafe fn _mm512_cmp_epu16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
+pub fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
     k1: __mmask32,
     a: __m512i,
     b: __m512i,
 ) -> __mmask32 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u16x32();
-    let b = b.as_u16x32();
-    let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i16x32::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x32();
+        let b = b.as_u16x32();
+        let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3920,21 +4180,23 @@ pub unsafe fn _mm512_mask_cmp_epu16_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u16x16();
-    let b = b.as_u16x16();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i16x16::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i16x16::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -3945,26 +4207,28 @@ pub unsafe fn _mm256_cmp_epu16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
+pub fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
     k1: __mmask16,
     a: __m256i,
     b: __m256i,
 ) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u16x16();
-    let b = b.as_u16x16();
-    let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i16x16::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x16();
+        let b = b.as_u16x16();
+        let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -3975,21 +4239,23 @@ pub unsafe fn _mm256_mask_cmp_epu16_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u16x8();
-    let b = b.as_u16x8();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i16x8::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i16x8::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x8();
+        let b = b.as_u16x8();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4000,26 +4266,24 @@ pub unsafe fn _mm_cmp_epu16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(
-    k1: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u16x8();
-    let b = b.as_u16x8();
-    let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i16x8::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+pub fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u16x8();
+        let b = b.as_u16x8();
+        let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4030,21 +4294,23 @@ pub unsafe fn _mm_mask_cmp_epu16_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u8x64();
-    let b = b.as_u8x64();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i8x64::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i8x64::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x64::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x64::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4055,26 +4321,28 @@ pub unsafe fn _mm512_cmp_epu8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
+pub fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
     k1: __mmask64,
     a: __m512i,
     b: __m512i,
 ) -> __mmask64 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u8x64();
-    let b = b.as_u8x64();
-    let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i8x64::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x64::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4085,21 +4353,23 @@ pub unsafe fn _mm512_mask_cmp_epu8_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u8x32();
-    let b = b.as_u8x32();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i8x32::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i8x32::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4110,26 +4380,28 @@ pub unsafe fn _mm256_cmp_epu8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
+pub fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
     k1: __mmask32,
     a: __m256i,
     b: __m256i,
 ) -> __mmask32 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u8x32();
-    let b = b.as_u8x32();
-    let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i8x32::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4140,21 +4412,23 @@ pub unsafe fn _mm256_mask_cmp_epu8_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u8x16();
-    let b = b.as_u8x16();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i8x16::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i8x16::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4165,26 +4439,24 @@ pub unsafe fn _mm_cmp_epu8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(
-    k1: __mmask16,
-    a: __m128i,
-    b: __m128i,
-) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_u8x16();
-    let b = b.as_u8x16();
-    let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i8x16::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+pub fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4195,21 +4467,23 @@ pub unsafe fn _mm_mask_cmp_epu8_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i16x32();
-    let b = b.as_i16x32();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i16x32::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i16x32::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4220,26 +4494,28 @@ pub unsafe fn _mm512_cmp_epi16_mask<const IMM8: i32>(a: __m512i, b: __m512i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
+pub fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
     k1: __mmask32,
     a: __m512i,
     b: __m512i,
 ) -> __mmask32 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i16x32();
-    let b = b.as_i16x32();
-    let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i16x32::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        let k1 = simd_select_bitmask(k1, i16x32::splat(-1), i16x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4250,21 +4526,23 @@ pub unsafe fn _mm512_mask_cmp_epi16_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i16x16();
-    let b = b.as_i16x16();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i16x16::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i16x16::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4275,26 +4553,28 @@ pub unsafe fn _mm256_cmp_epi16_mask<const IMM8: i32>(a: __m256i, b: __m256i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
+pub fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
     k1: __mmask16,
     a: __m256i,
     b: __m256i,
 ) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i16x16();
-    let b = b.as_i16x16();
-    let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i16x16::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x16();
+        let b = b.as_i16x16();
+        let k1 = simd_select_bitmask(k1, i16x16::splat(-1), i16x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4305,21 +4585,23 @@ pub unsafe fn _mm256_mask_cmp_epi16_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i16x8();
-    let b = b.as_i16x8();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i16x8::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i16x8::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i16x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i16x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4330,26 +4612,24 @@ pub unsafe fn _mm_cmp_epi16_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(
-    k1: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i16x8();
-    let b = b.as_i16x8();
-    let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i16x8::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+pub fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i16x8();
+        let b = b.as_i16x8();
+        let k1 = simd_select_bitmask(k1, i16x8::splat(-1), i16x8::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i16x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4360,21 +4640,23 @@ pub unsafe fn _mm_mask_cmp_epi16_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i8x64();
-    let b = b.as_i8x64();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i8x64::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i8x64::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> __mmask64 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x64::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x64::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4385,26 +4667,28 @@ pub unsafe fn _mm512_cmp_epi8_mask<const IMM8: i32>(a: __m512i, b: __m512i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
+pub fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
     k1: __mmask64,
     a: __m512i,
     b: __m512i,
 ) -> __mmask64 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i8x64();
-    let b = b.as_i8x64();
-    let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i8x64::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        let k1 = simd_select_bitmask(k1, i8x64::splat(-1), i8x64::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x64::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4415,21 +4699,23 @@ pub unsafe fn _mm512_mask_cmp_epi8_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i8x32();
-    let b = b.as_i8x32();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i8x32::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i8x32::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x32::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x32::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4440,26 +4726,28 @@ pub unsafe fn _mm256_cmp_epi8_mask<const IMM8: i32>(a: __m256i, b: __m256i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
+pub fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
     k1: __mmask32,
     a: __m256i,
     b: __m256i,
 ) -> __mmask32 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i8x32();
-    let b = b.as_i8x32();
-    let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i8x32::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x32();
+        let b = b.as_i8x32();
+        let k1 = simd_select_bitmask(k1, i8x32::splat(-1), i8x32::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x32::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -4470,21 +4758,23 @@ pub unsafe fn _mm256_mask_cmp_epi8_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i8x16();
-    let b = b.as_i8x16();
-    let r = match IMM8 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i8x16::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i8x16::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        let r = match IMM8 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i8x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i8x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -4495,26 +4785,24 @@ pub unsafe fn _mm_cmp_epi8_mask<const IMM8: i32>(a: __m128i, b: __m128i) -> __mm
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM8 = 0))]
-pub unsafe fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(
-    k1: __mmask16,
-    a: __m128i,
-    b: __m128i,
-) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 3);
-    let a = a.as_i8x16();
-    let b = b.as_i8x16();
-    let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::ZERO);
-    let r = match IMM8 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i8x16::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+pub fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(k1: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 3);
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        let k1 = simd_select_bitmask(k1, i8x16::splat(-1), i8x16::ZERO);
+        let r = match IMM8 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i8x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a.
@@ -4523,8 +4811,8 @@ pub unsafe fn _mm_mask_cmp_epi8_mask<const IMM8: i32>(
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_add_epi16(a: __m256i) -> i16 {
-    simd_reduce_add_unordered(a.as_i16x16())
+pub fn _mm256_reduce_add_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_add_unordered(a.as_i16x16()) }
 }
 
 /// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -4533,8 +4821,8 @@ pub unsafe fn _mm256_reduce_add_epi16(a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_add_epi16(k: __mmask16, a: __m256i) -> i16 {
-    simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO))
+pub fn _mm256_mask_reduce_add_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO)) }
 }
 
 /// Reduce the packed 16-bit integers in a by addition. Returns the sum of all elements in a.
@@ -4543,8 +4831,8 @@ pub unsafe fn _mm256_mask_reduce_add_epi16(k: __mmask16, a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_add_epi16(a: __m128i) -> i16 {
-    simd_reduce_add_unordered(a.as_i16x8())
+pub fn _mm_reduce_add_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_add_unordered(a.as_i16x8()) }
 }
 
 /// Reduce the packed 16-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -4553,8 +4841,8 @@ pub unsafe fn _mm_reduce_add_epi16(a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_add_epi16(k: __mmask8, a: __m128i) -> i16 {
-    simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO))
+pub fn _mm_mask_reduce_add_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO)) }
 }
 
 /// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
@@ -4563,8 +4851,8 @@ pub unsafe fn _mm_mask_reduce_add_epi16(k: __mmask8, a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_add_epi8(a: __m256i) -> i8 {
-    simd_reduce_add_unordered(a.as_i8x32())
+pub fn _mm256_reduce_add_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_add_unordered(a.as_i8x32()) }
 }
 
 /// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -4573,8 +4861,8 @@ pub unsafe fn _mm256_reduce_add_epi8(a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_add_epi8(k: __mmask32, a: __m256i) -> i8 {
-    simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO))
+pub fn _mm256_mask_reduce_add_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO)) }
 }
 
 /// Reduce the packed 8-bit integers in a by addition. Returns the sum of all elements in a.
@@ -4583,8 +4871,8 @@ pub unsafe fn _mm256_mask_reduce_add_epi8(k: __mmask32, a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_add_epi8(a: __m128i) -> i8 {
-    simd_reduce_add_unordered(a.as_i8x16())
+pub fn _mm_reduce_add_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_add_unordered(a.as_i8x16()) }
 }
 
 /// Reduce the packed 8-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -4593,8 +4881,8 @@ pub unsafe fn _mm_reduce_add_epi8(a: __m128i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_add_epi8(k: __mmask16, a: __m128i) -> i8 {
-    simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO))
+pub fn _mm_mask_reduce_add_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO)) }
 }
 
 /// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
@@ -4603,8 +4891,8 @@ pub unsafe fn _mm_mask_reduce_add_epi8(k: __mmask16, a: __m128i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_and_epi16(a: __m256i) -> i16 {
-    simd_reduce_and(a.as_i16x16())
+pub fn _mm256_reduce_and_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_and(a.as_i16x16()) }
 }
 
 /// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
@@ -4613,12 +4901,14 @@ pub unsafe fn _mm256_reduce_and_epi16(a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_and_epi16(k: __mmask16, a: __m256i) -> i16 {
-    simd_reduce_and(simd_select_bitmask(
-        k,
-        a.as_i16x16(),
-        _mm256_set1_epi64x(-1).as_i16x16(),
-    ))
+pub fn _mm256_mask_reduce_and_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i16x16(),
+            _mm256_set1_epi64x(-1).as_i16x16(),
+        ))
+    }
 }
 
 /// Reduce the packed 16-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
@@ -4627,8 +4917,8 @@ pub unsafe fn _mm256_mask_reduce_and_epi16(k: __mmask16, a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_and_epi16(a: __m128i) -> i16 {
-    simd_reduce_and(a.as_i16x8())
+pub fn _mm_reduce_and_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_and(a.as_i16x8()) }
 }
 
 /// Reduce the packed 16-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
@@ -4637,12 +4927,14 @@ pub unsafe fn _mm_reduce_and_epi16(a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_and_epi16(k: __mmask8, a: __m128i) -> i16 {
-    simd_reduce_and(simd_select_bitmask(
-        k,
-        a.as_i16x8(),
-        _mm_set1_epi64x(-1).as_i16x8(),
-    ))
+pub fn _mm_mask_reduce_and_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i16x8(),
+            _mm_set1_epi64x(-1).as_i16x8(),
+        ))
+    }
 }
 
 /// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
@@ -4651,8 +4943,8 @@ pub unsafe fn _mm_mask_reduce_and_epi16(k: __mmask8, a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_and_epi8(a: __m256i) -> i8 {
-    simd_reduce_and(a.as_i8x32())
+pub fn _mm256_reduce_and_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_and(a.as_i8x32()) }
 }
 
 /// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
@@ -4661,12 +4953,14 @@ pub unsafe fn _mm256_reduce_and_epi8(a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_and_epi8(k: __mmask32, a: __m256i) -> i8 {
-    simd_reduce_and(simd_select_bitmask(
-        k,
-        a.as_i8x32(),
-        _mm256_set1_epi64x(-1).as_i8x32(),
-    ))
+pub fn _mm256_mask_reduce_and_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i8x32(),
+            _mm256_set1_epi64x(-1).as_i8x32(),
+        ))
+    }
 }
 
 /// Reduce the packed 8-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
@@ -4675,8 +4969,8 @@ pub unsafe fn _mm256_mask_reduce_and_epi8(k: __mmask32, a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_and_epi8(a: __m128i) -> i8 {
-    simd_reduce_and(a.as_i8x16())
+pub fn _mm_reduce_and_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_and(a.as_i8x16()) }
 }
 
 /// Reduce the packed 8-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
@@ -4685,12 +4979,14 @@ pub unsafe fn _mm_reduce_and_epi8(a: __m128i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_and_epi8(k: __mmask16, a: __m128i) -> i8 {
-    simd_reduce_and(simd_select_bitmask(
-        k,
-        a.as_i8x16(),
-        _mm_set1_epi64x(-1).as_i8x16(),
-    ))
+pub fn _mm_mask_reduce_and_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe {
+        simd_reduce_and(simd_select_bitmask(
+            k,
+            a.as_i8x16(),
+            _mm_set1_epi64x(-1).as_i8x16(),
+        ))
+    }
 }
 
 /// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -4699,8 +4995,8 @@ pub unsafe fn _mm_mask_reduce_and_epi8(k: __mmask16, a: __m128i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_max_epi16(a: __m256i) -> i16 {
-    simd_reduce_max(a.as_i16x16())
+pub fn _mm256_reduce_max_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_max(a.as_i16x16()) }
 }
 
 /// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -4709,8 +5005,8 @@ pub unsafe fn _mm256_reduce_max_epi16(a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_max_epi16(k: __mmask16, a: __m256i) -> i16 {
-    simd_reduce_max(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(-32768)))
+pub fn _mm256_mask_reduce_max_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(-32768))) }
 }
 
 /// Reduce the packed 16-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -4719,8 +5015,8 @@ pub unsafe fn _mm256_mask_reduce_max_epi16(k: __mmask16, a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_max_epi16(a: __m128i) -> i16 {
-    simd_reduce_max(a.as_i16x8())
+pub fn _mm_reduce_max_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_max(a.as_i16x8()) }
 }
 
 /// Reduce the packed 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -4729,8 +5025,8 @@ pub unsafe fn _mm_reduce_max_epi16(a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_max_epi16(k: __mmask8, a: __m128i) -> i16 {
-    simd_reduce_max(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(-32768)))
+pub fn _mm_mask_reduce_max_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(-32768))) }
 }
 
 /// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -4739,8 +5035,8 @@ pub unsafe fn _mm_mask_reduce_max_epi16(k: __mmask8, a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_max_epi8(a: __m256i) -> i8 {
-    simd_reduce_max(a.as_i8x32())
+pub fn _mm256_reduce_max_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_max(a.as_i8x32()) }
 }
 
 /// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -4749,8 +5045,8 @@ pub unsafe fn _mm256_reduce_max_epi8(a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_max_epi8(k: __mmask32, a: __m256i) -> i8 {
-    simd_reduce_max(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(-128)))
+pub fn _mm256_mask_reduce_max_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(-128))) }
 }
 
 /// Reduce the packed 8-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -4759,8 +5055,8 @@ pub unsafe fn _mm256_mask_reduce_max_epi8(k: __mmask32, a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_max_epi8(a: __m128i) -> i8 {
-    simd_reduce_max(a.as_i8x16())
+pub fn _mm_reduce_max_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_max(a.as_i8x16()) }
 }
 
 /// Reduce the packed 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -4769,8 +5065,8 @@ pub unsafe fn _mm_reduce_max_epi8(a: __m128i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_max_epi8(k: __mmask16, a: __m128i) -> i8 {
-    simd_reduce_max(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(-128)))
+pub fn _mm_mask_reduce_max_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(-128))) }
 }
 
 /// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -4779,8 +5075,8 @@ pub unsafe fn _mm_mask_reduce_max_epi8(k: __mmask16, a: __m128i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_max_epu16(a: __m256i) -> u16 {
-    simd_reduce_max(a.as_u16x16())
+pub fn _mm256_reduce_max_epu16(a: __m256i) -> u16 {
+    unsafe { simd_reduce_max(a.as_u16x16()) }
 }
 
 /// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -4789,8 +5085,8 @@ pub unsafe fn _mm256_reduce_max_epu16(a: __m256i) -> u16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_max_epu16(k: __mmask16, a: __m256i) -> u16 {
-    simd_reduce_max(simd_select_bitmask(k, a.as_u16x16(), u16x16::ZERO))
+pub fn _mm256_mask_reduce_max_epu16(k: __mmask16, a: __m256i) -> u16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u16x16(), u16x16::ZERO)) }
 }
 
 /// Reduce the packed unsigned 16-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -4799,8 +5095,8 @@ pub unsafe fn _mm256_mask_reduce_max_epu16(k: __mmask16, a: __m256i) -> u16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_max_epu16(a: __m128i) -> u16 {
-    simd_reduce_max(a.as_u16x8())
+pub fn _mm_reduce_max_epu16(a: __m128i) -> u16 {
+    unsafe { simd_reduce_max(a.as_u16x8()) }
 }
 
 /// Reduce the packed unsigned 16-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -4809,8 +5105,8 @@ pub unsafe fn _mm_reduce_max_epu16(a: __m128i) -> u16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_max_epu16(k: __mmask8, a: __m128i) -> u16 {
-    simd_reduce_max(simd_select_bitmask(k, a.as_u16x8(), u16x8::ZERO))
+pub fn _mm_mask_reduce_max_epu16(k: __mmask8, a: __m128i) -> u16 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u16x8(), u16x8::ZERO)) }
 }
 
 /// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -4819,8 +5115,8 @@ pub unsafe fn _mm_mask_reduce_max_epu16(k: __mmask8, a: __m128i) -> u16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_max_epu8(a: __m256i) -> u8 {
-    simd_reduce_max(a.as_u8x32())
+pub fn _mm256_reduce_max_epu8(a: __m256i) -> u8 {
+    unsafe { simd_reduce_max(a.as_u8x32()) }
 }
 
 /// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -4829,8 +5125,8 @@ pub unsafe fn _mm256_reduce_max_epu8(a: __m256i) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_max_epu8(k: __mmask32, a: __m256i) -> u8 {
-    simd_reduce_max(simd_select_bitmask(k, a.as_u8x32(), u8x32::ZERO))
+pub fn _mm256_mask_reduce_max_epu8(k: __mmask32, a: __m256i) -> u8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u8x32(), u8x32::ZERO)) }
 }
 
 /// Reduce the packed unsigned 8-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -4839,8 +5135,8 @@ pub unsafe fn _mm256_mask_reduce_max_epu8(k: __mmask32, a: __m256i) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_max_epu8(a: __m128i) -> u8 {
-    simd_reduce_max(a.as_u8x16())
+pub fn _mm_reduce_max_epu8(a: __m128i) -> u8 {
+    unsafe { simd_reduce_max(a.as_u8x16()) }
 }
 
 /// Reduce the packed unsigned 8-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -4849,8 +5145,8 @@ pub unsafe fn _mm_reduce_max_epu8(a: __m128i) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_max_epu8(k: __mmask16, a: __m128i) -> u8 {
-    simd_reduce_max(simd_select_bitmask(k, a.as_u8x16(), u8x16::ZERO))
+pub fn _mm_mask_reduce_max_epu8(k: __mmask16, a: __m128i) -> u8 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u8x16(), u8x16::ZERO)) }
 }
 
 /// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -4859,8 +5155,8 @@ pub unsafe fn _mm_mask_reduce_max_epu8(k: __mmask16, a: __m128i) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_min_epi16(a: __m256i) -> i16 {
-    simd_reduce_min(a.as_i16x16())
+pub fn _mm256_reduce_min_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_min(a.as_i16x16()) }
 }
 
 /// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
@@ -4869,8 +5165,8 @@ pub unsafe fn _mm256_reduce_min_epi16(a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_min_epi16(k: __mmask16, a: __m256i) -> i16 {
-    simd_reduce_min(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(0x7fff)))
+pub fn _mm256_mask_reduce_min_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(0x7fff))) }
 }
 
 /// Reduce the packed 16-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -4879,8 +5175,8 @@ pub unsafe fn _mm256_mask_reduce_min_epi16(k: __mmask16, a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_min_epi16(a: __m128i) -> i16 {
-    simd_reduce_min(a.as_i16x8())
+pub fn _mm_reduce_min_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_min(a.as_i16x8()) }
 }
 
 /// Reduce the packed 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
@@ -4889,8 +5185,8 @@ pub unsafe fn _mm_reduce_min_epi16(a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_min_epi16(k: __mmask8, a: __m128i) -> i16 {
-    simd_reduce_min(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(0x7fff)))
+pub fn _mm_mask_reduce_min_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(0x7fff))) }
 }
 
 /// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -4899,8 +5195,8 @@ pub unsafe fn _mm_mask_reduce_min_epi16(k: __mmask8, a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_min_epi8(a: __m256i) -> i8 {
-    simd_reduce_min(a.as_i8x32())
+pub fn _mm256_reduce_min_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_min(a.as_i8x32()) }
 }
 
 /// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
@@ -4909,8 +5205,8 @@ pub unsafe fn _mm256_reduce_min_epi8(a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_min_epi8(k: __mmask32, a: __m256i) -> i8 {
-    simd_reduce_min(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(0x7f)))
+pub fn _mm256_mask_reduce_min_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(0x7f))) }
 }
 
 /// Reduce the packed 8-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -4919,8 +5215,8 @@ pub unsafe fn _mm256_mask_reduce_min_epi8(k: __mmask32, a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_min_epi8(a: __m128i) -> i8 {
-    simd_reduce_min(a.as_i8x16())
+pub fn _mm_reduce_min_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_min(a.as_i8x16()) }
 }
 
 /// Reduce the packed 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
@@ -4929,8 +5225,8 @@ pub unsafe fn _mm_reduce_min_epi8(a: __m128i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_min_epi8(k: __mmask16, a: __m128i) -> i8 {
-    simd_reduce_min(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(0x7f)))
+pub fn _mm_mask_reduce_min_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(0x7f))) }
 }
 
 /// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -4939,8 +5235,8 @@ pub unsafe fn _mm_mask_reduce_min_epi8(k: __mmask16, a: __m128i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_min_epu16(a: __m256i) -> u16 {
-    simd_reduce_min(a.as_u16x16())
+pub fn _mm256_reduce_min_epu16(a: __m256i) -> u16 {
+    unsafe { simd_reduce_min(a.as_u16x16()) }
 }
 
 /// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
@@ -4949,8 +5245,8 @@ pub unsafe fn _mm256_reduce_min_epu16(a: __m256i) -> u16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_min_epu16(k: __mmask16, a: __m256i) -> u16 {
-    simd_reduce_min(simd_select_bitmask(k, a.as_u16x16(), u16x16::splat(0xffff)))
+pub fn _mm256_mask_reduce_min_epu16(k: __mmask16, a: __m256i) -> u16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u16x16(), u16x16::splat(0xffff))) }
 }
 
 /// Reduce the packed unsigned 16-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -4959,8 +5255,8 @@ pub unsafe fn _mm256_mask_reduce_min_epu16(k: __mmask16, a: __m256i) -> u16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_min_epu16(a: __m128i) -> u16 {
-    simd_reduce_min(a.as_u16x8())
+pub fn _mm_reduce_min_epu16(a: __m128i) -> u16 {
+    unsafe { simd_reduce_min(a.as_u16x8()) }
 }
 
 /// Reduce the packed unsigned 16-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
@@ -4969,8 +5265,8 @@ pub unsafe fn _mm_reduce_min_epu16(a: __m128i) -> u16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_min_epu16(k: __mmask8, a: __m128i) -> u16 {
-    simd_reduce_min(simd_select_bitmask(k, a.as_u16x8(), u16x8::splat(0xffff)))
+pub fn _mm_mask_reduce_min_epu16(k: __mmask8, a: __m128i) -> u16 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u16x8(), u16x8::splat(0xffff))) }
 }
 
 /// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -4979,8 +5275,8 @@ pub unsafe fn _mm_mask_reduce_min_epu16(k: __mmask8, a: __m128i) -> u16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_min_epu8(a: __m256i) -> u8 {
-    simd_reduce_min(a.as_u8x32())
+pub fn _mm256_reduce_min_epu8(a: __m256i) -> u8 {
+    unsafe { simd_reduce_min(a.as_u8x32()) }
 }
 
 /// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
@@ -4989,8 +5285,8 @@ pub unsafe fn _mm256_reduce_min_epu8(a: __m256i) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_min_epu8(k: __mmask32, a: __m256i) -> u8 {
-    simd_reduce_min(simd_select_bitmask(k, a.as_u8x32(), u8x32::splat(0xff)))
+pub fn _mm256_mask_reduce_min_epu8(k: __mmask32, a: __m256i) -> u8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u8x32(), u8x32::splat(0xff))) }
 }
 
 /// Reduce the packed unsigned 8-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -4999,8 +5295,8 @@ pub unsafe fn _mm256_mask_reduce_min_epu8(k: __mmask32, a: __m256i) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_min_epu8(a: __m128i) -> u8 {
-    simd_reduce_min(a.as_u8x16())
+pub fn _mm_reduce_min_epu8(a: __m128i) -> u8 {
+    unsafe { simd_reduce_min(a.as_u8x16()) }
 }
 
 /// Reduce the packed unsigned 8-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
@@ -5009,8 +5305,8 @@ pub unsafe fn _mm_reduce_min_epu8(a: __m128i) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_min_epu8(k: __mmask16, a: __m128i) -> u8 {
-    simd_reduce_min(simd_select_bitmask(k, a.as_u8x16(), u8x16::splat(0xff)))
+pub fn _mm_mask_reduce_min_epu8(k: __mmask16, a: __m128i) -> u8 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u8x16(), u8x16::splat(0xff))) }
 }
 
 /// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a.
@@ -5019,8 +5315,8 @@ pub unsafe fn _mm_mask_reduce_min_epu8(k: __mmask16, a: __m128i) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_mul_epi16(a: __m256i) -> i16 {
-    simd_reduce_mul_unordered(a.as_i16x16())
+pub fn _mm256_reduce_mul_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(a.as_i16x16()) }
 }
 
 /// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -5029,8 +5325,8 @@ pub unsafe fn _mm256_reduce_mul_epi16(a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_mul_epi16(k: __mmask16, a: __m256i) -> i16 {
-    simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(1)))
+pub fn _mm256_mask_reduce_mul_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x16(), i16x16::splat(1))) }
 }
 
 /// Reduce the packed 16-bit integers in a by multiplication. Returns the product of all elements in a.
@@ -5039,8 +5335,8 @@ pub unsafe fn _mm256_mask_reduce_mul_epi16(k: __mmask16, a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_mul_epi16(a: __m128i) -> i16 {
-    simd_reduce_mul_unordered(a.as_i16x8())
+pub fn _mm_reduce_mul_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(a.as_i16x8()) }
 }
 
 /// Reduce the packed 16-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -5049,8 +5345,8 @@ pub unsafe fn _mm_reduce_mul_epi16(a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_mul_epi16(k: __mmask8, a: __m128i) -> i16 {
-    simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(1)))
+pub fn _mm_mask_reduce_mul_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i16x8(), i16x8::splat(1))) }
 }
 
 /// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
@@ -5059,8 +5355,8 @@ pub unsafe fn _mm_mask_reduce_mul_epi16(k: __mmask8, a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_mul_epi8(a: __m256i) -> i8 {
-    simd_reduce_mul_unordered(a.as_i8x32())
+pub fn _mm256_reduce_mul_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(a.as_i8x32()) }
 }
 
 /// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -5069,8 +5365,8 @@ pub unsafe fn _mm256_reduce_mul_epi8(a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_mul_epi8(k: __mmask32, a: __m256i) -> i8 {
-    simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(1)))
+pub fn _mm256_mask_reduce_mul_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x32(), i8x32::splat(1))) }
 }
 
 /// Reduce the packed 8-bit integers in a by multiplication. Returns the product of all elements in a.
@@ -5079,8 +5375,8 @@ pub unsafe fn _mm256_mask_reduce_mul_epi8(k: __mmask32, a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_mul_epi8(a: __m128i) -> i8 {
-    simd_reduce_mul_unordered(a.as_i8x16())
+pub fn _mm_reduce_mul_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(a.as_i8x16()) }
 }
 
 /// Reduce the packed 8-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -5089,8 +5385,8 @@ pub unsafe fn _mm_reduce_mul_epi8(a: __m128i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_mul_epi8(k: __mmask16, a: __m128i) -> i8 {
-    simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(1)))
+pub fn _mm_mask_reduce_mul_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_mul_unordered(simd_select_bitmask(k, a.as_i8x16(), i8x16::splat(1))) }
 }
 
 /// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
@@ -5099,8 +5395,8 @@ pub unsafe fn _mm_mask_reduce_mul_epi8(k: __mmask16, a: __m128i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_or_epi16(a: __m256i) -> i16 {
-    simd_reduce_or(a.as_i16x16())
+pub fn _mm256_reduce_or_epi16(a: __m256i) -> i16 {
+    unsafe { simd_reduce_or(a.as_i16x16()) }
 }
 
 /// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
@@ -5109,8 +5405,8 @@ pub unsafe fn _mm256_reduce_or_epi16(a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_or_epi16(k: __mmask16, a: __m256i) -> i16 {
-    simd_reduce_or(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO))
+pub fn _mm256_mask_reduce_or_epi16(k: __mmask16, a: __m256i) -> i16 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i16x16(), i16x16::ZERO)) }
 }
 
 /// Reduce the packed 16-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
@@ -5119,8 +5415,8 @@ pub unsafe fn _mm256_mask_reduce_or_epi16(k: __mmask16, a: __m256i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_or_epi16(a: __m128i) -> i16 {
-    simd_reduce_or(a.as_i16x8())
+pub fn _mm_reduce_or_epi16(a: __m128i) -> i16 {
+    unsafe { simd_reduce_or(a.as_i16x8()) }
 }
 
 /// Reduce the packed 16-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
@@ -5129,8 +5425,8 @@ pub unsafe fn _mm_reduce_or_epi16(a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_or_epi16(k: __mmask8, a: __m128i) -> i16 {
-    simd_reduce_or(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO))
+pub fn _mm_mask_reduce_or_epi16(k: __mmask8, a: __m128i) -> i16 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i16x8(), i16x8::ZERO)) }
 }
 
 /// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
@@ -5139,8 +5435,8 @@ pub unsafe fn _mm_mask_reduce_or_epi16(k: __mmask8, a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_or_epi8(a: __m256i) -> i8 {
-    simd_reduce_or(a.as_i8x32())
+pub fn _mm256_reduce_or_epi8(a: __m256i) -> i8 {
+    unsafe { simd_reduce_or(a.as_i8x32()) }
 }
 
 /// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
@@ -5149,8 +5445,8 @@ pub unsafe fn _mm256_reduce_or_epi8(a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_or_epi8(k: __mmask32, a: __m256i) -> i8 {
-    simd_reduce_or(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO))
+pub fn _mm256_mask_reduce_or_epi8(k: __mmask32, a: __m256i) -> i8 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i8x32(), i8x32::ZERO)) }
 }
 
 /// Reduce the packed 8-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
@@ -5159,8 +5455,8 @@ pub unsafe fn _mm256_mask_reduce_or_epi8(k: __mmask32, a: __m256i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_or_epi8(a: __m128i) -> i8 {
-    simd_reduce_or(a.as_i8x16())
+pub fn _mm_reduce_or_epi8(a: __m128i) -> i8 {
+    unsafe { simd_reduce_or(a.as_i8x16()) }
 }
 
 /// Reduce the packed 8-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
@@ -5169,8 +5465,8 @@ pub unsafe fn _mm_reduce_or_epi8(a: __m128i) -> i8 {
 #[inline]
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_or_epi8(k: __mmask16, a: __m128i) -> i8 {
-    simd_reduce_or(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO))
+pub fn _mm_mask_reduce_or_epi8(k: __mmask16, a: __m128i) -> i8 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i8x16(), i8x16::ZERO)) }
 }
 
 /// Load 512-bits (composed of 32 packed 16-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
@@ -5540,8 +5836,8 @@ pub unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
-pub unsafe fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5551,14 +5847,11 @@ pub unsafe fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
-pub unsafe fn _mm512_mask_madd_epi16(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let madd = _mm512_madd_epi16(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, madd, src.as_i32x16()))
+pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_madd_epi16(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, madd, src.as_i32x16()))
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5568,9 +5861,11 @@ pub unsafe fn _mm512_mask_madd_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
-pub unsafe fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let madd = _mm512_madd_epi16(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, madd, i32x16::ZERO))
+pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_madd_epi16(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, madd, i32x16::ZERO))
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5580,9 +5875,11 @@ pub unsafe fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
-pub unsafe fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let madd = _mm256_madd_epi16(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, madd, src.as_i32x8()))
+pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_madd_epi16(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, madd, src.as_i32x8()))
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5592,9 +5889,11 @@ pub unsafe fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
-pub unsafe fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let madd = _mm256_madd_epi16(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, madd, i32x8::ZERO))
+pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_madd_epi16(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, madd, i32x8::ZERO))
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5604,9 +5903,11 @@ pub unsafe fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
-pub unsafe fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let madd = _mm_madd_epi16(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, madd, src.as_i32x4()))
+pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_madd_epi16(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, madd, src.as_i32x4()))
+    }
 }
 
 /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5616,9 +5917,11 @@ pub unsafe fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
-pub unsafe fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let madd = _mm_madd_epi16(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, madd, i32x4::ZERO))
+pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_madd_epi16(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, madd, i32x4::ZERO))
+    }
 }
 
 /// Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.
@@ -5628,8 +5931,8 @@ pub unsafe fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub unsafe fn _mm512_maddubs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmaddubsw(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_maddubs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmaddubsw(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5639,14 +5942,11 @@ pub unsafe fn _mm512_maddubs_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub unsafe fn _mm512_mask_maddubs_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, madd, src.as_i16x32()))
+pub fn _mm512_mask_maddubs_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, madd, src.as_i16x32()))
+    }
 }
 
 /// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5656,9 +5956,11 @@ pub unsafe fn _mm512_mask_maddubs_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub unsafe fn _mm512_maskz_maddubs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, madd, i16x32::ZERO))
+pub fn _mm512_maskz_maddubs_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let madd = _mm512_maddubs_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, madd, i16x32::ZERO))
+    }
 }
 
 /// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5668,14 +5970,11 @@ pub unsafe fn _mm512_maskz_maddubs_epi16(k: __mmask32, a: __m512i, b: __m512i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub unsafe fn _mm256_mask_maddubs_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, madd, src.as_i16x16()))
+pub fn _mm256_mask_maddubs_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, madd, src.as_i16x16()))
+    }
 }
 
 /// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5685,9 +5984,11 @@ pub unsafe fn _mm256_mask_maddubs_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub unsafe fn _mm256_maskz_maddubs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, madd, i16x16::ZERO))
+pub fn _mm256_maskz_maddubs_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let madd = _mm256_maddubs_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, madd, i16x16::ZERO))
+    }
 }
 
 /// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5697,9 +5998,11 @@ pub unsafe fn _mm256_maskz_maddubs_epi16(k: __mmask16, a: __m256i, b: __m256i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub unsafe fn _mm_mask_maddubs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let madd = _mm_maddubs_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, madd, src.as_i16x8()))
+pub fn _mm_mask_maddubs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_maddubs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, madd, src.as_i16x8()))
+    }
 }
 
 /// Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5709,9 +6012,11 @@ pub unsafe fn _mm_mask_maddubs_epi16(src: __m128i, k: __mmask8, a: __m128i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
-pub unsafe fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let madd = _mm_maddubs_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, madd, i16x8::ZERO))
+pub fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let madd = _mm_maddubs_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, madd, i16x8::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
@@ -5721,8 +6026,8 @@ pub unsafe fn _mm_maskz_maddubs_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub unsafe fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpackssdw(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpackssdw(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5732,14 +6037,11 @@ pub unsafe fn _mm512_packs_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub unsafe fn _mm512_mask_packs_epi32(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let pack = _mm512_packs_epi32(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
+pub fn _mm512_mask_packs_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5749,9 +6051,11 @@ pub unsafe fn _mm512_mask_packs_epi32(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub unsafe fn _mm512_maskz_packs_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let pack = _mm512_packs_epi32(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
+pub fn _mm512_maskz_packs_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5761,14 +6065,11 @@ pub unsafe fn _mm512_maskz_packs_epi32(k: __mmask32, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub unsafe fn _mm256_mask_packs_epi32(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let pack = _mm256_packs_epi32(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
+pub fn _mm256_mask_packs_epi32(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5778,9 +6079,11 @@ pub unsafe fn _mm256_mask_packs_epi32(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub unsafe fn _mm256_maskz_packs_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let pack = _mm256_packs_epi32(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, pack, i16x16::ZERO))
+pub fn _mm256_maskz_packs_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, i16x16::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5790,9 +6093,11 @@ pub unsafe fn _mm256_maskz_packs_epi32(k: __mmask16, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub unsafe fn _mm_mask_packs_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let pack = _mm_packs_epi32(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
+pub fn _mm_mask_packs_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5802,9 +6107,11 @@ pub unsafe fn _mm_mask_packs_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
-pub unsafe fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let pack = _mm_packs_epi32(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
+pub fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst.
@@ -5814,8 +6121,8 @@ pub unsafe fn _mm_maskz_packs_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m1
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub unsafe fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpacksswb(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpacksswb(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5825,14 +6132,11 @@ pub unsafe fn _mm512_packs_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub unsafe fn _mm512_mask_packs_epi16(
-    src: __m512i,
-    k: __mmask64,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let pack = _mm512_packs_epi16(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
+pub fn _mm512_mask_packs_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5842,9 +6146,11 @@ pub unsafe fn _mm512_mask_packs_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub unsafe fn _mm512_maskz_packs_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let pack = _mm512_packs_epi16(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
+pub fn _mm512_maskz_packs_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packs_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5854,14 +6160,11 @@ pub unsafe fn _mm512_maskz_packs_epi16(k: __mmask64, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub unsafe fn _mm256_mask_packs_epi16(
-    src: __m256i,
-    k: __mmask32,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let pack = _mm256_packs_epi16(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
+pub fn _mm256_mask_packs_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5871,9 +6174,11 @@ pub unsafe fn _mm256_mask_packs_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub unsafe fn _mm256_maskz_packs_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let pack = _mm256_packs_epi16(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
+pub fn _mm256_maskz_packs_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packs_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5883,9 +6188,11 @@ pub unsafe fn _mm256_maskz_packs_epi16(k: __mmask32, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub unsafe fn _mm_mask_packs_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let pack = _mm_packs_epi16(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
+pub fn _mm_mask_packs_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5895,9 +6202,11 @@ pub unsafe fn _mm_mask_packs_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
-pub unsafe fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let pack = _mm_packs_epi16(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
+pub fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packs_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.
@@ -5907,8 +6216,8 @@ pub unsafe fn _mm_maskz_packs_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub unsafe fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpackusdw(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpackusdw(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5918,14 +6227,11 @@ pub unsafe fn _mm512_packus_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub unsafe fn _mm512_mask_packus_epi32(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let pack = _mm512_packus_epi32(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
+pub fn _mm512_mask_packus_epi32(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x32()))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5935,9 +6241,11 @@ pub unsafe fn _mm512_mask_packus_epi32(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub unsafe fn _mm512_maskz_packus_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let pack = _mm512_packus_epi32(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
+pub fn _mm512_maskz_packus_epi32(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi32(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, pack, i16x32::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5947,14 +6255,11 @@ pub unsafe fn _mm512_maskz_packus_epi32(k: __mmask32, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub unsafe fn _mm256_mask_packus_epi32(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let pack = _mm256_packus_epi32(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
+pub fn _mm256_mask_packus_epi32(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x16()))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5964,9 +6269,11 @@ pub unsafe fn _mm256_mask_packus_epi32(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub unsafe fn _mm256_maskz_packus_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let pack = _mm256_packus_epi32(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, pack, i16x16::ZERO))
+pub fn _mm256_maskz_packus_epi32(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi32(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, pack, i16x16::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5976,9 +6283,11 @@ pub unsafe fn _mm256_maskz_packus_epi32(k: __mmask16, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub unsafe fn _mm_mask_packus_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let pack = _mm_packus_epi32(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
+pub fn _mm_mask_packus_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, src.as_i16x8()))
+    }
 }
 
 /// Convert packed signed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5988,9 +6297,11 @@ pub unsafe fn _mm_mask_packus_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
-pub unsafe fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let pack = _mm_packus_epi32(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
+pub fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi32(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, pack, i16x8::ZERO))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.
@@ -6000,8 +6311,8 @@ pub unsafe fn _mm_maskz_packus_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub unsafe fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpackuswb(a.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpackuswb(a.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6011,14 +6322,11 @@ pub unsafe fn _mm512_packus_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub unsafe fn _mm512_mask_packus_epi16(
-    src: __m512i,
-    k: __mmask64,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let pack = _mm512_packus_epi16(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
+pub fn _mm512_mask_packus_epi16(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x64()))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6028,9 +6336,11 @@ pub unsafe fn _mm512_mask_packus_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub unsafe fn _mm512_maskz_packus_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let pack = _mm512_packus_epi16(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
+pub fn _mm512_maskz_packus_epi16(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let pack = _mm512_packus_epi16(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, pack, i8x64::ZERO))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6040,14 +6350,11 @@ pub unsafe fn _mm512_maskz_packus_epi16(k: __mmask64, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub unsafe fn _mm256_mask_packus_epi16(
-    src: __m256i,
-    k: __mmask32,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let pack = _mm256_packus_epi16(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
+pub fn _mm256_mask_packus_epi16(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x32()))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6057,9 +6364,11 @@ pub unsafe fn _mm256_mask_packus_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub unsafe fn _mm256_maskz_packus_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let pack = _mm256_packus_epi16(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
+pub fn _mm256_maskz_packus_epi16(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let pack = _mm256_packus_epi16(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, pack, i8x32::ZERO))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6069,9 +6378,11 @@ pub unsafe fn _mm256_maskz_packus_epi16(k: __mmask32, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub unsafe fn _mm_mask_packus_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let pack = _mm_packus_epi16(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
+pub fn _mm_mask_packus_epi16(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, src.as_i8x16()))
+    }
 }
 
 /// Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6081,9 +6392,11 @@ pub unsafe fn _mm_mask_packus_epi16(src: __m128i, k: __mmask16, a: __m128i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
-pub unsafe fn _mm_maskz_packus_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let pack = _mm_packus_epi16(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
+pub fn _mm_maskz_packus_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let pack = _mm_packus_epi16(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, pack, i8x16::ZERO))
+    }
 }
 
 /// Average packed unsigned 16-bit integers in a and b, and store the results in dst.
@@ -6093,11 +6406,13 @@ pub unsafe fn _mm_maskz_packus_epi16(k: __mmask16, a: __m128i, b: __m128i) -> __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgw))]
-pub unsafe fn _mm512_avg_epu16(a: __m512i, b: __m512i) -> __m512i {
-    let a = simd_cast::<_, u32x32>(a.as_u16x32());
-    let b = simd_cast::<_, u32x32>(b.as_u16x32());
-    let r = simd_shr(simd_add(simd_add(a, b), u32x32::splat(1)), u32x32::splat(1));
-    transmute(simd_cast::<_, u16x32>(r))
+pub fn _mm512_avg_epu16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, u32x32>(a.as_u16x32());
+        let b = simd_cast::<_, u32x32>(b.as_u16x32());
+        let r = simd_shr(simd_add(simd_add(a, b), u32x32::splat(1)), u32x32::splat(1));
+        transmute(simd_cast::<_, u16x32>(r))
+    }
 }
 
 /// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6107,9 +6422,11 @@ pub unsafe fn _mm512_avg_epu16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgw))]
-pub unsafe fn _mm512_mask_avg_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let avg = _mm512_avg_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, avg, src.as_u16x32()))
+pub fn _mm512_mask_avg_epu16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, avg, src.as_u16x32()))
+    }
 }
 
 /// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6119,9 +6436,11 @@ pub unsafe fn _mm512_mask_avg_epu16(src: __m512i, k: __mmask32, a: __m512i, b: _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgw))]
-pub unsafe fn _mm512_maskz_avg_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let avg = _mm512_avg_epu16(a, b).as_u16x32();
-    transmute(simd_select_bitmask(k, avg, u16x32::ZERO))
+pub fn _mm512_maskz_avg_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu16(a, b).as_u16x32();
+        transmute(simd_select_bitmask(k, avg, u16x32::ZERO))
+    }
 }
 
 /// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6131,9 +6450,11 @@ pub unsafe fn _mm512_maskz_avg_epu16(k: __mmask32, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgw))]
-pub unsafe fn _mm256_mask_avg_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let avg = _mm256_avg_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, avg, src.as_u16x16()))
+pub fn _mm256_mask_avg_epu16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, avg, src.as_u16x16()))
+    }
 }
 
 /// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6143,9 +6464,11 @@ pub unsafe fn _mm256_mask_avg_epu16(src: __m256i, k: __mmask16, a: __m256i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgw))]
-pub unsafe fn _mm256_maskz_avg_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let avg = _mm256_avg_epu16(a, b).as_u16x16();
-    transmute(simd_select_bitmask(k, avg, u16x16::ZERO))
+pub fn _mm256_maskz_avg_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu16(a, b).as_u16x16();
+        transmute(simd_select_bitmask(k, avg, u16x16::ZERO))
+    }
 }
 
 /// Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6155,9 +6478,11 @@ pub unsafe fn _mm256_maskz_avg_epu16(k: __mmask16, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgw))]
-pub unsafe fn _mm_mask_avg_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let avg = _mm_avg_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, avg, src.as_u16x8()))
+pub fn _mm_mask_avg_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, avg, src.as_u16x8()))
+    }
 }
 
 /// Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6167,9 +6492,11 @@ pub unsafe fn _mm_mask_avg_epu16(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgw))]
-pub unsafe fn _mm_maskz_avg_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let avg = _mm_avg_epu16(a, b).as_u16x8();
-    transmute(simd_select_bitmask(k, avg, u16x8::ZERO))
+pub fn _mm_maskz_avg_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu16(a, b).as_u16x8();
+        transmute(simd_select_bitmask(k, avg, u16x8::ZERO))
+    }
 }
 
 /// Average packed unsigned 8-bit integers in a and b, and store the results in dst.
@@ -6179,11 +6506,13 @@ pub unsafe fn _mm_maskz_avg_epu16(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgb))]
-pub unsafe fn _mm512_avg_epu8(a: __m512i, b: __m512i) -> __m512i {
-    let a = simd_cast::<_, u16x64>(a.as_u8x64());
-    let b = simd_cast::<_, u16x64>(b.as_u8x64());
-    let r = simd_shr(simd_add(simd_add(a, b), u16x64::splat(1)), u16x64::splat(1));
-    transmute(simd_cast::<_, u8x64>(r))
+pub fn _mm512_avg_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, u16x64>(a.as_u8x64());
+        let b = simd_cast::<_, u16x64>(b.as_u8x64());
+        let r = simd_shr(simd_add(simd_add(a, b), u16x64::splat(1)), u16x64::splat(1));
+        transmute(simd_cast::<_, u8x64>(r))
+    }
 }
 
 /// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6193,9 +6522,11 @@ pub unsafe fn _mm512_avg_epu8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgb))]
-pub unsafe fn _mm512_mask_avg_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let avg = _mm512_avg_epu8(a, b).as_u8x64();
-    transmute(simd_select_bitmask(k, avg, src.as_u8x64()))
+pub fn _mm512_mask_avg_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, avg, src.as_u8x64()))
+    }
 }
 
 /// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6205,9 +6536,11 @@ pub unsafe fn _mm512_mask_avg_epu8(src: __m512i, k: __mmask64, a: __m512i, b: __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgb))]
-pub unsafe fn _mm512_maskz_avg_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let avg = _mm512_avg_epu8(a, b).as_u8x64();
-    transmute(simd_select_bitmask(k, avg, u8x64::ZERO))
+pub fn _mm512_maskz_avg_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let avg = _mm512_avg_epu8(a, b).as_u8x64();
+        transmute(simd_select_bitmask(k, avg, u8x64::ZERO))
+    }
 }
 
 /// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6217,9 +6550,11 @@ pub unsafe fn _mm512_maskz_avg_epu8(k: __mmask64, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgb))]
-pub unsafe fn _mm256_mask_avg_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let avg = _mm256_avg_epu8(a, b).as_u8x32();
-    transmute(simd_select_bitmask(k, avg, src.as_u8x32()))
+pub fn _mm256_mask_avg_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, avg, src.as_u8x32()))
+    }
 }
 
 /// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6229,9 +6564,11 @@ pub unsafe fn _mm256_mask_avg_epu8(src: __m256i, k: __mmask32, a: __m256i, b: __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgb))]
-pub unsafe fn _mm256_maskz_avg_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let avg = _mm256_avg_epu8(a, b).as_u8x32();
-    transmute(simd_select_bitmask(k, avg, u8x32::ZERO))
+pub fn _mm256_maskz_avg_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let avg = _mm256_avg_epu8(a, b).as_u8x32();
+        transmute(simd_select_bitmask(k, avg, u8x32::ZERO))
+    }
 }
 
 /// Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6241,9 +6578,11 @@ pub unsafe fn _mm256_maskz_avg_epu8(k: __mmask32, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgb))]
-pub unsafe fn _mm_mask_avg_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let avg = _mm_avg_epu8(a, b).as_u8x16();
-    transmute(simd_select_bitmask(k, avg, src.as_u8x16()))
+pub fn _mm_mask_avg_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, avg, src.as_u8x16()))
+    }
 }
 
 /// Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6253,9 +6592,11 @@ pub unsafe fn _mm_mask_avg_epu8(src: __m128i, k: __mmask16, a: __m128i, b: __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpavgb))]
-pub unsafe fn _mm_maskz_avg_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let avg = _mm_avg_epu8(a, b).as_u8x16();
-    transmute(simd_select_bitmask(k, avg, u8x16::ZERO))
+pub fn _mm_maskz_avg_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let avg = _mm_avg_epu8(a, b).as_u8x16();
+        transmute(simd_select_bitmask(k, avg, u8x16::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst.
@@ -6265,8 +6606,8 @@ pub unsafe fn _mm_maskz_avg_epu8(k: __mmask16, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw))]
-pub unsafe fn _mm512_sll_epi16(a: __m512i, count: __m128i) -> __m512i {
-    transmute(vpsllw(a.as_i16x32(), count.as_i16x8()))
+pub fn _mm512_sll_epi16(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsllw(a.as_i16x32(), count.as_i16x8())) }
 }
 
 /// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6276,14 +6617,11 @@ pub unsafe fn _mm512_sll_epi16(a: __m512i, count: __m128i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw))]
-pub unsafe fn _mm512_mask_sll_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    count: __m128i,
-) -> __m512i {
-    let shf = _mm512_sll_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+pub fn _mm512_mask_sll_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6293,9 +6631,11 @@ pub unsafe fn _mm512_mask_sll_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw))]
-pub unsafe fn _mm512_maskz_sll_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
-    let shf = _mm512_sll_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+pub fn _mm512_maskz_sll_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6305,14 +6645,11 @@ pub unsafe fn _mm512_maskz_sll_epi16(k: __mmask32, a: __m512i, count: __m128i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw))]
-pub unsafe fn _mm256_mask_sll_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    count: __m128i,
-) -> __m256i {
-    let shf = _mm256_sll_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+pub fn _mm256_mask_sll_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6322,9 +6659,11 @@ pub unsafe fn _mm256_mask_sll_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw))]
-pub unsafe fn _mm256_maskz_sll_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
-    let shf = _mm256_sll_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+pub fn _mm256_maskz_sll_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6334,9 +6673,11 @@ pub unsafe fn _mm256_maskz_sll_epi16(k: __mmask16, a: __m256i, count: __m128i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw))]
-pub unsafe fn _mm_mask_sll_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sll_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+pub fn _mm_mask_sll_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6346,9 +6687,11 @@ pub unsafe fn _mm_mask_sll_epi16(src: __m128i, k: __mmask8, a: __m128i, count: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw))]
-pub unsafe fn _mm_maskz_sll_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sll_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+pub fn _mm_maskz_sll_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
@@ -6359,12 +6702,14 @@ pub unsafe fn _mm_maskz_sll_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_slli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 16 {
-        _mm512_setzero_si512()
-    } else {
-        transmute(simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
+pub fn _mm512_slli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
+        }
     }
 }
 
@@ -6376,18 +6721,16 @@ pub unsafe fn _mm512_slli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_slli_epi16<const IMM8: u32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = if IMM8 >= 16 {
-        u16x32::ZERO
-    } else {
-        simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16))
-    };
-    transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
+pub fn _mm512_mask_slli_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x32::ZERO
+        } else {
+            simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6398,13 +6741,15 @@ pub unsafe fn _mm512_mask_slli_epi16<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_slli_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 16 {
-        _mm512_setzero_si512()
-    } else {
-        let shf = simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16));
-        transmute(simd_select_bitmask(k, shf, u16x32::ZERO))
+pub fn _mm512_maskz_slli_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shl(a.as_u16x32(), u16x32::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x32::ZERO))
+        }
     }
 }
 
@@ -6416,18 +6761,16 @@ pub unsafe fn _mm512_maskz_slli_epi16<const IMM8: u32>(k: __mmask32, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_slli_epi16<const IMM8: u32>(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = if IMM8 >= 16 {
-        u16x16::ZERO
-    } else {
-        simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16))
-    };
-    transmute(simd_select_bitmask(k, shf, src.as_u16x16()))
+pub fn _mm256_mask_slli_epi16<const IMM8: u32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x16::ZERO
+        } else {
+            simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x16()))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6438,13 +6781,15 @@ pub unsafe fn _mm256_mask_slli_epi16<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_slli_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 16 {
-        _mm256_setzero_si256()
-    } else {
-        let shf = simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16));
-        transmute(simd_select_bitmask(k, shf, u16x16::ZERO))
+pub fn _mm256_maskz_slli_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm256_setzero_si256()
+        } else {
+            let shf = simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x16::ZERO))
+        }
     }
 }
 
@@ -6456,18 +6801,16 @@ pub unsafe fn _mm256_maskz_slli_epi16<const IMM8: u32>(k: __mmask16, a: __m256i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_slli_epi16<const IMM8: u32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = if IMM8 >= 16 {
-        u16x8::ZERO
-    } else {
-        simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16))
-    };
-    transmute(simd_select_bitmask(k, shf, src.as_u16x8()))
+pub fn _mm_mask_slli_epi16<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x8::ZERO
+        } else {
+            simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x8()))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6478,13 +6821,15 @@ pub unsafe fn _mm_mask_slli_epi16<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_slli_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 16 {
-        _mm_setzero_si128()
-    } else {
-        let shf = simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16));
-        transmute(simd_select_bitmask(k, shf, u16x8::ZERO))
+pub fn _mm_maskz_slli_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm_setzero_si128()
+        } else {
+            let shf = simd_shl(a.as_u16x8(), u16x8::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x8::ZERO))
+        }
     }
 }
 
@@ -6495,8 +6840,8 @@ pub unsafe fn _mm_maskz_slli_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
-pub unsafe fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i {
-    transmute(vpsllvw(a.as_i16x32(), count.as_i16x32()))
+pub fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsllvw(a.as_i16x32(), count.as_i16x32())) }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6506,14 +6851,11 @@ pub unsafe fn _mm512_sllv_epi16(a: __m512i, count: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
-pub unsafe fn _mm512_mask_sllv_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    count: __m512i,
-) -> __m512i {
-    let shf = _mm512_sllv_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+pub fn _mm512_mask_sllv_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6523,9 +6865,11 @@ pub unsafe fn _mm512_mask_sllv_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
-pub unsafe fn _mm512_maskz_sllv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
-    let shf = _mm512_sllv_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+pub fn _mm512_maskz_sllv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
@@ -6535,8 +6879,8 @@ pub unsafe fn _mm512_maskz_sllv_epi16(k: __mmask32, a: __m512i, count: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
-pub unsafe fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i {
-    transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16()))
+pub fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsllvw256(a.as_i16x16(), count.as_i16x16())) }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6546,14 +6890,11 @@ pub unsafe fn _mm256_sllv_epi16(a: __m256i, count: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
-pub unsafe fn _mm256_mask_sllv_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    count: __m256i,
-) -> __m256i {
-    let shf = _mm256_sllv_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+pub fn _mm256_mask_sllv_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6563,9 +6904,11 @@ pub unsafe fn _mm256_mask_sllv_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
-pub unsafe fn _mm256_maskz_sllv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
-    let shf = _mm256_sllv_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+pub fn _mm256_maskz_sllv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
@@ -6575,8 +6918,8 @@ pub unsafe fn _mm256_maskz_sllv_epi16(k: __mmask16, a: __m256i, count: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
-pub unsafe fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i {
-    transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8()))
+pub fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsllvw128(a.as_i16x8(), count.as_i16x8())) }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6586,14 +6929,11 @@ pub unsafe fn _mm_sllv_epi16(a: __m128i, count: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
-pub unsafe fn _mm_mask_sllv_epi16(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    count: __m128i,
-) -> __m128i {
-    let shf = _mm_sllv_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+pub fn _mm_mask_sllv_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
 }
 
 /// Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6603,9 +6943,11 @@ pub unsafe fn _mm_mask_sllv_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvw))]
-pub unsafe fn _mm_maskz_sllv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sllv_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+pub fn _mm_maskz_sllv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst.
@@ -6615,8 +6957,8 @@ pub unsafe fn _mm_maskz_sllv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
-pub unsafe fn _mm512_srl_epi16(a: __m512i, count: __m128i) -> __m512i {
-    transmute(vpsrlw(a.as_i16x32(), count.as_i16x8()))
+pub fn _mm512_srl_epi16(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrlw(a.as_i16x32(), count.as_i16x8())) }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6626,14 +6968,11 @@ pub unsafe fn _mm512_srl_epi16(a: __m512i, count: __m128i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
-pub unsafe fn _mm512_mask_srl_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    count: __m128i,
-) -> __m512i {
-    let shf = _mm512_srl_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+pub fn _mm512_mask_srl_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6643,9 +6982,11 @@ pub unsafe fn _mm512_mask_srl_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
-pub unsafe fn _mm512_maskz_srl_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
-    let shf = _mm512_srl_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+pub fn _mm512_maskz_srl_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6655,14 +6996,11 @@ pub unsafe fn _mm512_maskz_srl_epi16(k: __mmask32, a: __m512i, count: __m128i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
-pub unsafe fn _mm256_mask_srl_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    count: __m128i,
-) -> __m256i {
-    let shf = _mm256_srl_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+pub fn _mm256_mask_srl_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6672,9 +7010,11 @@ pub unsafe fn _mm256_mask_srl_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
-pub unsafe fn _mm256_maskz_srl_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
-    let shf = _mm256_srl_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+pub fn _mm256_maskz_srl_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6684,9 +7024,11 @@ pub unsafe fn _mm256_maskz_srl_epi16(k: __mmask16, a: __m256i, count: __m128i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
-pub unsafe fn _mm_mask_srl_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srl_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+pub fn _mm_mask_srl_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6696,9 +7038,11 @@ pub unsafe fn _mm_mask_srl_epi16(src: __m128i, k: __mmask8, a: __m128i, count: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
-pub unsafe fn _mm_maskz_srl_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srl_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+pub fn _mm_maskz_srl_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
@@ -6709,12 +7053,14 @@ pub unsafe fn _mm_maskz_srl_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_srli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 16 {
-        _mm512_setzero_si512()
-    } else {
-        transmute(simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
+pub fn _mm512_srli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16)))
+        }
     }
 }
 
@@ -6726,18 +7072,16 @@ pub unsafe fn _mm512_srli_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_srli_epi16<const IMM8: u32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = if IMM8 >= 16 {
-        u16x32::ZERO
-    } else {
-        simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16))
-    };
-    transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
+pub fn _mm512_mask_srli_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 16 {
+            u16x32::ZERO
+        } else {
+            simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u16x32()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6748,14 +7092,16 @@ pub unsafe fn _mm512_mask_srli_epi16<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_srli_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    //imm8 should be u32, it seems the document to verify is incorrect
-    if IMM8 >= 16 {
-        _mm512_setzero_si512()
-    } else {
-        let shf = simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16));
-        transmute(simd_select_bitmask(k, shf, u16x32::ZERO))
+pub fn _mm512_maskz_srli_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        //imm8 should be u32, it seems the document to verify is incorrect
+        if IMM8 >= 16 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shr(a.as_u16x32(), u16x32::splat(IMM8 as u16));
+            transmute(simd_select_bitmask(k, shf, u16x32::ZERO))
+        }
     }
 }
 
@@ -6767,14 +7113,12 @@ pub unsafe fn _mm512_maskz_srli_epi16<const IMM8: i32>(k: __mmask32, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_srli_epi16<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_srli_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shf.as_i16x16(), src.as_i16x16()))
+pub fn _mm256_mask_srli_epi16<const IMM8: i32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x16(), src.as_i16x16()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6785,10 +7129,12 @@ pub unsafe fn _mm256_mask_srli_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_srli_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_srli_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shf.as_i16x16(), i16x16::ZERO))
+pub fn _mm256_maskz_srli_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x16(), i16x16::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6799,14 +7145,12 @@ pub unsafe fn _mm256_maskz_srli_epi16<const IMM8: i32>(k: __mmask16, a: __m256i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_srli_epi16<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_srli_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shf.as_i16x8(), src.as_i16x8()))
+pub fn _mm_mask_srli_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x8(), src.as_i16x8()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6817,10 +7161,12 @@ pub unsafe fn _mm_mask_srli_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_srli_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_srli_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shf.as_i16x8(), i16x8::ZERO))
+pub fn _mm_maskz_srli_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_srli_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shf.as_i16x8(), i16x8::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
@@ -6830,8 +7176,8 @@ pub unsafe fn _mm_maskz_srli_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
-pub unsafe fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i {
-    transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32()))
+pub fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsrlvw(a.as_i16x32(), count.as_i16x32())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6841,14 +7187,11 @@ pub unsafe fn _mm512_srlv_epi16(a: __m512i, count: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
-pub unsafe fn _mm512_mask_srlv_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    count: __m512i,
-) -> __m512i {
-    let shf = _mm512_srlv_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+pub fn _mm512_mask_srlv_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6858,9 +7201,11 @@ pub unsafe fn _mm512_mask_srlv_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
-pub unsafe fn _mm512_maskz_srlv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
-    let shf = _mm512_srlv_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+pub fn _mm512_maskz_srlv_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
@@ -6870,8 +7215,8 @@ pub unsafe fn _mm512_maskz_srlv_epi16(k: __mmask32, a: __m512i, count: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
-pub unsafe fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i {
-    transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16()))
+pub fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsrlvw256(a.as_i16x16(), count.as_i16x16())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6881,14 +7226,11 @@ pub unsafe fn _mm256_srlv_epi16(a: __m256i, count: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
-pub unsafe fn _mm256_mask_srlv_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    count: __m256i,
-) -> __m256i {
-    let shf = _mm256_srlv_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+pub fn _mm256_mask_srlv_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6898,9 +7240,11 @@ pub unsafe fn _mm256_mask_srlv_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
-pub unsafe fn _mm256_maskz_srlv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
-    let shf = _mm256_srlv_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+pub fn _mm256_maskz_srlv_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
@@ -6910,8 +7254,8 @@ pub unsafe fn _mm256_maskz_srlv_epi16(k: __mmask16, a: __m256i, count: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
-pub unsafe fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i {
-    transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8()))
+pub fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsrlvw128(a.as_i16x8(), count.as_i16x8())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6921,14 +7265,11 @@ pub unsafe fn _mm_srlv_epi16(a: __m128i, count: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
-pub unsafe fn _mm_mask_srlv_epi16(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    count: __m128i,
-) -> __m128i {
-    let shf = _mm_srlv_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+pub fn _mm_mask_srlv_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6938,9 +7279,11 @@ pub unsafe fn _mm_mask_srlv_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvw))]
-pub unsafe fn _mm_maskz_srlv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srlv_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+pub fn _mm_maskz_srlv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst.
@@ -6950,8 +7293,8 @@ pub unsafe fn _mm_maskz_srlv_epi16(k: __mmask8, a: __m128i, count: __m128i) -> _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw))]
-pub unsafe fn _mm512_sra_epi16(a: __m512i, count: __m128i) -> __m512i {
-    transmute(vpsraw(a.as_i16x32(), count.as_i16x8()))
+pub fn _mm512_sra_epi16(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsraw(a.as_i16x32(), count.as_i16x8())) }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6961,14 +7304,11 @@ pub unsafe fn _mm512_sra_epi16(a: __m512i, count: __m128i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw))]
-pub unsafe fn _mm512_mask_sra_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    count: __m128i,
-) -> __m512i {
-    let shf = _mm512_sra_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+pub fn _mm512_mask_sra_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -6978,9 +7318,11 @@ pub unsafe fn _mm512_mask_sra_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw))]
-pub unsafe fn _mm512_maskz_sra_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
-    let shf = _mm512_sra_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+pub fn _mm512_maskz_sra_epi16(k: __mmask32, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -6990,14 +7332,11 @@ pub unsafe fn _mm512_maskz_sra_epi16(k: __mmask32, a: __m512i, count: __m128i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw))]
-pub unsafe fn _mm256_mask_sra_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    count: __m128i,
-) -> __m256i {
-    let shf = _mm256_sra_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+pub fn _mm256_mask_sra_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7007,9 +7346,11 @@ pub unsafe fn _mm256_mask_sra_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw))]
-pub unsafe fn _mm256_maskz_sra_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
-    let shf = _mm256_sra_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+pub fn _mm256_maskz_sra_epi16(k: __mmask16, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7019,9 +7360,11 @@ pub unsafe fn _mm256_maskz_sra_epi16(k: __mmask16, a: __m256i, count: __m128i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw))]
-pub unsafe fn _mm_mask_sra_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sra_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+pub fn _mm_mask_sra_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7031,9 +7374,11 @@ pub unsafe fn _mm_mask_sra_epi16(src: __m128i, k: __mmask8, a: __m128i, count: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw))]
-pub unsafe fn _mm_maskz_sra_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sra_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+pub fn _mm_maskz_sra_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
@@ -7044,9 +7389,11 @@ pub unsafe fn _mm_maskz_sra_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_srai_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16)))
+pub fn _mm512_srai_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16)))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7057,14 +7404,12 @@ pub unsafe fn _mm512_srai_epi16<const IMM8: u32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_srai_epi16<const IMM8: u32>(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
-    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+pub fn _mm512_mask_srai_epi16<const IMM8: u32>(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7075,10 +7420,12 @@ pub unsafe fn _mm512_mask_srai_epi16<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_srai_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
-    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+pub fn _mm512_maskz_srai_epi16<const IMM8: u32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i16x32(), i16x32::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7089,14 +7436,12 @@ pub unsafe fn _mm512_maskz_srai_epi16<const IMM8: u32>(k: __mmask32, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_srai_epi16<const IMM8: u32>(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
-    transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+pub fn _mm256_mask_srai_epi16<const IMM8: u32>(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7107,10 +7452,12 @@ pub unsafe fn _mm256_mask_srai_epi16<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_srai_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
-    transmute(simd_select_bitmask(k, r, i16x16::ZERO))
+pub fn _mm256_maskz_srai_epi16<const IMM8: u32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, i16x16::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7121,14 +7468,12 @@ pub unsafe fn _mm256_maskz_srai_epi16<const IMM8: u32>(k: __mmask16, a: __m256i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_srai_epi16<const IMM8: u32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
-    transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+pub fn _mm_mask_srai_epi16<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7139,10 +7484,12 @@ pub unsafe fn _mm_mask_srai_epi16<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraw, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_srai_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
-    transmute(simd_select_bitmask(k, r, i16x8::ZERO))
+pub fn _mm_maskz_srai_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i16x8(), i16x8::splat(IMM8.min(15) as i16));
+        transmute(simd_select_bitmask(k, r, i16x8::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
@@ -7152,8 +7499,8 @@ pub unsafe fn _mm_maskz_srai_epi16<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravw))]
-pub unsafe fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i {
-    transmute(vpsravw(a.as_i16x32(), count.as_i16x32()))
+pub fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsravw(a.as_i16x32(), count.as_i16x32())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7163,14 +7510,11 @@ pub unsafe fn _mm512_srav_epi16(a: __m512i, count: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravw))]
-pub unsafe fn _mm512_mask_srav_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    count: __m512i,
-) -> __m512i {
-    let shf = _mm512_srav_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+pub fn _mm512_mask_srav_epi16(src: __m512i, k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7180,9 +7524,11 @@ pub unsafe fn _mm512_mask_srav_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravw))]
-pub unsafe fn _mm512_maskz_srav_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
-    let shf = _mm512_srav_epi16(a, count).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+pub fn _mm512_maskz_srav_epi16(k: __mmask32, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi16(a, count).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
@@ -7192,8 +7538,8 @@ pub unsafe fn _mm512_maskz_srav_epi16(k: __mmask32, a: __m512i, count: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravw))]
-pub unsafe fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i {
-    transmute(vpsravw256(a.as_i16x16(), count.as_i16x16()))
+pub fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsravw256(a.as_i16x16(), count.as_i16x16())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7203,14 +7549,11 @@ pub unsafe fn _mm256_srav_epi16(a: __m256i, count: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravw))]
-pub unsafe fn _mm256_mask_srav_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    count: __m256i,
-) -> __m256i {
-    let shf = _mm256_srav_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+pub fn _mm256_mask_srav_epi16(src: __m256i, k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7220,9 +7563,11 @@ pub unsafe fn _mm256_mask_srav_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravw))]
-pub unsafe fn _mm256_maskz_srav_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
-    let shf = _mm256_srav_epi16(a, count).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+pub fn _mm256_maskz_srav_epi16(k: __mmask16, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi16(a, count).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
@@ -7232,8 +7577,8 @@ pub unsafe fn _mm256_maskz_srav_epi16(k: __mmask16, a: __m256i, count: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravw))]
-pub unsafe fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i {
-    transmute(vpsravw128(a.as_i16x8(), count.as_i16x8()))
+pub fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsravw128(a.as_i16x8(), count.as_i16x8())) }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7243,14 +7588,11 @@ pub unsafe fn _mm_srav_epi16(a: __m128i, count: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravw))]
-pub unsafe fn _mm_mask_srav_epi16(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    count: __m128i,
-) -> __m128i {
-    let shf = _mm_srav_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+pub fn _mm_mask_srav_epi16(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
 }
 
 /// Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7260,9 +7602,11 @@ pub unsafe fn _mm_mask_srav_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravw))]
-pub unsafe fn _mm_maskz_srav_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srav_epi16(a, count).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+pub fn _mm_maskz_srav_epi16(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi16(a, count).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -7272,8 +7616,8 @@ pub unsafe fn _mm_maskz_srav_epi16(k: __mmask8, a: __m128i, count: __m128i) -> _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub unsafe fn _mm512_permutex2var_epi16(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
-    transmute(vpermi2w(a.as_i16x32(), idx.as_i16x32(), b.as_i16x32()))
+pub fn _mm512_permutex2var_epi16(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2w(a.as_i16x32(), idx.as_i16x32(), b.as_i16x32())) }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -7283,14 +7627,16 @@ pub unsafe fn _mm512_permutex2var_epi16(a: __m512i, idx: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2w))]
-pub unsafe fn _mm512_mask_permutex2var_epi16(
+pub fn _mm512_mask_permutex2var_epi16(
     a: __m512i,
     k: __mmask32,
     idx: __m512i,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
-    transmute(simd_select_bitmask(k, permute, a.as_i16x32()))
+    unsafe {
+        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, a.as_i16x32()))
+    }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7300,14 +7646,16 @@ pub unsafe fn _mm512_mask_permutex2var_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub unsafe fn _mm512_maskz_permutex2var_epi16(
+pub fn _mm512_maskz_permutex2var_epi16(
     k: __mmask32,
     a: __m512i,
     idx: __m512i,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
-    transmute(simd_select_bitmask(k, permute, i16x32::ZERO))
+    unsafe {
+        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, i16x32::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -7317,14 +7665,16 @@ pub unsafe fn _mm512_maskz_permutex2var_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2w))]
-pub unsafe fn _mm512_mask2_permutex2var_epi16(
+pub fn _mm512_mask2_permutex2var_epi16(
     a: __m512i,
     idx: __m512i,
     k: __mmask32,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
-    transmute(simd_select_bitmask(k, permute, idx.as_i16x32()))
+    unsafe {
+        let permute = _mm512_permutex2var_epi16(a, idx, b).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, idx.as_i16x32()))
+    }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -7334,8 +7684,8 @@ pub unsafe fn _mm512_mask2_permutex2var_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub unsafe fn _mm256_permutex2var_epi16(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
-    transmute(vpermi2w256(a.as_i16x16(), idx.as_i16x16(), b.as_i16x16()))
+pub fn _mm256_permutex2var_epi16(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2w256(a.as_i16x16(), idx.as_i16x16(), b.as_i16x16())) }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -7345,14 +7695,16 @@ pub unsafe fn _mm256_permutex2var_epi16(a: __m256i, idx: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2w))]
-pub unsafe fn _mm256_mask_permutex2var_epi16(
+pub fn _mm256_mask_permutex2var_epi16(
     a: __m256i,
     k: __mmask16,
     idx: __m256i,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
-    transmute(simd_select_bitmask(k, permute, a.as_i16x16()))
+    unsafe {
+        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, a.as_i16x16()))
+    }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7362,14 +7714,16 @@ pub unsafe fn _mm256_mask_permutex2var_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub unsafe fn _mm256_maskz_permutex2var_epi16(
+pub fn _mm256_maskz_permutex2var_epi16(
     k: __mmask16,
     a: __m256i,
     idx: __m256i,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
-    transmute(simd_select_bitmask(k, permute, i16x16::ZERO))
+    unsafe {
+        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, i16x16::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -7379,14 +7733,16 @@ pub unsafe fn _mm256_maskz_permutex2var_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2w))]
-pub unsafe fn _mm256_mask2_permutex2var_epi16(
+pub fn _mm256_mask2_permutex2var_epi16(
     a: __m256i,
     idx: __m256i,
     k: __mmask16,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
-    transmute(simd_select_bitmask(k, permute, idx.as_i16x16()))
+    unsafe {
+        let permute = _mm256_permutex2var_epi16(a, idx, b).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, idx.as_i16x16()))
+    }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -7396,8 +7752,8 @@ pub unsafe fn _mm256_mask2_permutex2var_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub unsafe fn _mm_permutex2var_epi16(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    transmute(vpermi2w128(a.as_i16x8(), idx.as_i16x8(), b.as_i16x8()))
+pub fn _mm_permutex2var_epi16(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2w128(a.as_i16x8(), idx.as_i16x8(), b.as_i16x8())) }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -7407,14 +7763,11 @@ pub unsafe fn _mm_permutex2var_epi16(a: __m128i, idx: __m128i, b: __m128i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2w))]
-pub unsafe fn _mm_mask_permutex2var_epi16(
-    a: __m128i,
-    k: __mmask8,
-    idx: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
-    transmute(simd_select_bitmask(k, permute, a.as_i16x8()))
+pub fn _mm_mask_permutex2var_epi16(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, a.as_i16x8()))
+    }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7424,14 +7777,11 @@ pub unsafe fn _mm_mask_permutex2var_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2w or vpermt2w
-pub unsafe fn _mm_maskz_permutex2var_epi16(
-    k: __mmask8,
-    a: __m128i,
-    idx: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
-    transmute(simd_select_bitmask(k, permute, i16x8::ZERO))
+pub fn _mm_maskz_permutex2var_epi16(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, i16x8::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -7441,14 +7791,11 @@ pub unsafe fn _mm_maskz_permutex2var_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2w))]
-pub unsafe fn _mm_mask2_permutex2var_epi16(
-    a: __m128i,
-    idx: __m128i,
-    k: __mmask8,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
-    transmute(simd_select_bitmask(k, permute, idx.as_i16x8()))
+pub fn _mm_mask2_permutex2var_epi16(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi16(a, idx, b).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, idx.as_i16x8()))
+    }
 }
 
 /// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -7458,8 +7805,8 @@ pub unsafe fn _mm_mask2_permutex2var_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermw))]
-pub unsafe fn _mm512_permutexvar_epi16(idx: __m512i, a: __m512i) -> __m512i {
-    transmute(vpermw(a.as_i16x32(), idx.as_i16x32()))
+pub fn _mm512_permutexvar_epi16(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermw(a.as_i16x32(), idx.as_i16x32())) }
 }
 
 /// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7469,14 +7816,16 @@ pub unsafe fn _mm512_permutexvar_epi16(idx: __m512i, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermw))]
-pub unsafe fn _mm512_mask_permutexvar_epi16(
+pub fn _mm512_mask_permutexvar_epi16(
     src: __m512i,
     k: __mmask32,
     idx: __m512i,
     a: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
-    transmute(simd_select_bitmask(k, permute, src.as_i16x32()))
+    unsafe {
+        let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, src.as_i16x32()))
+    }
 }
 
 /// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7486,9 +7835,11 @@ pub unsafe fn _mm512_mask_permutexvar_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermw))]
-pub unsafe fn _mm512_maskz_permutexvar_epi16(k: __mmask32, idx: __m512i, a: __m512i) -> __m512i {
-    let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
-    transmute(simd_select_bitmask(k, permute, i16x32::ZERO))
+pub fn _mm512_maskz_permutexvar_epi16(k: __mmask32, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi16(idx, a).as_i16x32();
+        transmute(simd_select_bitmask(k, permute, i16x32::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -7498,8 +7849,8 @@ pub unsafe fn _mm512_maskz_permutexvar_epi16(k: __mmask32, idx: __m512i, a: __m5
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermw))]
-pub unsafe fn _mm256_permutexvar_epi16(idx: __m256i, a: __m256i) -> __m256i {
-    transmute(vpermw256(a.as_i16x16(), idx.as_i16x16()))
+pub fn _mm256_permutexvar_epi16(idx: __m256i, a: __m256i) -> __m256i {
+    unsafe { transmute(vpermw256(a.as_i16x16(), idx.as_i16x16())) }
 }
 
 /// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7509,14 +7860,16 @@ pub unsafe fn _mm256_permutexvar_epi16(idx: __m256i, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermw))]
-pub unsafe fn _mm256_mask_permutexvar_epi16(
+pub fn _mm256_mask_permutexvar_epi16(
     src: __m256i,
     k: __mmask16,
     idx: __m256i,
     a: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
-    transmute(simd_select_bitmask(k, permute, src.as_i16x16()))
+    unsafe {
+        let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i16x16()))
+    }
 }
 
 /// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7526,9 +7879,11 @@ pub unsafe fn _mm256_mask_permutexvar_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermw))]
-pub unsafe fn _mm256_maskz_permutexvar_epi16(k: __mmask16, idx: __m256i, a: __m256i) -> __m256i {
-    let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
-    transmute(simd_select_bitmask(k, permute, i16x16::ZERO))
+pub fn _mm256_maskz_permutexvar_epi16(k: __mmask16, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi16(idx, a).as_i16x16();
+        transmute(simd_select_bitmask(k, permute, i16x16::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -7538,8 +7893,8 @@ pub unsafe fn _mm256_maskz_permutexvar_epi16(k: __mmask16, idx: __m256i, a: __m2
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermw))]
-pub unsafe fn _mm_permutexvar_epi16(idx: __m128i, a: __m128i) -> __m128i {
-    transmute(vpermw128(a.as_i16x8(), idx.as_i16x8()))
+pub fn _mm_permutexvar_epi16(idx: __m128i, a: __m128i) -> __m128i {
+    unsafe { transmute(vpermw128(a.as_i16x8(), idx.as_i16x8())) }
 }
 
 /// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7549,14 +7904,11 @@ pub unsafe fn _mm_permutexvar_epi16(idx: __m128i, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermw))]
-pub unsafe fn _mm_mask_permutexvar_epi16(
-    src: __m128i,
-    k: __mmask8,
-    idx: __m128i,
-    a: __m128i,
-) -> __m128i {
-    let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
-    transmute(simd_select_bitmask(k, permute, src.as_i16x8()))
+pub fn _mm_mask_permutexvar_epi16(src: __m128i, k: __mmask8, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, src.as_i16x8()))
+    }
 }
 
 /// Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7566,9 +7918,11 @@ pub unsafe fn _mm_mask_permutexvar_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermw))]
-pub unsafe fn _mm_maskz_permutexvar_epi16(k: __mmask8, idx: __m128i, a: __m128i) -> __m128i {
-    let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
-    transmute(simd_select_bitmask(k, permute, i16x8::ZERO))
+pub fn _mm_maskz_permutexvar_epi16(k: __mmask8, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi16(idx, a).as_i16x8();
+        transmute(simd_select_bitmask(k, permute, i16x8::ZERO))
+    }
 }
 
 /// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
@@ -7578,8 +7932,8 @@ pub unsafe fn _mm_maskz_permutexvar_epi16(k: __mmask8, idx: __m128i, a: __m128i)
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
-pub unsafe fn _mm512_mask_blend_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(k, b.as_i16x32(), a.as_i16x32()))
+pub fn _mm512_mask_blend_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i16x32(), a.as_i16x32())) }
 }
 
 /// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
@@ -7589,8 +7943,8 @@ pub unsafe fn _mm512_mask_blend_epi16(k: __mmask32, a: __m512i, b: __m512i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
-pub unsafe fn _mm256_mask_blend_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(k, b.as_i16x16(), a.as_i16x16()))
+pub fn _mm256_mask_blend_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i16x16(), a.as_i16x16())) }
 }
 
 /// Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
@@ -7600,8 +7954,8 @@ pub unsafe fn _mm256_mask_blend_epi16(k: __mmask16, a: __m256i, b: __m256i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu16))] //should be vpblendmw
-pub unsafe fn _mm_mask_blend_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(k, b.as_i16x8(), a.as_i16x8()))
+pub fn _mm_mask_blend_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i16x8(), a.as_i16x8())) }
 }
 
 /// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
@@ -7611,8 +7965,8 @@ pub unsafe fn _mm_mask_blend_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
-pub unsafe fn _mm512_mask_blend_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(k, b.as_i8x64(), a.as_i8x64()))
+pub fn _mm512_mask_blend_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i8x64(), a.as_i8x64())) }
 }
 
 /// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
@@ -7622,8 +7976,8 @@ pub unsafe fn _mm512_mask_blend_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
-pub unsafe fn _mm256_mask_blend_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(k, b.as_i8x32(), a.as_i8x32()))
+pub fn _mm256_mask_blend_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i8x32(), a.as_i8x32())) }
 }
 
 /// Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
@@ -7633,8 +7987,8 @@ pub unsafe fn _mm256_mask_blend_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu8))] //should be vpblendmb
-pub unsafe fn _mm_mask_blend_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(k, b.as_i8x16(), a.as_i8x16()))
+pub fn _mm_mask_blend_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i8x16(), a.as_i8x16())) }
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of dst.
@@ -7644,17 +7998,19 @@ pub unsafe fn _mm_mask_blend_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm512_broadcastw_epi16(a: __m128i) -> __m512i {
-    let a = _mm512_castsi128_si512(a).as_i16x32();
-    let ret: i16x32 = simd_shuffle!(
-        a,
-        a,
-        [
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0,
-        ],
-    );
-    transmute(ret)
+pub fn _mm512_broadcastw_epi16(a: __m128i) -> __m512i {
+    unsafe {
+        let a = _mm512_castsi128_si512(a).as_i16x32();
+        let ret: i16x32 = simd_shuffle!(
+            a,
+            a,
+            [
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0,
+            ],
+        );
+        transmute(ret)
+    }
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7664,9 +8020,11 @@ pub unsafe fn _mm512_broadcastw_epi16(a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm512_mask_broadcastw_epi16(src: __m512i, k: __mmask32, a: __m128i) -> __m512i {
-    let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i16x32()))
+pub fn _mm512_mask_broadcastw_epi16(src: __m512i, k: __mmask32, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i16x32()))
+    }
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7676,9 +8034,11 @@ pub unsafe fn _mm512_mask_broadcastw_epi16(src: __m512i, k: __mmask32, a: __m128
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm512_maskz_broadcastw_epi16(k: __mmask32, a: __m128i) -> __m512i {
-    let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
-    transmute(simd_select_bitmask(k, broadcast, i16x32::ZERO))
+pub fn _mm512_maskz_broadcastw_epi16(k: __mmask32, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastw_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, broadcast, i16x32::ZERO))
+    }
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7688,9 +8048,11 @@ pub unsafe fn _mm512_maskz_broadcastw_epi16(k: __mmask32, a: __m128i) -> __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm256_mask_broadcastw_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
-    let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i16x16()))
+pub fn _mm256_mask_broadcastw_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i16x16()))
+    }
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7700,9 +8062,11 @@ pub unsafe fn _mm256_mask_broadcastw_epi16(src: __m256i, k: __mmask16, a: __m128
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm256_maskz_broadcastw_epi16(k: __mmask16, a: __m128i) -> __m256i {
-    let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, broadcast, i16x16::ZERO))
+pub fn _mm256_maskz_broadcastw_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastw_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, broadcast, i16x16::ZERO))
+    }
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7712,9 +8076,11 @@ pub unsafe fn _mm256_maskz_broadcastw_epi16(k: __mmask16, a: __m128i) -> __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm_mask_broadcastw_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i16x8()))
+pub fn _mm_mask_broadcastw_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i16x8()))
+    }
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7724,9 +8090,11 @@ pub unsafe fn _mm_mask_broadcastw_epi16(src: __m128i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm_maskz_broadcastw_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, broadcast, i16x8::ZERO))
+pub fn _mm_maskz_broadcastw_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastw_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, broadcast, i16x8::ZERO))
+    }
 }
 
 /// Broadcast the low packed 8-bit integer from a to all elements of dst.
@@ -7736,18 +8104,20 @@ pub unsafe fn _mm_maskz_broadcastw_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub unsafe fn _mm512_broadcastb_epi8(a: __m128i) -> __m512i {
-    let a = _mm512_castsi128_si512(a).as_i8x64();
-    let ret: i8x64 = simd_shuffle!(
-        a,
-        a,
-        [
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0,
-        ],
-    );
-    transmute(ret)
+pub fn _mm512_broadcastb_epi8(a: __m128i) -> __m512i {
+    unsafe {
+        let a = _mm512_castsi128_si512(a).as_i8x64();
+        let ret: i8x64 = simd_shuffle!(
+            a,
+            a,
+            [
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                0, 0, 0, 0, 0, 0, 0, 0,
+            ],
+        );
+        transmute(ret)
+    }
 }
 
 /// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7757,9 +8127,11 @@ pub unsafe fn _mm512_broadcastb_epi8(a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub unsafe fn _mm512_mask_broadcastb_epi8(src: __m512i, k: __mmask64, a: __m128i) -> __m512i {
-    let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i8x64()))
+pub fn _mm512_mask_broadcastb_epi8(src: __m512i, k: __mmask64, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i8x64()))
+    }
 }
 
 /// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7769,9 +8141,11 @@ pub unsafe fn _mm512_mask_broadcastb_epi8(src: __m512i, k: __mmask64, a: __m128i
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub unsafe fn _mm512_maskz_broadcastb_epi8(k: __mmask64, a: __m128i) -> __m512i {
-    let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
-    transmute(simd_select_bitmask(k, broadcast, i8x64::ZERO))
+pub fn _mm512_maskz_broadcastb_epi8(k: __mmask64, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastb_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, broadcast, i8x64::ZERO))
+    }
 }
 
 /// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7781,9 +8155,11 @@ pub unsafe fn _mm512_maskz_broadcastb_epi8(k: __mmask64, a: __m128i) -> __m512i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub unsafe fn _mm256_mask_broadcastb_epi8(src: __m256i, k: __mmask32, a: __m128i) -> __m256i {
-    let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i8x32()))
+pub fn _mm256_mask_broadcastb_epi8(src: __m256i, k: __mmask32, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i8x32()))
+    }
 }
 
 /// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7793,9 +8169,11 @@ pub unsafe fn _mm256_mask_broadcastb_epi8(src: __m256i, k: __mmask32, a: __m128i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub unsafe fn _mm256_maskz_broadcastb_epi8(k: __mmask32, a: __m128i) -> __m256i {
-    let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
-    transmute(simd_select_bitmask(k, broadcast, i8x32::ZERO))
+pub fn _mm256_maskz_broadcastb_epi8(k: __mmask32, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastb_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, broadcast, i8x32::ZERO))
+    }
 }
 
 /// Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7805,9 +8183,11 @@ pub unsafe fn _mm256_maskz_broadcastb_epi8(k: __mmask32, a: __m128i) -> __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub unsafe fn _mm_mask_broadcastb_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i8x16()))
+pub fn _mm_mask_broadcastb_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i8x16()))
+    }
 }
 
 /// Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7817,9 +8197,11 @@ pub unsafe fn _mm_mask_broadcastb_epi8(src: __m128i, k: __mmask16, a: __m128i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
-pub unsafe fn _mm_maskz_broadcastb_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
-    transmute(simd_select_bitmask(k, broadcast, i8x16::ZERO))
+pub fn _mm_maskz_broadcastb_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastb_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, broadcast, i8x16::ZERO))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
@@ -7829,25 +8211,27 @@ pub unsafe fn _mm_maskz_broadcastb_epi8(k: __mmask16, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub unsafe fn _mm512_unpackhi_epi16(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i16x32();
-    let b = b.as_i16x32();
-    #[rustfmt::skip]
-    let r: i16x32 = simd_shuffle!(
-        a,
-        b,
-        [
-            4, 32 + 4, 5, 32 + 5,
-            6, 32 + 6, 7, 32 + 7,
-            12, 32 + 12, 13, 32 + 13,
-            14, 32 + 14, 15, 32 + 15,
-            20, 32 + 20, 21, 32 + 21,
-            22, 32 + 22, 23, 32 + 23,
-            28, 32 + 28, 29, 32 + 29,
-            30, 32 + 30, 31, 32 + 31,
-        ],
-    );
-    transmute(r)
+pub fn _mm512_unpackhi_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        #[rustfmt::skip]
+        let r: i16x32 = simd_shuffle!(
+            a,
+            b,
+            [
+                4, 32 + 4, 5, 32 + 5,
+                6, 32 + 6, 7, 32 + 7,
+                12, 32 + 12, 13, 32 + 13,
+                14, 32 + 14, 15, 32 + 15,
+                20, 32 + 20, 21, 32 + 21,
+                22, 32 + 22, 23, 32 + 23,
+                28, 32 + 28, 29, 32 + 29,
+                30, 32 + 30, 31, 32 + 31,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7857,14 +8241,11 @@ pub unsafe fn _mm512_unpackhi_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub unsafe fn _mm512_mask_unpackhi_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i16x32()))
+pub fn _mm512_mask_unpackhi_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x32()))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7874,9 +8255,11 @@ pub unsafe fn _mm512_mask_unpackhi_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub unsafe fn _mm512_maskz_unpackhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, unpackhi, i16x32::ZERO))
+pub fn _mm512_maskz_unpackhi_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpackhi, i16x32::ZERO))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7886,14 +8269,11 @@ pub unsafe fn _mm512_maskz_unpackhi_epi16(k: __mmask32, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub unsafe fn _mm256_mask_unpackhi_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i16x16()))
+pub fn _mm256_mask_unpackhi_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x16()))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7903,9 +8283,11 @@ pub unsafe fn _mm256_mask_unpackhi_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub unsafe fn _mm256_maskz_unpackhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, unpackhi, i16x16::ZERO))
+pub fn _mm256_maskz_unpackhi_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpackhi, i16x16::ZERO))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7915,14 +8297,11 @@ pub unsafe fn _mm256_maskz_unpackhi_epi16(k: __mmask16, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub unsafe fn _mm_mask_unpackhi_epi16(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i16x8()))
+pub fn _mm_mask_unpackhi_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i16x8()))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7932,9 +8311,11 @@ pub unsafe fn _mm_mask_unpackhi_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
-pub unsafe fn _mm_maskz_unpackhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, unpackhi, i16x8::ZERO))
+pub fn _mm_maskz_unpackhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpackhi, i16x8::ZERO))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
@@ -7944,33 +8325,35 @@ pub unsafe fn _mm_maskz_unpackhi_epi16(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub unsafe fn _mm512_unpackhi_epi8(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i8x64();
-    let b = b.as_i8x64();
-    #[rustfmt::skip]
-    let r: i8x64 = simd_shuffle!(
-        a,
-        b,
-        [
-            8,  64+8,   9, 64+9,
-            10, 64+10, 11, 64+11,
-            12, 64+12, 13, 64+13,
-            14, 64+14, 15, 64+15,
-            24, 64+24, 25, 64+25,
-            26, 64+26, 27, 64+27,
-            28, 64+28, 29, 64+29,
-            30, 64+30, 31, 64+31,
-            40, 64+40, 41, 64+41,
-            42, 64+42, 43, 64+43,
-            44, 64+44, 45, 64+45,
-            46, 64+46, 47, 64+47,
-            56, 64+56, 57, 64+57,
-            58, 64+58, 59, 64+59,
-            60, 64+60, 61, 64+61,
-            62, 64+62, 63, 64+63,
-        ],
-    );
-    transmute(r)
+pub fn _mm512_unpackhi_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        #[rustfmt::skip]
+        let r: i8x64 = simd_shuffle!(
+            a,
+            b,
+            [
+                8, 64 + 8, 9, 64 + 9,
+                10, 64 + 10, 11, 64 + 11,
+                12, 64 + 12, 13, 64 + 13,
+                14, 64 + 14, 15, 64 + 15,
+                24, 64 + 24, 25, 64 + 25,
+                26, 64 + 26, 27, 64 + 27,
+                28, 64 + 28, 29, 64 + 29,
+                30, 64 + 30, 31, 64 + 31,
+                40, 64 + 40, 41, 64 + 41,
+                42, 64 + 42, 43, 64 + 43,
+                44, 64 + 44, 45, 64 + 45,
+                46, 64 + 46, 47, 64 + 47,
+                56, 64 + 56, 57, 64 + 57,
+                58, 64 + 58, 59, 64 + 59,
+                60, 64 + 60, 61, 64 + 61,
+                62, 64 + 62, 63, 64 + 63,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -7980,14 +8363,11 @@ pub unsafe fn _mm512_unpackhi_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub unsafe fn _mm512_mask_unpackhi_epi8(
-    src: __m512i,
-    k: __mmask64,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i8x64()))
+pub fn _mm512_mask_unpackhi_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x64()))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -7997,9 +8377,11 @@ pub unsafe fn _mm512_mask_unpackhi_epi8(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub unsafe fn _mm512_maskz_unpackhi_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, unpackhi, i8x64::ZERO))
+pub fn _mm512_maskz_unpackhi_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpackhi, i8x64::ZERO))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8009,14 +8391,11 @@ pub unsafe fn _mm512_maskz_unpackhi_epi8(k: __mmask64, a: __m512i, b: __m512i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub unsafe fn _mm256_mask_unpackhi_epi8(
-    src: __m256i,
-    k: __mmask32,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i8x32()))
+pub fn _mm256_mask_unpackhi_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x32()))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8026,9 +8405,11 @@ pub unsafe fn _mm256_mask_unpackhi_epi8(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub unsafe fn _mm256_maskz_unpackhi_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, unpackhi, i8x32::ZERO))
+pub fn _mm256_maskz_unpackhi_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpackhi, i8x32::ZERO))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8038,14 +8419,11 @@ pub unsafe fn _mm256_maskz_unpackhi_epi8(k: __mmask32, a: __m256i, b: __m256i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub unsafe fn _mm_mask_unpackhi_epi8(
-    src: __m128i,
-    k: __mmask16,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i8x16()))
+pub fn _mm_mask_unpackhi_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i8x16()))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8055,9 +8433,11 @@ pub unsafe fn _mm_mask_unpackhi_epi8(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhbw))]
-pub unsafe fn _mm_maskz_unpackhi_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, unpackhi, i8x16::ZERO))
+pub fn _mm_maskz_unpackhi_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpackhi, i8x16::ZERO))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
@@ -8067,25 +8447,27 @@ pub unsafe fn _mm_maskz_unpackhi_epi8(k: __mmask16, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub unsafe fn _mm512_unpacklo_epi16(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i16x32();
-    let b = b.as_i16x32();
-    #[rustfmt::skip]
-    let r: i16x32 = simd_shuffle!(
-        a,
-        b,
-        [
-            0,  32+0,   1, 32+1,
-            2,  32+2,   3, 32+3,
-            8,  32+8,   9, 32+9,
-            10, 32+10, 11, 32+11,
-            16, 32+16, 17, 32+17,
-            18, 32+18, 19, 32+19,
-            24, 32+24, 25, 32+25,
-            26, 32+26, 27, 32+27
-        ],
-    );
-    transmute(r)
+pub fn _mm512_unpacklo_epi16(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x32();
+        let b = b.as_i16x32();
+        #[rustfmt::skip]
+        let r: i16x32 = simd_shuffle!(
+            a,
+            b,
+            [
+               0,  32+0,   1, 32+1,
+               2,  32+2,   3, 32+3,
+               8,  32+8,   9, 32+9,
+               10, 32+10, 11, 32+11,
+               16, 32+16, 17, 32+17,
+               18, 32+18, 19, 32+19,
+               24, 32+24, 25, 32+25,
+               26, 32+26, 27, 32+27
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8095,14 +8477,11 @@ pub unsafe fn _mm512_unpacklo_epi16(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub unsafe fn _mm512_mask_unpacklo_epi16(
-    src: __m512i,
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i16x32()))
+pub fn _mm512_mask_unpacklo_epi16(src: __m512i, k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x32()))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8112,9 +8491,11 @@ pub unsafe fn _mm512_mask_unpacklo_epi16(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub unsafe fn _mm512_maskz_unpacklo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
-    let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, unpacklo, i16x32::ZERO))
+pub fn _mm512_maskz_unpacklo_epi16(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi16(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, unpacklo, i16x32::ZERO))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8124,14 +8505,11 @@ pub unsafe fn _mm512_maskz_unpacklo_epi16(k: __mmask32, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub unsafe fn _mm256_mask_unpacklo_epi16(
-    src: __m256i,
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i16x16()))
+pub fn _mm256_mask_unpacklo_epi16(src: __m256i, k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x16()))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8141,9 +8519,11 @@ pub unsafe fn _mm256_mask_unpacklo_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub unsafe fn _mm256_maskz_unpacklo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
-    let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, unpacklo, i16x16::ZERO))
+pub fn _mm256_maskz_unpacklo_epi16(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi16(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, unpacklo, i16x16::ZERO))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8153,14 +8533,11 @@ pub unsafe fn _mm256_maskz_unpacklo_epi16(k: __mmask16, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub unsafe fn _mm_mask_unpacklo_epi16(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i16x8()))
+pub fn _mm_mask_unpacklo_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i16x8()))
+    }
 }
 
 /// Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8170,9 +8547,11 @@ pub unsafe fn _mm_mask_unpacklo_epi16(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
-pub unsafe fn _mm_maskz_unpacklo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, unpacklo, i16x8::ZERO))
+pub fn _mm_maskz_unpacklo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi16(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, unpacklo, i16x8::ZERO))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
@@ -8182,33 +8561,35 @@ pub unsafe fn _mm_maskz_unpacklo_epi16(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub unsafe fn _mm512_unpacklo_epi8(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i8x64();
-    let b = b.as_i8x64();
-    #[rustfmt::skip]
-    let r: i8x64 = simd_shuffle!(
-        a,
-        b,
-        [
-            0,  64+0,   1, 64+1,
-            2,  64+2,   3, 64+3,
-            4,  64+4,   5, 64+5,
-            6,  64+6,   7, 64+7,
-            16, 64+16, 17, 64+17,
-            18, 64+18, 19, 64+19,
-            20, 64+20, 21, 64+21,
-            22, 64+22, 23, 64+23,
-            32, 64+32, 33, 64+33,
-            34, 64+34, 35, 64+35,
-            36, 64+36, 37, 64+37,
-            38, 64+38, 39, 64+39,
-            48, 64+48, 49, 64+49,
-            50, 64+50, 51, 64+51,
-            52, 64+52, 53, 64+53,
-            54, 64+54, 55, 64+55,
-        ],
-    );
-    transmute(r)
+pub fn _mm512_unpacklo_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        #[rustfmt::skip]
+        let r: i8x64 = simd_shuffle!(
+            a,
+            b,
+            [
+                0,  64+0,   1, 64+1,
+                2,  64+2,   3, 64+3,
+                4,  64+4,   5, 64+5,
+                6,  64+6,   7, 64+7,
+                16, 64+16, 17, 64+17,
+                18, 64+18, 19, 64+19,
+                20, 64+20, 21, 64+21,
+                22, 64+22, 23, 64+23,
+                32, 64+32, 33, 64+33,
+                34, 64+34, 35, 64+35,
+                36, 64+36, 37, 64+37,
+                38, 64+38, 39, 64+39,
+                48, 64+48, 49, 64+49,
+                50, 64+50, 51, 64+51,
+                52, 64+52, 53, 64+53,
+                54, 64+54, 55, 64+55,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8218,14 +8599,11 @@ pub unsafe fn _mm512_unpacklo_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub unsafe fn _mm512_mask_unpacklo_epi8(
-    src: __m512i,
-    k: __mmask64,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i8x64()))
+pub fn _mm512_mask_unpacklo_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x64()))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8235,9 +8613,11 @@ pub unsafe fn _mm512_mask_unpacklo_epi8(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub unsafe fn _mm512_maskz_unpacklo_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, unpacklo, i8x64::ZERO))
+pub fn _mm512_maskz_unpacklo_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, unpacklo, i8x64::ZERO))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8247,14 +8627,11 @@ pub unsafe fn _mm512_maskz_unpacklo_epi8(k: __mmask64, a: __m512i, b: __m512i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub unsafe fn _mm256_mask_unpacklo_epi8(
-    src: __m256i,
-    k: __mmask32,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i8x32()))
+pub fn _mm256_mask_unpacklo_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x32()))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8264,9 +8641,11 @@ pub unsafe fn _mm256_mask_unpacklo_epi8(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub unsafe fn _mm256_maskz_unpacklo_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, unpacklo, i8x32::ZERO))
+pub fn _mm256_maskz_unpacklo_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, unpacklo, i8x32::ZERO))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8276,14 +8655,11 @@ pub unsafe fn _mm256_maskz_unpacklo_epi8(k: __mmask32, a: __m256i, b: __m256i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub unsafe fn _mm_mask_unpacklo_epi8(
-    src: __m128i,
-    k: __mmask16,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i8x16()))
+pub fn _mm_mask_unpacklo_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i8x16()))
+    }
 }
 
 /// Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8293,9 +8669,11 @@ pub unsafe fn _mm_mask_unpacklo_epi8(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklbw))]
-pub unsafe fn _mm_maskz_unpacklo_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, unpacklo, i8x16::ZERO))
+pub fn _mm_maskz_unpacklo_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, unpacklo, i8x16::ZERO))
+    }
 }
 
 /// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8305,9 +8683,11 @@ pub unsafe fn _mm_maskz_unpacklo_epi8(k: __mmask16, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu16))]
-pub unsafe fn _mm512_mask_mov_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    let mov = a.as_i16x32();
-    transmute(simd_select_bitmask(k, mov, src.as_i16x32()))
+pub fn _mm512_mask_mov_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i16x32();
+        transmute(simd_select_bitmask(k, mov, src.as_i16x32()))
+    }
 }
 
 /// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8317,9 +8697,11 @@ pub unsafe fn _mm512_mask_mov_epi16(src: __m512i, k: __mmask32, a: __m512i) -> _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu16))]
-pub unsafe fn _mm512_maskz_mov_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    let mov = a.as_i16x32();
-    transmute(simd_select_bitmask(k, mov, i16x32::ZERO))
+pub fn _mm512_maskz_mov_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i16x32();
+        transmute(simd_select_bitmask(k, mov, i16x32::ZERO))
+    }
 }
 
 /// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8329,9 +8711,11 @@ pub unsafe fn _mm512_maskz_mov_epi16(k: __mmask32, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu16))]
-pub unsafe fn _mm256_mask_mov_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    let mov = a.as_i16x16();
-    transmute(simd_select_bitmask(k, mov, src.as_i16x16()))
+pub fn _mm256_mask_mov_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i16x16();
+        transmute(simd_select_bitmask(k, mov, src.as_i16x16()))
+    }
 }
 
 /// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8341,9 +8725,11 @@ pub unsafe fn _mm256_mask_mov_epi16(src: __m256i, k: __mmask16, a: __m256i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu16))]
-pub unsafe fn _mm256_maskz_mov_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    let mov = a.as_i16x16();
-    transmute(simd_select_bitmask(k, mov, i16x16::ZERO))
+pub fn _mm256_maskz_mov_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i16x16();
+        transmute(simd_select_bitmask(k, mov, i16x16::ZERO))
+    }
 }
 
 /// Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8353,9 +8739,11 @@ pub unsafe fn _mm256_maskz_mov_epi16(k: __mmask16, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu16))]
-pub unsafe fn _mm_mask_mov_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let mov = a.as_i16x8();
-    transmute(simd_select_bitmask(k, mov, src.as_i16x8()))
+pub fn _mm_mask_mov_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i16x8();
+        transmute(simd_select_bitmask(k, mov, src.as_i16x8()))
+    }
 }
 
 /// Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8365,9 +8753,11 @@ pub unsafe fn _mm_mask_mov_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu16))]
-pub unsafe fn _mm_maskz_mov_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    let mov = a.as_i16x8();
-    transmute(simd_select_bitmask(k, mov, i16x8::ZERO))
+pub fn _mm_maskz_mov_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i16x8();
+        transmute(simd_select_bitmask(k, mov, i16x8::ZERO))
+    }
 }
 
 /// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8377,9 +8767,11 @@ pub unsafe fn _mm_maskz_mov_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu8))]
-pub unsafe fn _mm512_mask_mov_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
-    let mov = a.as_i8x64();
-    transmute(simd_select_bitmask(k, mov, src.as_i8x64()))
+pub fn _mm512_mask_mov_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i8x64();
+        transmute(simd_select_bitmask(k, mov, src.as_i8x64()))
+    }
 }
 
 /// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8389,9 +8781,11 @@ pub unsafe fn _mm512_mask_mov_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu8))]
-pub unsafe fn _mm512_maskz_mov_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    let mov = a.as_i8x64();
-    transmute(simd_select_bitmask(k, mov, i8x64::ZERO))
+pub fn _mm512_maskz_mov_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i8x64();
+        transmute(simd_select_bitmask(k, mov, i8x64::ZERO))
+    }
 }
 
 /// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8401,9 +8795,11 @@ pub unsafe fn _mm512_maskz_mov_epi8(k: __mmask64, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu8))]
-pub unsafe fn _mm256_mask_mov_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
-    let mov = a.as_i8x32();
-    transmute(simd_select_bitmask(k, mov, src.as_i8x32()))
+pub fn _mm256_mask_mov_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i8x32();
+        transmute(simd_select_bitmask(k, mov, src.as_i8x32()))
+    }
 }
 
 /// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8413,9 +8809,11 @@ pub unsafe fn _mm256_mask_mov_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu8))]
-pub unsafe fn _mm256_maskz_mov_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    let mov = a.as_i8x32();
-    transmute(simd_select_bitmask(k, mov, i8x32::ZERO))
+pub fn _mm256_maskz_mov_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i8x32();
+        transmute(simd_select_bitmask(k, mov, i8x32::ZERO))
+    }
 }
 
 /// Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8425,9 +8823,11 @@ pub unsafe fn _mm256_maskz_mov_epi8(k: __mmask32, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu8))]
-pub unsafe fn _mm_mask_mov_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    let mov = a.as_i8x16();
-    transmute(simd_select_bitmask(k, mov, src.as_i8x16()))
+pub fn _mm_mask_mov_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i8x16();
+        transmute(simd_select_bitmask(k, mov, src.as_i8x16()))
+    }
 }
 
 /// Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8437,9 +8837,11 @@ pub unsafe fn _mm_mask_mov_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m12
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqu8))]
-pub unsafe fn _mm_maskz_mov_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    let mov = a.as_i8x16();
-    transmute(simd_select_bitmask(k, mov, i8x16::ZERO))
+pub fn _mm_maskz_mov_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i8x16();
+        transmute(simd_select_bitmask(k, mov, i8x16::ZERO))
+    }
 }
 
 /// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8449,9 +8851,11 @@ pub unsafe fn _mm_maskz_mov_epi8(k: __mmask16, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm512_mask_set1_epi16(src: __m512i, k: __mmask32, a: i16) -> __m512i {
-    let r = _mm512_set1_epi16(a).as_i16x32();
-    transmute(simd_select_bitmask(k, r, src.as_i16x32()))
+pub fn _mm512_mask_set1_epi16(src: __m512i, k: __mmask32, a: i16) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, r, src.as_i16x32()))
+    }
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8461,9 +8865,11 @@ pub unsafe fn _mm512_mask_set1_epi16(src: __m512i, k: __mmask32, a: i16) -> __m5
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm512_maskz_set1_epi16(k: __mmask32, a: i16) -> __m512i {
-    let r = _mm512_set1_epi16(a).as_i16x32();
-    transmute(simd_select_bitmask(k, r, i16x32::ZERO))
+pub fn _mm512_maskz_set1_epi16(k: __mmask32, a: i16) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, r, i16x32::ZERO))
+    }
 }
 
 /// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8473,9 +8879,11 @@ pub unsafe fn _mm512_maskz_set1_epi16(k: __mmask32, a: i16) -> __m512i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm256_mask_set1_epi16(src: __m256i, k: __mmask16, a: i16) -> __m256i {
-    let r = _mm256_set1_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+pub fn _mm256_mask_set1_epi16(src: __m256i, k: __mmask16, a: i16) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, r, src.as_i16x16()))
+    }
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8485,9 +8893,11 @@ pub unsafe fn _mm256_mask_set1_epi16(src: __m256i, k: __mmask16, a: i16) -> __m2
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm256_maskz_set1_epi16(k: __mmask16, a: i16) -> __m256i {
-    let r = _mm256_set1_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, r, i16x16::ZERO))
+pub fn _mm256_maskz_set1_epi16(k: __mmask16, a: i16) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, r, i16x16::ZERO))
+    }
 }
 
 /// Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8497,9 +8907,11 @@ pub unsafe fn _mm256_maskz_set1_epi16(k: __mmask16, a: i16) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm_mask_set1_epi16(src: __m128i, k: __mmask8, a: i16) -> __m128i {
-    let r = _mm_set1_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+pub fn _mm_mask_set1_epi16(src: __m128i, k: __mmask8, a: i16) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, r, src.as_i16x8()))
+    }
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8509,9 +8921,11 @@ pub unsafe fn _mm_mask_set1_epi16(src: __m128i, k: __mmask8, a: i16) -> __m128i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
-pub unsafe fn _mm_maskz_set1_epi16(k: __mmask8, a: i16) -> __m128i {
-    let r = _mm_set1_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, r, i16x8::ZERO))
+pub fn _mm_maskz_set1_epi16(k: __mmask8, a: i16) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, r, i16x8::ZERO))
+    }
 }
 
 /// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8521,9 +8935,11 @@ pub unsafe fn _mm_maskz_set1_epi16(k: __mmask8, a: i16) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))]
-pub unsafe fn _mm512_mask_set1_epi8(src: __m512i, k: __mmask64, a: i8) -> __m512i {
-    let r = _mm512_set1_epi8(a).as_i8x64();
-    transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+pub fn _mm512_mask_set1_epi8(src: __m512i, k: __mmask64, a: i8) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, r, src.as_i8x64()))
+    }
 }
 
 /// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8533,9 +8949,11 @@ pub unsafe fn _mm512_mask_set1_epi8(src: __m512i, k: __mmask64, a: i8) -> __m512
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))]
-pub unsafe fn _mm512_maskz_set1_epi8(k: __mmask64, a: i8) -> __m512i {
-    let r = _mm512_set1_epi8(a).as_i8x64();
-    transmute(simd_select_bitmask(k, r, i8x64::ZERO))
+pub fn _mm512_maskz_set1_epi8(k: __mmask64, a: i8) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi8(a).as_i8x64();
+        transmute(simd_select_bitmask(k, r, i8x64::ZERO))
+    }
 }
 
 /// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8545,9 +8963,11 @@ pub unsafe fn _mm512_maskz_set1_epi8(k: __mmask64, a: i8) -> __m512i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))]
-pub unsafe fn _mm256_mask_set1_epi8(src: __m256i, k: __mmask32, a: i8) -> __m256i {
-    let r = _mm256_set1_epi8(a).as_i8x32();
-    transmute(simd_select_bitmask(k, r, src.as_i8x32()))
+pub fn _mm256_mask_set1_epi8(src: __m256i, k: __mmask32, a: i8) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, r, src.as_i8x32()))
+    }
 }
 
 /// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8557,9 +8977,11 @@ pub unsafe fn _mm256_mask_set1_epi8(src: __m256i, k: __mmask32, a: i8) -> __m256
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))]
-pub unsafe fn _mm256_maskz_set1_epi8(k: __mmask32, a: i8) -> __m256i {
-    let r = _mm256_set1_epi8(a).as_i8x32();
-    transmute(simd_select_bitmask(k, r, i8x32::ZERO))
+pub fn _mm256_maskz_set1_epi8(k: __mmask32, a: i8) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, r, i8x32::ZERO))
+    }
 }
 
 /// Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8569,9 +8991,11 @@ pub unsafe fn _mm256_maskz_set1_epi8(k: __mmask32, a: i8) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))]
-pub unsafe fn _mm_mask_set1_epi8(src: __m128i, k: __mmask16, a: i8) -> __m128i {
-    let r = _mm_set1_epi8(a).as_i8x16();
-    transmute(simd_select_bitmask(k, r, src.as_i8x16()))
+pub fn _mm_mask_set1_epi8(src: __m128i, k: __mmask16, a: i8) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, r, src.as_i8x16()))
+    }
 }
 
 /// Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8581,9 +9005,11 @@ pub unsafe fn _mm_mask_set1_epi8(src: __m128i, k: __mmask16, a: i8) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))]
-pub unsafe fn _mm_maskz_set1_epi8(k: __mmask16, a: i8) -> __m128i {
-    let r = _mm_set1_epi8(a).as_i8x16();
-    transmute(simd_select_bitmask(k, r, i8x16::ZERO))
+pub fn _mm_maskz_set1_epi8(k: __mmask16, a: i8) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, r, i8x16::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst.
@@ -8594,48 +9020,50 @@ pub unsafe fn _mm_maskz_set1_epi8(k: __mmask16, a: i8) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_shufflelo_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i16x32();
-    let r: i16x32 = simd_shuffle!(
-        a,
-        a,
-        [
-            IMM8 as u32 & 0b11,
-            (IMM8 as u32 >> 2) & 0b11,
-            (IMM8 as u32 >> 4) & 0b11,
-            (IMM8 as u32 >> 6) & 0b11,
-            4,
-            5,
-            6,
-            7,
-            (IMM8 as u32 & 0b11) + 8,
-            ((IMM8 as u32 >> 2) & 0b11) + 8,
-            ((IMM8 as u32 >> 4) & 0b11) + 8,
-            ((IMM8 as u32 >> 6) & 0b11) + 8,
-            12,
-            13,
-            14,
-            15,
-            (IMM8 as u32 & 0b11) + 16,
-            ((IMM8 as u32 >> 2) & 0b11) + 16,
-            ((IMM8 as u32 >> 4) & 0b11) + 16,
-            ((IMM8 as u32 >> 6) & 0b11) + 16,
-            20,
-            21,
-            22,
-            23,
-            (IMM8 as u32 & 0b11) + 24,
-            ((IMM8 as u32 >> 2) & 0b11) + 24,
-            ((IMM8 as u32 >> 4) & 0b11) + 24,
-            ((IMM8 as u32 >> 6) & 0b11) + 24,
-            28,
-            29,
-            30,
-            31,
-        ],
-    );
-    transmute(r)
+pub fn _mm512_shufflelo_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i16x32();
+        let r: i16x32 = simd_shuffle!(
+            a,
+            a,
+            [
+                IMM8 as u32 & 0b11,
+                (IMM8 as u32 >> 2) & 0b11,
+                (IMM8 as u32 >> 4) & 0b11,
+                (IMM8 as u32 >> 6) & 0b11,
+                4,
+                5,
+                6,
+                7,
+                (IMM8 as u32 & 0b11) + 8,
+                ((IMM8 as u32 >> 2) & 0b11) + 8,
+                ((IMM8 as u32 >> 4) & 0b11) + 8,
+                ((IMM8 as u32 >> 6) & 0b11) + 8,
+                12,
+                13,
+                14,
+                15,
+                (IMM8 as u32 & 0b11) + 16,
+                ((IMM8 as u32 >> 2) & 0b11) + 16,
+                ((IMM8 as u32 >> 4) & 0b11) + 16,
+                ((IMM8 as u32 >> 6) & 0b11) + 16,
+                20,
+                21,
+                22,
+                23,
+                (IMM8 as u32 & 0b11) + 24,
+                ((IMM8 as u32 >> 2) & 0b11) + 24,
+                ((IMM8 as u32 >> 4) & 0b11) + 24,
+                ((IMM8 as u32 >> 6) & 0b11) + 24,
+                28,
+                29,
+                30,
+                31,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8646,14 +9074,16 @@ pub unsafe fn _mm512_shufflelo_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_shufflelo_epi16<const IMM8: i32>(
+pub fn _mm512_mask_shufflelo_epi16<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm512_shufflelo_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
+    }
 }
 
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8664,10 +9094,12 @@ pub unsafe fn _mm512_mask_shufflelo_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm512_shufflelo_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, r.as_i16x32(), i16x32::ZERO))
+pub fn _mm512_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), i16x32::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8678,14 +9110,16 @@ pub unsafe fn _mm512_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask32, a: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_shufflelo_epi16<const IMM8: i32>(
+pub fn _mm256_mask_shufflelo_epi16<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
+    }
 }
 
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8696,10 +9130,12 @@ pub unsafe fn _mm256_mask_shufflelo_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shuffle.as_i16x16(), i16x16::ZERO))
+pub fn _mm256_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), i16x16::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8710,14 +9146,12 @@ pub unsafe fn _mm256_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask16, a: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_shufflelo_epi16<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
+pub fn _mm_mask_shufflelo_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
+    }
 }
 
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8728,10 +9162,12 @@ pub unsafe fn _mm_mask_shufflelo_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shuffle.as_i16x8(), i16x8::ZERO))
+pub fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflelo_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), i16x8::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst.
@@ -8742,66 +9178,70 @@ pub unsafe fn _mm_maskz_shufflelo_epi16<const IMM8: i32>(k: __mmask8, a: __m128i
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_shufflehi_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i16x32();
-    let r: i16x32 = simd_shuffle!(
-        a,
-        a,
-        [
-            0,
-            1,
-            2,
-            3,
-            (IMM8 as u32 & 0b11) + 4,
-            ((IMM8 as u32 >> 2) & 0b11) + 4,
-            ((IMM8 as u32 >> 4) & 0b11) + 4,
-            ((IMM8 as u32 >> 6) & 0b11) + 4,
-            8,
-            9,
-            10,
-            11,
-            (IMM8 as u32 & 0b11) + 12,
-            ((IMM8 as u32 >> 2) & 0b11) + 12,
-            ((IMM8 as u32 >> 4) & 0b11) + 12,
-            ((IMM8 as u32 >> 6) & 0b11) + 12,
-            16,
-            17,
-            18,
-            19,
-            (IMM8 as u32 & 0b11) + 20,
-            ((IMM8 as u32 >> 2) & 0b11) + 20,
-            ((IMM8 as u32 >> 4) & 0b11) + 20,
-            ((IMM8 as u32 >> 6) & 0b11) + 20,
-            24,
-            25,
-            26,
-            27,
-            (IMM8 as u32 & 0b11) + 28,
-            ((IMM8 as u32 >> 2) & 0b11) + 28,
-            ((IMM8 as u32 >> 4) & 0b11) + 28,
-            ((IMM8 as u32 >> 6) & 0b11) + 28,
-        ],
-    );
-    transmute(r)
-}
-
-/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///
+pub fn _mm512_shufflehi_epi16<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i16x32();
+        let r: i16x32 = simd_shuffle!(
+            a,
+            a,
+            [
+                0,
+                1,
+                2,
+                3,
+                (IMM8 as u32 & 0b11) + 4,
+                ((IMM8 as u32 >> 2) & 0b11) + 4,
+                ((IMM8 as u32 >> 4) & 0b11) + 4,
+                ((IMM8 as u32 >> 6) & 0b11) + 4,
+                8,
+                9,
+                10,
+                11,
+                (IMM8 as u32 & 0b11) + 12,
+                ((IMM8 as u32 >> 2) & 0b11) + 12,
+                ((IMM8 as u32 >> 4) & 0b11) + 12,
+                ((IMM8 as u32 >> 6) & 0b11) + 12,
+                16,
+                17,
+                18,
+                19,
+                (IMM8 as u32 & 0b11) + 20,
+                ((IMM8 as u32 >> 2) & 0b11) + 20,
+                ((IMM8 as u32 >> 4) & 0b11) + 20,
+                ((IMM8 as u32 >> 6) & 0b11) + 20,
+                24,
+                25,
+                26,
+                27,
+                (IMM8 as u32 & 0b11) + 28,
+                ((IMM8 as u32 >> 2) & 0b11) + 28,
+                ((IMM8 as u32 >> 4) & 0b11) + 28,
+                ((IMM8 as u32 >> 6) & 0b11) + 28,
+            ],
+        );
+        transmute(r)
+    }
+}
+
+/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_shufflehi_epi16&expand=5210)
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_shufflehi_epi16<const IMM8: i32>(
+pub fn _mm512_mask_shufflehi_epi16<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm512_shufflehi_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), src.as_i16x32()))
+    }
 }
 
 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8812,10 +9252,12 @@ pub unsafe fn _mm512_mask_shufflehi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm512_shufflehi_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, r.as_i16x32(), i16x32::ZERO))
+pub fn _mm512_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_i16x32(), i16x32::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8826,14 +9268,16 @@ pub unsafe fn _mm512_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask32, a: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_shufflehi_epi16<const IMM8: i32>(
+pub fn _mm256_mask_shufflehi_epi16<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), src.as_i16x16()))
+    }
 }
 
 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8844,10 +9288,12 @@ pub unsafe fn _mm256_mask_shufflehi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shuffle.as_i16x16(), i16x16::ZERO))
+pub fn _mm256_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm256_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x16(), i16x16::ZERO))
+    }
 }
 
 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8858,14 +9304,12 @@ pub unsafe fn _mm256_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask16, a: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_shufflehi_epi16<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
+pub fn _mm_mask_shufflehi_epi16<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), src.as_i16x8()))
+    }
 }
 
 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8876,10 +9320,12 @@ pub unsafe fn _mm_mask_shufflehi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
-    transmute(simd_select_bitmask(k, shuffle.as_i16x8(), i16x8::ZERO))
+pub fn _mm_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shuffle = _mm_shufflehi_epi16::<IMM8>(a);
+        transmute(simd_select_bitmask(k, shuffle.as_i16x8(), i16x8::ZERO))
+    }
 }
 
 /// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.
@@ -8889,8 +9335,8 @@ pub unsafe fn _mm_maskz_shufflehi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufb))]
-pub unsafe fn _mm512_shuffle_epi8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpshufb(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_shuffle_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpshufb(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8900,14 +9346,11 @@ pub unsafe fn _mm512_shuffle_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufb))]
-pub unsafe fn _mm512_mask_shuffle_epi8(
-    src: __m512i,
-    k: __mmask64,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, shuffle, src.as_i8x64()))
+pub fn _mm512_mask_shuffle_epi8(src: __m512i, k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, shuffle, src.as_i8x64()))
+    }
 }
 
 /// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8917,9 +9360,11 @@ pub unsafe fn _mm512_mask_shuffle_epi8(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufb))]
-pub unsafe fn _mm512_maskz_shuffle_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, shuffle, i8x64::ZERO))
+pub fn _mm512_maskz_shuffle_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let shuffle = _mm512_shuffle_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, shuffle, i8x64::ZERO))
+    }
 }
 
 /// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8929,14 +9374,11 @@ pub unsafe fn _mm512_maskz_shuffle_epi8(k: __mmask64, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufb))]
-pub unsafe fn _mm256_mask_shuffle_epi8(
-    src: __m256i,
-    k: __mmask32,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, shuffle, src.as_i8x32()))
+pub fn _mm256_mask_shuffle_epi8(src: __m256i, k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, shuffle, src.as_i8x32()))
+    }
 }
 
 /// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8946,9 +9388,11 @@ pub unsafe fn _mm256_mask_shuffle_epi8(
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufb))]
-pub unsafe fn _mm256_maskz_shuffle_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, shuffle, i8x32::ZERO))
+pub fn _mm256_maskz_shuffle_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let shuffle = _mm256_shuffle_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, shuffle, i8x32::ZERO))
+    }
 }
 
 /// Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -8958,9 +9402,11 @@ pub unsafe fn _mm256_maskz_shuffle_epi8(k: __mmask32, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufb))]
-pub unsafe fn _mm_mask_shuffle_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, shuffle, src.as_i8x16()))
+pub fn _mm_mask_shuffle_epi8(src: __m128i, k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, shuffle, src.as_i8x16()))
+    }
 }
 
 /// Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -8970,9 +9416,11 @@ pub unsafe fn _mm_mask_shuffle_epi8(src: __m128i, k: __mmask16, a: __m128i, b: _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufb))]
-pub unsafe fn _mm_maskz_shuffle_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, shuffle, i8x16::ZERO))
+pub fn _mm_maskz_shuffle_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let shuffle = _mm_shuffle_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, shuffle, i8x16::ZERO))
+    }
 }
 
 /// Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
@@ -8982,7 +9430,7 @@ pub unsafe fn _mm_maskz_shuffle_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmw))]
-pub unsafe fn _mm512_test_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_test_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
     let and = _mm512_and_si512(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_cmpneq_epi16_mask(and, zero)
@@ -8995,7 +9443,7 @@ pub unsafe fn _mm512_test_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmw))]
-pub unsafe fn _mm512_mask_test_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_test_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     let and = _mm512_and_si512(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_mask_cmpneq_epi16_mask(k, and, zero)
@@ -9008,7 +9456,7 @@ pub unsafe fn _mm512_mask_test_epi16_mask(k: __mmask32, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmw))]
-pub unsafe fn _mm256_test_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_test_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_cmpneq_epi16_mask(and, zero)
@@ -9021,7 +9469,7 @@ pub unsafe fn _mm256_test_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmw))]
-pub unsafe fn _mm256_mask_test_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_test_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_mask_cmpneq_epi16_mask(k, and, zero)
@@ -9034,7 +9482,7 @@ pub unsafe fn _mm256_mask_test_epi16_mask(k: __mmask16, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmw))]
-pub unsafe fn _mm_test_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_test_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_cmpneq_epi16_mask(and, zero)
@@ -9047,7 +9495,7 @@ pub unsafe fn _mm_test_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmw))]
-pub unsafe fn _mm_mask_test_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_test_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_mask_cmpneq_epi16_mask(k, and, zero)
@@ -9060,7 +9508,7 @@ pub unsafe fn _mm_mask_test_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmb))]
-pub unsafe fn _mm512_test_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_test_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
     let and = _mm512_and_si512(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_cmpneq_epi8_mask(and, zero)
@@ -9073,7 +9521,7 @@ pub unsafe fn _mm512_test_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmb))]
-pub unsafe fn _mm512_mask_test_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_test_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     let and = _mm512_and_si512(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_mask_cmpneq_epi8_mask(k, and, zero)
@@ -9086,7 +9534,7 @@ pub unsafe fn _mm512_mask_test_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmb))]
-pub unsafe fn _mm256_test_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_test_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_cmpneq_epi8_mask(and, zero)
@@ -9099,7 +9547,7 @@ pub unsafe fn _mm256_test_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmb))]
-pub unsafe fn _mm256_mask_test_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_test_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_mask_cmpneq_epi8_mask(k, and, zero)
@@ -9112,7 +9560,7 @@ pub unsafe fn _mm256_mask_test_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmb))]
-pub unsafe fn _mm_test_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_test_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_cmpneq_epi8_mask(and, zero)
@@ -9125,7 +9573,7 @@ pub unsafe fn _mm_test_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmb))]
-pub unsafe fn _mm_mask_test_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_test_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_mask_cmpneq_epi8_mask(k, and, zero)
@@ -9138,7 +9586,7 @@ pub unsafe fn _mm_mask_test_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmw))]
-pub unsafe fn _mm512_testn_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_testn_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
     let and = _mm512_and_si512(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_cmpeq_epi16_mask(and, zero)
@@ -9151,7 +9599,7 @@ pub unsafe fn _mm512_testn_epi16_mask(a: __m512i, b: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmw))]
-pub unsafe fn _mm512_mask_testn_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
+pub fn _mm512_mask_testn_epi16_mask(k: __mmask32, a: __m512i, b: __m512i) -> __mmask32 {
     let and = _mm512_and_si512(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_mask_cmpeq_epi16_mask(k, and, zero)
@@ -9164,7 +9612,7 @@ pub unsafe fn _mm512_mask_testn_epi16_mask(k: __mmask32, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmw))]
-pub unsafe fn _mm256_testn_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_testn_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_cmpeq_epi16_mask(and, zero)
@@ -9177,7 +9625,7 @@ pub unsafe fn _mm256_testn_epi16_mask(a: __m256i, b: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmw))]
-pub unsafe fn _mm256_mask_testn_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
+pub fn _mm256_mask_testn_epi16_mask(k: __mmask16, a: __m256i, b: __m256i) -> __mmask16 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_mask_cmpeq_epi16_mask(k, and, zero)
@@ -9190,7 +9638,7 @@ pub unsafe fn _mm256_mask_testn_epi16_mask(k: __mmask16, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmw))]
-pub unsafe fn _mm_testn_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_testn_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_cmpeq_epi16_mask(and, zero)
@@ -9203,7 +9651,7 @@ pub unsafe fn _mm_testn_epi16_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmw))]
-pub unsafe fn _mm_mask_testn_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_testn_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_mask_cmpeq_epi16_mask(k, and, zero)
@@ -9216,7 +9664,7 @@ pub unsafe fn _mm_mask_testn_epi16_mask(k: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmb))]
-pub unsafe fn _mm512_testn_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_testn_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
     let and = _mm512_and_si512(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_cmpeq_epi8_mask(and, zero)
@@ -9229,7 +9677,7 @@ pub unsafe fn _mm512_testn_epi8_mask(a: __m512i, b: __m512i) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmb))]
-pub unsafe fn _mm512_mask_testn_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
+pub fn _mm512_mask_testn_epi8_mask(k: __mmask64, a: __m512i, b: __m512i) -> __mmask64 {
     let and = _mm512_and_si512(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_mask_cmpeq_epi8_mask(k, and, zero)
@@ -9242,7 +9690,7 @@ pub unsafe fn _mm512_mask_testn_epi8_mask(k: __mmask64, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmb))]
-pub unsafe fn _mm256_testn_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_testn_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_cmpeq_epi8_mask(and, zero)
@@ -9255,7 +9703,7 @@ pub unsafe fn _mm256_testn_epi8_mask(a: __m256i, b: __m256i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmb))]
-pub unsafe fn _mm256_mask_testn_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
+pub fn _mm256_mask_testn_epi8_mask(k: __mmask32, a: __m256i, b: __m256i) -> __mmask32 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_mask_cmpeq_epi8_mask(k, and, zero)
@@ -9268,7 +9716,7 @@ pub unsafe fn _mm256_mask_testn_epi8_mask(k: __mmask32, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmb))]
-pub unsafe fn _mm_testn_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_testn_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_cmpeq_epi8_mask(and, zero)
@@ -9281,7 +9729,7 @@ pub unsafe fn _mm_testn_epi8_mask(a: __m128i, b: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmb))]
-pub unsafe fn _mm_mask_testn_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
+pub fn _mm_mask_testn_epi8_mask(k: __mmask16, a: __m128i, b: __m128i) -> __mmask16 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_mask_cmpeq_epi8_mask(k, and, zero)
@@ -9338,8 +9786,8 @@ pub unsafe fn _load_mask32(mem_addr: *const __mmask32) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsadbw))]
-pub unsafe fn _mm512_sad_epu8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpsadbw(a.as_u8x64(), b.as_u8x64()))
+pub fn _mm512_sad_epu8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpsadbw(a.as_u8x64(), b.as_u8x64())) }
 }
 
 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
@@ -9350,12 +9798,14 @@ pub unsafe fn _mm512_sad_epu8(a: __m512i, b: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub unsafe fn _mm512_dbsad_epu8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_u8x64();
-    let b = b.as_u8x64();
-    let r = vdbpsadbw(a, b, IMM8);
-    transmute(r)
+pub fn _mm512_dbsad_epu8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = vdbpsadbw(a, b, IMM8);
+        transmute(r)
+    }
 }
 
 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
@@ -9366,17 +9816,19 @@ pub unsafe fn _mm512_dbsad_epu8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m5
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(4)]
 #[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub unsafe fn _mm512_mask_dbsad_epu8<const IMM8: i32>(
+pub fn _mm512_mask_dbsad_epu8<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_u8x64();
-    let b = b.as_u8x64();
-    let r = vdbpsadbw(a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_u16x32()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = vdbpsadbw(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_u16x32()))
+    }
 }
 
 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
@@ -9387,16 +9839,14 @@ pub unsafe fn _mm512_mask_dbsad_epu8<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub unsafe fn _mm512_maskz_dbsad_epu8<const IMM8: i32>(
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_u8x64();
-    let b = b.as_u8x64();
-    let r = vdbpsadbw(a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, u16x32::ZERO))
+pub fn _mm512_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x64();
+        let b = b.as_u8x64();
+        let r = vdbpsadbw(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, u16x32::ZERO))
+    }
 }
 
 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
@@ -9407,12 +9857,14 @@ pub unsafe fn _mm512_maskz_dbsad_epu8<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub unsafe fn _mm256_dbsad_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_u8x32();
-    let b = b.as_u8x32();
-    let r = vdbpsadbw256(a, b, IMM8);
-    transmute(r)
+pub fn _mm256_dbsad_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = vdbpsadbw256(a, b, IMM8);
+        transmute(r)
+    }
 }
 
 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
@@ -9423,17 +9875,19 @@ pub unsafe fn _mm256_dbsad_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m2
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(4)]
 #[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub unsafe fn _mm256_mask_dbsad_epu8<const IMM8: i32>(
+pub fn _mm256_mask_dbsad_epu8<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_u8x32();
-    let b = b.as_u8x32();
-    let r = vdbpsadbw256(a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_u16x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = vdbpsadbw256(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_u16x16()))
+    }
 }
 
 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
@@ -9444,16 +9898,14 @@ pub unsafe fn _mm256_mask_dbsad_epu8<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub unsafe fn _mm256_maskz_dbsad_epu8<const IMM8: i32>(
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_u8x32();
-    let b = b.as_u8x32();
-    let r = vdbpsadbw256(a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, u16x16::ZERO))
+pub fn _mm256_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x32();
+        let b = b.as_u8x32();
+        let r = vdbpsadbw256(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, u16x16::ZERO))
+    }
 }
 
 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst. Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
@@ -9464,12 +9916,14 @@ pub unsafe fn _mm256_maskz_dbsad_epu8<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub unsafe fn _mm_dbsad_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_u8x16();
-    let b = b.as_u8x16();
-    let r = vdbpsadbw128(a, b, IMM8);
-    transmute(r)
+pub fn _mm_dbsad_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = vdbpsadbw128(a, b, IMM8);
+        transmute(r)
+    }
 }
 
 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
@@ -9480,17 +9934,19 @@ pub unsafe fn _mm_dbsad_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(4)]
 #[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub unsafe fn _mm_mask_dbsad_epu8<const IMM8: i32>(
+pub fn _mm_mask_dbsad_epu8<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_u8x16();
-    let b = b.as_u8x16();
-    let r = vdbpsadbw128(a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_u16x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = vdbpsadbw128(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_u16x8()))
+    }
 }
 
 /// Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
@@ -9501,16 +9957,14 @@ pub unsafe fn _mm_mask_dbsad_epu8<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vdbpsadbw, IMM8 = 0))]
-pub unsafe fn _mm_maskz_dbsad_epu8<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_u8x16();
-    let b = b.as_u8x16();
-    let r = vdbpsadbw128(a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, u16x8::ZERO))
+pub fn _mm_maskz_dbsad_epu8<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_u8x16();
+        let b = b.as_u8x16();
+        let r = vdbpsadbw128(a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, u16x8::ZERO))
+    }
 }
 
 /// Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
@@ -9520,7 +9974,7 @@ pub unsafe fn _mm_maskz_dbsad_epu8<const IMM8: i32>(
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovw2m))]
-pub unsafe fn _mm512_movepi16_mask(a: __m512i) -> __mmask32 {
+pub fn _mm512_movepi16_mask(a: __m512i) -> __mmask32 {
     let filter = _mm512_set1_epi16(1 << 15);
     let a = _mm512_and_si512(a, filter);
     _mm512_cmpeq_epi16_mask(a, filter)
@@ -9533,7 +9987,7 @@ pub unsafe fn _mm512_movepi16_mask(a: __m512i) -> __mmask32 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovw2m))]
-pub unsafe fn _mm256_movepi16_mask(a: __m256i) -> __mmask16 {
+pub fn _mm256_movepi16_mask(a: __m256i) -> __mmask16 {
     let filter = _mm256_set1_epi16(1 << 15);
     let a = _mm256_and_si256(a, filter);
     _mm256_cmpeq_epi16_mask(a, filter)
@@ -9546,7 +10000,7 @@ pub unsafe fn _mm256_movepi16_mask(a: __m256i) -> __mmask16 {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovw2m))]
-pub unsafe fn _mm_movepi16_mask(a: __m128i) -> __mmask8 {
+pub fn _mm_movepi16_mask(a: __m128i) -> __mmask8 {
     let filter = _mm_set1_epi16(1 << 15);
     let a = _mm_and_si128(a, filter);
     _mm_cmpeq_epi16_mask(a, filter)
@@ -9559,7 +10013,7 @@ pub unsafe fn _mm_movepi16_mask(a: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovb2m))]
-pub unsafe fn _mm512_movepi8_mask(a: __m512i) -> __mmask64 {
+pub fn _mm512_movepi8_mask(a: __m512i) -> __mmask64 {
     let filter = _mm512_set1_epi8(1 << 7);
     let a = _mm512_and_si512(a, filter);
     _mm512_cmpeq_epi8_mask(a, filter)
@@ -9573,7 +10027,7 @@ pub unsafe fn _mm512_movepi8_mask(a: __m512i) -> __mmask64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
 // using vpmovb2m plus converting the mask register to a standard register.
-pub unsafe fn _mm256_movepi8_mask(a: __m256i) -> __mmask32 {
+pub fn _mm256_movepi8_mask(a: __m256i) -> __mmask32 {
     let filter = _mm256_set1_epi8(1 << 7);
     let a = _mm256_and_si256(a, filter);
     _mm256_cmpeq_epi8_mask(a, filter)
@@ -9587,7 +10041,7 @@ pub unsafe fn _mm256_movepi8_mask(a: __m256i) -> __mmask32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovmskb))] // should be vpmovb2m but compiled to vpmovmskb in the test shim because that takes less cycles than
 // using vpmovb2m plus converting the mask register to a standard register.
-pub unsafe fn _mm_movepi8_mask(a: __m128i) -> __mmask16 {
+pub fn _mm_movepi8_mask(a: __m128i) -> __mmask16 {
     let filter = _mm_set1_epi8(1 << 7);
     let a = _mm_and_si128(a, filter);
     _mm_cmpeq_epi8_mask(a, filter)
@@ -9600,27 +10054,29 @@ pub unsafe fn _mm_movepi8_mask(a: __m128i) -> __mmask16 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovm2w))]
-pub unsafe fn _mm512_movm_epi16(k: __mmask32) -> __m512i {
-    let one = _mm512_set1_epi16(
-        1 << 15
-            | 1 << 14
-            | 1 << 13
-            | 1 << 12
-            | 1 << 11
-            | 1 << 10
-            | 1 << 9
-            | 1 << 8
-            | 1 << 7
-            | 1 << 6
-            | 1 << 5
-            | 1 << 4
-            | 1 << 3
-            | 1 << 2
-            | 1 << 1
-            | 1 << 0,
-    )
-    .as_i16x32();
-    transmute(simd_select_bitmask(k, one, i16x32::ZERO))
+pub fn _mm512_movm_epi16(k: __mmask32) -> __m512i {
+    unsafe {
+        let one = _mm512_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        )
+        .as_i16x32();
+        transmute(simd_select_bitmask(k, one, i16x32::ZERO))
+    }
 }
 
 /// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
@@ -9630,27 +10086,29 @@ pub unsafe fn _mm512_movm_epi16(k: __mmask32) -> __m512i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovm2w))]
-pub unsafe fn _mm256_movm_epi16(k: __mmask16) -> __m256i {
-    let one = _mm256_set1_epi16(
-        1 << 15
-            | 1 << 14
-            | 1 << 13
-            | 1 << 12
-            | 1 << 11
-            | 1 << 10
-            | 1 << 9
-            | 1 << 8
-            | 1 << 7
-            | 1 << 6
-            | 1 << 5
-            | 1 << 4
-            | 1 << 3
-            | 1 << 2
-            | 1 << 1
-            | 1 << 0,
-    )
-    .as_i16x16();
-    transmute(simd_select_bitmask(k, one, i16x16::ZERO))
+pub fn _mm256_movm_epi16(k: __mmask16) -> __m256i {
+    unsafe {
+        let one = _mm256_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        )
+        .as_i16x16();
+        transmute(simd_select_bitmask(k, one, i16x16::ZERO))
+    }
 }
 
 /// Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
@@ -9660,27 +10118,29 @@ pub unsafe fn _mm256_movm_epi16(k: __mmask16) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovm2w))]
-pub unsafe fn _mm_movm_epi16(k: __mmask8) -> __m128i {
-    let one = _mm_set1_epi16(
-        1 << 15
-            | 1 << 14
-            | 1 << 13
-            | 1 << 12
-            | 1 << 11
-            | 1 << 10
-            | 1 << 9
-            | 1 << 8
-            | 1 << 7
-            | 1 << 6
-            | 1 << 5
-            | 1 << 4
-            | 1 << 3
-            | 1 << 2
-            | 1 << 1
-            | 1 << 0,
-    )
-    .as_i16x8();
-    transmute(simd_select_bitmask(k, one, i16x8::ZERO))
+pub fn _mm_movm_epi16(k: __mmask8) -> __m128i {
+    unsafe {
+        let one = _mm_set1_epi16(
+            1 << 15
+                | 1 << 14
+                | 1 << 13
+                | 1 << 12
+                | 1 << 11
+                | 1 << 10
+                | 1 << 9
+                | 1 << 8
+                | 1 << 7
+                | 1 << 6
+                | 1 << 5
+                | 1 << 4
+                | 1 << 3
+                | 1 << 2
+                | 1 << 1
+                | 1 << 0,
+        )
+        .as_i16x8();
+        transmute(simd_select_bitmask(k, one, i16x8::ZERO))
+    }
 }
 
 /// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
@@ -9690,11 +10150,13 @@ pub unsafe fn _mm_movm_epi16(k: __mmask8) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovm2b))]
-pub unsafe fn _mm512_movm_epi8(k: __mmask64) -> __m512i {
-    let one =
-        _mm512_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
-            .as_i8x64();
-    transmute(simd_select_bitmask(k, one, i8x64::ZERO))
+pub fn _mm512_movm_epi8(k: __mmask64) -> __m512i {
+    unsafe {
+        let one =
+            _mm512_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+                .as_i8x64();
+        transmute(simd_select_bitmask(k, one, i8x64::ZERO))
+    }
 }
 
 /// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
@@ -9704,11 +10166,13 @@ pub unsafe fn _mm512_movm_epi8(k: __mmask64) -> __m512i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovm2b))]
-pub unsafe fn _mm256_movm_epi8(k: __mmask32) -> __m256i {
-    let one =
-        _mm256_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
-            .as_i8x32();
-    transmute(simd_select_bitmask(k, one, i8x32::ZERO))
+pub fn _mm256_movm_epi8(k: __mmask32) -> __m256i {
+    unsafe {
+        let one =
+            _mm256_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+                .as_i8x32();
+        transmute(simd_select_bitmask(k, one, i8x32::ZERO))
+    }
 }
 
 /// Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
@@ -9718,10 +10182,13 @@ pub unsafe fn _mm256_movm_epi8(k: __mmask32) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovm2b))]
-pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i {
-    let one = _mm_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
-        .as_i8x16();
-    transmute(simd_select_bitmask(k, one, i8x16::ZERO))
+pub fn _mm_movm_epi8(k: __mmask16) -> __m128i {
+    unsafe {
+        let one =
+            _mm_set1_epi8(1 << 7 | 1 << 6 | 1 << 5 | 1 << 4 | 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0)
+                .as_i8x16();
+        transmute(simd_select_bitmask(k, one, i8x16::ZERO))
+    }
 }
 
 /// Convert 32-bit mask a into an integer value, and store the result in dst.
@@ -9730,7 +10197,7 @@ pub unsafe fn _mm_movm_epi8(k: __mmask16) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _cvtmask32_u32(a: __mmask32) -> u32 {
+pub fn _cvtmask32_u32(a: __mmask32) -> u32 {
     a
 }
 
@@ -9740,7 +10207,7 @@ pub unsafe fn _cvtmask32_u32(a: __mmask32) -> u32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _cvtu32_mask32(a: u32) -> __mmask32 {
+pub fn _cvtu32_mask32(a: u32) -> __mmask32 {
     a
 }
 
@@ -9750,7 +10217,7 @@ pub unsafe fn _cvtu32_mask32(a: u32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+pub fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
     a + b
 }
 
@@ -9760,7 +10227,7 @@ pub unsafe fn _kadd_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+pub fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
     a + b
 }
 
@@ -9770,7 +10237,7 @@ pub unsafe fn _kadd_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+pub fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
     a & b
 }
 
@@ -9780,7 +10247,7 @@ pub unsafe fn _kand_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+pub fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
     a & b
 }
 
@@ -9790,7 +10257,7 @@ pub unsafe fn _kand_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
+pub fn _knot_mask32(a: __mmask32) -> __mmask32 {
     !a
 }
 
@@ -9800,7 +10267,7 @@ pub unsafe fn _knot_mask32(a: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
+pub fn _knot_mask64(a: __mmask64) -> __mmask64 {
     !a
 }
 
@@ -9810,7 +10277,7 @@ pub unsafe fn _knot_mask64(a: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+pub fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
     _knot_mask32(a) & b
 }
 
@@ -9820,7 +10287,7 @@ pub unsafe fn _kandn_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+pub fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
     _knot_mask64(a) & b
 }
 
@@ -9830,7 +10297,7 @@ pub unsafe fn _kandn_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+pub fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
     a | b
 }
 
@@ -9840,7 +10307,7 @@ pub unsafe fn _kor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+pub fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
     a | b
 }
 
@@ -9850,7 +10317,7 @@ pub unsafe fn _kor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+pub fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
     a ^ b
 }
 
@@ -9860,7 +10327,7 @@ pub unsafe fn _kxor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+pub fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
     a ^ b
 }
 
@@ -9870,7 +10337,7 @@ pub unsafe fn _kxor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
+pub fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
     _knot_mask32(a ^ b)
 }
 
@@ -9880,7 +10347,7 @@ pub unsafe fn _kxnor_mask32(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
+pub fn _kxnor_mask64(a: __mmask64, b: __mmask64) -> __mmask64 {
     _knot_mask64(a ^ b)
 }
 
@@ -9917,7 +10384,7 @@ pub unsafe fn _kortest_mask64_u8(a: __mmask64, b: __mmask64, all_ones: *mut u8)
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kortestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+pub fn _kortestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
     (_kor_mask32(a, b) == 0xffffffff) as u8
 }
 
@@ -9928,7 +10395,7 @@ pub unsafe fn _kortestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kortestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+pub fn _kortestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
     (_kor_mask64(a, b) == 0xffffffff_ffffffff) as u8
 }
 
@@ -9939,7 +10406,7 @@ pub unsafe fn _kortestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kortestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+pub fn _kortestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
     (_kor_mask32(a, b) == 0) as u8
 }
 
@@ -9950,7 +10417,7 @@ pub unsafe fn _kortestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kortestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+pub fn _kortestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
     (_kor_mask64(a, b) == 0) as u8
 }
 
@@ -9961,7 +10428,7 @@ pub unsafe fn _kortestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
 #[target_feature(enable = "avx512bw")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kshiftli_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
+pub fn _kshiftli_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
     a << COUNT
 }
 
@@ -9972,7 +10439,7 @@ pub unsafe fn _kshiftli_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kshiftli_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
+pub fn _kshiftli_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
     a << COUNT
 }
 
@@ -9983,7 +10450,7 @@ pub unsafe fn _kshiftli_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kshiftri_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
+pub fn _kshiftri_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
     a >> COUNT
 }
 
@@ -9994,7 +10461,7 @@ pub unsafe fn _kshiftri_mask32<const COUNT: u32>(a: __mmask32) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kshiftri_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
+pub fn _kshiftri_mask64<const COUNT: u32>(a: __mmask64) -> __mmask64 {
     a >> COUNT
 }
 
@@ -10031,7 +10498,7 @@ pub unsafe fn _ktest_mask64_u8(a: __mmask64, b: __mmask64, and_not: *mut u8) ->
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _ktestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+pub fn _ktestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
     (_kandn_mask32(a, b) == 0) as u8
 }
 
@@ -10042,7 +10509,7 @@ pub unsafe fn _ktestc_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _ktestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+pub fn _ktestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
     (_kandn_mask64(a, b) == 0) as u8
 }
 
@@ -10053,7 +10520,7 @@ pub unsafe fn _ktestc_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _ktestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
+pub fn _ktestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
     (_kand_mask32(a, b) == 0) as u8
 }
 
@@ -10064,7 +10531,7 @@ pub unsafe fn _ktestz_mask32_u8(a: __mmask32, b: __mmask32) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _ktestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
+pub fn _ktestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
     (_kand_mask64(a, b) == 0) as u8
 }
 
@@ -10075,7 +10542,7 @@ pub unsafe fn _ktestz_mask64_u8(a: __mmask64, b: __mmask64) -> u8 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckwd
-pub unsafe fn _mm512_kunpackw(a: __mmask32, b: __mmask32) -> __mmask32 {
+pub fn _mm512_kunpackw(a: __mmask32, b: __mmask32) -> __mmask32 {
     ((a & 0xffff) << 16) | (b & 0xffff)
 }
 
@@ -10086,7 +10553,7 @@ pub unsafe fn _mm512_kunpackw(a: __mmask32, b: __mmask32) -> __mmask32 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckdq
-pub unsafe fn _mm512_kunpackd(a: __mmask64, b: __mmask64) -> __mmask64 {
+pub fn _mm512_kunpackd(a: __mmask64, b: __mmask64) -> __mmask64 {
     ((a & 0xffffffff) << 32) | (b & 0xffffffff)
 }
 
@@ -10097,9 +10564,11 @@ pub unsafe fn _mm512_kunpackd(a: __mmask64, b: __mmask64) -> __mmask64 {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm512_cvtepi16_epi8(a: __m512i) -> __m256i {
-    let a = a.as_i16x32();
-    transmute::<i8x32, _>(simd_cast(a))
+pub fn _mm512_cvtepi16_epi8(a: __m512i) -> __m256i {
+    unsafe {
+        let a = a.as_i16x32();
+        transmute::<i8x32, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10109,9 +10578,11 @@ pub unsafe fn _mm512_cvtepi16_epi8(a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm512_mask_cvtepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
-    let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
-    transmute(simd_select_bitmask(k, convert, src.as_i8x32()))
+pub fn _mm512_mask_cvtepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, convert, src.as_i8x32()))
+    }
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10121,9 +10592,11 @@ pub unsafe fn _mm512_mask_cvtepi16_epi8(src: __m256i, k: __mmask32, a: __m512i)
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm512_maskz_cvtepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
-    let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
-    transmute(simd_select_bitmask(k, convert, i8x32::ZERO))
+pub fn _mm512_maskz_cvtepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi8(a).as_i8x32();
+        transmute(simd_select_bitmask(k, convert, i8x32::ZERO))
+    }
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
@@ -10133,9 +10606,11 @@ pub unsafe fn _mm512_maskz_cvtepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm256_cvtepi16_epi8(a: __m256i) -> __m128i {
-    let a = a.as_i16x16();
-    transmute::<i8x16, _>(simd_cast(a))
+pub fn _mm256_cvtepi16_epi8(a: __m256i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x16();
+        transmute::<i8x16, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10145,9 +10620,11 @@ pub unsafe fn _mm256_cvtepi16_epi8(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm256_mask_cvtepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
-    let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
-    transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+pub fn _mm256_mask_cvtepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+    }
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10157,9 +10634,11 @@ pub unsafe fn _mm256_mask_cvtepi16_epi8(src: __m128i, k: __mmask16, a: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm256_maskz_cvtepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
-    let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
-    transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+pub fn _mm256_maskz_cvtepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+    }
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
@@ -10169,14 +10648,16 @@ pub unsafe fn _mm256_maskz_cvtepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm_cvtepi16_epi8(a: __m128i) -> __m128i {
-    let a = a.as_i16x8();
-    let v256: i16x16 = simd_shuffle!(
-        a,
-        i16x8::ZERO,
-        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
-    );
-    transmute::<i8x16, _>(simd_cast(v256))
+pub fn _mm_cvtepi16_epi8(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i16x8();
+        let v256: i16x16 = simd_shuffle!(
+            a,
+            i16x8::ZERO,
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
+        );
+        transmute::<i8x16, _>(simd_cast(v256))
+    }
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10186,10 +10667,12 @@ pub unsafe fn _mm_cvtepi16_epi8(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm_mask_cvtepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi16_epi8(a).as_i8x16();
-    let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
-    transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+pub fn _mm_mask_cvtepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi8(a).as_i8x16();
+        let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
+        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+    }
 }
 
 /// Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10199,10 +10682,12 @@ pub unsafe fn _mm_mask_cvtepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovwb))]
-pub unsafe fn _mm_maskz_cvtepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi16_epi8(a).as_i8x16();
-    let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
-    transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+pub fn _mm_maskz_cvtepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi8(a).as_i8x16();
+        let k: __mmask16 = 0b11111111_11111111 & k as __mmask16;
+        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+    }
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
@@ -10212,12 +10697,14 @@ pub unsafe fn _mm_maskz_cvtepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm512_cvtsepi16_epi8(a: __m512i) -> __m256i {
-    transmute(vpmovswb(
-        a.as_i16x32(),
-        i8x32::ZERO,
-        0b11111111_11111111_11111111_11111111,
-    ))
+pub fn _mm512_cvtsepi16_epi8(a: __m512i) -> __m256i {
+    unsafe {
+        transmute(vpmovswb(
+            a.as_i16x32(),
+            i8x32::ZERO,
+            0b11111111_11111111_11111111_11111111,
+        ))
+    }
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10227,8 +10714,8 @@ pub unsafe fn _mm512_cvtsepi16_epi8(a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm512_mask_cvtsepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
-    transmute(vpmovswb(a.as_i16x32(), src.as_i8x32(), k))
+pub fn _mm512_mask_cvtsepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovswb(a.as_i16x32(), src.as_i8x32(), k)) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10238,8 +10725,8 @@ pub unsafe fn _mm512_mask_cvtsepi16_epi8(src: __m256i, k: __mmask32, a: __m512i)
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm512_maskz_cvtsepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
-    transmute(vpmovswb(a.as_i16x32(), i8x32::ZERO, k))
+pub fn _mm512_maskz_cvtsepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovswb(a.as_i16x32(), i8x32::ZERO, k)) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
@@ -10249,8 +10736,8 @@ pub unsafe fn _mm512_maskz_cvtsepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm256_cvtsepi16_epi8(a: __m256i) -> __m128i {
-    transmute(vpmovswb256(a.as_i16x16(), i8x16::ZERO, 0b11111111_11111111))
+pub fn _mm256_cvtsepi16_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovswb256(a.as_i16x16(), i8x16::ZERO, 0b11111111_11111111)) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10260,8 +10747,8 @@ pub unsafe fn _mm256_cvtsepi16_epi8(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm256_mask_cvtsepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
-    transmute(vpmovswb256(a.as_i16x16(), src.as_i8x16(), k))
+pub fn _mm256_mask_cvtsepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovswb256(a.as_i16x16(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10271,8 +10758,8 @@ pub unsafe fn _mm256_mask_cvtsepi16_epi8(src: __m128i, k: __mmask16, a: __m256i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm256_maskz_cvtsepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
-    transmute(vpmovswb256(a.as_i16x16(), i8x16::ZERO, k))
+pub fn _mm256_maskz_cvtsepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovswb256(a.as_i16x16(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
@@ -10282,8 +10769,8 @@ pub unsafe fn _mm256_maskz_cvtsepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm_cvtsepi16_epi8(a: __m128i) -> __m128i {
-    transmute(vpmovswb128(a.as_i16x8(), i8x16::ZERO, 0b11111111))
+pub fn _mm_cvtsepi16_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovswb128(a.as_i16x8(), i8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10293,8 +10780,8 @@ pub unsafe fn _mm_cvtsepi16_epi8(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm_mask_cvtsepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovswb128(a.as_i16x8(), src.as_i8x16(), k))
+pub fn _mm_mask_cvtsepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovswb128(a.as_i16x8(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10304,8 +10791,8 @@ pub unsafe fn _mm_mask_cvtsepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovswb))]
-pub unsafe fn _mm_maskz_cvtsepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovswb128(a.as_i16x8(), i8x16::ZERO, k))
+pub fn _mm_maskz_cvtsepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovswb128(a.as_i16x8(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
@@ -10315,12 +10802,14 @@ pub unsafe fn _mm_maskz_cvtsepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm512_cvtusepi16_epi8(a: __m512i) -> __m256i {
-    transmute(vpmovuswb(
-        a.as_u16x32(),
-        u8x32::ZERO,
-        0b11111111_11111111_11111111_11111111,
-    ))
+pub fn _mm512_cvtusepi16_epi8(a: __m512i) -> __m256i {
+    unsafe {
+        transmute(vpmovuswb(
+            a.as_u16x32(),
+            u8x32::ZERO,
+            0b11111111_11111111_11111111_11111111,
+        ))
+    }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10330,8 +10819,8 @@ pub unsafe fn _mm512_cvtusepi16_epi8(a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm512_mask_cvtusepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
-    transmute(vpmovuswb(a.as_u16x32(), src.as_u8x32(), k))
+pub fn _mm512_mask_cvtusepi16_epi8(src: __m256i, k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovuswb(a.as_u16x32(), src.as_u8x32(), k)) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10341,8 +10830,8 @@ pub unsafe fn _mm512_mask_cvtusepi16_epi8(src: __m256i, k: __mmask32, a: __m512i
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm512_maskz_cvtusepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
-    transmute(vpmovuswb(a.as_u16x32(), u8x32::ZERO, k))
+pub fn _mm512_maskz_cvtusepi16_epi8(k: __mmask32, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovuswb(a.as_u16x32(), u8x32::ZERO, k)) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
@@ -10352,12 +10841,14 @@ pub unsafe fn _mm512_maskz_cvtusepi16_epi8(k: __mmask32, a: __m512i) -> __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm256_cvtusepi16_epi8(a: __m256i) -> __m128i {
-    transmute(vpmovuswb256(
-        a.as_u16x16(),
-        u8x16::ZERO,
-        0b11111111_11111111,
-    ))
+pub fn _mm256_cvtusepi16_epi8(a: __m256i) -> __m128i {
+    unsafe {
+        transmute(vpmovuswb256(
+            a.as_u16x16(),
+            u8x16::ZERO,
+            0b11111111_11111111,
+        ))
+    }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10367,8 +10858,8 @@ pub unsafe fn _mm256_cvtusepi16_epi8(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm256_mask_cvtusepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
-    transmute(vpmovuswb256(a.as_u16x16(), src.as_u8x16(), k))
+pub fn _mm256_mask_cvtusepi16_epi8(src: __m128i, k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovuswb256(a.as_u16x16(), src.as_u8x16(), k)) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10378,8 +10869,8 @@ pub unsafe fn _mm256_mask_cvtusepi16_epi8(src: __m128i, k: __mmask16, a: __m256i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm256_maskz_cvtusepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
-    transmute(vpmovuswb256(a.as_u16x16(), u8x16::ZERO, k))
+pub fn _mm256_maskz_cvtusepi16_epi8(k: __mmask16, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovuswb256(a.as_u16x16(), u8x16::ZERO, k)) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
@@ -10389,8 +10880,8 @@ pub unsafe fn _mm256_maskz_cvtusepi16_epi8(k: __mmask16, a: __m256i) -> __m128i
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm_cvtusepi16_epi8(a: __m128i) -> __m128i {
-    transmute(vpmovuswb128(a.as_u16x8(), u8x16::ZERO, 0b11111111))
+pub fn _mm_cvtusepi16_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovuswb128(a.as_u16x8(), u8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10400,8 +10891,8 @@ pub unsafe fn _mm_cvtusepi16_epi8(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm_mask_cvtusepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovuswb128(a.as_u16x8(), src.as_u8x16(), k))
+pub fn _mm_mask_cvtusepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovuswb128(a.as_u16x8(), src.as_u8x16(), k)) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10411,8 +10902,8 @@ pub unsafe fn _mm_mask_cvtusepi16_epi8(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovuswb))]
-pub unsafe fn _mm_maskz_cvtusepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovuswb128(a.as_u16x8(), u8x16::ZERO, k))
+pub fn _mm_maskz_cvtusepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovuswb128(a.as_u16x8(), u8x16::ZERO, k)) }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.
@@ -10422,9 +10913,11 @@ pub unsafe fn _mm_maskz_cvtusepi16_epi8(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub unsafe fn _mm512_cvtepi8_epi16(a: __m256i) -> __m512i {
-    let a = a.as_i8x32();
-    transmute::<i16x32, _>(simd_cast(a))
+pub fn _mm512_cvtepi8_epi16(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x32();
+        transmute::<i16x32, _>(simd_cast(a))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10434,9 +10927,11 @@ pub unsafe fn _mm512_cvtepi8_epi16(a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub unsafe fn _mm512_mask_cvtepi8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
-    transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
+pub fn _mm512_mask_cvtepi8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10446,9 +10941,11 @@ pub unsafe fn _mm512_mask_cvtepi8_epi16(src: __m512i, k: __mmask32, a: __m256i)
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub unsafe fn _mm512_maskz_cvtepi8_epi16(k: __mmask32, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
-    transmute(simd_select_bitmask(k, convert, i16x32::ZERO))
+pub fn _mm512_maskz_cvtepi8_epi16(k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, i16x32::ZERO))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10458,9 +10955,11 @@ pub unsafe fn _mm512_maskz_cvtepi8_epi16(k: __mmask32, a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub unsafe fn _mm256_mask_cvtepi8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+pub fn _mm256_mask_cvtepi8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10470,9 +10969,11 @@ pub unsafe fn _mm256_mask_cvtepi8_epi16(src: __m256i, k: __mmask16, a: __m128i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub unsafe fn _mm256_maskz_cvtepi8_epi16(k: __mmask16, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+pub fn _mm256_maskz_cvtepi8_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10482,9 +10983,11 @@ pub unsafe fn _mm256_maskz_cvtepi8_epi16(k: __mmask16, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub unsafe fn _mm_mask_cvtepi8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi8_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+pub fn _mm_mask_cvtepi8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10494,9 +10997,11 @@ pub unsafe fn _mm_mask_cvtepi8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbw))]
-pub unsafe fn _mm_maskz_cvtepi8_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi8_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+pub fn _mm_maskz_cvtepi8_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.
@@ -10506,9 +11011,11 @@ pub unsafe fn _mm_maskz_cvtepi8_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub unsafe fn _mm512_cvtepu8_epi16(a: __m256i) -> __m512i {
-    let a = a.as_u8x32();
-    transmute::<i16x32, _>(simd_cast(a))
+pub fn _mm512_cvtepu8_epi16(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x32();
+        transmute::<i16x32, _>(simd_cast(a))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10518,9 +11025,11 @@ pub unsafe fn _mm512_cvtepu8_epi16(a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub unsafe fn _mm512_mask_cvtepu8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
-    transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
+pub fn _mm512_mask_cvtepu8_epi16(src: __m512i, k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x32()))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10530,9 +11039,11 @@ pub unsafe fn _mm512_mask_cvtepu8_epi16(src: __m512i, k: __mmask32, a: __m256i)
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub unsafe fn _mm512_maskz_cvtepu8_epi16(k: __mmask32, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
-    transmute(simd_select_bitmask(k, convert, i16x32::ZERO))
+pub fn _mm512_maskz_cvtepu8_epi16(k: __mmask32, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi16(a).as_i16x32();
+        transmute(simd_select_bitmask(k, convert, i16x32::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10542,9 +11053,11 @@ pub unsafe fn _mm512_maskz_cvtepu8_epi16(k: __mmask32, a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub unsafe fn _mm256_mask_cvtepu8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+pub fn _mm256_mask_cvtepu8_epi16(src: __m256i, k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10554,9 +11067,11 @@ pub unsafe fn _mm256_mask_cvtepu8_epi16(src: __m256i, k: __mmask16, a: __m128i)
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub unsafe fn _mm256_maskz_cvtepu8_epi16(k: __mmask16, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+pub fn _mm256_maskz_cvtepu8_epi16(k: __mmask16, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10566,9 +11081,11 @@ pub unsafe fn _mm256_maskz_cvtepu8_epi16(k: __mmask16, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub unsafe fn _mm_mask_cvtepu8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu8_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+pub fn _mm_mask_cvtepu8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10578,9 +11095,11 @@ pub unsafe fn _mm_mask_cvtepu8_epi16(src: __m128i, k: __mmask8, a: __m128i) -> _
 #[target_feature(enable = "avx512bw,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbw))]
-pub unsafe fn _mm_maskz_cvtepu8_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu8_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+pub fn _mm_maskz_cvtepu8_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
 }
 
 /// Shift 128-bit lanes in a left by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -10591,89 +11110,91 @@ pub unsafe fn _mm_maskz_cvtepu8_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_bslli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    const fn mask(shift: i32, i: u32) -> u32 {
-        let shift = shift as u32 & 0xff;
-        if shift > 15 || i % 16 < shift {
-            0
-        } else {
-            64 + (i - shift)
+pub fn _mm512_bslli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        const fn mask(shift: i32, i: u32) -> u32 {
+            let shift = shift as u32 & 0xff;
+            if shift > 15 || i % 16 < shift {
+                0
+            } else {
+                64 + (i - shift)
+            }
         }
+        let a = a.as_i8x64();
+        let zero = i8x64::ZERO;
+        let r: i8x64 = simd_shuffle!(
+            zero,
+            a,
+            [
+                mask(IMM8, 0),
+                mask(IMM8, 1),
+                mask(IMM8, 2),
+                mask(IMM8, 3),
+                mask(IMM8, 4),
+                mask(IMM8, 5),
+                mask(IMM8, 6),
+                mask(IMM8, 7),
+                mask(IMM8, 8),
+                mask(IMM8, 9),
+                mask(IMM8, 10),
+                mask(IMM8, 11),
+                mask(IMM8, 12),
+                mask(IMM8, 13),
+                mask(IMM8, 14),
+                mask(IMM8, 15),
+                mask(IMM8, 16),
+                mask(IMM8, 17),
+                mask(IMM8, 18),
+                mask(IMM8, 19),
+                mask(IMM8, 20),
+                mask(IMM8, 21),
+                mask(IMM8, 22),
+                mask(IMM8, 23),
+                mask(IMM8, 24),
+                mask(IMM8, 25),
+                mask(IMM8, 26),
+                mask(IMM8, 27),
+                mask(IMM8, 28),
+                mask(IMM8, 29),
+                mask(IMM8, 30),
+                mask(IMM8, 31),
+                mask(IMM8, 32),
+                mask(IMM8, 33),
+                mask(IMM8, 34),
+                mask(IMM8, 35),
+                mask(IMM8, 36),
+                mask(IMM8, 37),
+                mask(IMM8, 38),
+                mask(IMM8, 39),
+                mask(IMM8, 40),
+                mask(IMM8, 41),
+                mask(IMM8, 42),
+                mask(IMM8, 43),
+                mask(IMM8, 44),
+                mask(IMM8, 45),
+                mask(IMM8, 46),
+                mask(IMM8, 47),
+                mask(IMM8, 48),
+                mask(IMM8, 49),
+                mask(IMM8, 50),
+                mask(IMM8, 51),
+                mask(IMM8, 52),
+                mask(IMM8, 53),
+                mask(IMM8, 54),
+                mask(IMM8, 55),
+                mask(IMM8, 56),
+                mask(IMM8, 57),
+                mask(IMM8, 58),
+                mask(IMM8, 59),
+                mask(IMM8, 60),
+                mask(IMM8, 61),
+                mask(IMM8, 62),
+                mask(IMM8, 63),
+            ],
+        );
+        transmute(r)
     }
-    let a = a.as_i8x64();
-    let zero = i8x64::ZERO;
-    let r: i8x64 = simd_shuffle!(
-        zero,
-        a,
-        [
-            mask(IMM8, 0),
-            mask(IMM8, 1),
-            mask(IMM8, 2),
-            mask(IMM8, 3),
-            mask(IMM8, 4),
-            mask(IMM8, 5),
-            mask(IMM8, 6),
-            mask(IMM8, 7),
-            mask(IMM8, 8),
-            mask(IMM8, 9),
-            mask(IMM8, 10),
-            mask(IMM8, 11),
-            mask(IMM8, 12),
-            mask(IMM8, 13),
-            mask(IMM8, 14),
-            mask(IMM8, 15),
-            mask(IMM8, 16),
-            mask(IMM8, 17),
-            mask(IMM8, 18),
-            mask(IMM8, 19),
-            mask(IMM8, 20),
-            mask(IMM8, 21),
-            mask(IMM8, 22),
-            mask(IMM8, 23),
-            mask(IMM8, 24),
-            mask(IMM8, 25),
-            mask(IMM8, 26),
-            mask(IMM8, 27),
-            mask(IMM8, 28),
-            mask(IMM8, 29),
-            mask(IMM8, 30),
-            mask(IMM8, 31),
-            mask(IMM8, 32),
-            mask(IMM8, 33),
-            mask(IMM8, 34),
-            mask(IMM8, 35),
-            mask(IMM8, 36),
-            mask(IMM8, 37),
-            mask(IMM8, 38),
-            mask(IMM8, 39),
-            mask(IMM8, 40),
-            mask(IMM8, 41),
-            mask(IMM8, 42),
-            mask(IMM8, 43),
-            mask(IMM8, 44),
-            mask(IMM8, 45),
-            mask(IMM8, 46),
-            mask(IMM8, 47),
-            mask(IMM8, 48),
-            mask(IMM8, 49),
-            mask(IMM8, 50),
-            mask(IMM8, 51),
-            mask(IMM8, 52),
-            mask(IMM8, 53),
-            mask(IMM8, 54),
-            mask(IMM8, 55),
-            mask(IMM8, 56),
-            mask(IMM8, 57),
-            mask(IMM8, 58),
-            mask(IMM8, 59),
-            mask(IMM8, 60),
-            mask(IMM8, 61),
-            mask(IMM8, 62),
-            mask(IMM8, 63),
-        ],
-    );
-    transmute(r)
 }
 
 /// Shift 128-bit lanes in a right by imm8 bytes while shifting in zeros, and store the results in dst.
@@ -10684,171 +11205,208 @@ pub unsafe fn _mm512_bslli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 3))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i8x64();
-    let zero = i8x64::ZERO;
-    let r: i8x64 = match IMM8 % 16 {
-        0 => simd_shuffle!(
-            a,
-            zero,
-            [
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
-                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-            ],
-        ),
-        1 => simd_shuffle!(
-            a,
-            zero,
-            [
-                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 17, 18, 19, 20, 21, 22, 23,
-                24, 25, 26, 27, 28, 29, 30, 31, 80, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
-                45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112,
-            ],
-        ),
-        2 => simd_shuffle!(
-            a,
-            zero,
-            [
-                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 18, 19, 20, 21, 22, 23, 24,
-                25, 26, 27, 28, 29, 30, 31, 80, 81, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-                46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
-            ],
-        ),
-        3 => simd_shuffle!(
-            a,
-            zero,
-            [
-                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 19, 20, 21, 22, 23, 24,
-                25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-                46, 47, 96, 97, 98, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
-                114,
-            ],
-        ),
-        4 => simd_shuffle!(
-            a,
-            zero,
-            [
-                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 20, 21, 22, 23, 24, 25,
-                26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
-                47, 96, 97, 98, 99, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114,
-                115,
-            ],
-        ),
-        5 => simd_shuffle!(
-            a,
-            zero,
-            [
-                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 21, 22, 23, 24, 25, 26,
-                27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                96, 97, 98, 99, 100, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114,
-                115, 116,
-            ],
-        ),
-        6 => simd_shuffle!(
-            a,
-            zero,
-            [
-                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 22, 23, 24, 25, 26, 27,
-                28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96,
-                97, 98, 99, 100, 101, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
-                116, 117,
-            ],
-        ),
-        7 => simd_shuffle!(
-            a,
-            zero,
-            [
-                7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 23, 24, 25, 26, 27,
-                28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96,
-                97, 98, 99, 100, 101, 102, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
-                116, 117, 118,
-            ],
-        ),
-        8 => simd_shuffle!(
-            a,
-            zero,
-            [
-                8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 24, 25, 26, 27, 28,
-                29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 40, 41, 42, 43, 44, 45, 46, 47, 96, 97,
-                98, 99, 100, 101, 102, 103, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
-                116, 117, 118, 119,
-            ],
-        ),
-        9 => simd_shuffle!(
-            a,
-            zero,
-            [
-                9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 25, 26, 27, 28, 29,
-                30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 41, 42, 43, 44, 45, 46, 47, 96, 97, 98,
-                99, 100, 101, 102, 103, 104, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116,
-                117, 118, 119, 120,
-            ],
-        ),
-        10 => simd_shuffle!(
-            a,
-            zero,
-            [
-                10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 26, 27, 28, 29, 30,
-                31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 42, 43, 44, 45, 46, 47, 96, 97, 98, 99,
-                100, 101, 102, 103, 104, 105, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117,
-                118, 119, 120, 121,
-            ],
-        ),
-        11 => simd_shuffle!(
-            a,
-            zero,
-            [
-                11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 27, 28, 29, 30, 31,
-                80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 43, 44, 45, 46, 47, 96, 97, 98, 99,
-                100, 101, 102, 103, 104, 105, 106, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116,
-                117, 118, 119, 120, 121, 122,
-            ],
-        ),
-        12 => simd_shuffle!(
-            a,
-            zero,
-            [
-                12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 28, 29, 30, 31, 80,
-                81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 44, 45, 46, 47, 96, 97, 98, 99, 100,
-                101, 102, 103, 104, 105, 106, 107, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117,
-                118, 119, 120, 121, 122, 123,
-            ],
-        ),
-        13 => simd_shuffle!(
-            a,
-            zero,
-            [
-                13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 29, 30, 31, 80, 81,
-                82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 45, 46, 47, 96, 97, 98, 99, 100, 101,
-                102, 103, 104, 105, 106, 107, 108, 61, 62, 63, 112, 113, 114, 115, 116, 117, 118,
-                119, 120, 121, 122, 123, 124,
-            ],
-        ),
-        14 => simd_shuffle!(
-            a,
-            zero,
-            [
-                14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 30, 31, 80, 81, 82,
-                83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 46, 47, 96, 97, 98, 99, 100, 101, 102,
-                103, 104, 105, 106, 107, 108, 109, 62, 63, 112, 113, 114, 115, 116, 117, 118, 119,
-                120, 121, 122, 123, 124, 125,
-            ],
-        ),
-        15 => simd_shuffle!(
-            a,
-            zero,
-            [
-                15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 31, 80, 81, 82, 83,
-                84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 47, 96, 97, 98, 99, 100, 101, 102, 103,
-                104, 105, 106, 107, 108, 109, 110, 63, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-                121, 122, 123, 124, 125, 126,
-            ],
-        ),
-        _ => zero,
-    };
-    transmute(r)
+pub fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i8x64();
+        let zero = i8x64::ZERO;
+        let r: i8x64 = match IMM8 % 16 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                        40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+                        59, 60, 61, 62, 63,
+                    ],
+                )
+            }
+            1 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 17, 18, 19, 20, 21,
+                        22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 33, 34, 35, 36, 37, 38, 39, 40,
+                        41, 42, 43, 44, 45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                        60, 61, 62, 63, 112,
+                    ],
+                )
+            }
+            2 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 18, 19, 20, 21, 22,
+                        23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 34, 35, 36, 37, 38, 39, 40, 41,
+                        42, 43, 44, 45, 46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+                        61, 62, 63, 112, 113,
+                    ],
+                )
+            }
+            3 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 19, 20, 21, 22,
+                        23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 35, 36, 37, 38, 39, 40, 41,
+                        42, 43, 44, 45, 46, 47, 96, 97, 98, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+                        61, 62, 63, 112, 113, 114,
+                    ],
+                )
+            }
+            4 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 20, 21, 22, 23,
+                        24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 36, 37, 38, 39, 40, 41, 42,
+                        43, 44, 45, 46, 47, 96, 97, 98, 99, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+                        62, 63, 112, 113, 114, 115,
+                    ],
+                )
+            }
+            5 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 21, 22, 23, 24,
+                        25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 37, 38, 39, 40, 41, 42, 43,
+                        44, 45, 46, 47, 96, 97, 98, 99, 100, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+                        62, 63, 112, 113, 114, 115, 116,
+                    ],
+                )
+            }
+            6 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 22, 23, 24, 25,
+                        26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 38, 39, 40, 41, 42, 43, 44,
+                        45, 46, 47, 96, 97, 98, 99, 100, 101, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+                        63, 112, 113, 114, 115, 116, 117,
+                    ],
+                )
+            }
+            7 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 23, 24, 25,
+                        26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 39, 40, 41, 42, 43, 44,
+                        45, 46, 47, 96, 97, 98, 99, 100, 101, 102, 55, 56, 57, 58, 59, 60, 61, 62,
+                        63, 112, 113, 114, 115, 116, 117, 118,
+                    ],
+                )
+            }
+            8 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 24, 25, 26,
+                        27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 40, 41, 42, 43, 44, 45,
+                        46, 47, 96, 97, 98, 99, 100, 101, 102, 103, 56, 57, 58, 59, 60, 61, 62, 63,
+                        112, 113, 114, 115, 116, 117, 118, 119,
+                    ],
+                )
+            }
+            9 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 25, 26, 27,
+                        28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 41, 42, 43, 44, 45, 46,
+                        47, 96, 97, 98, 99, 100, 101, 102, 103, 104, 57, 58, 59, 60, 61, 62, 63,
+                        112, 113, 114, 115, 116, 117, 118, 119, 120,
+                    ],
+                )
+            }
+            10 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 26, 27, 28,
+                        29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 42, 43, 44, 45, 46, 47,
+                        96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 58, 59, 60, 61, 62, 63, 112,
+                        113, 114, 115, 116, 117, 118, 119, 120, 121,
+                    ],
+                )
+            }
+            11 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 27, 28, 29,
+                        30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 43, 44, 45, 46, 47, 96,
+                        97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 59, 60, 61, 62, 63, 112,
+                        113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
+                    ],
+                )
+            }
+            12 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 28, 29, 30,
+                        31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 44, 45, 46, 47, 96, 97,
+                        98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 60, 61, 62, 63, 112, 113,
+                        114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
+                    ],
+                )
+            }
+            13 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 29, 30, 31,
+                        80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 45, 46, 47, 96, 97, 98,
+                        99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 61, 62, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124,
+                    ],
+                )
+            }
+            14 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 30, 31, 80,
+                        81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 46, 47, 96, 97, 98, 99,
+                        100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 62, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+                    ],
+                )
+            }
+            15 => {
+                simd_shuffle!(
+                    a,
+                    zero,
+                    [
+                        15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 31, 80, 81,
+                        82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 47, 96, 97, 98, 99,
+                        100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+                    ],
+                )
+            }
+            _ => zero,
+        };
+        transmute(r)
+    }
 }
 
 /// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst.
@@ -10861,187 +11419,222 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    // If palignr is shifting the pair of vectors more than the size of two
-    // lanes, emit zero.
-    if IMM8 >= 32 {
-        return _mm512_setzero_si512();
-    }
-    // If palignr is shifting the pair of input vectors more than one lane,
-    // but less than two lanes, convert to shifting in zeroes.
-    let (a, b) = if IMM8 > 16 {
-        (_mm512_setzero_si512(), a)
-    } else {
-        (a, b)
-    };
-    let a = a.as_i8x64();
-    let b = b.as_i8x64();
-
-    if IMM8 == 16 {
-        return transmute(a);
-    }
-
-    let r: i8x64 = match IMM8 % 16 {
-        0 => simd_shuffle!(
-            b,
-            a,
-            [
-                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
-                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
-                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-            ],
-        ),
-        1 => simd_shuffle!(
-            b,
-            a,
-            [
-                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 17, 18, 19, 20, 21, 22, 23,
-                24, 25, 26, 27, 28, 29, 30, 31, 80, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
-                45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112,
-            ],
-        ),
-        2 => simd_shuffle!(
-            b,
-            a,
-            [
-                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 18, 19, 20, 21, 22, 23, 24,
-                25, 26, 27, 28, 29, 30, 31, 80, 81, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-                46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
-            ],
-        ),
-        3 => simd_shuffle!(
-            b,
-            a,
-            [
-                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 19, 20, 21, 22, 23, 24,
-                25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
-                46, 47, 96, 97, 98, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113,
-                114,
-            ],
-        ),
-        4 => simd_shuffle!(
-            b,
-            a,
-            [
-                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 20, 21, 22, 23, 24, 25,
-                26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
-                47, 96, 97, 98, 99, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114,
-                115,
-            ],
-        ),
-        5 => simd_shuffle!(
-            b,
-            a,
-            [
-                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 21, 22, 23, 24, 25, 26,
-                27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                96, 97, 98, 99, 100, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114,
-                115, 116,
-            ],
-        ),
-        6 => simd_shuffle!(
-            b,
-            a,
-            [
-                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 22, 23, 24, 25, 26, 27,
-                28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96,
-                97, 98, 99, 100, 101, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
-                116, 117,
-            ],
-        ),
-        7 => simd_shuffle!(
-            b,
-            a,
-            [
-                7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 23, 24, 25, 26, 27,
-                28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 39, 40, 41, 42, 43, 44, 45, 46, 47, 96,
-                97, 98, 99, 100, 101, 102, 55, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
-                116, 117, 118,
-            ],
-        ),
-        8 => simd_shuffle!(
-            b,
-            a,
-            [
-                8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 24, 25, 26, 27, 28,
-                29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 40, 41, 42, 43, 44, 45, 46, 47, 96, 97,
-                98, 99, 100, 101, 102, 103, 56, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115,
-                116, 117, 118, 119,
-            ],
-        ),
-        9 => simd_shuffle!(
-            b,
-            a,
-            [
-                9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 25, 26, 27, 28, 29,
-                30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 41, 42, 43, 44, 45, 46, 47, 96, 97, 98,
-                99, 100, 101, 102, 103, 104, 57, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116,
-                117, 118, 119, 120,
-            ],
-        ),
-        10 => simd_shuffle!(
-            b,
-            a,
-            [
-                10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 26, 27, 28, 29, 30,
-                31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 42, 43, 44, 45, 46, 47, 96, 97, 98, 99,
-                100, 101, 102, 103, 104, 105, 58, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117,
-                118, 119, 120, 121,
-            ],
-        ),
-        11 => simd_shuffle!(
-            b,
-            a,
-            [
-                11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 27, 28, 29, 30, 31,
-                80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 43, 44, 45, 46, 47, 96, 97, 98, 99,
-                100, 101, 102, 103, 104, 105, 106, 59, 60, 61, 62, 63, 112, 113, 114, 115, 116,
-                117, 118, 119, 120, 121, 122,
-            ],
-        ),
-        12 => simd_shuffle!(
-            b,
-            a,
-            [
-                12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 28, 29, 30, 31, 80,
-                81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 44, 45, 46, 47, 96, 97, 98, 99, 100,
-                101, 102, 103, 104, 105, 106, 107, 60, 61, 62, 63, 112, 113, 114, 115, 116, 117,
-                118, 119, 120, 121, 122, 123,
-            ],
-        ),
-        13 => simd_shuffle!(
-            b,
-            a,
-            [
-                13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 29, 30, 31, 80, 81,
-                82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 45, 46, 47, 96, 97, 98, 99, 100, 101,
-                102, 103, 104, 105, 106, 107, 108, 61, 62, 63, 112, 113, 114, 115, 116, 117, 118,
-                119, 120, 121, 122, 123, 124,
-            ],
-        ),
-        14 => simd_shuffle!(
-            b,
-            a,
-            [
-                14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 30, 31, 80, 81, 82,
-                83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 46, 47, 96, 97, 98, 99, 100, 101, 102,
-                103, 104, 105, 106, 107, 108, 109, 62, 63, 112, 113, 114, 115, 116, 117, 118, 119,
-                120, 121, 122, 123, 124, 125,
-            ],
-        ),
-        15 => simd_shuffle!(
-            b,
-            a,
-            [
-                15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 31, 80, 81, 82, 83,
-                84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 47, 96, 97, 98, 99, 100, 101, 102, 103,
-                104, 105, 106, 107, 108, 109, 110, 63, 112, 113, 114, 115, 116, 117, 118, 119, 120,
-                121, 122, 123, 124, 125, 126,
-            ],
-        ),
-        _ => unreachable_unchecked(),
-    };
-    transmute(r)
+pub fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        // If palignr is shifting the pair of vectors more than the size of two
+        // lanes, emit zero.
+        if IMM8 >= 32 {
+            return _mm512_setzero_si512();
+        }
+        // If palignr is shifting the pair of input vectors more than one lane,
+        // but less than two lanes, convert to shifting in zeroes.
+        let (a, b) = if IMM8 > 16 {
+            (_mm512_setzero_si512(), a)
+        } else {
+            (a, b)
+        };
+        let a = a.as_i8x64();
+        let b = b.as_i8x64();
+        if IMM8 == 16 {
+            return transmute(a);
+        }
+        let r: i8x64 = match IMM8 % 16 {
+            0 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+                        40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+                        59, 60, 61, 62, 63,
+                    ],
+                )
+            }
+            1 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 17, 18, 19, 20, 21,
+                        22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 33, 34, 35, 36, 37, 38, 39, 40,
+                        41, 42, 43, 44, 45, 46, 47, 96, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+                        60, 61, 62, 63, 112,
+                    ],
+                )
+            }
+            2 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 18, 19, 20, 21, 22,
+                        23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 34, 35, 36, 37, 38, 39, 40, 41,
+                        42, 43, 44, 45, 46, 47, 96, 97, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+                        61, 62, 63, 112, 113,
+                    ],
+                )
+            }
+            3 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 19, 20, 21, 22,
+                        23, 24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 35, 36, 37, 38, 39, 40, 41,
+                        42, 43, 44, 45, 46, 47, 96, 97, 98, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+                        61, 62, 63, 112, 113, 114,
+                    ],
+                )
+            }
+            4 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 20, 21, 22, 23,
+                        24, 25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 36, 37, 38, 39, 40, 41, 42,
+                        43, 44, 45, 46, 47, 96, 97, 98, 99, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+                        62, 63, 112, 113, 114, 115,
+                    ],
+                )
+            }
+            5 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 21, 22, 23, 24,
+                        25, 26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 37, 38, 39, 40, 41, 42, 43,
+                        44, 45, 46, 47, 96, 97, 98, 99, 100, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+                        62, 63, 112, 113, 114, 115, 116,
+                    ],
+                )
+            }
+            6 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 22, 23, 24, 25,
+                        26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 38, 39, 40, 41, 42, 43, 44,
+                        45, 46, 47, 96, 97, 98, 99, 100, 101, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+                        63, 112, 113, 114, 115, 116, 117,
+                    ],
+                )
+            }
+            7 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        7, 8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 23, 24, 25,
+                        26, 27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 39, 40, 41, 42, 43, 44,
+                        45, 46, 47, 96, 97, 98, 99, 100, 101, 102, 55, 56, 57, 58, 59, 60, 61, 62,
+                        63, 112, 113, 114, 115, 116, 117, 118,
+                    ],
+                )
+            }
+            8 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        8, 9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 24, 25, 26,
+                        27, 28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 40, 41, 42, 43, 44, 45,
+                        46, 47, 96, 97, 98, 99, 100, 101, 102, 103, 56, 57, 58, 59, 60, 61, 62, 63,
+                        112, 113, 114, 115, 116, 117, 118, 119,
+                    ],
+                )
+            }
+            9 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        9, 10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 25, 26, 27,
+                        28, 29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 41, 42, 43, 44, 45, 46,
+                        47, 96, 97, 98, 99, 100, 101, 102, 103, 104, 57, 58, 59, 60, 61, 62, 63,
+                        112, 113, 114, 115, 116, 117, 118, 119, 120,
+                    ],
+                )
+            }
+            10 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        10, 11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 26, 27, 28,
+                        29, 30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 42, 43, 44, 45, 46, 47,
+                        96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 58, 59, 60, 61, 62, 63, 112,
+                        113, 114, 115, 116, 117, 118, 119, 120, 121,
+                    ],
+                )
+            }
+            11 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        11, 12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 27, 28, 29,
+                        30, 31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 43, 44, 45, 46, 47, 96,
+                        97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 59, 60, 61, 62, 63, 112,
+                        113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
+                    ],
+                )
+            }
+            12 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        12, 13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 28, 29, 30,
+                        31, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 44, 45, 46, 47, 96, 97,
+                        98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 60, 61, 62, 63, 112, 113,
+                        114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
+                    ],
+                )
+            }
+            13 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        13, 14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 29, 30, 31,
+                        80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 45, 46, 47, 96, 97, 98,
+                        99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 61, 62, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124,
+                    ],
+                )
+            }
+            14 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        14, 15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 30, 31, 80,
+                        81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 46, 47, 96, 97, 98, 99,
+                        100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 62, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+                    ],
+                )
+            }
+            15 => {
+                simd_shuffle!(
+                    b,
+                    a,
+                    [
+                        15, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 31, 80, 81,
+                        82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 47, 96, 97, 98, 99,
+                        100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 63, 112, 113, 114,
+                        115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+                    ],
+                )
+            }
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
 }
 
 /// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11052,15 +11645,17 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_alignr_epi8<const IMM8: i32>(
+pub fn _mm512_mask_alignr_epi8<const IMM8: i32>(
     src: __m512i,
     k: __mmask64,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm512_alignr_epi8::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i8x64(), src.as_i8x64()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x64(), src.as_i8x64()))
+    }
 }
 
 /// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11071,14 +11666,12 @@ pub unsafe fn _mm512_mask_alignr_epi8<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_alignr_epi8<const IMM8: i32>(
-    k: __mmask64,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm512_alignr_epi8::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i8x64(), i8x64::ZERO))
+pub fn _mm512_maskz_alignr_epi8<const IMM8: i32>(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x64(), i8x64::ZERO))
+    }
 }
 
 /// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11089,15 +11682,17 @@ pub unsafe fn _mm512_maskz_alignr_epi8<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(4)]
 #[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
-pub unsafe fn _mm256_mask_alignr_epi8<const IMM8: i32>(
+pub fn _mm256_mask_alignr_epi8<const IMM8: i32>(
     src: __m256i,
     k: __mmask32,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm256_alignr_epi8::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i8x32(), src.as_i8x32()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x32(), src.as_i8x32()))
+    }
 }
 
 /// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11108,14 +11703,12 @@ pub unsafe fn _mm256_mask_alignr_epi8<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
-pub unsafe fn _mm256_maskz_alignr_epi8<const IMM8: i32>(
-    k: __mmask32,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm256_alignr_epi8::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i8x32(), i8x32::ZERO))
+pub fn _mm256_maskz_alignr_epi8<const IMM8: i32>(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x32(), i8x32::ZERO))
+    }
 }
 
 /// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11126,15 +11719,17 @@ pub unsafe fn _mm256_maskz_alignr_epi8<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(4)]
 #[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
-pub unsafe fn _mm_mask_alignr_epi8<const IMM8: i32>(
+pub fn _mm_mask_alignr_epi8<const IMM8: i32>(
     src: __m128i,
     k: __mmask16,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm_alignr_epi8::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i8x16(), src.as_i8x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x16(), src.as_i8x16()))
+    }
 }
 
 /// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11145,14 +11740,12 @@ pub unsafe fn _mm_mask_alignr_epi8<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpalignr, IMM8 = 5))]
-pub unsafe fn _mm_maskz_alignr_epi8<const IMM8: i32>(
-    k: __mmask16,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm_alignr_epi8::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i8x16(), i8x16::ZERO))
+pub fn _mm_maskz_alignr_epi8<const IMM8: i32>(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i8x16(), i8x16::ZERO))
+    }
 }
 
 /// Convert packed signed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
diff --git a/crates/core_arch/src/x86/avx512cd.rs b/crates/core_arch/src/x86/avx512cd.rs
index 71eceab6bd..3982c55fa6 100644
--- a/crates/core_arch/src/x86/avx512cd.rs
+++ b/crates/core_arch/src/x86/avx512cd.rs
@@ -11,7 +11,7 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
-pub unsafe fn _mm512_broadcastmw_epi32(k: __mmask16) -> __m512i {
+pub fn _mm512_broadcastmw_epi32(k: __mmask16) -> __m512i {
     _mm512_set1_epi32(k as i32)
 }
 
@@ -22,7 +22,7 @@ pub unsafe fn _mm512_broadcastmw_epi32(k: __mmask16) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
-pub unsafe fn _mm256_broadcastmw_epi32(k: __mmask16) -> __m256i {
+pub fn _mm256_broadcastmw_epi32(k: __mmask16) -> __m256i {
     _mm256_set1_epi32(k as i32)
 }
 
@@ -33,7 +33,7 @@ pub unsafe fn _mm256_broadcastmw_epi32(k: __mmask16) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmw2d
-pub unsafe fn _mm_broadcastmw_epi32(k: __mmask16) -> __m128i {
+pub fn _mm_broadcastmw_epi32(k: __mmask16) -> __m128i {
     _mm_set1_epi32(k as i32)
 }
 
@@ -44,7 +44,7 @@ pub unsafe fn _mm_broadcastmw_epi32(k: __mmask16) -> __m128i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
-pub unsafe fn _mm512_broadcastmb_epi64(k: __mmask8) -> __m512i {
+pub fn _mm512_broadcastmb_epi64(k: __mmask8) -> __m512i {
     _mm512_set1_epi64(k as i64)
 }
 
@@ -55,7 +55,7 @@ pub unsafe fn _mm512_broadcastmb_epi64(k: __mmask8) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
-pub unsafe fn _mm256_broadcastmb_epi64(k: __mmask8) -> __m256i {
+pub fn _mm256_broadcastmb_epi64(k: __mmask8) -> __m256i {
     _mm256_set1_epi64x(k as i64)
 }
 
@@ -66,7 +66,7 @@ pub unsafe fn _mm256_broadcastmb_epi64(k: __mmask8) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] // should be vpbroadcastmb2q
-pub unsafe fn _mm_broadcastmb_epi64(k: __mmask8) -> __m128i {
+pub fn _mm_broadcastmb_epi64(k: __mmask8) -> __m128i {
     _mm_set1_epi64x(k as i64)
 }
 
@@ -77,8 +77,8 @@ pub unsafe fn _mm_broadcastmb_epi64(k: __mmask8) -> __m128i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm512_conflict_epi32(a: __m512i) -> __m512i {
-    transmute(vpconflictd(a.as_i32x16()))
+pub fn _mm512_conflict_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(vpconflictd(a.as_i32x16())) }
 }
 
 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -88,9 +88,11 @@ pub unsafe fn _mm512_conflict_epi32(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    let conflict = _mm512_conflict_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, conflict, src.as_i32x16()))
+pub fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x16()))
+    }
 }
 
 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -100,9 +102,11 @@ pub unsafe fn _mm512_mask_conflict_epi32(src: __m512i, k: __mmask16, a: __m512i)
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    let conflict = _mm512_conflict_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, conflict, i32x16::ZERO))
+pub fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, conflict, i32x16::ZERO))
+    }
 }
 
 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@@ -112,8 +116,8 @@ pub unsafe fn _mm512_maskz_conflict_epi32(k: __mmask16, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm256_conflict_epi32(a: __m256i) -> __m256i {
-    transmute(vpconflictd256(a.as_i32x8()))
+pub fn _mm256_conflict_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(vpconflictd256(a.as_i32x8())) }
 }
 
 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -123,9 +127,11 @@ pub unsafe fn _mm256_conflict_epi32(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let conflict = _mm256_conflict_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, conflict, src.as_i32x8()))
+pub fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x8()))
+    }
 }
 
 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -135,9 +141,11 @@ pub unsafe fn _mm256_mask_conflict_epi32(src: __m256i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    let conflict = _mm256_conflict_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, conflict, i32x8::ZERO))
+pub fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, conflict, i32x8::ZERO))
+    }
 }
 
 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@@ -147,8 +155,8 @@ pub unsafe fn _mm256_maskz_conflict_epi32(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm_conflict_epi32(a: __m128i) -> __m128i {
-    transmute(vpconflictd128(a.as_i32x4()))
+pub fn _mm_conflict_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpconflictd128(a.as_i32x4())) }
 }
 
 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -158,9 +166,11 @@ pub unsafe fn _mm_conflict_epi32(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let conflict = _mm_conflict_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, conflict, src.as_i32x4()))
+pub fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, conflict, src.as_i32x4()))
+    }
 }
 
 /// Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -170,9 +180,11 @@ pub unsafe fn _mm_mask_conflict_epi32(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictd))]
-pub unsafe fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let conflict = _mm_conflict_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, conflict, i32x4::ZERO))
+pub fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, conflict, i32x4::ZERO))
+    }
 }
 
 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@@ -182,8 +194,8 @@ pub unsafe fn _mm_maskz_conflict_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm512_conflict_epi64(a: __m512i) -> __m512i {
-    transmute(vpconflictq(a.as_i64x8()))
+pub fn _mm512_conflict_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(vpconflictq(a.as_i64x8())) }
 }
 
 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -193,9 +205,11 @@ pub unsafe fn _mm512_conflict_epi64(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    let conflict = _mm512_conflict_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, conflict, src.as_i64x8()))
+pub fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x8()))
+    }
 }
 
 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -205,9 +219,11 @@ pub unsafe fn _mm512_mask_conflict_epi64(src: __m512i, k: __mmask8, a: __m512i)
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    let conflict = _mm512_conflict_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, conflict, i64x8::ZERO))
+pub fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let conflict = _mm512_conflict_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, conflict, i64x8::ZERO))
+    }
 }
 
 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@@ -217,8 +233,8 @@ pub unsafe fn _mm512_maskz_conflict_epi64(k: __mmask8, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm256_conflict_epi64(a: __m256i) -> __m256i {
-    transmute(vpconflictq256(a.as_i64x4()))
+pub fn _mm256_conflict_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(vpconflictq256(a.as_i64x4())) }
 }
 
 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -228,9 +244,11 @@ pub unsafe fn _mm256_conflict_epi64(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let conflict = _mm256_conflict_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, conflict, src.as_i64x4()))
+pub fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x4()))
+    }
 }
 
 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -240,9 +258,11 @@ pub unsafe fn _mm256_mask_conflict_epi64(src: __m256i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    let conflict = _mm256_conflict_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, conflict, i64x4::ZERO))
+pub fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let conflict = _mm256_conflict_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, conflict, i64x4::ZERO))
+    }
 }
 
 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
@@ -252,8 +272,8 @@ pub unsafe fn _mm256_maskz_conflict_epi64(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm_conflict_epi64(a: __m128i) -> __m128i {
-    transmute(vpconflictq128(a.as_i64x2()))
+pub fn _mm_conflict_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(vpconflictq128(a.as_i64x2())) }
 }
 
 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -263,9 +283,11 @@ pub unsafe fn _mm_conflict_epi64(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let conflict = _mm_conflict_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, conflict, src.as_i64x2()))
+pub fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, conflict, src.as_i64x2()))
+    }
 }
 
 /// Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
@@ -275,9 +297,11 @@ pub unsafe fn _mm_mask_conflict_epi64(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpconflictq))]
-pub unsafe fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let conflict = _mm_conflict_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, conflict, i64x2::ZERO))
+pub fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let conflict = _mm_conflict_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, conflict, i64x2::ZERO))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
@@ -287,8 +311,8 @@ pub unsafe fn _mm_maskz_conflict_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm512_lzcnt_epi32(a: __m512i) -> __m512i {
-    transmute(simd_ctlz(a.as_i32x16()))
+pub fn _mm512_lzcnt_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctlz(a.as_i32x16())) }
 }
 
 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -298,9 +322,11 @@ pub unsafe fn _mm512_lzcnt_epi32(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i32x16()))
+pub fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x16()))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -310,9 +336,11 @@ pub unsafe fn _mm512_mask_lzcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) ->
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, zerocount, i32x16::ZERO))
+pub fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, zerocount, i32x16::ZERO))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
@@ -322,8 +350,8 @@ pub unsafe fn _mm512_maskz_lzcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm256_lzcnt_epi32(a: __m256i) -> __m256i {
-    transmute(simd_ctlz(a.as_i32x8()))
+pub fn _mm256_lzcnt_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctlz(a.as_i32x8())) }
 }
 
 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -333,9 +361,11 @@ pub unsafe fn _mm256_lzcnt_epi32(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i32x8()))
+pub fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x8()))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -345,9 +375,11 @@ pub unsafe fn _mm256_mask_lzcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, zerocount, i32x8::ZERO))
+pub fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, zerocount, i32x8::ZERO))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
@@ -357,8 +389,8 @@ pub unsafe fn _mm256_maskz_lzcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm_lzcnt_epi32(a: __m128i) -> __m128i {
-    transmute(simd_ctlz(a.as_i32x4()))
+pub fn _mm_lzcnt_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctlz(a.as_i32x4())) }
 }
 
 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -368,9 +400,11 @@ pub unsafe fn _mm_lzcnt_epi32(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i32x4()))
+pub fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i32x4()))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -380,9 +414,11 @@ pub unsafe fn _mm_mask_lzcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntd))]
-pub unsafe fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, zerocount, i32x4::ZERO))
+pub fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, zerocount, i32x4::ZERO))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
@@ -392,8 +428,8 @@ pub unsafe fn _mm_maskz_lzcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm512_lzcnt_epi64(a: __m512i) -> __m512i {
-    transmute(simd_ctlz(a.as_i64x8()))
+pub fn _mm512_lzcnt_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctlz(a.as_i64x8())) }
 }
 
 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -403,9 +439,11 @@ pub unsafe fn _mm512_lzcnt_epi64(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i64x8()))
+pub fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x8()))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -415,9 +453,11 @@ pub unsafe fn _mm512_mask_lzcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) ->
 #[target_feature(enable = "avx512cd")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, zerocount, i64x8::ZERO))
+pub fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let zerocount = _mm512_lzcnt_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, zerocount, i64x8::ZERO))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
@@ -427,8 +467,8 @@ pub unsafe fn _mm512_maskz_lzcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm256_lzcnt_epi64(a: __m256i) -> __m256i {
-    transmute(simd_ctlz(a.as_i64x4()))
+pub fn _mm256_lzcnt_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctlz(a.as_i64x4())) }
 }
 
 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -438,9 +478,11 @@ pub unsafe fn _mm256_lzcnt_epi64(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i64x4()))
+pub fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x4()))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -450,9 +492,11 @@ pub unsafe fn _mm256_mask_lzcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, zerocount, i64x4::ZERO))
+pub fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let zerocount = _mm256_lzcnt_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, zerocount, i64x4::ZERO))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
@@ -462,8 +506,8 @@ pub unsafe fn _mm256_maskz_lzcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm_lzcnt_epi64(a: __m128i) -> __m128i {
-    transmute(simd_ctlz(a.as_i64x2()))
+pub fn _mm_lzcnt_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctlz(a.as_i64x2())) }
 }
 
 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -473,9 +517,11 @@ pub unsafe fn _mm_lzcnt_epi64(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, zerocount, src.as_i64x2()))
+pub fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, zerocount, src.as_i64x2()))
+    }
 }
 
 /// Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -485,9 +531,11 @@ pub unsafe fn _mm_mask_lzcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512cd,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vplzcntq))]
-pub unsafe fn _mm_maskz_lzcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, zerocount, i64x2::ZERO))
+pub fn _mm_maskz_lzcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let zerocount = _mm_lzcnt_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, zerocount, i64x2::ZERO))
+    }
 }
 
 #[allow(improper_ctypes)]
diff --git a/crates/core_arch/src/x86/avx512dq.rs b/crates/core_arch/src/x86/avx512dq.rs
index 66d0feebb6..20ae01b393 100644
--- a/crates/core_arch/src/x86/avx512dq.rs
+++ b/crates/core_arch/src/x86/avx512dq.rs
@@ -15,9 +15,11 @@ use crate::{
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_and_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let and = _mm_and_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, and, src.as_f64x2()))
+pub fn _mm_mask_and_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let and = _mm_and_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, and, src.as_f64x2()))
+    }
 }
 
 /// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
@@ -28,9 +30,11 @@ pub unsafe fn _mm_mask_and_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_and_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let and = _mm_and_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, and, f64x2::ZERO))
+pub fn _mm_maskz_and_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let and = _mm_and_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, and, f64x2::ZERO))
+    }
 }
 
 /// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
@@ -42,9 +46,11 @@ pub unsafe fn _mm_maskz_and_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_and_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let and = _mm256_and_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, and, src.as_f64x4()))
+pub fn _mm256_mask_and_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let and = _mm256_and_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, and, src.as_f64x4()))
+    }
 }
 
 /// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
@@ -55,9 +61,11 @@ pub unsafe fn _mm256_mask_and_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_and_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let and = _mm256_and_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, and, f64x4::ZERO))
+pub fn _mm256_maskz_and_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let and = _mm256_and_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, and, f64x4::ZERO))
+    }
 }
 
 /// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
@@ -68,8 +76,8 @@ pub unsafe fn _mm256_maskz_and_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandp))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_and_pd(a: __m512d, b: __m512d) -> __m512d {
-    transmute(simd_and(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b)))
+pub fn _mm512_and_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_and(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
 }
 
 /// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b
@@ -81,9 +89,11 @@ pub unsafe fn _mm512_and_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_and_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let and = _mm512_and_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, and, src.as_f64x8()))
+pub fn _mm512_mask_and_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let and = _mm512_and_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, and, src.as_f64x8()))
+    }
 }
 
 /// Compute the bitwise AND of packed double-precision (64-bit) floating point numbers in a and b and
@@ -94,9 +104,11 @@ pub unsafe fn _mm512_mask_and_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_and_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let and = _mm512_and_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, and, f64x8::ZERO))
+pub fn _mm512_maskz_and_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let and = _mm512_and_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, and, f64x8::ZERO))
+    }
 }
 
 /// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
@@ -108,9 +120,11 @@ pub unsafe fn _mm512_maskz_and_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_and_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let and = _mm_and_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, and, src.as_f32x4()))
+pub fn _mm_mask_and_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let and = _mm_and_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, and, src.as_f32x4()))
+    }
 }
 
 /// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
@@ -121,9 +135,11 @@ pub unsafe fn _mm_mask_and_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_and_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let and = _mm_and_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, and, f32x4::ZERO))
+pub fn _mm_maskz_and_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let and = _mm_and_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, and, f32x4::ZERO))
+    }
 }
 
 /// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
@@ -135,9 +151,11 @@ pub unsafe fn _mm_maskz_and_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_and_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let and = _mm256_and_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, and, src.as_f32x8()))
+pub fn _mm256_mask_and_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let and = _mm256_and_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, and, src.as_f32x8()))
+    }
 }
 
 /// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
@@ -148,9 +166,11 @@ pub unsafe fn _mm256_mask_and_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_and_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let and = _mm256_and_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, and, f32x8::ZERO))
+pub fn _mm256_maskz_and_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let and = _mm256_and_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, and, f32x8::ZERO))
+    }
 }
 
 /// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
@@ -161,11 +181,13 @@ pub unsafe fn _mm256_maskz_and_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_and_ps(a: __m512, b: __m512) -> __m512 {
-    transmute(simd_and(
-        transmute::<_, u32x16>(a),
-        transmute::<_, u32x16>(b),
-    ))
+pub fn _mm512_and_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(simd_and(
+            transmute::<_, u32x16>(a),
+            transmute::<_, u32x16>(b),
+        ))
+    }
 }
 
 /// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b
@@ -177,9 +199,11 @@ pub unsafe fn _mm512_and_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_and_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let and = _mm512_and_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, and, src.as_f32x16()))
+pub fn _mm512_mask_and_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let and = _mm512_and_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, and, src.as_f32x16()))
+    }
 }
 
 /// Compute the bitwise AND of packed single-precision (32-bit) floating point numbers in a and b and
@@ -190,9 +214,11 @@ pub unsafe fn _mm512_mask_and_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_and_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let and = _mm512_and_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, and, f32x16::ZERO))
+pub fn _mm512_maskz_and_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let and = _mm512_and_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, and, f32x16::ZERO))
+    }
 }
 
 // Andnot
@@ -206,9 +232,11 @@ pub unsafe fn _mm512_maskz_and_ps(k: __mmask16, a: __m512, b: __m512) -> __m512
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandnpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_andnot_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let andnot = _mm_andnot_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, andnot, src.as_f64x2()))
+pub fn _mm_mask_andnot_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let andnot = _mm_andnot_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, andnot, src.as_f64x2()))
+    }
 }
 
 /// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
@@ -220,9 +248,11 @@ pub unsafe fn _mm_mask_andnot_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m12
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandnpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_andnot_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let andnot = _mm_andnot_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, andnot, f64x2::ZERO))
+pub fn _mm_maskz_andnot_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let andnot = _mm_andnot_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, andnot, f64x2::ZERO))
+    }
 }
 
 /// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
@@ -234,9 +264,11 @@ pub unsafe fn _mm_maskz_andnot_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandnpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_andnot_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let andnot = _mm256_andnot_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, andnot, src.as_f64x4()))
+pub fn _mm256_mask_andnot_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let andnot = _mm256_andnot_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, andnot, src.as_f64x4()))
+    }
 }
 
 /// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
@@ -248,9 +280,11 @@ pub unsafe fn _mm256_mask_andnot_pd(src: __m256d, k: __mmask8, a: __m256d, b: __
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandnpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_andnot_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let andnot = _mm256_andnot_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, andnot, f64x4::ZERO))
+pub fn _mm256_maskz_andnot_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let andnot = _mm256_andnot_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, andnot, f64x4::ZERO))
+    }
 }
 
 /// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
@@ -261,8 +295,8 @@ pub unsafe fn _mm256_maskz_andnot_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandnp))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_andnot_pd(a: __m512d, b: __m512d) -> __m512d {
-    _mm512_and_pd(_mm512_xor_pd(a, transmute(_mm512_set1_epi64(-1))), b)
+pub fn _mm512_andnot_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { _mm512_and_pd(_mm512_xor_pd(a, transmute(_mm512_set1_epi64(-1))), b) }
 }
 
 /// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
@@ -274,9 +308,11 @@ pub unsafe fn _mm512_andnot_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandnpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_andnot_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let andnot = _mm512_andnot_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, andnot, src.as_f64x8()))
+pub fn _mm512_mask_andnot_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let andnot = _mm512_andnot_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, andnot, src.as_f64x8()))
+    }
 }
 
 /// Compute the bitwise NOT of packed double-precision (64-bit) floating point numbers in a and then
@@ -288,9 +324,11 @@ pub unsafe fn _mm512_mask_andnot_pd(src: __m512d, k: __mmask8, a: __m512d, b: __
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandnpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_andnot_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let andnot = _mm512_andnot_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, andnot, f64x8::ZERO))
+pub fn _mm512_maskz_andnot_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let andnot = _mm512_andnot_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, andnot, f64x8::ZERO))
+    }
 }
 
 /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
@@ -302,9 +340,11 @@ pub unsafe fn _mm512_maskz_andnot_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandnps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_andnot_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let andnot = _mm_andnot_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, andnot, src.as_f32x4()))
+pub fn _mm_mask_andnot_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let andnot = _mm_andnot_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, andnot, src.as_f32x4()))
+    }
 }
 
 /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
@@ -316,9 +356,11 @@ pub unsafe fn _mm_mask_andnot_ps(src: __m128, k: __mmask8, a: __m128, b: __m128)
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandnps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_andnot_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let andnot = _mm_andnot_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, andnot, f32x4::ZERO))
+pub fn _mm_maskz_andnot_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let andnot = _mm_andnot_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, andnot, f32x4::ZERO))
+    }
 }
 
 /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
@@ -330,9 +372,11 @@ pub unsafe fn _mm_maskz_andnot_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandnps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_andnot_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let andnot = _mm256_andnot_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, andnot, src.as_f32x8()))
+pub fn _mm256_mask_andnot_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let andnot = _mm256_andnot_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, andnot, src.as_f32x8()))
+    }
 }
 
 /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
@@ -344,9 +388,11 @@ pub unsafe fn _mm256_mask_andnot_ps(src: __m256, k: __mmask8, a: __m256, b: __m2
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vandnps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_andnot_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let andnot = _mm256_andnot_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, andnot, f32x8::ZERO))
+pub fn _mm256_maskz_andnot_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let andnot = _mm256_andnot_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, andnot, f32x8::ZERO))
+    }
 }
 
 /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
@@ -357,8 +403,8 @@ pub unsafe fn _mm256_maskz_andnot_ps(k: __mmask8, a: __m256, b: __m256) -> __m25
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandnps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_andnot_ps(a: __m512, b: __m512) -> __m512 {
-    _mm512_and_ps(_mm512_xor_ps(a, transmute(_mm512_set1_epi32(-1))), b)
+pub fn _mm512_andnot_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { _mm512_and_ps(_mm512_xor_ps(a, transmute(_mm512_set1_epi32(-1))), b) }
 }
 
 /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
@@ -370,9 +416,11 @@ pub unsafe fn _mm512_andnot_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandnps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_andnot_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let andnot = _mm512_andnot_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, andnot, src.as_f32x16()))
+pub fn _mm512_mask_andnot_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let andnot = _mm512_andnot_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, andnot, src.as_f32x16()))
+    }
 }
 
 /// Compute the bitwise NOT of packed single-precision (32-bit) floating point numbers in a and then
@@ -384,9 +432,11 @@ pub unsafe fn _mm512_mask_andnot_ps(src: __m512, k: __mmask16, a: __m512, b: __m
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vandnps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_andnot_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let andnot = _mm512_andnot_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, andnot, f32x16::ZERO))
+pub fn _mm512_maskz_andnot_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let andnot = _mm512_andnot_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, andnot, f32x16::ZERO))
+    }
 }
 
 // Or
@@ -400,9 +450,11 @@ pub unsafe fn _mm512_maskz_andnot_ps(k: __mmask16, a: __m512, b: __m512) -> __m5
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_or_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let or = _mm_or_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, or, src.as_f64x2()))
+pub fn _mm_mask_or_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let or = _mm_or_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, or, src.as_f64x2()))
+    }
 }
 
 /// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
@@ -413,9 +465,11 @@ pub unsafe fn _mm_mask_or_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_or_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let or = _mm_or_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, or, f64x2::ZERO))
+pub fn _mm_maskz_or_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let or = _mm_or_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, or, f64x2::ZERO))
+    }
 }
 
 /// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
@@ -427,9 +481,11 @@ pub unsafe fn _mm_maskz_or_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_or_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let or = _mm256_or_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, or, src.as_f64x4()))
+pub fn _mm256_mask_or_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let or = _mm256_or_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, or, src.as_f64x4()))
+    }
 }
 
 /// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
@@ -440,9 +496,11 @@ pub unsafe fn _mm256_mask_or_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_or_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let or = _mm256_or_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, or, f64x4::ZERO))
+pub fn _mm256_maskz_or_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let or = _mm256_or_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, or, f64x4::ZERO))
+    }
 }
 
 /// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b
@@ -453,8 +511,8 @@ pub unsafe fn _mm256_maskz_or_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vorp))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_or_pd(a: __m512d, b: __m512d) -> __m512d {
-    transmute(simd_or(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b)))
+pub fn _mm512_or_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_or(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
 }
 
 /// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
@@ -466,9 +524,11 @@ pub unsafe fn _mm512_or_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_or_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let or = _mm512_or_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, or, src.as_f64x8()))
+pub fn _mm512_mask_or_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let or = _mm512_or_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, or, src.as_f64x8()))
+    }
 }
 
 /// Compute the bitwise OR of packed double-precision (64-bit) floating point numbers in a and b and
@@ -479,9 +539,11 @@ pub unsafe fn _mm512_mask_or_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_or_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let or = _mm512_or_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, or, f64x8::ZERO))
+pub fn _mm512_maskz_or_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let or = _mm512_or_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, or, f64x8::ZERO))
+    }
 }
 
 /// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
@@ -493,9 +555,11 @@ pub unsafe fn _mm512_maskz_or_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_or_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let or = _mm_or_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, or, src.as_f32x4()))
+pub fn _mm_mask_or_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let or = _mm_or_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, or, src.as_f32x4()))
+    }
 }
 
 /// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
@@ -506,9 +570,11 @@ pub unsafe fn _mm_mask_or_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_or_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let or = _mm_or_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, or, f32x4::ZERO))
+pub fn _mm_maskz_or_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let or = _mm_or_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, or, f32x4::ZERO))
+    }
 }
 
 /// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
@@ -520,9 +586,11 @@ pub unsafe fn _mm_maskz_or_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_or_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let or = _mm256_or_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, or, src.as_f32x8()))
+pub fn _mm256_mask_or_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let or = _mm256_or_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, or, src.as_f32x8()))
+    }
 }
 
 /// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
@@ -533,9 +601,11 @@ pub unsafe fn _mm256_mask_or_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_or_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let or = _mm256_or_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, or, f32x8::ZERO))
+pub fn _mm256_maskz_or_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let or = _mm256_or_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, or, f32x8::ZERO))
+    }
 }
 
 /// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b
@@ -546,11 +616,13 @@ pub unsafe fn _mm256_maskz_or_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_or_ps(a: __m512, b: __m512) -> __m512 {
-    transmute(simd_or(
-        transmute::<_, u32x16>(a),
-        transmute::<_, u32x16>(b),
-    ))
+pub fn _mm512_or_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(simd_or(
+            transmute::<_, u32x16>(a),
+            transmute::<_, u32x16>(b),
+        ))
+    }
 }
 
 /// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
@@ -562,9 +634,11 @@ pub unsafe fn _mm512_or_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_or_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let or = _mm512_or_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, or, src.as_f32x16()))
+pub fn _mm512_mask_or_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let or = _mm512_or_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, or, src.as_f32x16()))
+    }
 }
 
 /// Compute the bitwise OR of packed single-precision (32-bit) floating point numbers in a and b and
@@ -575,9 +649,11 @@ pub unsafe fn _mm512_mask_or_ps(src: __m512, k: __mmask16, a: __m512, b: __m512)
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_or_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let or = _mm512_or_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, or, f32x16::ZERO))
+pub fn _mm512_maskz_or_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let or = _mm512_or_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, or, f32x16::ZERO))
+    }
 }
 
 // Xor
@@ -591,9 +667,11 @@ pub unsafe fn _mm512_maskz_or_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vxorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_xor_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let xor = _mm_xor_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, xor, src.as_f64x2()))
+pub fn _mm_mask_xor_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let xor = _mm_xor_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, xor, src.as_f64x2()))
+    }
 }
 
 /// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
@@ -604,9 +682,11 @@ pub unsafe fn _mm_mask_xor_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vxorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_xor_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let xor = _mm_xor_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, xor, f64x2::ZERO))
+pub fn _mm_maskz_xor_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let xor = _mm_xor_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, xor, f64x2::ZERO))
+    }
 }
 
 /// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
@@ -618,9 +698,11 @@ pub unsafe fn _mm_maskz_xor_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vxorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_xor_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let xor = _mm256_xor_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, xor, src.as_f64x4()))
+pub fn _mm256_mask_xor_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let xor = _mm256_xor_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, xor, src.as_f64x4()))
+    }
 }
 
 /// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
@@ -631,9 +713,11 @@ pub unsafe fn _mm256_mask_xor_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vxorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_xor_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let xor = _mm256_xor_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, xor, f64x4::ZERO))
+pub fn _mm256_maskz_xor_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let xor = _mm256_xor_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, xor, f64x4::ZERO))
+    }
 }
 
 /// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b
@@ -644,8 +728,8 @@ pub unsafe fn _mm256_maskz_xor_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vxorp))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_xor_pd(a: __m512d, b: __m512d) -> __m512d {
-    transmute(simd_xor(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b)))
+pub fn _mm512_xor_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_xor(transmute::<_, u64x8>(a), transmute::<_, u64x8>(b))) }
 }
 
 /// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
@@ -657,9 +741,11 @@ pub unsafe fn _mm512_xor_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vxorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_xor_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let xor = _mm512_xor_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, xor, src.as_f64x8()))
+pub fn _mm512_mask_xor_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let xor = _mm512_xor_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, xor, src.as_f64x8()))
+    }
 }
 
 /// Compute the bitwise XOR of packed double-precision (64-bit) floating point numbers in a and b and
@@ -670,9 +756,11 @@ pub unsafe fn _mm512_mask_xor_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vxorpd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_xor_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let xor = _mm512_xor_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, xor, f64x8::ZERO))
+pub fn _mm512_maskz_xor_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let xor = _mm512_xor_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, xor, f64x8::ZERO))
+    }
 }
 
 /// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
@@ -684,9 +772,11 @@ pub unsafe fn _mm512_maskz_xor_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vxorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_xor_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let xor = _mm_xor_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, xor, src.as_f32x4()))
+pub fn _mm_mask_xor_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let xor = _mm_xor_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, xor, src.as_f32x4()))
+    }
 }
 
 /// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
@@ -697,9 +787,11 @@ pub unsafe fn _mm_mask_xor_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vxorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_xor_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let xor = _mm_xor_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, xor, f32x4::ZERO))
+pub fn _mm_maskz_xor_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let xor = _mm_xor_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, xor, f32x4::ZERO))
+    }
 }
 
 /// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
@@ -711,9 +803,11 @@ pub unsafe fn _mm_maskz_xor_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vxorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_xor_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let xor = _mm256_xor_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, xor, src.as_f32x8()))
+pub fn _mm256_mask_xor_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let xor = _mm256_xor_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, xor, src.as_f32x8()))
+    }
 }
 
 /// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
@@ -724,9 +818,11 @@ pub unsafe fn _mm256_mask_xor_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vxorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_xor_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let xor = _mm256_xor_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, xor, f32x8::ZERO))
+pub fn _mm256_maskz_xor_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let xor = _mm256_xor_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, xor, f32x8::ZERO))
+    }
 }
 
 /// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b
@@ -737,11 +833,13 @@ pub unsafe fn _mm256_maskz_xor_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vxorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_xor_ps(a: __m512, b: __m512) -> __m512 {
-    transmute(simd_xor(
-        transmute::<_, u32x16>(a),
-        transmute::<_, u32x16>(b),
-    ))
+pub fn _mm512_xor_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(simd_xor(
+            transmute::<_, u32x16>(a),
+            transmute::<_, u32x16>(b),
+        ))
+    }
 }
 
 /// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
@@ -753,9 +851,11 @@ pub unsafe fn _mm512_xor_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vxorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_xor_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let xor = _mm512_xor_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, xor, src.as_f32x16()))
+pub fn _mm512_mask_xor_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let xor = _mm512_xor_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, xor, src.as_f32x16()))
+    }
 }
 
 /// Compute the bitwise XOR of packed single-precision (32-bit) floating point numbers in a and b and
@@ -766,9 +866,11 @@ pub unsafe fn _mm512_mask_xor_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vxorps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_xor_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let xor = _mm512_xor_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, xor, f32x16::ZERO))
+pub fn _mm512_maskz_xor_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let xor = _mm512_xor_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, xor, f32x16::ZERO))
+    }
 }
 
 // Broadcast
@@ -780,9 +882,11 @@ pub unsafe fn _mm512_maskz_xor_ps(k: __mmask16, a: __m512, b: __m512) -> __m512
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_broadcast_f32x2(a: __m128) -> __m256 {
-    let b: f32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
-    transmute(b)
+pub fn _mm256_broadcast_f32x2(a: __m128) -> __m256 {
+    unsafe {
+        let b: f32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
 }
 
 /// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
@@ -793,9 +897,11 @@ pub unsafe fn _mm256_broadcast_f32x2(a: __m128) -> __m256 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vbroadcastf32x2))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_broadcast_f32x2(src: __m256, k: __mmask8, a: __m128) -> __m256 {
-    let b = _mm256_broadcast_f32x2(a).as_f32x8();
-    transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+pub fn _mm256_mask_broadcast_f32x2(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let b = _mm256_broadcast_f32x2(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
 }
 
 /// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
@@ -806,9 +912,11 @@ pub unsafe fn _mm256_mask_broadcast_f32x2(src: __m256, k: __mmask8, a: __m128) -
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vbroadcastf32x2))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_broadcast_f32x2(k: __mmask8, a: __m128) -> __m256 {
-    let b = _mm256_broadcast_f32x2(a).as_f32x8();
-    transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+pub fn _mm256_maskz_broadcast_f32x2(k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let b = _mm256_broadcast_f32x2(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
 }
 
 /// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
@@ -818,9 +926,11 @@ pub unsafe fn _mm256_maskz_broadcast_f32x2(k: __mmask8, a: __m128) -> __m256 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_broadcast_f32x2(a: __m128) -> __m512 {
-    let b: f32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]);
-    transmute(b)
+pub fn _mm512_broadcast_f32x2(a: __m128) -> __m512 {
+    unsafe {
+        let b: f32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
 }
 
 /// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
@@ -831,9 +941,11 @@ pub unsafe fn _mm512_broadcast_f32x2(a: __m128) -> __m512 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vbroadcastf32x2))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_broadcast_f32x2(src: __m512, k: __mmask16, a: __m128) -> __m512 {
-    let b = _mm512_broadcast_f32x2(a).as_f32x16();
-    transmute(simd_select_bitmask(k, b, src.as_f32x16()))
+pub fn _mm512_mask_broadcast_f32x2(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x2(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, src.as_f32x16()))
+    }
 }
 
 /// Broadcasts the lower 2 packed single-precision (32-bit) floating-point elements from a to all
@@ -844,9 +956,11 @@ pub unsafe fn _mm512_mask_broadcast_f32x2(src: __m512, k: __mmask16, a: __m128)
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vbroadcastf32x2))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_broadcast_f32x2(k: __mmask16, a: __m128) -> __m512 {
-    let b = _mm512_broadcast_f32x2(a).as_f32x16();
-    transmute(simd_select_bitmask(k, b, f32x16::ZERO))
+pub fn _mm512_maskz_broadcast_f32x2(k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x2(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, f32x16::ZERO))
+    }
 }
 
 /// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
@@ -856,9 +970,11 @@ pub unsafe fn _mm512_maskz_broadcast_f32x2(k: __mmask16, a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_broadcast_f32x8(a: __m256) -> __m512 {
-    let b: f32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]);
-    transmute(b)
+pub fn _mm512_broadcast_f32x8(a: __m256) -> __m512 {
+    unsafe {
+        let b: f32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(b)
+    }
 }
 
 /// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
@@ -868,9 +984,11 @@ pub unsafe fn _mm512_broadcast_f32x8(a: __m256) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_broadcast_f32x8(src: __m512, k: __mmask16, a: __m256) -> __m512 {
-    let b = _mm512_broadcast_f32x8(a).as_f32x16();
-    transmute(simd_select_bitmask(k, b, src.as_f32x16()))
+pub fn _mm512_mask_broadcast_f32x8(src: __m512, k: __mmask16, a: __m256) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x8(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, src.as_f32x16()))
+    }
 }
 
 /// Broadcasts the 8 packed single-precision (32-bit) floating-point elements from a to all
@@ -880,9 +998,11 @@ pub unsafe fn _mm512_mask_broadcast_f32x8(src: __m512, k: __mmask16, a: __m256)
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_broadcast_f32x8(k: __mmask16, a: __m256) -> __m512 {
-    let b = _mm512_broadcast_f32x8(a).as_f32x16();
-    transmute(simd_select_bitmask(k, b, f32x16::ZERO))
+pub fn _mm512_maskz_broadcast_f32x8(k: __mmask16, a: __m256) -> __m512 {
+    unsafe {
+        let b = _mm512_broadcast_f32x8(a).as_f32x16();
+        transmute(simd_select_bitmask(k, b, f32x16::ZERO))
+    }
 }
 
 /// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
@@ -892,9 +1012,11 @@ pub unsafe fn _mm512_maskz_broadcast_f32x8(k: __mmask16, a: __m256) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_broadcast_f64x2(a: __m128d) -> __m256d {
-    let b: f64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
-    transmute(b)
+pub fn _mm256_broadcast_f64x2(a: __m128d) -> __m256d {
+    unsafe {
+        let b: f64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
+        transmute(b)
+    }
 }
 
 /// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
@@ -904,9 +1026,11 @@ pub unsafe fn _mm256_broadcast_f64x2(a: __m128d) -> __m256d {
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_broadcast_f64x2(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
-    let b = _mm256_broadcast_f64x2(a).as_f64x4();
-    transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+pub fn _mm256_mask_broadcast_f64x2(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let b = _mm256_broadcast_f64x2(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+    }
 }
 
 /// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
@@ -916,9 +1040,11 @@ pub unsafe fn _mm256_mask_broadcast_f64x2(src: __m256d, k: __mmask8, a: __m128d)
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m256d {
-    let b = _mm256_broadcast_f64x2(a).as_f64x4();
-    transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+pub fn _mm256_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let b = _mm256_broadcast_f64x2(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+    }
 }
 
 /// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
@@ -928,9 +1054,11 @@ pub unsafe fn _mm256_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m256d {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_broadcast_f64x2(a: __m128d) -> __m512d {
-    let b: f64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
-    transmute(b)
+pub fn _mm512_broadcast_f64x2(a: __m128d) -> __m512d {
+    unsafe {
+        let b: f64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
 }
 
 /// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
@@ -940,9 +1068,11 @@ pub unsafe fn _mm512_broadcast_f64x2(a: __m128d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_broadcast_f64x2(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
-    let b = _mm512_broadcast_f64x2(a).as_f64x8();
-    transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+pub fn _mm512_mask_broadcast_f64x2(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let b = _mm512_broadcast_f64x2(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
 }
 
 /// Broadcasts the 2 packed double-precision (64-bit) floating-point elements from a to all
@@ -952,9 +1082,11 @@ pub unsafe fn _mm512_mask_broadcast_f64x2(src: __m512d, k: __mmask8, a: __m128d)
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m512d {
-    let b = _mm512_broadcast_f64x2(a).as_f64x8();
-    transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+pub fn _mm512_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let b = _mm512_broadcast_f64x2(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
 }
 
 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
@@ -963,10 +1095,12 @@ pub unsafe fn _mm512_maskz_broadcast_f64x2(k: __mmask8, a: __m128d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_broadcast_i32x2(a: __m128i) -> __m128i {
-    let a = a.as_i32x4();
-    let b: i32x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
-    transmute(b)
+pub fn _mm_broadcast_i32x2(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b: i32x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
+        transmute(b)
+    }
 }
 
 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
@@ -977,9 +1111,11 @@ pub unsafe fn _mm_broadcast_i32x2(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vbroadcasti32x2))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_broadcast_i32x2(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let b = _mm_broadcast_i32x2(a).as_i32x4();
-    transmute(simd_select_bitmask(k, b, src.as_i32x4()))
+pub fn _mm_mask_broadcast_i32x2(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_broadcast_i32x2(a).as_i32x4();
+        transmute(simd_select_bitmask(k, b, src.as_i32x4()))
+    }
 }
 
 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
@@ -990,9 +1126,11 @@ pub unsafe fn _mm_mask_broadcast_i32x2(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vbroadcasti32x2))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m128i {
-    let b = _mm_broadcast_i32x2(a).as_i32x4();
-    transmute(simd_select_bitmask(k, b, i32x4::ZERO))
+pub fn _mm_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_broadcast_i32x2(a).as_i32x4();
+        transmute(simd_select_bitmask(k, b, i32x4::ZERO))
+    }
 }
 
 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
@@ -1001,10 +1139,12 @@ pub unsafe fn _mm_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_broadcast_i32x2(a: __m128i) -> __m256i {
-    let a = a.as_i32x4();
-    let b: i32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
-    transmute(b)
+pub fn _mm256_broadcast_i32x2(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b: i32x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
 }
 
 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
@@ -1015,9 +1155,11 @@ pub unsafe fn _mm256_broadcast_i32x2(a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vbroadcasti32x2))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_broadcast_i32x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let b = _mm256_broadcast_i32x2(a).as_i32x8();
-    transmute(simd_select_bitmask(k, b, src.as_i32x8()))
+pub fn _mm256_mask_broadcast_i32x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i32x2(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, src.as_i32x8()))
+    }
 }
 
 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
@@ -1028,9 +1170,11 @@ pub unsafe fn _mm256_mask_broadcast_i32x2(src: __m256i, k: __mmask8, a: __m128i)
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vbroadcasti32x2))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m256i {
-    let b = _mm256_broadcast_i32x2(a).as_i32x8();
-    transmute(simd_select_bitmask(k, b, i32x8::ZERO))
+pub fn _mm256_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i32x2(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, i32x8::ZERO))
+    }
 }
 
 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst.
@@ -1039,10 +1183,12 @@ pub unsafe fn _mm256_maskz_broadcast_i32x2(k: __mmask8, a: __m128i) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_broadcast_i32x2(a: __m128i) -> __m512i {
-    let a = a.as_i32x4();
-    let b: i32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]);
-    transmute(b)
+pub fn _mm512_broadcast_i32x2(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x4();
+        let b: i32x16 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
 }
 
 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using writemask k
@@ -1053,9 +1199,11 @@ pub unsafe fn _mm512_broadcast_i32x2(a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vbroadcasti32x2))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_broadcast_i32x2(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
-    let b = _mm512_broadcast_i32x2(a).as_i32x16();
-    transmute(simd_select_bitmask(k, b, src.as_i32x16()))
+pub fn _mm512_mask_broadcast_i32x2(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x2(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, src.as_i32x16()))
+    }
 }
 
 /// Broadcasts the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k
@@ -1066,9 +1214,11 @@ pub unsafe fn _mm512_mask_broadcast_i32x2(src: __m512i, k: __mmask16, a: __m128i
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vbroadcasti32x2))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_broadcast_i32x2(k: __mmask16, a: __m128i) -> __m512i {
-    let b = _mm512_broadcast_i32x2(a).as_i32x16();
-    transmute(simd_select_bitmask(k, b, i32x16::ZERO))
+pub fn _mm512_maskz_broadcast_i32x2(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x2(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, i32x16::ZERO))
+    }
 }
 
 /// Broadcasts the 8 packed 32-bit integers from a to all elements of dst.
@@ -1077,10 +1227,12 @@ pub unsafe fn _mm512_maskz_broadcast_i32x2(k: __mmask16, a: __m128i) -> __m512i
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_broadcast_i32x8(a: __m256i) -> __m512i {
-    let a = a.as_i32x8();
-    let b: i32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]);
-    transmute(b)
+pub fn _mm512_broadcast_i32x8(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x8();
+        let b: i32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute(b)
+    }
 }
 
 /// Broadcasts the 8 packed 32-bit integers from a to all elements of dst using writemask k
@@ -1090,9 +1242,11 @@ pub unsafe fn _mm512_broadcast_i32x8(a: __m256i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_broadcast_i32x8(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
-    let b = _mm512_broadcast_i32x8(a).as_i32x16();
-    transmute(simd_select_bitmask(k, b, src.as_i32x16()))
+pub fn _mm512_mask_broadcast_i32x8(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x8(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, src.as_i32x16()))
+    }
 }
 
 /// Broadcasts the 8 packed 32-bit integers from a to all elements of dst using zeromask k
@@ -1102,9 +1256,11 @@ pub unsafe fn _mm512_mask_broadcast_i32x8(src: __m512i, k: __mmask16, a: __m256i
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_broadcast_i32x8(k: __mmask16, a: __m256i) -> __m512i {
-    let b = _mm512_broadcast_i32x8(a).as_i32x16();
-    transmute(simd_select_bitmask(k, b, i32x16::ZERO))
+pub fn _mm512_maskz_broadcast_i32x8(k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i32x8(a).as_i32x16();
+        transmute(simd_select_bitmask(k, b, i32x16::ZERO))
+    }
 }
 
 /// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
@@ -1113,10 +1269,12 @@ pub unsafe fn _mm512_maskz_broadcast_i32x8(k: __mmask16, a: __m256i) -> __m512i
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_broadcast_i64x2(a: __m128i) -> __m256i {
-    let a = a.as_i64x2();
-    let b: i64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
-    transmute(b)
+pub fn _mm256_broadcast_i64x2(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b: i64x4 = simd_shuffle!(a, a, [0, 1, 0, 1]);
+        transmute(b)
+    }
 }
 
 /// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using writemask k
@@ -1126,9 +1284,11 @@ pub unsafe fn _mm256_broadcast_i64x2(a: __m128i) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_broadcast_i64x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let b = _mm256_broadcast_i64x2(a).as_i64x4();
-    transmute(simd_select_bitmask(k, b, src.as_i64x4()))
+pub fn _mm256_mask_broadcast_i64x2(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i64x2(a).as_i64x4();
+        transmute(simd_select_bitmask(k, b, src.as_i64x4()))
+    }
 }
 
 /// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using zeromask k
@@ -1138,9 +1298,11 @@ pub unsafe fn _mm256_mask_broadcast_i64x2(src: __m256i, k: __mmask8, a: __m128i)
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m256i {
-    let b = _mm256_broadcast_i64x2(a).as_i64x4();
-    transmute(simd_select_bitmask(k, b, i64x4::ZERO))
+pub fn _mm256_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let b = _mm256_broadcast_i64x2(a).as_i64x4();
+        transmute(simd_select_bitmask(k, b, i64x4::ZERO))
+    }
 }
 
 /// Broadcasts the 2 packed 64-bit integers from a to all elements of dst.
@@ -1149,10 +1311,12 @@ pub unsafe fn _mm256_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_broadcast_i64x2(a: __m128i) -> __m512i {
-    let a = a.as_i64x2();
-    let b: i64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
-    transmute(b)
+pub fn _mm512_broadcast_i64x2(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b: i64x8 = simd_shuffle!(a, a, [0, 1, 0, 1, 0, 1, 0, 1]);
+        transmute(b)
+    }
 }
 
 /// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using writemask k
@@ -1162,9 +1326,11 @@ pub unsafe fn _mm512_broadcast_i64x2(a: __m128i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_broadcast_i64x2(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    let b = _mm512_broadcast_i64x2(a).as_i64x8();
-    transmute(simd_select_bitmask(k, b, src.as_i64x8()))
+pub fn _mm512_mask_broadcast_i64x2(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i64x2(a).as_i64x8();
+        transmute(simd_select_bitmask(k, b, src.as_i64x8()))
+    }
 }
 
 /// Broadcasts the 2 packed 64-bit integers from a to all elements of dst using zeromask k
@@ -1174,9 +1340,11 @@ pub unsafe fn _mm512_mask_broadcast_i64x2(src: __m512i, k: __mmask8, a: __m128i)
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m512i {
-    let b = _mm512_broadcast_i64x2(a).as_i64x8();
-    transmute(simd_select_bitmask(k, b, i64x8::ZERO))
+pub fn _mm512_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let b = _mm512_broadcast_i64x2(a).as_i64x8();
+        transmute(simd_select_bitmask(k, b, i64x8::ZERO))
+    }
 }
 
 // Extract
@@ -1189,11 +1357,13 @@ pub unsafe fn _mm512_maskz_broadcast_i64x2(k: __mmask8, a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512dq")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_extractf32x8_ps<const IMM8: i32>(a: __m512) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 1);
-    match IMM8 & 1 {
-        0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-        _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+pub fn _mm512_extractf32x8_ps<const IMM8: i32>(a: __m512) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        }
     }
 }
 
@@ -1207,14 +1377,12 @@ pub unsafe fn _mm512_extractf32x8_ps<const IMM8: i32>(a: __m512) -> __m256 {
 #[cfg_attr(test, assert_instr(vextractf32x8, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_extractf32x8_ps<const IMM8: i32>(
-    src: __m256,
-    k: __mmask8,
-    a: __m512,
-) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm512_extractf32x8_ps::<IMM8>(a);
-    transmute(simd_select_bitmask(k, b.as_f32x8(), src.as_f32x8()))
+pub fn _mm512_mask_extractf32x8_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m512) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extractf32x8_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f32x8(), src.as_f32x8()))
+    }
 }
 
 /// Extracts 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a,
@@ -1227,10 +1395,12 @@ pub unsafe fn _mm512_mask_extractf32x8_ps<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vextractf32x8, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_extractf32x8_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm512_extractf32x8_ps::<IMM8>(a);
-    transmute(simd_select_bitmask(k, b.as_f32x8(), f32x8::ZERO))
+pub fn _mm512_maskz_extractf32x8_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extractf32x8_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f32x8(), f32x8::ZERO))
+    }
 }
 
 /// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
@@ -1241,11 +1411,13 @@ pub unsafe fn _mm512_maskz_extractf32x8_ps<const IMM8: i32>(k: __mmask8, a: __m5
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_extractf64x2_pd<const IMM8: i32>(a: __m256d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 1);
-    match IMM8 & 1 {
-        0 => simd_shuffle!(a, a, [0, 1]),
-        _ => simd_shuffle!(a, a, [2, 3]),
+pub fn _mm256_extractf64x2_pd<const IMM8: i32>(a: __m256d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            _ => simd_shuffle!(a, a, [2, 3]),
+        }
     }
 }
 
@@ -1259,14 +1431,16 @@ pub unsafe fn _mm256_extractf64x2_pd<const IMM8: i32>(a: __m256d) -> __m128d {
 #[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_extractf64x2_pd<const IMM8: i32>(
+pub fn _mm256_mask_extractf64x2_pd<const IMM8: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m256d,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm256_extractf64x2_pd::<IMM8>(a);
-    transmute(simd_select_bitmask(k, b.as_f64x2(), src.as_f64x2()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extractf64x2_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f64x2(), src.as_f64x2()))
+    }
 }
 
 /// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
@@ -1279,10 +1453,12 @@ pub unsafe fn _mm256_mask_extractf64x2_pd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm256_extractf64x2_pd::<IMM8>(a);
-    transmute(simd_select_bitmask(k, b.as_f64x2(), f64x2::ZERO))
+pub fn _mm256_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extractf64x2_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, b.as_f64x2(), f64x2::ZERO))
+    }
 }
 
 /// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
@@ -1293,13 +1469,15 @@ pub unsafe fn _mm256_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m2
 #[target_feature(enable = "avx512dq")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_extractf64x2_pd<const IMM8: i32>(a: __m512d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 2);
-    match IMM8 & 3 {
-        0 => simd_shuffle!(a, a, [0, 1]),
-        1 => simd_shuffle!(a, a, [2, 3]),
-        2 => simd_shuffle!(a, a, [4, 5]),
-        _ => simd_shuffle!(a, a, [6, 7]),
+pub fn _mm512_extractf64x2_pd<const IMM8: i32>(a: __m512d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            1 => simd_shuffle!(a, a, [2, 3]),
+            2 => simd_shuffle!(a, a, [4, 5]),
+            _ => simd_shuffle!(a, a, [6, 7]),
+        }
     }
 }
 
@@ -1313,14 +1491,16 @@ pub unsafe fn _mm512_extractf64x2_pd<const IMM8: i32>(a: __m512d) -> __m128d {
 #[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 3))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_extractf64x2_pd<const IMM8: i32>(
+pub fn _mm512_mask_extractf64x2_pd<const IMM8: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m512d,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 2);
-    let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
-    transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+    }
 }
 
 /// Extracts 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a,
@@ -1333,10 +1513,12 @@ pub unsafe fn _mm512_mask_extractf64x2_pd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vextractf64x2, IMM8 = 3))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 2);
-    let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
-    transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+pub fn _mm512_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extractf64x2_pd::<IMM8>(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+    }
 }
 
 /// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
@@ -1347,14 +1529,16 @@ pub unsafe fn _mm512_maskz_extractf64x2_pd<const IMM8: i32>(k: __mmask8, a: __m5
 #[target_feature(enable = "avx512dq")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_extracti32x8_epi32<const IMM8: i32>(a: __m512i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let a = a.as_i32x16();
-    let b: i32x8 = match IMM8 & 1 {
-        0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-        _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
-    };
-    transmute(b)
+pub fn _mm512_extracti32x8_epi32<const IMM8: i32>(a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i32x16();
+        let b: i32x8 = match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        };
+        transmute(b)
+    }
 }
 
 /// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
@@ -1366,14 +1550,16 @@ pub unsafe fn _mm512_extracti32x8_epi32<const IMM8: i32>(a: __m512i) -> __m256i
 #[cfg_attr(test, assert_instr(vextracti32x8, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_extracti32x8_epi32<const IMM8: i32>(
+pub fn _mm512_mask_extracti32x8_epi32<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
-    transmute(simd_select_bitmask(k, b, src.as_i32x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, src.as_i32x8()))
+    }
 }
 
 /// Extracts 256 bits (composed of 8 packed 32-bit integers) from a, selected with IMM8, and stores
@@ -1385,10 +1571,12 @@ pub unsafe fn _mm512_mask_extracti32x8_epi32<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vextracti32x8, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_extracti32x8_epi32<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
-    transmute(simd_select_bitmask(k, b, i32x8::ZERO))
+pub fn _mm512_maskz_extracti32x8_epi32<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_extracti32x8_epi32::<IMM8>(a).as_i32x8();
+        transmute(simd_select_bitmask(k, b, i32x8::ZERO))
+    }
 }
 
 /// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
@@ -1399,12 +1587,14 @@ pub unsafe fn _mm512_maskz_extracti32x8_epi32<const IMM8: i32>(k: __mmask8, a: _
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_extracti64x2_epi64<const IMM8: i32>(a: __m256i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let a = a.as_i64x4();
-    match IMM8 & 1 {
-        0 => simd_shuffle!(a, a, [0, 1]),
-        _ => simd_shuffle!(a, a, [2, 3]),
+pub fn _mm256_extracti64x2_epi64<const IMM8: i32>(a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i64x4();
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            _ => simd_shuffle!(a, a, [2, 3]),
+        }
     }
 }
 
@@ -1417,14 +1607,16 @@ pub unsafe fn _mm256_extracti64x2_epi64<const IMM8: i32>(a: __m256i) -> __m128i
 #[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_extracti64x2_epi64<const IMM8: i32>(
+pub fn _mm256_mask_extracti64x2_epi64<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m256i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
-    transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+    }
 }
 
 /// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
@@ -1436,10 +1628,12 @@ pub unsafe fn _mm256_mask_extracti64x2_epi64<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
-    transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+pub fn _mm256_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+    }
 }
 
 /// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
@@ -1450,14 +1644,16 @@ pub unsafe fn _mm256_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: _
 #[target_feature(enable = "avx512dq")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_extracti64x2_epi64<const IMM8: i32>(a: __m512i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 2);
-    let a = a.as_i64x8();
-    match IMM8 & 3 {
-        0 => simd_shuffle!(a, a, [0, 1]),
-        1 => simd_shuffle!(a, a, [2, 3]),
-        2 => simd_shuffle!(a, a, [4, 5]),
-        _ => simd_shuffle!(a, a, [6, 7]),
+pub fn _mm512_extracti64x2_epi64<const IMM8: i32>(a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let a = a.as_i64x8();
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, a, [0, 1]),
+            1 => simd_shuffle!(a, a, [2, 3]),
+            2 => simd_shuffle!(a, a, [4, 5]),
+            _ => simd_shuffle!(a, a, [6, 7]),
+        }
     }
 }
 
@@ -1470,14 +1666,16 @@ pub unsafe fn _mm512_extracti64x2_epi64<const IMM8: i32>(a: __m512i) -> __m128i
 #[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 3))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_extracti64x2_epi64<const IMM8: i32>(
+pub fn _mm512_mask_extracti64x2_epi64<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m512i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 2);
-    let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
-    transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+    }
 }
 
 /// Extracts 128 bits (composed of 2 packed 64-bit integers) from a, selected with IMM8, and stores
@@ -1489,10 +1687,12 @@ pub unsafe fn _mm512_mask_extracti64x2_epi64<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vextracti64x2, IMM8 = 3))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 2);
-    let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
-    transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+pub fn _mm512_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_extracti64x2_epi64::<IMM8>(a).as_i64x2();
+        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+    }
 }
 
 // Insert
@@ -1505,20 +1705,26 @@ pub unsafe fn _mm512_maskz_extracti64x2_epi64<const IMM8: i32>(k: __mmask8, a: _
 #[target_feature(enable = "avx512dq")]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_insertf32x8<const IMM8: i32>(a: __m512, b: __m256) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm512_castps256_ps512(b);
-    match IMM8 & 1 {
-        0 => simd_shuffle!(
-            a,
-            b,
-            [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15]
-        ),
-        _ => simd_shuffle!(
-            a,
-            b,
-            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
-        ),
+pub fn _mm512_insertf32x8<const IMM8: i32>(a: __m512, b: __m256) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_castps256_ps512(b);
+        match IMM8 & 1 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15]
+                )
+            }
+            _ => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
+                )
+            }
+        }
     }
 }
 
@@ -1532,15 +1738,17 @@ pub unsafe fn _mm512_insertf32x8<const IMM8: i32>(a: __m512, b: __m256) -> __m51
 #[cfg_attr(test, assert_instr(vinsertf32x8, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_insertf32x8<const IMM8: i32>(
+pub fn _mm512_mask_insertf32x8<const IMM8: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m256,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 1);
-    let c = _mm512_insertf32x8::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, c.as_f32x16(), src.as_f32x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_insertf32x8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_f32x16(), src.as_f32x16()))
+    }
 }
 
 /// Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point
@@ -1553,14 +1761,12 @@ pub unsafe fn _mm512_mask_insertf32x8<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vinsertf32x8, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_insertf32x8<const IMM8: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m256,
-) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 1);
-    let c = _mm512_insertf32x8::<IMM8>(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, c, f32x16::ZERO))
+pub fn _mm512_maskz_insertf32x8<const IMM8: i32>(k: __mmask16, a: __m512, b: __m256) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_insertf32x8::<IMM8>(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, c, f32x16::ZERO))
+    }
 }
 
 /// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
@@ -1571,12 +1777,14 @@ pub unsafe fn _mm512_maskz_insertf32x8<const IMM8: i32>(
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_insertf64x2<const IMM8: i32>(a: __m256d, b: __m128d) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm256_castpd128_pd256(b);
-    match IMM8 & 1 {
-        0 => simd_shuffle!(a, b, [4, 5, 2, 3]),
-        _ => simd_shuffle!(a, b, [0, 1, 4, 5]),
+pub fn _mm256_insertf64x2<const IMM8: i32>(a: __m256d, b: __m128d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_castpd128_pd256(b);
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, b, [4, 5, 2, 3]),
+            _ => simd_shuffle!(a, b, [0, 1, 4, 5]),
+        }
     }
 }
 
@@ -1590,15 +1798,17 @@ pub unsafe fn _mm256_insertf64x2<const IMM8: i32>(a: __m256d, b: __m128d) -> __m
 #[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_insertf64x2<const IMM8: i32>(
+pub fn _mm256_mask_insertf64x2<const IMM8: i32>(
     src: __m256d,
     k: __mmask8,
     a: __m256d,
     b: __m128d,
 ) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 1);
-    let c = _mm256_insertf64x2::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, c.as_f64x4(), src.as_f64x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_insertf64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_f64x4(), src.as_f64x4()))
+    }
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
@@ -1611,14 +1821,12 @@ pub unsafe fn _mm256_mask_insertf64x2<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_insertf64x2<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256d,
-    b: __m128d,
-) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 1);
-    let c = _mm256_insertf64x2::<IMM8>(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, c, f64x4::ZERO))
+pub fn _mm256_maskz_insertf64x2<const IMM8: i32>(k: __mmask8, a: __m256d, b: __m128d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_insertf64x2::<IMM8>(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, c, f64x4::ZERO))
+    }
 }
 
 /// Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
@@ -1629,14 +1837,16 @@ pub unsafe fn _mm256_maskz_insertf64x2<const IMM8: i32>(
 #[target_feature(enable = "avx512dq")]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_insertf64x2<const IMM8: i32>(a: __m512d, b: __m128d) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 2);
-    let b = _mm512_castpd128_pd512(b);
-    match IMM8 & 3 {
-        0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]),
-        2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]),
-        _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]),
+pub fn _mm512_insertf64x2<const IMM8: i32>(a: __m512d, b: __m128d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_castpd128_pd512(b);
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]),
+        }
     }
 }
 
@@ -1650,15 +1860,17 @@ pub unsafe fn _mm512_insertf64x2<const IMM8: i32>(a: __m512d, b: __m128d) -> __m
 #[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 3))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_insertf64x2<const IMM8: i32>(
+pub fn _mm512_mask_insertf64x2<const IMM8: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m128d,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 2);
-    let c = _mm512_insertf64x2::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, c.as_f64x8(), src.as_f64x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_insertf64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_f64x8(), src.as_f64x8()))
+    }
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point
@@ -1671,14 +1883,12 @@ pub unsafe fn _mm512_mask_insertf64x2<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vinsertf64x2, IMM8 = 3))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_insertf64x2<const IMM8: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m128d,
-) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 2);
-    let c = _mm512_insertf64x2::<IMM8>(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, c, f64x8::ZERO))
+pub fn _mm512_maskz_insertf64x2<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m128d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_insertf64x2::<IMM8>(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, c, f64x8::ZERO))
+    }
 }
 
 /// Copy a to dst, then insert 256 bits (composed of 8 packed 32-bit integers) from b into dst at the
@@ -1689,23 +1899,29 @@ pub unsafe fn _mm512_maskz_insertf64x2<const IMM8: i32>(
 #[target_feature(enable = "avx512dq")]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_inserti32x8<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let a = a.as_i32x16();
-    let b = _mm512_castsi256_si512(b).as_i32x16();
-    let r: i32x16 = match IMM8 & 1 {
-        0 => simd_shuffle!(
-            a,
-            b,
-            [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15]
-        ),
-        _ => simd_shuffle!(
-            a,
-            b,
-            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
-        ),
-    };
-    transmute(r)
+pub fn _mm512_inserti32x8<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i32x16();
+        let b = _mm512_castsi256_si512(b).as_i32x16();
+        let r: i32x16 = match IMM8 & 1 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15]
+                )
+            }
+            _ => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23]
+                )
+            }
+        };
+        transmute(r)
+    }
 }
 
 /// Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the
@@ -1718,15 +1934,17 @@ pub unsafe fn _mm512_inserti32x8<const IMM8: i32>(a: __m512i, b: __m256i) -> __m
 #[cfg_attr(test, assert_instr(vinserti32x8, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_inserti32x8<const IMM8: i32>(
+pub fn _mm512_mask_inserti32x8<const IMM8: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m256i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let c = _mm512_inserti32x8::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, c.as_i32x16(), src.as_i32x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_inserti32x8::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_i32x16(), src.as_i32x16()))
+    }
 }
 
 /// Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the
@@ -1739,14 +1957,12 @@ pub unsafe fn _mm512_mask_inserti32x8<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vinserti32x8, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_inserti32x8<const IMM8: i32>(
-    k: __mmask16,
-    a: __m512i,
-    b: __m256i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let c = _mm512_inserti32x8::<IMM8>(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, c, i32x16::ZERO))
+pub fn _mm512_maskz_inserti32x8<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm512_inserti32x8::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, c, i32x16::ZERO))
+    }
 }
 
 /// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
@@ -1757,13 +1973,15 @@ pub unsafe fn _mm512_maskz_inserti32x8<const IMM8: i32>(
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_inserti64x2<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let a = a.as_i64x4();
-    let b = _mm256_castsi128_si256(b).as_i64x4();
-    match IMM8 & 1 {
-        0 => simd_shuffle!(a, b, [4, 5, 2, 3]),
-        _ => simd_shuffle!(a, b, [0, 1, 4, 5]),
+pub fn _mm256_inserti64x2<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i64x4();
+        let b = _mm256_castsi128_si256(b).as_i64x4();
+        match IMM8 & 1 {
+            0 => simd_shuffle!(a, b, [4, 5, 2, 3]),
+            _ => simd_shuffle!(a, b, [0, 1, 4, 5]),
+        }
     }
 }
 
@@ -1777,15 +1995,17 @@ pub unsafe fn _mm256_inserti64x2<const IMM8: i32>(a: __m256i, b: __m128i) -> __m
 #[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_inserti64x2<const IMM8: i32>(
+pub fn _mm256_mask_inserti64x2<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m128i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let c = _mm256_inserti64x2::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, c.as_i64x4(), src.as_i64x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_inserti64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_i64x4(), src.as_i64x4()))
+    }
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
@@ -1798,14 +2018,12 @@ pub unsafe fn _mm256_mask_inserti64x2<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_inserti64x2<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m128i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let c = _mm256_inserti64x2::<IMM8>(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, c, i64x4::ZERO))
+pub fn _mm256_maskz_inserti64x2<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let c = _mm256_inserti64x2::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, c, i64x4::ZERO))
+    }
 }
 
 /// Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the
@@ -1816,15 +2034,17 @@ pub unsafe fn _mm256_maskz_inserti64x2<const IMM8: i32>(
 #[target_feature(enable = "avx512dq")]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_inserti64x2<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 2);
-    let a = a.as_i64x8();
-    let b = _mm512_castsi128_si512(b).as_i64x8();
-    match IMM8 & 3 {
-        0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]),
-        1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]),
-        2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]),
-        _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]),
+pub fn _mm512_inserti64x2<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let a = a.as_i64x8();
+        let b = _mm512_castsi128_si512(b).as_i64x8();
+        match IMM8 & 3 {
+            0 => simd_shuffle!(a, b, [8, 9, 2, 3, 4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [0, 1, 8, 9, 4, 5, 6, 7]),
+            2 => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 8, 9]),
+        }
     }
 }
 
@@ -1838,15 +2058,17 @@ pub unsafe fn _mm512_inserti64x2<const IMM8: i32>(a: __m512i, b: __m128i) -> __m
 #[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 3))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_inserti64x2<const IMM8: i32>(
+pub fn _mm512_mask_inserti64x2<const IMM8: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m128i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 2);
-    let c = _mm512_inserti64x2::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, c.as_i64x8(), src.as_i64x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_inserti64x2::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, c.as_i64x8(), src.as_i64x8()))
+    }
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the
@@ -1859,14 +2081,12 @@ pub unsafe fn _mm512_mask_inserti64x2<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vinserti64x2, IMM8 = 3))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_inserti64x2<const IMM8: i32>(
-    k: __mmask8,
-    a: __m512i,
-    b: __m128i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 2);
-    let c = _mm512_inserti64x2::<IMM8>(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, c, i64x8::ZERO))
+pub fn _mm512_maskz_inserti64x2<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let c = _mm512_inserti64x2::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, c, i64x8::ZERO))
+    }
 }
 
 // Convert
@@ -1886,9 +2106,11 @@ pub unsafe fn _mm512_maskz_inserti64x2<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvt_roundepi64_pd<const ROUNDING: i32>(a: __m512i) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtqq2pd_512(a.as_i64x8(), ROUNDING))
+pub fn _mm512_cvt_roundepi64_pd<const ROUNDING: i32>(a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtqq2pd_512(a.as_i64x8(), ROUNDING))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -1907,14 +2129,16 @@ pub unsafe fn _mm512_cvt_roundepi64_pd<const ROUNDING: i32>(a: __m512i) -> __m51
 #[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvt_roundepi64_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepi64_pd<const ROUNDING: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512i,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
-    transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -1933,13 +2157,12 @@ pub unsafe fn _mm512_mask_cvt_roundepi64_pd<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtqq2pd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvt_roundepi64_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512i,
-) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
-    transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+pub fn _mm512_maskz_cvt_roundepi64_pd<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -1950,8 +2173,8 @@ pub unsafe fn _mm512_maskz_cvt_roundepi64_pd<const ROUNDING: i32>(
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtepi64_pd(a: __m128i) -> __m128d {
-    transmute(vcvtqq2pd_128(a.as_i64x2(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm_cvtepi64_pd(a: __m128i) -> __m128d {
+    unsafe { transmute(vcvtqq2pd_128(a.as_i64x2(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -1963,9 +2186,11 @@ pub unsafe fn _mm_cvtepi64_pd(a: __m128i) -> __m128d {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtepi64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
-    let b = _mm_cvtepi64_pd(a).as_f64x2();
-    transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+pub fn _mm_mask_cvtepi64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepi64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -1976,9 +2201,11 @@ pub unsafe fn _mm_mask_cvtepi64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtepi64_pd(k: __mmask8, a: __m128i) -> __m128d {
-    let b = _mm_cvtepi64_pd(a).as_f64x2();
-    transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+pub fn _mm_maskz_cvtepi64_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepi64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -1989,8 +2216,8 @@ pub unsafe fn _mm_maskz_cvtepi64_pd(k: __mmask8, a: __m128i) -> __m128d {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtepi64_pd(a: __m256i) -> __m256d {
-    transmute(vcvtqq2pd_256(a.as_i64x4(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm256_cvtepi64_pd(a: __m256i) -> __m256d {
+    unsafe { transmute(vcvtqq2pd_256(a.as_i64x4(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2002,9 +2229,11 @@ pub unsafe fn _mm256_cvtepi64_pd(a: __m256i) -> __m256d {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvtepi64_pd(src: __m256d, k: __mmask8, a: __m256i) -> __m256d {
-    let b = _mm256_cvtepi64_pd(a).as_f64x4();
-    transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+pub fn _mm256_mask_cvtepi64_pd(src: __m256d, k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepi64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2015,9 +2244,11 @@ pub unsafe fn _mm256_mask_cvtepi64_pd(src: __m256d, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvtepi64_pd(k: __mmask8, a: __m256i) -> __m256d {
-    let b = _mm256_cvtepi64_pd(a).as_f64x4();
-    transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+pub fn _mm256_maskz_cvtepi64_pd(k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepi64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2028,8 +2259,8 @@ pub unsafe fn _mm256_maskz_cvtepi64_pd(k: __mmask8, a: __m256i) -> __m256d {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtepi64_pd(a: __m512i) -> __m512d {
-    transmute(vcvtqq2pd_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm512_cvtepi64_pd(a: __m512i) -> __m512d {
+    unsafe { transmute(vcvtqq2pd_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2041,9 +2272,11 @@ pub unsafe fn _mm512_cvtepi64_pd(a: __m512i) -> __m512d {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtepi64_pd(src: __m512d, k: __mmask8, a: __m512i) -> __m512d {
-    let b = _mm512_cvtepi64_pd(a).as_f64x8();
-    transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+pub fn _mm512_mask_cvtepi64_pd(src: __m512d, k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepi64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2054,9 +2287,11 @@ pub unsafe fn _mm512_mask_cvtepi64_pd(src: __m512d, k: __mmask8, a: __m512i) ->
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtepi64_pd(k: __mmask8, a: __m512i) -> __m512d {
-    let b = _mm512_cvtepi64_pd(a).as_f64x8();
-    transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+pub fn _mm512_maskz_cvtepi64_pd(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepi64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2074,9 +2309,11 @@ pub unsafe fn _mm512_maskz_cvtepi64_pd(k: __mmask8, a: __m512i) -> __m512d {
 #[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvt_roundepi64_ps<const ROUNDING: i32>(a: __m512i) -> __m256 {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtqq2ps_512(a.as_i64x8(), ROUNDING))
+pub fn _mm512_cvt_roundepi64_ps<const ROUNDING: i32>(a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtqq2ps_512(a.as_i64x8(), ROUNDING))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2095,14 +2332,16 @@ pub unsafe fn _mm512_cvt_roundepi64_ps<const ROUNDING: i32>(a: __m512i) -> __m25
 #[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvt_roundepi64_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepi64_ps<const ROUNDING: i32>(
     src: __m256,
     k: __mmask8,
     a: __m512i,
 ) -> __m256 {
-    static_assert_rounding!(ROUNDING);
-    let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
-    transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2121,13 +2360,12 @@ pub unsafe fn _mm512_mask_cvt_roundepi64_ps<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtqq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvt_roundepi64_ps<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512i,
-) -> __m256 {
-    static_assert_rounding!(ROUNDING);
-    let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
-    transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+pub fn _mm512_maskz_cvt_roundepi64_ps<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepi64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2138,7 +2376,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepi64_ps<const ROUNDING: i32>(
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtepi64_ps(a: __m128i) -> __m128 {
+pub fn _mm_cvtepi64_ps(a: __m128i) -> __m128 {
     _mm_mask_cvtepi64_ps(_mm_undefined_ps(), 0xff, a)
 }
 
@@ -2151,8 +2389,8 @@ pub unsafe fn _mm_cvtepi64_ps(a: __m128i) -> __m128 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
-    transmute(vcvtqq2ps_128(a.as_i64x2(), src.as_f32x4(), k))
+pub fn _mm_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe { transmute(vcvtqq2ps_128(a.as_i64x2(), src.as_f32x4(), k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2163,7 +2401,7 @@ pub unsafe fn _mm_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m1
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtepi64_ps(k: __mmask8, a: __m128i) -> __m128 {
+pub fn _mm_maskz_cvtepi64_ps(k: __mmask8, a: __m128i) -> __m128 {
     _mm_mask_cvtepi64_ps(_mm_setzero_ps(), k, a)
 }
 
@@ -2175,8 +2413,8 @@ pub unsafe fn _mm_maskz_cvtepi64_ps(k: __mmask8, a: __m128i) -> __m128 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtepi64_ps(a: __m256i) -> __m128 {
-    transmute(vcvtqq2ps_256(a.as_i64x4(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm256_cvtepi64_ps(a: __m256i) -> __m128 {
+    unsafe { transmute(vcvtqq2ps_256(a.as_i64x4(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2188,9 +2426,11 @@ pub unsafe fn _mm256_cvtepi64_ps(a: __m256i) -> __m128 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m256i) -> __m128 {
-    let b = _mm256_cvtepi64_ps(a).as_f32x4();
-    transmute(simd_select_bitmask(k, b, src.as_f32x4()))
+pub fn _mm256_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepi64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, src.as_f32x4()))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2201,9 +2441,11 @@ pub unsafe fn _mm256_mask_cvtepi64_ps(src: __m128, k: __mmask8, a: __m256i) -> _
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvtepi64_ps(k: __mmask8, a: __m256i) -> __m128 {
-    let b = _mm256_cvtepi64_ps(a).as_f32x4();
-    transmute(simd_select_bitmask(k, b, f32x4::ZERO))
+pub fn _mm256_maskz_cvtepi64_ps(k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepi64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, f32x4::ZERO))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2214,8 +2456,8 @@ pub unsafe fn _mm256_maskz_cvtepi64_ps(k: __mmask8, a: __m256i) -> __m128 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtepi64_ps(a: __m512i) -> __m256 {
-    transmute(vcvtqq2ps_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm512_cvtepi64_ps(a: __m512i) -> __m256 {
+    unsafe { transmute(vcvtqq2ps_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2227,9 +2469,11 @@ pub unsafe fn _mm512_cvtepi64_ps(a: __m512i) -> __m256 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtepi64_ps(src: __m256, k: __mmask8, a: __m512i) -> __m256 {
-    let b = _mm512_cvtepi64_ps(a).as_f32x8();
-    transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+pub fn _mm512_mask_cvtepi64_ps(src: __m256, k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepi64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2240,9 +2484,11 @@ pub unsafe fn _mm512_mask_cvtepi64_ps(src: __m256, k: __mmask8, a: __m512i) -> _
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtepi64_ps(k: __mmask8, a: __m512i) -> __m256 {
-    let b = _mm512_cvtepi64_ps(a).as_f32x8();
-    transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+pub fn _mm512_maskz_cvtepi64_ps(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepi64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2260,9 +2506,11 @@ pub unsafe fn _mm512_maskz_cvtepi64_ps(k: __mmask8, a: __m512i) -> __m256 {
 #[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvt_roundepu64_pd<const ROUNDING: i32>(a: __m512i) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtuqq2pd_512(a.as_u64x8(), ROUNDING))
+pub fn _mm512_cvt_roundepu64_pd<const ROUNDING: i32>(a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtuqq2pd_512(a.as_u64x8(), ROUNDING))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2281,14 +2529,16 @@ pub unsafe fn _mm512_cvt_roundepu64_pd<const ROUNDING: i32>(a: __m512i) -> __m51
 #[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvt_roundepu64_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepu64_pd<const ROUNDING: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512i,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
-    transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2307,13 +2557,12 @@ pub unsafe fn _mm512_mask_cvt_roundepu64_pd<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtuqq2pd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvt_roundepu64_pd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512i,
-) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
-    transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+pub fn _mm512_maskz_cvt_roundepu64_pd<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_pd::<ROUNDING>(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2324,8 +2573,8 @@ pub unsafe fn _mm512_maskz_cvt_roundepu64_pd<const ROUNDING: i32>(
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtepu64_pd(a: __m128i) -> __m128d {
-    transmute(vcvtuqq2pd_128(a.as_u64x2(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm_cvtepu64_pd(a: __m128i) -> __m128d {
+    unsafe { transmute(vcvtuqq2pd_128(a.as_u64x2(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2337,9 +2586,11 @@ pub unsafe fn _mm_cvtepu64_pd(a: __m128i) -> __m128d {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtepu64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
-    let b = _mm_cvtepu64_pd(a).as_f64x2();
-    transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+pub fn _mm_mask_cvtepu64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepu64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, src.as_f64x2()))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2350,9 +2601,11 @@ pub unsafe fn _mm_mask_cvtepu64_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtepu64_pd(k: __mmask8, a: __m128i) -> __m128d {
-    let b = _mm_cvtepu64_pd(a).as_f64x2();
-    transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+pub fn _mm_maskz_cvtepu64_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let b = _mm_cvtepu64_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, b, f64x2::ZERO))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2363,8 +2616,8 @@ pub unsafe fn _mm_maskz_cvtepu64_pd(k: __mmask8, a: __m128i) -> __m128d {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtepu64_pd(a: __m256i) -> __m256d {
-    transmute(vcvtuqq2pd_256(a.as_u64x4(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm256_cvtepu64_pd(a: __m256i) -> __m256d {
+    unsafe { transmute(vcvtuqq2pd_256(a.as_u64x4(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2376,9 +2629,11 @@ pub unsafe fn _mm256_cvtepu64_pd(a: __m256i) -> __m256d {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvtepu64_pd(src: __m256d, k: __mmask8, a: __m256i) -> __m256d {
-    let b = _mm256_cvtepu64_pd(a).as_f64x4();
-    transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+pub fn _mm256_mask_cvtepu64_pd(src: __m256d, k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepu64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, src.as_f64x4()))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2389,9 +2644,11 @@ pub unsafe fn _mm256_mask_cvtepu64_pd(src: __m256d, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvtepu64_pd(k: __mmask8, a: __m256i) -> __m256d {
-    let b = _mm256_cvtepu64_pd(a).as_f64x4();
-    transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+pub fn _mm256_maskz_cvtepu64_pd(k: __mmask8, a: __m256i) -> __m256d {
+    unsafe {
+        let b = _mm256_cvtepu64_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, b, f64x4::ZERO))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2402,8 +2659,8 @@ pub unsafe fn _mm256_maskz_cvtepu64_pd(k: __mmask8, a: __m256i) -> __m256d {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtuqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtepu64_pd(a: __m512i) -> __m512d {
-    transmute(vcvtuqq2pd_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm512_cvtepu64_pd(a: __m512i) -> __m512d {
+    unsafe { transmute(vcvtuqq2pd_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2415,9 +2672,11 @@ pub unsafe fn _mm512_cvtepu64_pd(a: __m512i) -> __m512d {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtuqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtepu64_pd(src: __m512d, k: __mmask8, a: __m512i) -> __m512d {
-    let b = _mm512_cvtepu64_pd(a).as_f64x8();
-    transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+pub fn _mm512_mask_cvtepu64_pd(src: __m512d, k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepu64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, src.as_f64x8()))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements,
@@ -2428,9 +2687,11 @@ pub unsafe fn _mm512_mask_cvtepu64_pd(src: __m512d, k: __mmask8, a: __m512i) ->
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtuqq2pd))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtepu64_pd(k: __mmask8, a: __m512i) -> __m512d {
-    let b = _mm512_cvtepu64_pd(a).as_f64x8();
-    transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+pub fn _mm512_maskz_cvtepu64_pd(k: __mmask8, a: __m512i) -> __m512d {
+    unsafe {
+        let b = _mm512_cvtepu64_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, b, f64x8::ZERO))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2448,9 +2709,11 @@ pub unsafe fn _mm512_maskz_cvtepu64_pd(k: __mmask8, a: __m512i) -> __m512d {
 #[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvt_roundepu64_ps<const ROUNDING: i32>(a: __m512i) -> __m256 {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtuqq2ps_512(a.as_u64x8(), ROUNDING))
+pub fn _mm512_cvt_roundepu64_ps<const ROUNDING: i32>(a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtuqq2ps_512(a.as_u64x8(), ROUNDING))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2469,14 +2732,16 @@ pub unsafe fn _mm512_cvt_roundepu64_ps<const ROUNDING: i32>(a: __m512i) -> __m25
 #[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvt_roundepu64_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepu64_ps<const ROUNDING: i32>(
     src: __m256,
     k: __mmask8,
     a: __m512i,
 ) -> __m256 {
-    static_assert_rounding!(ROUNDING);
-    let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
-    transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2495,13 +2760,12 @@ pub unsafe fn _mm512_mask_cvt_roundepu64_ps<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtuqq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvt_roundepu64_ps<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512i,
-) -> __m256 {
-    static_assert_rounding!(ROUNDING);
-    let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
-    transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+pub fn _mm512_maskz_cvt_roundepu64_ps<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let b = _mm512_cvt_roundepu64_ps::<ROUNDING>(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2512,7 +2776,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepu64_ps<const ROUNDING: i32>(
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtepu64_ps(a: __m128i) -> __m128 {
+pub fn _mm_cvtepu64_ps(a: __m128i) -> __m128 {
     _mm_mask_cvtepu64_ps(_mm_undefined_ps(), 0xff, a)
 }
 
@@ -2525,8 +2789,8 @@ pub unsafe fn _mm_cvtepu64_ps(a: __m128i) -> __m128 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
-    transmute(vcvtuqq2ps_128(a.as_u64x2(), src.as_f32x4(), k))
+pub fn _mm_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe { transmute(vcvtuqq2ps_128(a.as_u64x2(), src.as_f32x4(), k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2537,7 +2801,7 @@ pub unsafe fn _mm_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m128i) -> __m1
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtepu64_ps(k: __mmask8, a: __m128i) -> __m128 {
+pub fn _mm_maskz_cvtepu64_ps(k: __mmask8, a: __m128i) -> __m128 {
     _mm_mask_cvtepu64_ps(_mm_setzero_ps(), k, a)
 }
 
@@ -2549,8 +2813,8 @@ pub unsafe fn _mm_maskz_cvtepu64_ps(k: __mmask8, a: __m128i) -> __m128 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtepu64_ps(a: __m256i) -> __m128 {
-    transmute(vcvtuqq2ps_256(a.as_u64x4(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm256_cvtepu64_ps(a: __m256i) -> __m128 {
+    unsafe { transmute(vcvtuqq2ps_256(a.as_u64x4(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2562,9 +2826,11 @@ pub unsafe fn _mm256_cvtepu64_ps(a: __m256i) -> __m128 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m256i) -> __m128 {
-    let b = _mm256_cvtepu64_ps(a).as_f32x4();
-    transmute(simd_select_bitmask(k, b, src.as_f32x4()))
+pub fn _mm256_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepu64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, src.as_f32x4()))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2575,9 +2841,11 @@ pub unsafe fn _mm256_mask_cvtepu64_ps(src: __m128, k: __mmask8, a: __m256i) -> _
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvtepu64_ps(k: __mmask8, a: __m256i) -> __m128 {
-    let b = _mm256_cvtepu64_ps(a).as_f32x4();
-    transmute(simd_select_bitmask(k, b, f32x4::ZERO))
+pub fn _mm256_maskz_cvtepu64_ps(k: __mmask8, a: __m256i) -> __m128 {
+    unsafe {
+        let b = _mm256_cvtepu64_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, b, f32x4::ZERO))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2588,8 +2856,8 @@ pub unsafe fn _mm256_maskz_cvtepu64_ps(k: __mmask8, a: __m256i) -> __m128 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtepu64_ps(a: __m512i) -> __m256 {
-    transmute(vcvtuqq2ps_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm512_cvtepu64_ps(a: __m512i) -> __m256 {
+    unsafe { transmute(vcvtuqq2ps_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2601,9 +2869,11 @@ pub unsafe fn _mm512_cvtepu64_ps(a: __m512i) -> __m256 {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtepu64_ps(src: __m256, k: __mmask8, a: __m512i) -> __m256 {
-    let b = _mm512_cvtepu64_ps(a).as_f32x8();
-    transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+pub fn _mm512_mask_cvtepu64_ps(src: __m256, k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepu64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, src.as_f32x8()))
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements,
@@ -2614,9 +2884,11 @@ pub unsafe fn _mm512_mask_cvtepu64_ps(src: __m256, k: __mmask8, a: __m512i) -> _
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ps))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtepu64_ps(k: __mmask8, a: __m512i) -> __m256 {
-    let b = _mm512_cvtepu64_ps(a).as_f32x8();
-    transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+pub fn _mm512_maskz_cvtepu64_ps(k: __mmask8, a: __m512i) -> __m256 {
+    unsafe {
+        let b = _mm512_cvtepu64_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, b, f32x8::ZERO))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
@@ -2634,7 +2906,7 @@ pub unsafe fn _mm512_maskz_cvtepu64_ps(k: __mmask8, a: __m512i) -> __m256 {
 #[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvt_roundpd_epi64<const ROUNDING: i32>(a: __m512d) -> __m512i {
+pub fn _mm512_cvt_roundpd_epi64<const ROUNDING: i32>(a: __m512d) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundpd_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -2655,13 +2927,15 @@ pub unsafe fn _mm512_cvt_roundpd_epi64<const ROUNDING: i32>(a: __m512d) -> __m51
 #[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvt_roundpd_epi64<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundpd_epi64<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512d,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtpd2qq_512(a.as_f64x8(), src.as_i64x8(), k, ROUNDING))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtpd2qq_512(a.as_f64x8(), src.as_i64x8(), k, ROUNDING))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
@@ -2680,10 +2954,7 @@ pub unsafe fn _mm512_mask_cvt_roundpd_epi64<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtpd2qq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvt_roundpd_epi64<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-) -> __m512i {
+pub fn _mm512_maskz_cvt_roundpd_epi64<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundpd_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
 }
@@ -2696,7 +2967,7 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_epi64<const ROUNDING: i32>(
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtpd_epi64(a: __m128d) -> __m128i {
+pub fn _mm_cvtpd_epi64(a: __m128d) -> __m128i {
     _mm_mask_cvtpd_epi64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -2709,8 +2980,8 @@ pub unsafe fn _mm_cvtpd_epi64(a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    transmute(vcvtpd2qq_128(a.as_f64x2(), src.as_i64x2(), k))
+pub fn _mm_mask_cvtpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2qq_128(a.as_f64x2(), src.as_i64x2(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
@@ -2721,7 +2992,7 @@ pub unsafe fn _mm_mask_cvtpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
+pub fn _mm_maskz_cvtpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
     _mm_mask_cvtpd_epi64(_mm_setzero_si128(), k, a)
 }
 
@@ -2733,7 +3004,7 @@ pub unsafe fn _mm_maskz_cvtpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtpd_epi64(a: __m256d) -> __m256i {
+pub fn _mm256_cvtpd_epi64(a: __m256d) -> __m256i {
     _mm256_mask_cvtpd_epi64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -2746,8 +3017,8 @@ pub unsafe fn _mm256_cvtpd_epi64(a: __m256d) -> __m256i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvtpd_epi64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
-    transmute(vcvtpd2qq_256(a.as_f64x4(), src.as_i64x4(), k))
+pub fn _mm256_mask_cvtpd_epi64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvtpd2qq_256(a.as_f64x4(), src.as_i64x4(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
@@ -2758,7 +3029,7 @@ pub unsafe fn _mm256_mask_cvtpd_epi64(src: __m256i, k: __mmask8, a: __m256d) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvtpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
+pub fn _mm256_maskz_cvtpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
     _mm256_mask_cvtpd_epi64(_mm256_setzero_si256(), k, a)
 }
 
@@ -2770,7 +3041,7 @@ pub unsafe fn _mm256_maskz_cvtpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtpd_epi64(a: __m512d) -> __m512i {
+pub fn _mm512_cvtpd_epi64(a: __m512d) -> __m512i {
     _mm512_mask_cvtpd_epi64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -2783,13 +3054,15 @@ pub unsafe fn _mm512_cvtpd_epi64(a: __m512d) -> __m512i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtpd_epi64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
-    transmute(vcvtpd2qq_512(
-        a.as_f64x8(),
-        src.as_i64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtpd_epi64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvtpd2qq_512(
+            a.as_f64x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers,
@@ -2800,7 +3073,7 @@ pub unsafe fn _mm512_mask_cvtpd_epi64(src: __m512i, k: __mmask8, a: __m512d) ->
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
+pub fn _mm512_maskz_cvtpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
     _mm512_mask_cvtpd_epi64(_mm512_setzero_si512(), k, a)
 }
 
@@ -2819,7 +3092,7 @@ pub unsafe fn _mm512_maskz_cvtpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvt_roundps_epi64<const ROUNDING: i32>(a: __m256) -> __m512i {
+pub fn _mm512_cvt_roundps_epi64<const ROUNDING: i32>(a: __m256) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundps_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -2840,13 +3113,15 @@ pub unsafe fn _mm512_cvt_roundps_epi64<const ROUNDING: i32>(a: __m256) -> __m512
 #[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvt_roundps_epi64<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundps_epi64<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m256,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtps2qq_512(a.as_f32x8(), src.as_i64x8(), k, ROUNDING))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtps2qq_512(a.as_f32x8(), src.as_i64x8(), k, ROUNDING))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
@@ -2865,10 +3140,7 @@ pub unsafe fn _mm512_mask_cvt_roundps_epi64<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtps2qq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvt_roundps_epi64<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m256,
-) -> __m512i {
+pub fn _mm512_maskz_cvt_roundps_epi64<const ROUNDING: i32>(k: __mmask8, a: __m256) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundps_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
 }
@@ -2881,7 +3153,7 @@ pub unsafe fn _mm512_maskz_cvt_roundps_epi64<const ROUNDING: i32>(
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtps_epi64(a: __m128) -> __m128i {
+pub fn _mm_cvtps_epi64(a: __m128) -> __m128i {
     _mm_mask_cvtps_epi64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -2894,8 +3166,8 @@ pub unsafe fn _mm_cvtps_epi64(a: __m128) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    transmute(vcvtps2qq_128(a.as_f32x4(), src.as_i64x2(), k))
+pub fn _mm_mask_cvtps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2qq_128(a.as_f32x4(), src.as_i64x2(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
@@ -2906,7 +3178,7 @@ pub unsafe fn _mm_mask_cvtps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m1
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m128i {
+pub fn _mm_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m128i {
     _mm_mask_cvtps_epi64(_mm_setzero_si128(), k, a)
 }
 
@@ -2918,7 +3190,7 @@ pub unsafe fn _mm_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtps_epi64(a: __m128) -> __m256i {
+pub fn _mm256_cvtps_epi64(a: __m128) -> __m256i {
     _mm256_mask_cvtps_epi64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -2931,8 +3203,8 @@ pub unsafe fn _mm256_cvtps_epi64(a: __m128) -> __m256i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvtps_epi64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
-    transmute(vcvtps2qq_256(a.as_f32x4(), src.as_i64x4(), k))
+pub fn _mm256_mask_cvtps_epi64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvtps2qq_256(a.as_f32x4(), src.as_i64x4(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
@@ -2943,7 +3215,7 @@ pub unsafe fn _mm256_mask_cvtps_epi64(src: __m256i, k: __mmask8, a: __m128) -> _
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m256i {
+pub fn _mm256_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m256i {
     _mm256_mask_cvtps_epi64(_mm256_setzero_si256(), k, a)
 }
 
@@ -2955,7 +3227,7 @@ pub unsafe fn _mm256_maskz_cvtps_epi64(k: __mmask8, a: __m128) -> __m256i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtps_epi64(a: __m256) -> __m512i {
+pub fn _mm512_cvtps_epi64(a: __m256) -> __m512i {
     _mm512_mask_cvtps_epi64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -2968,13 +3240,15 @@ pub unsafe fn _mm512_cvtps_epi64(a: __m256) -> __m512i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtps_epi64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
-    transmute(vcvtps2qq_512(
-        a.as_f32x8(),
-        src.as_i64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtps_epi64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvtps2qq_512(
+            a.as_f32x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers,
@@ -2985,7 +3259,7 @@ pub unsafe fn _mm512_mask_cvtps_epi64(src: __m512i, k: __mmask8, a: __m256) -> _
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtps_epi64(k: __mmask8, a: __m256) -> __m512i {
+pub fn _mm512_maskz_cvtps_epi64(k: __mmask8, a: __m256) -> __m512i {
     _mm512_mask_cvtps_epi64(_mm512_setzero_si512(), k, a)
 }
 
@@ -3004,7 +3278,7 @@ pub unsafe fn _mm512_maskz_cvtps_epi64(k: __mmask8, a: __m256) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvt_roundpd_epu64<const ROUNDING: i32>(a: __m512d) -> __m512i {
+pub fn _mm512_cvt_roundpd_epu64<const ROUNDING: i32>(a: __m512d) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundpd_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -3025,13 +3299,15 @@ pub unsafe fn _mm512_cvt_roundpd_epu64<const ROUNDING: i32>(a: __m512d) -> __m51
 #[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvt_roundpd_epu64<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundpd_epu64<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512d,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtpd2uqq_512(a.as_f64x8(), src.as_u64x8(), k, ROUNDING))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtpd2uqq_512(a.as_f64x8(), src.as_u64x8(), k, ROUNDING))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
@@ -3050,10 +3326,7 @@ pub unsafe fn _mm512_mask_cvt_roundpd_epu64<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtpd2uqq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvt_roundpd_epu64<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-) -> __m512i {
+pub fn _mm512_maskz_cvt_roundpd_epu64<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundpd_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
 }
@@ -3066,7 +3339,7 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_epu64<const ROUNDING: i32>(
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtpd_epu64(a: __m128d) -> __m128i {
+pub fn _mm_cvtpd_epu64(a: __m128d) -> __m128i {
     _mm_mask_cvtpd_epu64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -3079,8 +3352,8 @@ pub unsafe fn _mm_cvtpd_epu64(a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    transmute(vcvtpd2uqq_128(a.as_f64x2(), src.as_u64x2(), k))
+pub fn _mm_mask_cvtpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2uqq_128(a.as_f64x2(), src.as_u64x2(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
@@ -3091,7 +3364,7 @@ pub unsafe fn _mm_mask_cvtpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
+pub fn _mm_maskz_cvtpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
     _mm_mask_cvtpd_epu64(_mm_setzero_si128(), k, a)
 }
 
@@ -3103,7 +3376,7 @@ pub unsafe fn _mm_maskz_cvtpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtpd_epu64(a: __m256d) -> __m256i {
+pub fn _mm256_cvtpd_epu64(a: __m256d) -> __m256i {
     _mm256_mask_cvtpd_epu64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -3116,8 +3389,8 @@ pub unsafe fn _mm256_cvtpd_epu64(a: __m256d) -> __m256i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvtpd_epu64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
-    transmute(vcvtpd2uqq_256(a.as_f64x4(), src.as_u64x4(), k))
+pub fn _mm256_mask_cvtpd_epu64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvtpd2uqq_256(a.as_f64x4(), src.as_u64x4(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
@@ -3128,7 +3401,7 @@ pub unsafe fn _mm256_mask_cvtpd_epu64(src: __m256i, k: __mmask8, a: __m256d) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvtpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
+pub fn _mm256_maskz_cvtpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
     _mm256_mask_cvtpd_epu64(_mm256_setzero_si256(), k, a)
 }
 
@@ -3140,7 +3413,7 @@ pub unsafe fn _mm256_maskz_cvtpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtpd_epu64(a: __m512d) -> __m512i {
+pub fn _mm512_cvtpd_epu64(a: __m512d) -> __m512i {
     _mm512_mask_cvtpd_epu64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -3153,13 +3426,15 @@ pub unsafe fn _mm512_cvtpd_epu64(a: __m512d) -> __m512i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtpd_epu64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
-    transmute(vcvtpd2uqq_512(
-        a.as_f64x8(),
-        src.as_u64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtpd_epu64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvtpd2uqq_512(
+            a.as_f64x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers,
@@ -3170,7 +3445,7 @@ pub unsafe fn _mm512_mask_cvtpd_epu64(src: __m512i, k: __mmask8, a: __m512d) ->
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
+pub fn _mm512_maskz_cvtpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
     _mm512_mask_cvtpd_epu64(_mm512_setzero_si512(), k, a)
 }
 
@@ -3189,7 +3464,7 @@ pub unsafe fn _mm512_maskz_cvtpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvt_roundps_epu64<const ROUNDING: i32>(a: __m256) -> __m512i {
+pub fn _mm512_cvt_roundps_epu64<const ROUNDING: i32>(a: __m256) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundps_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -3210,13 +3485,15 @@ pub unsafe fn _mm512_cvt_roundps_epu64<const ROUNDING: i32>(a: __m256) -> __m512
 #[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvt_roundps_epu64<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundps_epu64<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m256,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtps2uqq_512(a.as_f32x8(), src.as_u64x8(), k, ROUNDING))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtps2uqq_512(a.as_f32x8(), src.as_u64x8(), k, ROUNDING))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
@@ -3235,10 +3512,7 @@ pub unsafe fn _mm512_mask_cvt_roundps_epu64<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtps2uqq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvt_roundps_epu64<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m256,
-) -> __m512i {
+pub fn _mm512_maskz_cvt_roundps_epu64<const ROUNDING: i32>(k: __mmask8, a: __m256) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundps_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
 }
@@ -3251,7 +3525,7 @@ pub unsafe fn _mm512_maskz_cvt_roundps_epu64<const ROUNDING: i32>(
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtps_epu64(a: __m128) -> __m128i {
+pub fn _mm_cvtps_epu64(a: __m128) -> __m128i {
     _mm_mask_cvtps_epu64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -3264,8 +3538,8 @@ pub unsafe fn _mm_cvtps_epu64(a: __m128) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvtps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    transmute(vcvtps2uqq_128(a.as_f32x4(), src.as_u64x2(), k))
+pub fn _mm_mask_cvtps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2uqq_128(a.as_f32x4(), src.as_u64x2(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
@@ -3276,7 +3550,7 @@ pub unsafe fn _mm_mask_cvtps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m1
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m128i {
+pub fn _mm_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m128i {
     _mm_mask_cvtps_epu64(_mm_setzero_si128(), k, a)
 }
 
@@ -3288,7 +3562,7 @@ pub unsafe fn _mm_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtps_epu64(a: __m128) -> __m256i {
+pub fn _mm256_cvtps_epu64(a: __m128) -> __m256i {
     _mm256_mask_cvtps_epu64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -3301,8 +3575,8 @@ pub unsafe fn _mm256_cvtps_epu64(a: __m128) -> __m256i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvtps_epu64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
-    transmute(vcvtps2uqq_256(a.as_f32x4(), src.as_u64x4(), k))
+pub fn _mm256_mask_cvtps_epu64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvtps2uqq_256(a.as_f32x4(), src.as_u64x4(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
@@ -3313,7 +3587,7 @@ pub unsafe fn _mm256_mask_cvtps_epu64(src: __m256i, k: __mmask8, a: __m128) -> _
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m256i {
+pub fn _mm256_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m256i {
     _mm256_mask_cvtps_epu64(_mm256_setzero_si256(), k, a)
 }
 
@@ -3325,7 +3599,7 @@ pub unsafe fn _mm256_maskz_cvtps_epu64(k: __mmask8, a: __m128) -> __m256i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtps_epu64(a: __m256) -> __m512i {
+pub fn _mm512_cvtps_epu64(a: __m256) -> __m512i {
     _mm512_mask_cvtps_epu64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -3338,13 +3612,15 @@ pub unsafe fn _mm512_cvtps_epu64(a: __m256) -> __m512i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtps_epu64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
-    transmute(vcvtps2uqq_512(
-        a.as_f32x8(),
-        src.as_u64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtps_epu64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvtps2uqq_512(
+            a.as_f32x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers,
@@ -3355,7 +3631,7 @@ pub unsafe fn _mm512_mask_cvtps_epu64(src: __m512i, k: __mmask8, a: __m256) -> _
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvtps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtps_epu64(k: __mmask8, a: __m256) -> __m512i {
+pub fn _mm512_maskz_cvtps_epu64(k: __mmask8, a: __m256) -> __m512i {
     _mm512_mask_cvtps_epu64(_mm512_setzero_si512(), k, a)
 }
 
@@ -3369,7 +3645,7 @@ pub unsafe fn _mm512_maskz_cvtps_epu64(k: __mmask8, a: __m256) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtt_roundpd_epi64<const SAE: i32>(a: __m512d) -> __m512i {
+pub fn _mm512_cvtt_roundpd_epi64<const SAE: i32>(a: __m512d) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundpd_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -3384,13 +3660,15 @@ pub unsafe fn _mm512_cvtt_roundpd_epi64<const SAE: i32>(a: __m512d) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtt_roundpd_epi64<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundpd_epi64<const SAE: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512d,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    transmute(vcvttpd2qq_512(a.as_f64x8(), src.as_i64x8(), k, SAE))
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttpd2qq_512(a.as_f64x8(), src.as_i64x8(), k, SAE))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
@@ -3403,7 +3681,7 @@ pub unsafe fn _mm512_mask_cvtt_roundpd_epi64<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvttpd2qq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtt_roundpd_epi64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
+pub fn _mm512_maskz_cvtt_roundpd_epi64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundpd_epi64::<SAE>(_mm512_setzero_si512(), k, a)
 }
@@ -3416,7 +3694,7 @@ pub unsafe fn _mm512_maskz_cvtt_roundpd_epi64<const SAE: i32>(k: __mmask8, a: __
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvttpd_epi64(a: __m128d) -> __m128i {
+pub fn _mm_cvttpd_epi64(a: __m128d) -> __m128i {
     _mm_mask_cvttpd_epi64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -3429,8 +3707,8 @@ pub unsafe fn _mm_cvttpd_epi64(a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvttpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    transmute(vcvttpd2qq_128(a.as_f64x2(), src.as_i64x2(), k))
+pub fn _mm_mask_cvttpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2qq_128(a.as_f64x2(), src.as_i64x2(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
@@ -3442,7 +3720,7 @@ pub unsafe fn _mm_mask_cvttpd_epi64(src: __m128i, k: __mmask8, a: __m128d) -> __
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvttpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
+pub fn _mm_maskz_cvttpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
     _mm_mask_cvttpd_epi64(_mm_setzero_si128(), k, a)
 }
 
@@ -3454,7 +3732,7 @@ pub unsafe fn _mm_maskz_cvttpd_epi64(k: __mmask8, a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvttpd_epi64(a: __m256d) -> __m256i {
+pub fn _mm256_cvttpd_epi64(a: __m256d) -> __m256i {
     _mm256_mask_cvttpd_epi64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -3467,8 +3745,8 @@ pub unsafe fn _mm256_cvttpd_epi64(a: __m256d) -> __m256i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvttpd_epi64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
-    transmute(vcvttpd2qq_256(a.as_f64x4(), src.as_i64x4(), k))
+pub fn _mm256_mask_cvttpd_epi64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvttpd2qq_256(a.as_f64x4(), src.as_i64x4(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
@@ -3480,7 +3758,7 @@ pub unsafe fn _mm256_mask_cvttpd_epi64(src: __m256i, k: __mmask8, a: __m256d) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvttpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
+pub fn _mm256_maskz_cvttpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
     _mm256_mask_cvttpd_epi64(_mm256_setzero_si256(), k, a)
 }
 
@@ -3492,7 +3770,7 @@ pub unsafe fn _mm256_maskz_cvttpd_epi64(k: __mmask8, a: __m256d) -> __m256i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvttpd_epi64(a: __m512d) -> __m512i {
+pub fn _mm512_cvttpd_epi64(a: __m512d) -> __m512i {
     _mm512_mask_cvttpd_epi64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -3505,13 +3783,15 @@ pub unsafe fn _mm512_cvttpd_epi64(a: __m512d) -> __m512i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvttpd_epi64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
-    transmute(vcvttpd2qq_512(
-        a.as_f64x8(),
-        src.as_i64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttpd_epi64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvttpd2qq_512(
+            a.as_f64x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed signed 64-bit integers
@@ -3523,7 +3803,7 @@ pub unsafe fn _mm512_mask_cvttpd_epi64(src: __m512i, k: __mmask8, a: __m512d) ->
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttpd2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvttpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
+pub fn _mm512_maskz_cvttpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
     _mm512_mask_cvttpd_epi64(_mm512_setzero_si512(), k, a)
 }
 
@@ -3537,7 +3817,7 @@ pub unsafe fn _mm512_maskz_cvttpd_epi64(k: __mmask8, a: __m512d) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtt_roundps_epi64<const SAE: i32>(a: __m256) -> __m512i {
+pub fn _mm512_cvtt_roundps_epi64<const SAE: i32>(a: __m256) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundps_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -3552,13 +3832,15 @@ pub unsafe fn _mm512_cvtt_roundps_epi64<const SAE: i32>(a: __m256) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtt_roundps_epi64<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundps_epi64<const SAE: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m256,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    transmute(vcvttps2qq_512(a.as_f32x8(), src.as_i64x8(), k, SAE))
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttps2qq_512(a.as_f32x8(), src.as_i64x8(), k, SAE))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
@@ -3571,7 +3853,7 @@ pub unsafe fn _mm512_mask_cvtt_roundps_epi64<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvttps2qq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtt_roundps_epi64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
+pub fn _mm512_maskz_cvtt_roundps_epi64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundps_epi64::<SAE>(_mm512_setzero_si512(), k, a)
 }
@@ -3584,7 +3866,7 @@ pub unsafe fn _mm512_maskz_cvtt_roundps_epi64<const SAE: i32>(k: __mmask8, a: __
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvttps_epi64(a: __m128) -> __m128i {
+pub fn _mm_cvttps_epi64(a: __m128) -> __m128i {
     _mm_mask_cvttps_epi64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -3597,8 +3879,8 @@ pub unsafe fn _mm_cvttps_epi64(a: __m128) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvttps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    transmute(vcvttps2qq_128(a.as_f32x4(), src.as_i64x2(), k))
+pub fn _mm_mask_cvttps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2qq_128(a.as_f32x4(), src.as_i64x2(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
@@ -3610,7 +3892,7 @@ pub unsafe fn _mm_mask_cvttps_epi64(src: __m128i, k: __mmask8, a: __m128) -> __m
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m128i {
+pub fn _mm_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m128i {
     _mm_mask_cvttps_epi64(_mm_setzero_si128(), k, a)
 }
 
@@ -3622,7 +3904,7 @@ pub unsafe fn _mm_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvttps_epi64(a: __m128) -> __m256i {
+pub fn _mm256_cvttps_epi64(a: __m128) -> __m256i {
     _mm256_mask_cvttps_epi64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -3635,8 +3917,8 @@ pub unsafe fn _mm256_cvttps_epi64(a: __m128) -> __m256i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvttps_epi64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
-    transmute(vcvttps2qq_256(a.as_f32x4(), src.as_i64x4(), k))
+pub fn _mm256_mask_cvttps_epi64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvttps2qq_256(a.as_f32x4(), src.as_i64x4(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
@@ -3648,7 +3930,7 @@ pub unsafe fn _mm256_mask_cvttps_epi64(src: __m256i, k: __mmask8, a: __m128) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m256i {
+pub fn _mm256_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m256i {
     _mm256_mask_cvttps_epi64(_mm256_setzero_si256(), k, a)
 }
 
@@ -3660,7 +3942,7 @@ pub unsafe fn _mm256_maskz_cvttps_epi64(k: __mmask8, a: __m128) -> __m256i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvttps_epi64(a: __m256) -> __m512i {
+pub fn _mm512_cvttps_epi64(a: __m256) -> __m512i {
     _mm512_mask_cvttps_epi64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -3673,13 +3955,15 @@ pub unsafe fn _mm512_cvttps_epi64(a: __m256) -> __m512i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvttps_epi64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
-    transmute(vcvttps2qq_512(
-        a.as_f32x8(),
-        src.as_i64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttps_epi64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvttps2qq_512(
+            a.as_f32x8(),
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed signed 64-bit integers
@@ -3691,7 +3975,7 @@ pub unsafe fn _mm512_mask_cvttps_epi64(src: __m512i, k: __mmask8, a: __m256) ->
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttps2qq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvttps_epi64(k: __mmask8, a: __m256) -> __m512i {
+pub fn _mm512_maskz_cvttps_epi64(k: __mmask8, a: __m256) -> __m512i {
     _mm512_mask_cvttps_epi64(_mm512_setzero_si512(), k, a)
 }
 
@@ -3705,7 +3989,7 @@ pub unsafe fn _mm512_maskz_cvttps_epi64(k: __mmask8, a: __m256) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtt_roundpd_epu64<const SAE: i32>(a: __m512d) -> __m512i {
+pub fn _mm512_cvtt_roundpd_epu64<const SAE: i32>(a: __m512d) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundpd_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -3720,13 +4004,15 @@ pub unsafe fn _mm512_cvtt_roundpd_epu64<const SAE: i32>(a: __m512d) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtt_roundpd_epu64<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundpd_epu64<const SAE: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512d,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    transmute(vcvttpd2uqq_512(a.as_f64x8(), src.as_u64x8(), k, SAE))
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttpd2uqq_512(a.as_f64x8(), src.as_u64x8(), k, SAE))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
@@ -3739,7 +4025,7 @@ pub unsafe fn _mm512_mask_cvtt_roundpd_epu64<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvttpd2uqq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtt_roundpd_epu64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
+pub fn _mm512_maskz_cvtt_roundpd_epu64<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundpd_epu64::<SAE>(_mm512_setzero_si512(), k, a)
 }
@@ -3752,7 +4038,7 @@ pub unsafe fn _mm512_maskz_cvtt_roundpd_epu64<const SAE: i32>(k: __mmask8, a: __
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvttpd_epu64(a: __m128d) -> __m128i {
+pub fn _mm_cvttpd_epu64(a: __m128d) -> __m128i {
     _mm_mask_cvttpd_epu64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -3765,8 +4051,8 @@ pub unsafe fn _mm_cvttpd_epu64(a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvttpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    transmute(vcvttpd2uqq_128(a.as_f64x2(), src.as_u64x2(), k))
+pub fn _mm_mask_cvttpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2uqq_128(a.as_f64x2(), src.as_u64x2(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
@@ -3778,7 +4064,7 @@ pub unsafe fn _mm_mask_cvttpd_epu64(src: __m128i, k: __mmask8, a: __m128d) -> __
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvttpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
+pub fn _mm_maskz_cvttpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
     _mm_mask_cvttpd_epu64(_mm_setzero_si128(), k, a)
 }
 
@@ -3790,7 +4076,7 @@ pub unsafe fn _mm_maskz_cvttpd_epu64(k: __mmask8, a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvttpd_epu64(a: __m256d) -> __m256i {
+pub fn _mm256_cvttpd_epu64(a: __m256d) -> __m256i {
     _mm256_mask_cvttpd_epu64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -3803,8 +4089,8 @@ pub unsafe fn _mm256_cvttpd_epu64(a: __m256d) -> __m256i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvttpd_epu64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
-    transmute(vcvttpd2uqq_256(a.as_f64x4(), src.as_u64x4(), k))
+pub fn _mm256_mask_cvttpd_epu64(src: __m256i, k: __mmask8, a: __m256d) -> __m256i {
+    unsafe { transmute(vcvttpd2uqq_256(a.as_f64x4(), src.as_u64x4(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
@@ -3816,7 +4102,7 @@ pub unsafe fn _mm256_mask_cvttpd_epu64(src: __m256i, k: __mmask8, a: __m256d) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvttpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
+pub fn _mm256_maskz_cvttpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
     _mm256_mask_cvttpd_epu64(_mm256_setzero_si256(), k, a)
 }
 
@@ -3828,7 +4114,7 @@ pub unsafe fn _mm256_maskz_cvttpd_epu64(k: __mmask8, a: __m256d) -> __m256i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvttpd_epu64(a: __m512d) -> __m512i {
+pub fn _mm512_cvttpd_epu64(a: __m512d) -> __m512i {
     _mm512_mask_cvttpd_epu64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -3841,13 +4127,15 @@ pub unsafe fn _mm512_cvttpd_epu64(a: __m512d) -> __m512i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvttpd_epu64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
-    transmute(vcvttpd2uqq_512(
-        a.as_f64x8(),
-        src.as_u64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttpd_epu64(src: __m512i, k: __mmask8, a: __m512d) -> __m512i {
+    unsafe {
+        transmute(vcvttpd2uqq_512(
+            a.as_f64x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers
@@ -3859,7 +4147,7 @@ pub unsafe fn _mm512_mask_cvttpd_epu64(src: __m512i, k: __mmask8, a: __m512d) ->
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttpd2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvttpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
+pub fn _mm512_maskz_cvttpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
     _mm512_mask_cvttpd_epu64(_mm512_setzero_si512(), k, a)
 }
 
@@ -3873,7 +4161,7 @@ pub unsafe fn _mm512_maskz_cvttpd_epu64(k: __mmask8, a: __m512d) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtt_roundps_epu64<const SAE: i32>(a: __m256) -> __m512i {
+pub fn _mm512_cvtt_roundps_epu64<const SAE: i32>(a: __m256) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundps_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -3888,13 +4176,15 @@ pub unsafe fn _mm512_cvtt_roundps_epu64<const SAE: i32>(a: __m256) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvtt_roundps_epu64<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundps_epu64<const SAE: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m256,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    transmute(vcvttps2uqq_512(a.as_f32x8(), src.as_u64x8(), k, SAE))
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttps2uqq_512(a.as_f32x8(), src.as_u64x8(), k, SAE))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
@@ -3907,7 +4197,7 @@ pub unsafe fn _mm512_mask_cvtt_roundps_epu64<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvttps2uqq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvtt_roundps_epu64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
+pub fn _mm512_maskz_cvtt_roundps_epu64<const SAE: i32>(k: __mmask8, a: __m256) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundps_epu64::<SAE>(_mm512_setzero_si512(), k, a)
 }
@@ -3920,7 +4210,7 @@ pub unsafe fn _mm512_maskz_cvtt_roundps_epu64<const SAE: i32>(k: __mmask8, a: __
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvttps_epu64(a: __m128) -> __m128i {
+pub fn _mm_cvttps_epu64(a: __m128) -> __m128i {
     _mm_mask_cvttps_epu64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -3933,8 +4223,8 @@ pub unsafe fn _mm_cvttps_epu64(a: __m128) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_cvttps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    transmute(vcvttps2uqq_128(a.as_f32x4(), src.as_u64x2(), k))
+pub fn _mm_mask_cvttps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2uqq_128(a.as_f32x4(), src.as_u64x2(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
@@ -3946,7 +4236,7 @@ pub unsafe fn _mm_mask_cvttps_epu64(src: __m128i, k: __mmask8, a: __m128) -> __m
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m128i {
+pub fn _mm_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m128i {
     _mm_mask_cvttps_epu64(_mm_setzero_si128(), k, a)
 }
 
@@ -3958,7 +4248,7 @@ pub unsafe fn _mm_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvttps_epu64(a: __m128) -> __m256i {
+pub fn _mm256_cvttps_epu64(a: __m128) -> __m256i {
     _mm256_mask_cvttps_epu64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -3971,8 +4261,8 @@ pub unsafe fn _mm256_cvttps_epu64(a: __m128) -> __m256i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_cvttps_epu64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
-    transmute(vcvttps2uqq_256(a.as_f32x4(), src.as_u64x4(), k))
+pub fn _mm256_mask_cvttps_epu64(src: __m256i, k: __mmask8, a: __m128) -> __m256i {
+    unsafe { transmute(vcvttps2uqq_256(a.as_f32x4(), src.as_u64x4(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
@@ -3984,7 +4274,7 @@ pub unsafe fn _mm256_mask_cvttps_epu64(src: __m256i, k: __mmask8, a: __m128) ->
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m256i {
+pub fn _mm256_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m256i {
     _mm256_mask_cvttps_epu64(_mm256_setzero_si256(), k, a)
 }
 
@@ -3996,7 +4286,7 @@ pub unsafe fn _mm256_maskz_cvttps_epu64(k: __mmask8, a: __m128) -> __m256i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvttps_epu64(a: __m256) -> __m512i {
+pub fn _mm512_cvttps_epu64(a: __m256) -> __m512i {
     _mm512_mask_cvttps_epu64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -4009,13 +4299,15 @@ pub unsafe fn _mm512_cvttps_epu64(a: __m256) -> __m512i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_cvttps_epu64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
-    transmute(vcvttps2uqq_512(
-        a.as_f32x8(),
-        src.as_u64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttps_epu64(src: __m512i, k: __mmask8, a: __m256) -> __m512i {
+    unsafe {
+        transmute(vcvttps2uqq_512(
+            a.as_f32x8(),
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers
@@ -4027,7 +4319,7 @@ pub unsafe fn _mm512_mask_cvttps_epu64(src: __m512i, k: __mmask8, a: __m256) ->
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vcvttps2uqq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_cvttps_epu64(k: __mmask8, a: __m256) -> __m512i {
+pub fn _mm512_maskz_cvttps_epu64(k: __mmask8, a: __m256) -> __m512i {
     _mm512_mask_cvttps_epu64(_mm512_setzero_si512(), k, a)
 }
 
@@ -4041,8 +4333,8 @@ pub unsafe fn _mm512_maskz_cvttps_epu64(k: __mmask8, a: __m256) -> __m512i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmullq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mullo_epi64(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_mul(a.as_i64x2(), b.as_i64x2()))
+pub fn _mm_mullo_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_mul(a.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
@@ -4054,9 +4346,11 @@ pub unsafe fn _mm_mullo_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmullq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_mullo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let b = _mm_mullo_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+pub fn _mm_mask_mullo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_mullo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, b, src.as_i64x2()))
+    }
 }
 
 /// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
@@ -4068,9 +4362,11 @@ pub unsafe fn _mm_mask_mullo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmullq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_mullo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let b = _mm_mullo_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+pub fn _mm_maskz_mullo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let b = _mm_mullo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, b, i64x2::ZERO))
+    }
 }
 
 /// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
@@ -4081,8 +4377,8 @@ pub unsafe fn _mm_maskz_mullo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m1
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmullq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mullo_epi64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_mul(a.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_mullo_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_mul(a.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
@@ -4094,14 +4390,11 @@ pub unsafe fn _mm256_mullo_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmullq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_mullo_epi64(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let b = _mm256_mullo_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, b, src.as_i64x4()))
+pub fn _mm256_mask_mullo_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let b = _mm256_mullo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, b, src.as_i64x4()))
+    }
 }
 
 /// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
@@ -4113,9 +4406,11 @@ pub unsafe fn _mm256_mask_mullo_epi64(
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmullq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_mullo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let b = _mm256_mullo_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, b, i64x4::ZERO))
+pub fn _mm256_maskz_mullo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let b = _mm256_mullo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, b, i64x4::ZERO))
+    }
 }
 
 /// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
@@ -4126,8 +4421,8 @@ pub unsafe fn _mm256_maskz_mullo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> _
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vpmullq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mullo_epi64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_mul(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_mullo_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
@@ -4139,14 +4434,11 @@ pub unsafe fn _mm512_mullo_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vpmullq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_mullo_epi64(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let b = _mm512_mullo_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, b, src.as_i64x8()))
+pub fn _mm512_mask_mullo_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let b = _mm512_mullo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, b, src.as_i64x8()))
+    }
 }
 
 /// Multiply packed 64-bit integers in `a` and `b`, producing intermediate 128-bit integers, and store
@@ -4158,9 +4450,11 @@ pub unsafe fn _mm512_mask_mullo_epi64(
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vpmullq))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_mullo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let b = _mm512_mullo_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, b, i64x8::ZERO))
+pub fn _mm512_maskz_mullo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let b = _mm512_mullo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, b, i64x8::ZERO))
+    }
 }
 
 // Mask Registers
@@ -4171,7 +4465,7 @@ pub unsafe fn _mm512_maskz_mullo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> _
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _cvtmask8_u32(a: __mmask8) -> u32 {
+pub fn _cvtmask8_u32(a: __mmask8) -> u32 {
     a as u32
 }
 
@@ -4181,7 +4475,7 @@ pub unsafe fn _cvtmask8_u32(a: __mmask8) -> u32 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _cvtu32_mask8(a: u32) -> __mmask8 {
+pub fn _cvtu32_mask8(a: u32) -> __mmask8 {
     a as __mmask8
 }
 
@@ -4191,7 +4485,7 @@ pub unsafe fn _cvtu32_mask8(a: u32) -> __mmask8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kadd_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _kadd_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
     a + b
 }
 
@@ -4201,7 +4495,7 @@ pub unsafe fn _kadd_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kadd_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+pub fn _kadd_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
     a + b
 }
 
@@ -4211,7 +4505,7 @@ pub unsafe fn _kadd_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kand_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+pub fn _kand_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
     a & b
 }
 
@@ -4221,7 +4515,7 @@ pub unsafe fn _kand_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kandn_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+pub fn _kandn_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
     _knot_mask8(a) & b
 }
 
@@ -4231,7 +4525,7 @@ pub unsafe fn _kandn_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _knot_mask8(a: __mmask8) -> __mmask8 {
+pub fn _knot_mask8(a: __mmask8) -> __mmask8 {
     a ^ 0b11111111
 }
 
@@ -4241,7 +4535,7 @@ pub unsafe fn _knot_mask8(a: __mmask8) -> __mmask8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+pub fn _kor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
     a | b
 }
 
@@ -4251,7 +4545,7 @@ pub unsafe fn _kor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kxnor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+pub fn _kxnor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
     _knot_mask8(_kxor_mask8(a, b))
 }
 
@@ -4261,7 +4555,7 @@ pub unsafe fn _kxnor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kxor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
+pub fn _kxor_mask8(a: __mmask8, b: __mmask8) -> __mmask8 {
     a ^ b
 }
 
@@ -4285,7 +4579,7 @@ pub unsafe fn _kortest_mask8_u8(a: __mmask8, b: __mmask8, all_ones: *mut u8) ->
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kortestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+pub fn _kortestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
     (_kor_mask8(a, b) == 0xff) as u8
 }
 
@@ -4296,7 +4590,7 @@ pub unsafe fn _kortestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kortestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+pub fn _kortestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
     (_kor_mask8(a, b) == 0) as u8
 }
 
@@ -4307,7 +4601,7 @@ pub unsafe fn _kortestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
 #[target_feature(enable = "avx512dq")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kshiftli_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
+pub fn _kshiftli_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
     a << COUNT
 }
 
@@ -4318,7 +4612,7 @@ pub unsafe fn _kshiftli_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
 #[target_feature(enable = "avx512dq")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kshiftri_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
+pub fn _kshiftri_mask8<const COUNT: u32>(a: __mmask8) -> __mmask8 {
     a >> COUNT
 }
 
@@ -4355,7 +4649,7 @@ pub unsafe fn _ktest_mask8_u8(a: __mmask8, b: __mmask8, and_not: *mut u8) -> u8
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _ktestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+pub fn _ktestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
     (_kandn_mask16(a, b) == 0) as u8
 }
 
@@ -4366,7 +4660,7 @@ pub unsafe fn _ktestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _ktestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+pub fn _ktestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
     (_kandn_mask8(a, b) == 0) as u8
 }
 
@@ -4377,7 +4671,7 @@ pub unsafe fn _ktestc_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _ktestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+pub fn _ktestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
     (_kand_mask16(a, b) == 0) as u8
 }
 
@@ -4388,7 +4682,7 @@ pub unsafe fn _ktestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _ktestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
+pub fn _ktestz_mask8_u8(a: __mmask8, b: __mmask8) -> u8 {
     (_kand_mask8(a, b) == 0) as u8
 }
 
@@ -4419,7 +4713,7 @@ pub unsafe fn _store_mask8(mem_addr: *mut __mmask8, a: __mmask8) {
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_movepi32_mask(a: __m128i) -> __mmask8 {
+pub fn _mm_movepi32_mask(a: __m128i) -> __mmask8 {
     let zero = _mm_setzero_si128();
     _mm_cmplt_epi32_mask(a, zero)
 }
@@ -4431,7 +4725,7 @@ pub unsafe fn _mm_movepi32_mask(a: __m128i) -> __mmask8 {
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_movepi32_mask(a: __m256i) -> __mmask8 {
+pub fn _mm256_movepi32_mask(a: __m256i) -> __mmask8 {
     let zero = _mm256_setzero_si256();
     _mm256_cmplt_epi32_mask(a, zero)
 }
@@ -4443,7 +4737,7 @@ pub unsafe fn _mm256_movepi32_mask(a: __m256i) -> __mmask8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_movepi32_mask(a: __m512i) -> __mmask16 {
+pub fn _mm512_movepi32_mask(a: __m512i) -> __mmask16 {
     let zero = _mm512_setzero_si512();
     _mm512_cmplt_epi32_mask(a, zero)
 }
@@ -4455,7 +4749,7 @@ pub unsafe fn _mm512_movepi32_mask(a: __m512i) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_movepi64_mask(a: __m128i) -> __mmask8 {
+pub fn _mm_movepi64_mask(a: __m128i) -> __mmask8 {
     let zero = _mm_setzero_si128();
     _mm_cmplt_epi64_mask(a, zero)
 }
@@ -4467,7 +4761,7 @@ pub unsafe fn _mm_movepi64_mask(a: __m128i) -> __mmask8 {
 #[inline]
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_movepi64_mask(a: __m256i) -> __mmask8 {
+pub fn _mm256_movepi64_mask(a: __m256i) -> __mmask8 {
     let zero = _mm256_setzero_si256();
     _mm256_cmplt_epi64_mask(a, zero)
 }
@@ -4479,7 +4773,7 @@ pub unsafe fn _mm256_movepi64_mask(a: __m256i) -> __mmask8 {
 #[inline]
 #[target_feature(enable = "avx512dq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_movepi64_mask(a: __m512i) -> __mmask8 {
+pub fn _mm512_movepi64_mask(a: __m512i) -> __mmask8 {
     let zero = _mm512_setzero_si512();
     _mm512_cmplt_epi64_mask(a, zero)
 }
@@ -4492,7 +4786,7 @@ pub unsafe fn _mm512_movepi64_mask(a: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovm2d))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_movm_epi32(k: __mmask8) -> __m128i {
+pub fn _mm_movm_epi32(k: __mmask8) -> __m128i {
     let ones = _mm_set1_epi32(-1);
     _mm_maskz_mov_epi32(k, ones)
 }
@@ -4505,7 +4799,7 @@ pub unsafe fn _mm_movm_epi32(k: __mmask8) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovm2d))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_movm_epi32(k: __mmask8) -> __m256i {
+pub fn _mm256_movm_epi32(k: __mmask8) -> __m256i {
     let ones = _mm256_set1_epi32(-1);
     _mm256_maskz_mov_epi32(k, ones)
 }
@@ -4518,7 +4812,7 @@ pub unsafe fn _mm256_movm_epi32(k: __mmask8) -> __m256i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vpmovm2d))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_movm_epi32(k: __mmask16) -> __m512i {
+pub fn _mm512_movm_epi32(k: __mmask16) -> __m512i {
     let ones = _mm512_set1_epi32(-1);
     _mm512_maskz_mov_epi32(k, ones)
 }
@@ -4531,7 +4825,7 @@ pub unsafe fn _mm512_movm_epi32(k: __mmask16) -> __m512i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovm2q))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_movm_epi64(k: __mmask8) -> __m128i {
+pub fn _mm_movm_epi64(k: __mmask8) -> __m128i {
     let ones = _mm_set1_epi64x(-1);
     _mm_maskz_mov_epi64(k, ones)
 }
@@ -4544,7 +4838,7 @@ pub unsafe fn _mm_movm_epi64(k: __mmask8) -> __m128i {
 #[target_feature(enable = "avx512dq,avx512vl")]
 #[cfg_attr(test, assert_instr(vpmovm2q))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_movm_epi64(k: __mmask8) -> __m256i {
+pub fn _mm256_movm_epi64(k: __mmask8) -> __m256i {
     let ones = _mm256_set1_epi64x(-1);
     _mm256_maskz_mov_epi64(k, ones)
 }
@@ -4557,7 +4851,7 @@ pub unsafe fn _mm256_movm_epi64(k: __mmask8) -> __m256i {
 #[target_feature(enable = "avx512dq")]
 #[cfg_attr(test, assert_instr(vpmovm2q))]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_movm_epi64(k: __mmask8) -> __m512i {
+pub fn _mm512_movm_epi64(k: __mmask8) -> __m512i {
     let ones = _mm512_set1_epi64(-1);
     _mm512_maskz_mov_epi64(k, ones)
 }
@@ -4578,10 +4872,7 @@ pub unsafe fn _mm512_movm_epi64(k: __mmask8) -> __m512i {
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_range_round_pd<const IMM8: i32, const SAE: i32>(
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
+pub fn _mm512_range_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
     static_assert_uimm_bits!(IMM8, 4);
     static_assert_sae!(SAE);
     _mm512_mask_range_round_pd::<IMM8, SAE>(_mm512_setzero_pd(), 0xff, a, b)
@@ -4602,22 +4893,24 @@ pub unsafe fn _mm512_range_round_pd<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_range_round_pd<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_mask_range_round_pd<const IMM8: i32, const SAE: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    transmute(vrangepd_512(
-        a.as_f64x8(),
-        b.as_f64x8(),
-        IMM8,
-        src.as_f64x8(),
-        k,
-        SAE,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangepd_512(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            IMM8,
+            src.as_f64x8(),
+            k,
+            SAE,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
@@ -4635,7 +4928,7 @@ pub unsafe fn _mm512_mask_range_round_pd<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_range_round_pd<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_maskz_range_round_pd<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
@@ -4658,7 +4951,7 @@ pub unsafe fn _mm512_maskz_range_round_pd<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_range_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+pub fn _mm_range_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM8, 4);
     _mm_mask_range_pd::<IMM8>(_mm_setzero_pd(), 0xff, a, b)
 }
@@ -4677,20 +4970,22 @@ pub unsafe fn _mm_range_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_range_pd<const IMM8: i32>(
+pub fn _mm_mask_range_pd<const IMM8: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 4);
-    transmute(vrangepd_128(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        IMM8,
-        src.as_f64x2(),
-        k,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangepd_128(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            IMM8,
+            src.as_f64x2(),
+            k,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
@@ -4707,7 +5002,7 @@ pub unsafe fn _mm_mask_range_pd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+pub fn _mm_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM8, 4);
     _mm_mask_range_pd::<IMM8>(_mm_setzero_pd(), k, a, b)
 }
@@ -4725,7 +5020,7 @@ pub unsafe fn _mm_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_range_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
+pub fn _mm256_range_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
     static_assert_uimm_bits!(IMM8, 4);
     _mm256_mask_range_pd::<IMM8>(_mm256_setzero_pd(), 0xff, a, b)
 }
@@ -4744,20 +5039,22 @@ pub unsafe fn _mm256_range_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_range_pd<const IMM8: i32>(
+pub fn _mm256_mask_range_pd<const IMM8: i32>(
     src: __m256d,
     k: __mmask8,
     a: __m256d,
     b: __m256d,
 ) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 4);
-    transmute(vrangepd_256(
-        a.as_f64x4(),
-        b.as_f64x4(),
-        IMM8,
-        src.as_f64x4(),
-        k,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangepd_256(
+            a.as_f64x4(),
+            b.as_f64x4(),
+            IMM8,
+            src.as_f64x4(),
+            k,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
@@ -4774,11 +5071,7 @@ pub unsafe fn _mm256_mask_range_pd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_range_pd<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256d,
-    b: __m256d,
-) -> __m256d {
+pub fn _mm256_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
     static_assert_uimm_bits!(IMM8, 4);
     _mm256_mask_range_pd::<IMM8>(_mm256_setzero_pd(), k, a, b)
 }
@@ -4796,7 +5089,7 @@ pub unsafe fn _mm256_maskz_range_pd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_range_pd<const IMM8: i32>(a: __m512d, b: __m512d) -> __m512d {
+pub fn _mm512_range_pd<const IMM8: i32>(a: __m512d, b: __m512d) -> __m512d {
     static_assert_uimm_bits!(IMM8, 4);
     _mm512_mask_range_pd::<IMM8>(_mm512_setzero_pd(), 0xff, a, b)
 }
@@ -4815,21 +5108,23 @@ pub unsafe fn _mm512_range_pd<const IMM8: i32>(a: __m512d, b: __m512d) -> __m512
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_range_pd<const IMM8: i32>(
+pub fn _mm512_mask_range_pd<const IMM8: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 4);
-    transmute(vrangepd_512(
-        a.as_f64x8(),
-        b.as_f64x8(),
-        IMM8,
-        src.as_f64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangepd_512(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            IMM8,
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
@@ -4846,11 +5141,7 @@ pub unsafe fn _mm512_mask_range_pd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrangepd, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_range_pd<const IMM8: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
+pub fn _mm512_maskz_range_pd<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
     static_assert_uimm_bits!(IMM8, 4);
     _mm512_mask_range_pd::<IMM8>(_mm512_setzero_pd(), k, a, b)
 }
@@ -4869,10 +5160,7 @@ pub unsafe fn _mm512_maskz_range_pd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_range_round_ps<const IMM8: i32, const SAE: i32>(
-    a: __m512,
-    b: __m512,
-) -> __m512 {
+pub fn _mm512_range_round_ps<const IMM8: i32, const SAE: i32>(a: __m512, b: __m512) -> __m512 {
     static_assert_uimm_bits!(IMM8, 4);
     static_assert_sae!(SAE);
     _mm512_mask_range_round_ps::<IMM8, SAE>(_mm512_setzero_ps(), 0xffff, a, b)
@@ -4892,22 +5180,24 @@ pub unsafe fn _mm512_range_round_ps<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_range_round_ps<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_mask_range_round_ps<const IMM8: i32, const SAE: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    transmute(vrangeps_512(
-        a.as_f32x16(),
-        b.as_f32x16(),
-        IMM8,
-        src.as_f32x16(),
-        k,
-        SAE,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangeps_512(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            IMM8,
+            src.as_f32x16(),
+            k,
+            SAE,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
@@ -4924,7 +5214,7 @@ pub unsafe fn _mm512_mask_range_round_ps<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_range_round_ps<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_maskz_range_round_ps<const IMM8: i32, const SAE: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
@@ -4947,7 +5237,7 @@ pub unsafe fn _mm512_maskz_range_round_ps<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_range_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+pub fn _mm_range_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM8, 4);
     _mm_mask_range_ps::<IMM8>(_mm_setzero_ps(), 0xff, a, b)
 }
@@ -4966,20 +5256,22 @@ pub unsafe fn _mm_range_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_range_ps<const IMM8: i32>(
+pub fn _mm_mask_range_ps<const IMM8: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 4);
-    transmute(vrangeps_128(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        IMM8,
-        src.as_f32x4(),
-        k,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangeps_128(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            IMM8,
+            src.as_f32x4(),
+            k,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
@@ -4996,7 +5288,7 @@ pub unsafe fn _mm_mask_range_ps<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+pub fn _mm_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM8, 4);
     _mm_mask_range_ps::<IMM8>(_mm_setzero_ps(), k, a, b)
 }
@@ -5014,7 +5306,7 @@ pub unsafe fn _mm_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m128, b: __m
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_range_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
+pub fn _mm256_range_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM8, 4);
     _mm256_mask_range_ps::<IMM8>(_mm256_setzero_ps(), 0xff, a, b)
 }
@@ -5033,20 +5325,22 @@ pub unsafe fn _mm256_range_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_range_ps<const IMM8: i32>(
+pub fn _mm256_mask_range_ps<const IMM8: i32>(
     src: __m256,
     k: __mmask8,
     a: __m256,
     b: __m256,
 ) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 4);
-    transmute(vrangeps_256(
-        a.as_f32x8(),
-        b.as_f32x8(),
-        IMM8,
-        src.as_f32x8(),
-        k,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangeps_256(
+            a.as_f32x8(),
+            b.as_f32x8(),
+            IMM8,
+            src.as_f32x8(),
+            k,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
@@ -5063,7 +5357,7 @@ pub unsafe fn _mm256_mask_range_ps<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+pub fn _mm256_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM8, 4);
     _mm256_mask_range_ps::<IMM8>(_mm256_setzero_ps(), k, a, b)
 }
@@ -5081,7 +5375,7 @@ pub unsafe fn _mm256_maskz_range_ps<const IMM8: i32>(k: __mmask8, a: __m256, b:
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_range_ps<const IMM8: i32>(a: __m512, b: __m512) -> __m512 {
+pub fn _mm512_range_ps<const IMM8: i32>(a: __m512, b: __m512) -> __m512 {
     static_assert_uimm_bits!(IMM8, 4);
     _mm512_mask_range_ps::<IMM8>(_mm512_setzero_ps(), 0xffff, a, b)
 }
@@ -5100,21 +5394,23 @@ pub unsafe fn _mm512_range_ps<const IMM8: i32>(a: __m512, b: __m512) -> __m512 {
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_range_ps<const IMM8: i32>(
+pub fn _mm512_mask_range_ps<const IMM8: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 4);
-    transmute(vrangeps_512(
-        a.as_f32x16(),
-        b.as_f32x16(),
-        IMM8,
-        src.as_f32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangeps_512(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            IMM8,
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed
@@ -5131,7 +5427,7 @@ pub unsafe fn _mm512_mask_range_ps<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrangeps, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_range_ps<const IMM8: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+pub fn _mm512_maskz_range_ps<const IMM8: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
     static_assert_uimm_bits!(IMM8, 4);
     _mm512_mask_range_ps::<IMM8>(_mm512_setzero_ps(), k, a, b)
 }
@@ -5151,10 +5447,7 @@ pub unsafe fn _mm512_maskz_range_ps<const IMM8: i32>(k: __mmask16, a: __m512, b:
 #[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_range_round_sd<const IMM8: i32, const SAE: i32>(
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
+pub fn _mm_range_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM8, 4);
     static_assert_sae!(SAE);
     _mm_mask_range_round_sd::<IMM8, SAE>(_mm_setzero_pd(), 0xff, a, b)
@@ -5176,22 +5469,24 @@ pub unsafe fn _mm_range_round_sd<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_range_round_sd<const IMM8: i32, const SAE: i32>(
+pub fn _mm_mask_range_round_sd<const IMM8: i32, const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    transmute(vrangesd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        IMM8,
-        SAE,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
@@ -5210,7 +5505,7 @@ pub unsafe fn _mm_mask_range_round_sd<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_range_round_sd<const IMM8: i32, const SAE: i32>(
+pub fn _mm_maskz_range_round_sd<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
@@ -5235,21 +5530,23 @@ pub unsafe fn _mm_maskz_range_round_sd<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_range_sd<const IMM8: i32>(
+pub fn _mm_mask_range_sd<const IMM8: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 4);
-    transmute(vrangesd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        IMM8,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
@@ -5267,7 +5564,7 @@ pub unsafe fn _mm_mask_range_sd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrangesd, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_range_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+pub fn _mm_maskz_range_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM8, 4);
     _mm_mask_range_sd::<IMM8>(_mm_setzero_pd(), k, a, b)
 }
@@ -5287,7 +5584,7 @@ pub unsafe fn _mm_maskz_range_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __
 #[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_range_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+pub fn _mm_range_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM8, 4);
     static_assert_sae!(SAE);
     _mm_mask_range_round_ss::<IMM8, SAE>(_mm_setzero_ps(), 0xff, a, b)
@@ -5309,22 +5606,24 @@ pub unsafe fn _mm_range_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b:
 #[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_range_round_ss<const IMM8: i32, const SAE: i32>(
+pub fn _mm_mask_range_round_ss<const IMM8: i32, const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 4);
-    static_assert_sae!(SAE);
-    transmute(vrangess(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        IMM8,
-        SAE,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        static_assert_sae!(SAE);
+        transmute(vrangess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
@@ -5343,7 +5642,7 @@ pub unsafe fn _mm_mask_range_round_ss<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrangess, IMM8 = 5, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_range_round_ss<const IMM8: i32, const SAE: i32>(
+pub fn _mm_maskz_range_round_ss<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
@@ -5368,21 +5667,23 @@ pub unsafe fn _mm_maskz_range_round_ss<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrangess, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_range_ss<const IMM8: i32>(
+pub fn _mm_mask_range_ss<const IMM8: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 4);
-    transmute(vrangess(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        IMM8,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 4);
+        transmute(vrangess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower
@@ -5400,7 +5701,7 @@ pub unsafe fn _mm_mask_range_ss<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrangess, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_range_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+pub fn _mm_maskz_range_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM8, 4);
     _mm_mask_range_ss::<IMM8>(_mm_setzero_ps(), k, a, b)
 }
@@ -5425,7 +5726,7 @@ pub unsafe fn _mm_maskz_range_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(1, 2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
+pub fn _mm512_reduce_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
     _mm512_mask_reduce_round_pd::<IMM8, SAE>(_mm512_undefined_pd(), 0xff, a)
@@ -5450,14 +5751,16 @@ pub unsafe fn _mm512_reduce_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_round_pd<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_mask_reduce_round_pd<const IMM8: i32, const SAE: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    transmute(vreducepd_512(a.as_f64x8(), IMM8, src.as_f64x8(), k, SAE))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreducepd_512(a.as_f64x8(), IMM8, src.as_f64x8(), k, SAE))
+    }
 }
 
 /// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
@@ -5479,7 +5782,7 @@ pub unsafe fn _mm512_mask_reduce_round_pd<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_reduce_round_pd<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_maskz_reduce_round_pd<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m512d,
 ) -> __m512d {
@@ -5504,7 +5807,7 @@ pub unsafe fn _mm512_maskz_reduce_round_pd<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_pd<const IMM8: i32>(a: __m128d) -> __m128d {
+pub fn _mm_reduce_pd<const IMM8: i32>(a: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_pd::<IMM8>(_mm_undefined_pd(), 0xff, a)
 }
@@ -5526,13 +5829,11 @@ pub unsafe fn _mm_reduce_pd<const IMM8: i32>(a: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_pd<const IMM8: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vreducepd_128(a.as_f64x2(), IMM8, src.as_f64x2(), k))
+pub fn _mm_mask_reduce_pd<const IMM8: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducepd_128(a.as_f64x2(), IMM8, src.as_f64x2(), k))
+    }
 }
 
 /// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
@@ -5552,7 +5853,7 @@ pub unsafe fn _mm_mask_reduce_pd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
+pub fn _mm_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_pd::<IMM8>(_mm_setzero_pd(), k, a)
 }
@@ -5573,7 +5874,7 @@ pub unsafe fn _mm_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> _
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+pub fn _mm256_reduce_pd<const IMM8: i32>(a: __m256d) -> __m256d {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_mask_reduce_pd::<IMM8>(_mm256_undefined_pd(), 0xff, a)
 }
@@ -5595,13 +5896,11 @@ pub unsafe fn _mm256_reduce_pd<const IMM8: i32>(a: __m256d) -> __m256d {
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_pd<const IMM8: i32>(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vreducepd_256(a.as_f64x4(), IMM8, src.as_f64x4(), k))
+pub fn _mm256_mask_reduce_pd<const IMM8: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducepd_256(a.as_f64x4(), IMM8, src.as_f64x4(), k))
+    }
 }
 
 /// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
@@ -5621,7 +5920,7 @@ pub unsafe fn _mm256_mask_reduce_pd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
+pub fn _mm256_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_mask_reduce_pd::<IMM8>(_mm256_setzero_pd(), k, a)
 }
@@ -5642,7 +5941,7 @@ pub unsafe fn _mm256_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_pd<const IMM8: i32>(a: __m512d) -> __m512d {
+pub fn _mm512_reduce_pd<const IMM8: i32>(a: __m512d) -> __m512d {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_mask_reduce_pd::<IMM8>(_mm512_undefined_pd(), 0xff, a)
 }
@@ -5664,19 +5963,17 @@ pub unsafe fn _mm512_reduce_pd<const IMM8: i32>(a: __m512d) -> __m512d {
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_pd<const IMM8: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vreducepd_512(
-        a.as_f64x8(),
-        IMM8,
-        src.as_f64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_reduce_pd<const IMM8: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducepd_512(
+            a.as_f64x8(),
+            IMM8,
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by
@@ -5696,7 +5993,7 @@ pub unsafe fn _mm512_mask_reduce_pd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vreducepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
+pub fn _mm512_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_mask_reduce_pd::<IMM8>(_mm512_setzero_pd(), k, a)
 }
@@ -5719,7 +6016,7 @@ pub unsafe fn _mm512_maskz_reduce_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(1, 2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
+pub fn _mm512_reduce_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
     _mm512_mask_reduce_round_ps::<IMM8, SAE>(_mm512_undefined_ps(), 0xffff, a)
@@ -5744,14 +6041,16 @@ pub unsafe fn _mm512_reduce_round_ps<const IMM8: i32, const SAE: i32>(a: __m512)
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_round_ps<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_mask_reduce_round_ps<const IMM8: i32, const SAE: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    transmute(vreduceps_512(a.as_f32x16(), IMM8, src.as_f32x16(), k, SAE))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreduceps_512(a.as_f32x16(), IMM8, src.as_f32x16(), k, SAE))
+    }
 }
 
 /// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
@@ -5773,7 +6072,7 @@ pub unsafe fn _mm512_mask_reduce_round_ps<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_reduce_round_ps<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_maskz_reduce_round_ps<const IMM8: i32, const SAE: i32>(
     k: __mmask16,
     a: __m512,
 ) -> __m512 {
@@ -5798,7 +6097,7 @@ pub unsafe fn _mm512_maskz_reduce_round_ps<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_ps<const IMM8: i32>(a: __m128) -> __m128 {
+pub fn _mm_reduce_ps<const IMM8: i32>(a: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_ps::<IMM8>(_mm_undefined_ps(), 0xff, a)
 }
@@ -5820,9 +6119,11 @@ pub unsafe fn _mm_reduce_ps<const IMM8: i32>(a: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vreduceps_128(a.as_f32x4(), IMM8, src.as_f32x4(), k))
+pub fn _mm_mask_reduce_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreduceps_128(a.as_f32x4(), IMM8, src.as_f32x4(), k))
+    }
 }
 
 /// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
@@ -5842,7 +6143,7 @@ pub unsafe fn _mm_mask_reduce_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: _
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
+pub fn _mm_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_ps::<IMM8>(_mm_setzero_ps(), k, a)
 }
@@ -5863,7 +6164,7 @@ pub unsafe fn _mm_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_reduce_ps<const IMM8: i32>(a: __m256) -> __m256 {
+pub fn _mm256_reduce_ps<const IMM8: i32>(a: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_mask_reduce_ps::<IMM8>(_mm256_undefined_ps(), 0xff, a)
 }
@@ -5885,13 +6186,11 @@ pub unsafe fn _mm256_reduce_ps<const IMM8: i32>(a: __m256) -> __m256 {
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_reduce_ps<const IMM8: i32>(
-    src: __m256,
-    k: __mmask8,
-    a: __m256,
-) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vreduceps_256(a.as_f32x8(), IMM8, src.as_f32x8(), k))
+pub fn _mm256_mask_reduce_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreduceps_256(a.as_f32x8(), IMM8, src.as_f32x8(), k))
+    }
 }
 
 /// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
@@ -5911,7 +6210,7 @@ pub unsafe fn _mm256_mask_reduce_ps<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
+pub fn _mm256_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_mask_reduce_ps::<IMM8>(_mm256_setzero_ps(), k, a)
 }
@@ -5932,7 +6231,7 @@ pub unsafe fn _mm256_maskz_reduce_ps<const IMM8: i32>(k: __mmask8, a: __m256) ->
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_ps<const IMM8: i32>(a: __m512) -> __m512 {
+pub fn _mm512_reduce_ps<const IMM8: i32>(a: __m512) -> __m512 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_mask_reduce_ps::<IMM8>(_mm512_undefined_ps(), 0xffff, a)
 }
@@ -5954,19 +6253,17 @@ pub unsafe fn _mm512_reduce_ps<const IMM8: i32>(a: __m512) -> __m512 {
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_ps<const IMM8: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vreduceps_512(
-        a.as_f32x16(),
-        IMM8,
-        src.as_f32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_reduce_ps<const IMM8: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreduceps_512(
+            a.as_f32x16(),
+            IMM8,
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by
@@ -5986,7 +6283,7 @@ pub unsafe fn _mm512_mask_reduce_ps<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vreduceps, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_reduce_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
+pub fn _mm512_maskz_reduce_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_mask_reduce_ps::<IMM8>(_mm512_setzero_ps(), k, a)
 }
@@ -6010,10 +6307,7 @@ pub unsafe fn _mm512_maskz_reduce_ps<const IMM8: i32>(k: __mmask16, a: __m512) -
 #[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_round_sd<const IMM8: i32, const SAE: i32>(
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
+pub fn _mm_reduce_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
     _mm_mask_reduce_round_sd::<IMM8, SAE>(_mm_undefined_pd(), 0xff, a, b)
@@ -6039,22 +6333,24 @@ pub unsafe fn _mm_reduce_round_sd<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_round_sd<const IMM8: i32, const SAE: i32>(
+pub fn _mm_mask_reduce_round_sd<const IMM8: i32, const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    transmute(vreducesd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        IMM8,
-        SAE,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreducesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
 }
 
 /// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
@@ -6077,7 +6373,7 @@ pub unsafe fn _mm_mask_reduce_round_sd<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_reduce_round_sd<const IMM8: i32, const SAE: i32>(
+pub fn _mm_maskz_reduce_round_sd<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
@@ -6105,7 +6401,7 @@ pub unsafe fn _mm_maskz_reduce_round_sd<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+pub fn _mm_reduce_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_sd::<IMM8>(_mm_undefined_pd(), 0xff, a, b)
 }
@@ -6128,21 +6424,23 @@ pub unsafe fn _mm_reduce_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d
 #[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_sd<const IMM8: i32>(
+pub fn _mm_mask_reduce_sd<const IMM8: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vreducesd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        IMM8,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducesd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Extract the reduced argument of the lower double-precision (64-bit) floating-point element in b
@@ -6163,7 +6461,7 @@ pub unsafe fn _mm_mask_reduce_sd<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vreducesd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_reduce_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+pub fn _mm_maskz_reduce_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_sd::<IMM8>(_mm_setzero_pd(), k, a, b)
 }
@@ -6188,7 +6486,7 @@ pub unsafe fn _mm_maskz_reduce_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: _
 #[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+pub fn _mm_reduce_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
     _mm_mask_reduce_round_ss::<IMM8, SAE>(_mm_undefined_ps(), 0xff, a, b)
@@ -6214,22 +6512,24 @@ pub unsafe fn _mm_reduce_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b:
 #[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_round_ss<const IMM8: i32, const SAE: i32>(
+pub fn _mm_mask_reduce_round_ss<const IMM8: i32, const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    transmute(vreducess(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        IMM8,
-        SAE,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        transmute(vreducess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            SAE,
+        ))
+    }
 }
 
 /// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
@@ -6252,7 +6552,7 @@ pub unsafe fn _mm_mask_reduce_round_ss<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreducess, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_reduce_round_ss<const IMM8: i32, const SAE: i32>(
+pub fn _mm_maskz_reduce_round_ss<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
@@ -6280,7 +6580,7 @@ pub unsafe fn _mm_maskz_reduce_round_ss<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_reduce_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+pub fn _mm_reduce_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_ss::<IMM8>(_mm_undefined_ps(), 0xff, a, b)
 }
@@ -6303,21 +6603,23 @@ pub unsafe fn _mm_reduce_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
 #[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_reduce_ss<const IMM8: i32>(
+pub fn _mm_mask_reduce_ss<const IMM8: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vreducess(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        IMM8,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vreducess(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Extract the reduced argument of the lower single-precision (32-bit) floating-point element in b
@@ -6338,7 +6640,7 @@ pub unsafe fn _mm_mask_reduce_ss<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vreducess, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_maskz_reduce_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+pub fn _mm_maskz_reduce_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_ss::<IMM8>(_mm_setzero_ps(), k, a, b)
 }
@@ -6364,7 +6666,7 @@ pub unsafe fn _mm_maskz_reduce_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __
 #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_fpclass_pd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
+pub fn _mm_fpclass_pd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_fpclass_pd_mask::<IMM8>(0xff, a)
 }
@@ -6389,9 +6691,11 @@ pub unsafe fn _mm_fpclass_pd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vfpclasspd_128(a.as_f64x2(), IMM8, k1))
+pub fn _mm_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclasspd_128(a.as_f64x2(), IMM8, k1))
+    }
 }
 
 /// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
@@ -6413,7 +6717,7 @@ pub unsafe fn _mm_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d
 #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_fpclass_pd_mask<const IMM8: i32>(a: __m256d) -> __mmask8 {
+pub fn _mm256_fpclass_pd_mask<const IMM8: i32>(a: __m256d) -> __mmask8 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_mask_fpclass_pd_mask::<IMM8>(0xff, a)
 }
@@ -6438,9 +6742,11 @@ pub unsafe fn _mm256_fpclass_pd_mask<const IMM8: i32>(a: __m256d) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m256d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vfpclasspd_256(a.as_f64x4(), IMM8, k1))
+pub fn _mm256_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m256d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclasspd_256(a.as_f64x4(), IMM8, k1))
+    }
 }
 
 /// Test packed double-precision (64-bit) floating-point elements in a for special categories specified
@@ -6462,7 +6768,7 @@ pub unsafe fn _mm256_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m2
 #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_fpclass_pd_mask<const IMM8: i32>(a: __m512d) -> __mmask8 {
+pub fn _mm512_fpclass_pd_mask<const IMM8: i32>(a: __m512d) -> __mmask8 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_mask_fpclass_pd_mask::<IMM8>(0xff, a)
 }
@@ -6487,9 +6793,11 @@ pub unsafe fn _mm512_fpclass_pd_mask<const IMM8: i32>(a: __m512d) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vfpclasspd, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m512d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vfpclasspd_512(a.as_f64x8(), IMM8, k1))
+pub fn _mm512_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m512d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclasspd_512(a.as_f64x8(), IMM8, k1))
+    }
 }
 
 /// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
@@ -6511,7 +6819,7 @@ pub unsafe fn _mm512_mask_fpclass_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m5
 #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_fpclass_ps_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
+pub fn _mm_fpclass_ps_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_fpclass_ps_mask::<IMM8>(0xff, a)
 }
@@ -6536,9 +6844,11 @@ pub unsafe fn _mm_fpclass_ps_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vfpclassps_128(a.as_f32x4(), IMM8, k1))
+pub fn _mm_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclassps_128(a.as_f32x4(), IMM8, k1))
+    }
 }
 
 /// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
@@ -6560,7 +6870,7 @@ pub unsafe fn _mm_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128)
 #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_fpclass_ps_mask<const IMM8: i32>(a: __m256) -> __mmask8 {
+pub fn _mm256_fpclass_ps_mask<const IMM8: i32>(a: __m256) -> __mmask8 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_mask_fpclass_ps_mask::<IMM8>(0xff, a)
 }
@@ -6585,9 +6895,11 @@ pub unsafe fn _mm256_fpclass_ps_mask<const IMM8: i32>(a: __m256) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m256) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vfpclassps_256(a.as_f32x8(), IMM8, k1))
+pub fn _mm256_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m256) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclassps_256(a.as_f32x8(), IMM8, k1))
+    }
 }
 
 /// Test packed single-precision (32-bit) floating-point elements in a for special categories specified
@@ -6609,7 +6921,7 @@ pub unsafe fn _mm256_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m2
 #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_fpclass_ps_mask<const IMM8: i32>(a: __m512) -> __mmask16 {
+pub fn _mm512_fpclass_ps_mask<const IMM8: i32>(a: __m512) -> __mmask16 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_mask_fpclass_ps_mask::<IMM8>(0xffff, a)
 }
@@ -6634,9 +6946,11 @@ pub unsafe fn _mm512_fpclass_ps_mask<const IMM8: i32>(a: __m512) -> __mmask16 {
 #[cfg_attr(test, assert_instr(vfpclassps, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m512) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(vfpclassps_512(a.as_f32x16(), IMM8, k1))
+pub fn _mm512_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m512) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(vfpclassps_512(a.as_f32x16(), IMM8, k1))
+    }
 }
 
 /// Test the lower double-precision (64-bit) floating-point element in a for special categories specified
@@ -6658,7 +6972,7 @@ pub unsafe fn _mm512_mask_fpclass_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m
 #[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_fpclass_sd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
+pub fn _mm_fpclass_sd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_fpclass_sd_mask::<IMM8>(0xff, a)
 }
@@ -6683,9 +6997,11 @@ pub unsafe fn _mm_fpclass_sd_mask<const IMM8: i32>(a: __m128d) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vfpclasssd, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_fpclass_sd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    vfpclasssd(a.as_f64x2(), IMM8, k1)
+pub fn _mm_mask_fpclass_sd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vfpclasssd(a.as_f64x2(), IMM8, k1)
+    }
 }
 
 /// Test the lower single-precision (32-bit) floating-point element in a for special categories specified
@@ -6707,7 +7023,7 @@ pub unsafe fn _mm_mask_fpclass_sd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d
 #[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_fpclass_ss_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
+pub fn _mm_fpclass_ss_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_fpclass_ss_mask::<IMM8>(0xff, a)
 }
@@ -6732,9 +7048,11 @@ pub unsafe fn _mm_fpclass_ss_mask<const IMM8: i32>(a: __m128) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vfpclassss, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_mask_fpclass_ss_mask<const IMM8: i32>(k1: __mmask8, a: __m128) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    vfpclassss(a.as_f32x4(), IMM8, k1)
+pub fn _mm_mask_fpclass_ss_mask<const IMM8: i32>(k1: __mmask8, a: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vfpclassss(a.as_f32x4(), IMM8, k1)
+    }
 }
 
 #[allow(improper_ctypes)]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7b084a3ee7..d751b44119 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -17,10 +17,12 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsd))]
-pub unsafe fn _mm512_abs_epi32(a: __m512i) -> __m512i {
-    let a = a.as_i32x16();
-    let r = simd_select::<i32x16, _>(simd_lt(a, i32x16::ZERO), simd_neg(a), a);
-    transmute(r)
+pub fn _mm512_abs_epi32(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let r = simd_select::<i32x16, _>(simd_lt(a, i32x16::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
 }
 
 /// Computes the absolute value of packed 32-bit integers in `a`, and store the
@@ -32,9 +34,11 @@ pub unsafe fn _mm512_abs_epi32(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsd))]
-pub unsafe fn _mm512_mask_abs_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    let abs = _mm512_abs_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, abs, src.as_i32x16()))
+pub fn _mm512_mask_abs_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, abs, src.as_i32x16()))
+    }
 }
 
 /// Computes the absolute value of packed 32-bit integers in `a`, and store the
@@ -46,9 +50,11 @@ pub unsafe fn _mm512_mask_abs_epi32(src: __m512i, k: __mmask16, a: __m512i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsd))]
-pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    let abs = _mm512_abs_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, abs, i32x16::ZERO))
+pub fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, abs, i32x16::ZERO))
+    }
 }
 
 /// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -58,9 +64,11 @@ pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsd))]
-pub unsafe fn _mm256_mask_abs_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let abs = _mm256_abs_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, abs, src.as_i32x8()))
+pub fn _mm256_mask_abs_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, abs, src.as_i32x8()))
+    }
 }
 
 /// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -70,9 +78,11 @@ pub unsafe fn _mm256_mask_abs_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsd))]
-pub unsafe fn _mm256_maskz_abs_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    let abs = _mm256_abs_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, abs, i32x8::ZERO))
+pub fn _mm256_maskz_abs_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, abs, i32x8::ZERO))
+    }
 }
 
 /// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -82,9 +92,11 @@ pub unsafe fn _mm256_maskz_abs_epi32(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsd))]
-pub unsafe fn _mm_mask_abs_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let abs = _mm_abs_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, abs, src.as_i32x4()))
+pub fn _mm_mask_abs_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, abs, src.as_i32x4()))
+    }
 }
 
 /// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -94,9 +106,11 @@ pub unsafe fn _mm_mask_abs_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsd))]
-pub unsafe fn _mm_maskz_abs_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let abs = _mm_abs_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, abs, i32x4::ZERO))
+pub fn _mm_maskz_abs_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, abs, i32x4::ZERO))
+    }
 }
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
@@ -106,10 +120,12 @@ pub unsafe fn _mm_maskz_abs_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsq))]
-pub unsafe fn _mm512_abs_epi64(a: __m512i) -> __m512i {
-    let a = a.as_i64x8();
-    let r = simd_select::<i64x8, _>(simd_lt(a, i64x8::ZERO), simd_neg(a), a);
-    transmute(r)
+pub fn _mm512_abs_epi64(a: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x8();
+        let r = simd_select::<i64x8, _>(simd_lt(a, i64x8::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
 }
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -119,9 +135,11 @@ pub unsafe fn _mm512_abs_epi64(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsq))]
-pub unsafe fn _mm512_mask_abs_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    let abs = _mm512_abs_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, abs, src.as_i64x8()))
+pub fn _mm512_mask_abs_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, abs, src.as_i64x8()))
+    }
 }
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -131,9 +149,11 @@ pub unsafe fn _mm512_mask_abs_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsq))]
-pub unsafe fn _mm512_maskz_abs_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    let abs = _mm512_abs_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, abs, i64x8::ZERO))
+pub fn _mm512_maskz_abs_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let abs = _mm512_abs_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, abs, i64x8::ZERO))
+    }
 }
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
@@ -143,10 +163,12 @@ pub unsafe fn _mm512_maskz_abs_epi64(k: __mmask8, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsq))]
-pub unsafe fn _mm256_abs_epi64(a: __m256i) -> __m256i {
-    let a = a.as_i64x4();
-    let r = simd_select::<i64x4, _>(simd_lt(a, i64x4::ZERO), simd_neg(a), a);
-    transmute(r)
+pub fn _mm256_abs_epi64(a: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x4();
+        let r = simd_select::<i64x4, _>(simd_lt(a, i64x4::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
 }
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -156,9 +178,11 @@ pub unsafe fn _mm256_abs_epi64(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsq))]
-pub unsafe fn _mm256_mask_abs_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let abs = _mm256_abs_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, abs, src.as_i64x4()))
+pub fn _mm256_mask_abs_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, abs, src.as_i64x4()))
+    }
 }
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -168,9 +192,11 @@ pub unsafe fn _mm256_mask_abs_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsq))]
-pub unsafe fn _mm256_maskz_abs_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    let abs = _mm256_abs_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, abs, i64x4::ZERO))
+pub fn _mm256_maskz_abs_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let abs = _mm256_abs_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, abs, i64x4::ZERO))
+    }
 }
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
@@ -180,10 +206,12 @@ pub unsafe fn _mm256_maskz_abs_epi64(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsq))]
-pub unsafe fn _mm_abs_epi64(a: __m128i) -> __m128i {
-    let a = a.as_i64x2();
-    let r = simd_select::<i64x2, _>(simd_lt(a, i64x2::ZERO), simd_neg(a), a);
-    transmute(r)
+pub fn _mm_abs_epi64(a: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x2();
+        let r = simd_select::<i64x2, _>(simd_lt(a, i64x2::ZERO), simd_neg(a), a);
+        transmute(r)
+    }
 }
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -193,9 +221,11 @@ pub unsafe fn _mm_abs_epi64(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsq))]
-pub unsafe fn _mm_mask_abs_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let abs = _mm_abs_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, abs, src.as_i64x2()))
+pub fn _mm_mask_abs_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, abs, src.as_i64x2()))
+    }
 }
 
 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -205,9 +235,11 @@ pub unsafe fn _mm_mask_abs_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpabsq))]
-pub unsafe fn _mm_maskz_abs_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let abs = _mm_abs_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, abs, i64x2::ZERO))
+pub fn _mm_maskz_abs_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let abs = _mm_abs_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, abs, i64x2::ZERO))
+    }
 }
 
 /// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.
@@ -217,8 +249,8 @@ pub unsafe fn _mm_maskz_abs_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandd))]
-pub unsafe fn _mm512_abs_ps(v2: __m512) -> __m512 {
-    simd_fabs(v2)
+pub fn _mm512_abs_ps(v2: __m512) -> __m512 {
+    unsafe { simd_fabs(v2) }
 }
 
 /// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -228,8 +260,8 @@ pub unsafe fn _mm512_abs_ps(v2: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandd))]
-pub unsafe fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m512 {
-    simd_select_bitmask(k, simd_fabs(v2), src)
+pub fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, simd_fabs(v2), src) }
 }
 
 /// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.
@@ -239,8 +271,8 @@ pub unsafe fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
-pub unsafe fn _mm512_abs_pd(v2: __m512d) -> __m512d {
-    simd_fabs(v2)
+pub fn _mm512_abs_pd(v2: __m512d) -> __m512d {
+    unsafe { simd_fabs(v2) }
 }
 
 /// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -250,8 +282,8 @@ pub unsafe fn _mm512_abs_pd(v2: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
-pub unsafe fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m512d {
-    simd_select_bitmask(k, simd_fabs(v2), src)
+pub fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, simd_fabs(v2), src) }
 }
 
 /// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -261,9 +293,11 @@ pub unsafe fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa32))]
-pub unsafe fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    let mov = a.as_i32x16();
-    transmute(simd_select_bitmask(k, mov, src.as_i32x16()))
+pub fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i32x16();
+        transmute(simd_select_bitmask(k, mov, src.as_i32x16()))
+    }
 }
 
 /// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -273,9 +307,11 @@ pub unsafe fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa32))]
-pub unsafe fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    let mov = a.as_i32x16();
-    transmute(simd_select_bitmask(k, mov, i32x16::ZERO))
+pub fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i32x16();
+        transmute(simd_select_bitmask(k, mov, i32x16::ZERO))
+    }
 }
 
 /// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -285,9 +321,11 @@ pub unsafe fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa32))]
-pub unsafe fn _mm256_mask_mov_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let mov = a.as_i32x8();
-    transmute(simd_select_bitmask(k, mov, src.as_i32x8()))
+pub fn _mm256_mask_mov_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i32x8();
+        transmute(simd_select_bitmask(k, mov, src.as_i32x8()))
+    }
 }
 
 /// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -297,9 +335,11 @@ pub unsafe fn _mm256_mask_mov_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa32))]
-pub unsafe fn _mm256_maskz_mov_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    let mov = a.as_i32x8();
-    transmute(simd_select_bitmask(k, mov, i32x8::ZERO))
+pub fn _mm256_maskz_mov_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i32x8();
+        transmute(simd_select_bitmask(k, mov, i32x8::ZERO))
+    }
 }
 
 /// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -309,9 +349,11 @@ pub unsafe fn _mm256_maskz_mov_epi32(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa32))]
-pub unsafe fn _mm_mask_mov_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let mov = a.as_i32x4();
-    transmute(simd_select_bitmask(k, mov, src.as_i32x4()))
+pub fn _mm_mask_mov_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i32x4();
+        transmute(simd_select_bitmask(k, mov, src.as_i32x4()))
+    }
 }
 
 /// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -321,9 +363,11 @@ pub unsafe fn _mm_mask_mov_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa32))]
-pub unsafe fn _mm_maskz_mov_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let mov = a.as_i32x4();
-    transmute(simd_select_bitmask(k, mov, i32x4::ZERO))
+pub fn _mm_maskz_mov_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i32x4();
+        transmute(simd_select_bitmask(k, mov, i32x4::ZERO))
+    }
 }
 
 /// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -333,9 +377,11 @@ pub unsafe fn _mm_maskz_mov_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa64))]
-pub unsafe fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    let mov = a.as_i64x8();
-    transmute(simd_select_bitmask(k, mov, src.as_i64x8()))
+pub fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i64x8();
+        transmute(simd_select_bitmask(k, mov, src.as_i64x8()))
+    }
 }
 
 /// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -345,9 +391,11 @@ pub unsafe fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa64))]
-pub unsafe fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    let mov = a.as_i64x8();
-    transmute(simd_select_bitmask(k, mov, i64x8::ZERO))
+pub fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        let mov = a.as_i64x8();
+        transmute(simd_select_bitmask(k, mov, i64x8::ZERO))
+    }
 }
 
 /// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -357,9 +405,11 @@ pub unsafe fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa64))]
-pub unsafe fn _mm256_mask_mov_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    let mov = a.as_i64x4();
-    transmute(simd_select_bitmask(k, mov, src.as_i64x4()))
+pub fn _mm256_mask_mov_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i64x4();
+        transmute(simd_select_bitmask(k, mov, src.as_i64x4()))
+    }
 }
 
 /// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -369,9 +419,11 @@ pub unsafe fn _mm256_mask_mov_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa64))]
-pub unsafe fn _mm256_maskz_mov_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    let mov = a.as_i64x4();
-    transmute(simd_select_bitmask(k, mov, i64x4::ZERO))
+pub fn _mm256_maskz_mov_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let mov = a.as_i64x4();
+        transmute(simd_select_bitmask(k, mov, i64x4::ZERO))
+    }
 }
 
 /// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -381,9 +433,11 @@ pub unsafe fn _mm256_maskz_mov_epi64(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa64))]
-pub unsafe fn _mm_mask_mov_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let mov = a.as_i64x2();
-    transmute(simd_select_bitmask(k, mov, src.as_i64x2()))
+pub fn _mm_mask_mov_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i64x2();
+        transmute(simd_select_bitmask(k, mov, src.as_i64x2()))
+    }
 }
 
 /// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -393,9 +447,11 @@ pub unsafe fn _mm_mask_mov_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa64))]
-pub unsafe fn _mm_maskz_mov_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let mov = a.as_i64x2();
-    transmute(simd_select_bitmask(k, mov, i64x2::ZERO))
+pub fn _mm_maskz_mov_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let mov = a.as_i64x2();
+        transmute(simd_select_bitmask(k, mov, i64x2::ZERO))
+    }
 }
 
 /// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -405,9 +461,11 @@ pub unsafe fn _mm_maskz_mov_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovaps))]
-pub unsafe fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    let mov = a.as_f32x16();
-    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+pub fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov = a.as_f32x16();
+        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+    }
 }
 
 /// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -417,9 +475,11 @@ pub unsafe fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovaps))]
-pub unsafe fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 {
-    let mov = a.as_f32x16();
-    transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+pub fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov = a.as_f32x16();
+        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+    }
 }
 
 /// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -429,9 +489,11 @@ pub unsafe fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovaps))]
-pub unsafe fn _mm256_mask_mov_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    let mov = a.as_f32x8();
-    transmute(simd_select_bitmask(k, mov, src.as_f32x8()))
+pub fn _mm256_mask_mov_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = a.as_f32x8();
+        transmute(simd_select_bitmask(k, mov, src.as_f32x8()))
+    }
 }
 
 /// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -441,9 +503,11 @@ pub unsafe fn _mm256_mask_mov_ps(src: __m256, k: __mmask8, a: __m256) -> __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovaps))]
-pub unsafe fn _mm256_maskz_mov_ps(k: __mmask8, a: __m256) -> __m256 {
-    let mov = a.as_f32x8();
-    transmute(simd_select_bitmask(k, mov, f32x8::ZERO))
+pub fn _mm256_maskz_mov_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = a.as_f32x8();
+        transmute(simd_select_bitmask(k, mov, f32x8::ZERO))
+    }
 }
 
 /// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -453,9 +517,11 @@ pub unsafe fn _mm256_maskz_mov_ps(k: __mmask8, a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovaps))]
-pub unsafe fn _mm_mask_mov_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    let mov = a.as_f32x4();
-    transmute(simd_select_bitmask(k, mov, src.as_f32x4()))
+pub fn _mm_mask_mov_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = a.as_f32x4();
+        transmute(simd_select_bitmask(k, mov, src.as_f32x4()))
+    }
 }
 
 /// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -465,9 +531,11 @@ pub unsafe fn _mm_mask_mov_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovaps))]
-pub unsafe fn _mm_maskz_mov_ps(k: __mmask8, a: __m128) -> __m128 {
-    let mov = a.as_f32x4();
-    transmute(simd_select_bitmask(k, mov, f32x4::ZERO))
+pub fn _mm_maskz_mov_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = a.as_f32x4();
+        transmute(simd_select_bitmask(k, mov, f32x4::ZERO))
+    }
 }
 
 /// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -477,9 +545,11 @@ pub unsafe fn _mm_maskz_mov_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovapd))]
-pub unsafe fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    let mov = a.as_f64x8();
-    transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+pub fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov = a.as_f64x8();
+        transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+    }
 }
 
 /// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -489,9 +559,11 @@ pub unsafe fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovapd))]
-pub unsafe fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d {
-    let mov = a.as_f64x8();
-    transmute(simd_select_bitmask(k, mov, f64x8::ZERO))
+pub fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov = a.as_f64x8();
+        transmute(simd_select_bitmask(k, mov, f64x8::ZERO))
+    }
 }
 
 /// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -501,9 +573,11 @@ pub unsafe fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovapd))]
-pub unsafe fn _mm256_mask_mov_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    let mov = a.as_f64x4();
-    transmute(simd_select_bitmask(k, mov, src.as_f64x4()))
+pub fn _mm256_mask_mov_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = a.as_f64x4();
+        transmute(simd_select_bitmask(k, mov, src.as_f64x4()))
+    }
 }
 
 /// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -513,9 +587,11 @@ pub unsafe fn _mm256_mask_mov_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovapd))]
-pub unsafe fn _mm256_maskz_mov_pd(k: __mmask8, a: __m256d) -> __m256d {
-    let mov = a.as_f64x4();
-    transmute(simd_select_bitmask(k, mov, f64x4::ZERO))
+pub fn _mm256_maskz_mov_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = a.as_f64x4();
+        transmute(simd_select_bitmask(k, mov, f64x4::ZERO))
+    }
 }
 
 /// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -525,9 +601,11 @@ pub unsafe fn _mm256_maskz_mov_pd(k: __mmask8, a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovapd))]
-pub unsafe fn _mm_mask_mov_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    let mov = a.as_f64x2();
-    transmute(simd_select_bitmask(k, mov, src.as_f64x2()))
+pub fn _mm_mask_mov_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = a.as_f64x2();
+        transmute(simd_select_bitmask(k, mov, src.as_f64x2()))
+    }
 }
 
 /// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -537,9 +615,11 @@ pub unsafe fn _mm_mask_mov_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovapd))]
-pub unsafe fn _mm_maskz_mov_pd(k: __mmask8, a: __m128d) -> __m128d {
-    let mov = a.as_f64x2();
-    transmute(simd_select_bitmask(k, mov, f64x2::ZERO))
+pub fn _mm_maskz_mov_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = a.as_f64x2();
+        transmute(simd_select_bitmask(k, mov, f64x2::ZERO))
+    }
 }
 
 /// Add packed 32-bit integers in a and b, and store the results in dst.
@@ -549,8 +629,8 @@ pub unsafe fn _mm_maskz_mov_pd(k: __mmask8, a: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddd))]
-pub unsafe fn _mm512_add_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_add(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_add_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -560,9 +640,11 @@ pub unsafe fn _mm512_add_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddd))]
-pub unsafe fn _mm512_mask_add_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_add_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, add, src.as_i32x16()))
+pub fn _mm512_mask_add_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, add, src.as_i32x16()))
+    }
 }
 
 /// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -572,9 +654,11 @@ pub unsafe fn _mm512_mask_add_epi32(src: __m512i, k: __mmask16, a: __m512i, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddd))]
-pub unsafe fn _mm512_maskz_add_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_add_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, add, i32x16::ZERO))
+pub fn _mm512_maskz_add_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, add, i32x16::ZERO))
+    }
 }
 
 /// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -584,9 +668,11 @@ pub unsafe fn _mm512_maskz_add_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddd))]
-pub unsafe fn _mm256_mask_add_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_add_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, add, src.as_i32x8()))
+pub fn _mm256_mask_add_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, add, src.as_i32x8()))
+    }
 }
 
 /// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -596,9 +682,11 @@ pub unsafe fn _mm256_mask_add_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddd))]
-pub unsafe fn _mm256_maskz_add_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_add_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, add, i32x8::ZERO))
+pub fn _mm256_maskz_add_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, add, i32x8::ZERO))
+    }
 }
 
 /// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -608,9 +696,11 @@ pub unsafe fn _mm256_maskz_add_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddd))]
-pub unsafe fn _mm_mask_add_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_add_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, add, src.as_i32x4()))
+pub fn _mm_mask_add_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, add, src.as_i32x4()))
+    }
 }
 
 /// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -620,9 +710,11 @@ pub unsafe fn _mm_mask_add_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddd))]
-pub unsafe fn _mm_maskz_add_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_add_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, add, i32x4::ZERO))
+pub fn _mm_maskz_add_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, add, i32x4::ZERO))
+    }
 }
 
 /// Add packed 64-bit integers in a and b, and store the results in dst.
@@ -632,8 +724,8 @@ pub unsafe fn _mm_maskz_add_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddq))]
-pub unsafe fn _mm512_add_epi64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_add(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_add_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_add(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -643,9 +735,11 @@ pub unsafe fn _mm512_add_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddq))]
-pub unsafe fn _mm512_mask_add_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_add_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, add, src.as_i64x8()))
+pub fn _mm512_mask_add_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, add, src.as_i64x8()))
+    }
 }
 
 /// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -655,9 +749,11 @@ pub unsafe fn _mm512_mask_add_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddq))]
-pub unsafe fn _mm512_maskz_add_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let add = _mm512_add_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, add, i64x8::ZERO))
+pub fn _mm512_maskz_add_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let add = _mm512_add_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, add, i64x8::ZERO))
+    }
 }
 
 /// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -667,9 +763,11 @@ pub unsafe fn _mm512_maskz_add_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddq))]
-pub unsafe fn _mm256_mask_add_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_add_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, add, src.as_i64x4()))
+pub fn _mm256_mask_add_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, add, src.as_i64x4()))
+    }
 }
 
 /// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -679,9 +777,11 @@ pub unsafe fn _mm256_mask_add_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddq))]
-pub unsafe fn _mm256_maskz_add_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let add = _mm256_add_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, add, i64x4::ZERO))
+pub fn _mm256_maskz_add_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let add = _mm256_add_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, add, i64x4::ZERO))
+    }
 }
 
 /// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -691,9 +791,11 @@ pub unsafe fn _mm256_maskz_add_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddq))]
-pub unsafe fn _mm_mask_add_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_add_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, add, src.as_i64x2()))
+pub fn _mm_mask_add_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, add, src.as_i64x2()))
+    }
 }
 
 /// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -703,9 +805,11 @@ pub unsafe fn _mm_mask_add_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpaddq))]
-pub unsafe fn _mm_maskz_add_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let add = _mm_add_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, add, i64x2::ZERO))
+pub fn _mm_maskz_add_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let add = _mm_add_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, add, i64x2::ZERO))
+    }
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
@@ -715,8 +819,8 @@ pub unsafe fn _mm_maskz_add_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddps))]
-pub unsafe fn _mm512_add_ps(a: __m512, b: __m512) -> __m512 {
-    transmute(simd_add(a.as_f32x16(), b.as_f32x16()))
+pub fn _mm512_add_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_add(a.as_f32x16(), b.as_f32x16())) }
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -726,9 +830,11 @@ pub unsafe fn _mm512_add_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddps))]
-pub unsafe fn _mm512_mask_add_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let add = _mm512_add_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, add, src.as_f32x16()))
+pub fn _mm512_mask_add_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let add = _mm512_add_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, add, src.as_f32x16()))
+    }
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -738,9 +844,11 @@ pub unsafe fn _mm512_mask_add_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddps))]
-pub unsafe fn _mm512_maskz_add_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let add = _mm512_add_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, add, f32x16::ZERO))
+pub fn _mm512_maskz_add_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let add = _mm512_add_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, add, f32x16::ZERO))
+    }
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -750,9 +858,11 @@ pub unsafe fn _mm512_maskz_add_ps(k: __mmask16, a: __m512, b: __m512) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddps))]
-pub unsafe fn _mm256_mask_add_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let add = _mm256_add_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, add, src.as_f32x8()))
+pub fn _mm256_mask_add_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let add = _mm256_add_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, add, src.as_f32x8()))
+    }
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -762,9 +872,11 @@ pub unsafe fn _mm256_mask_add_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddps))]
-pub unsafe fn _mm256_maskz_add_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let add = _mm256_add_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, add, f32x8::ZERO))
+pub fn _mm256_maskz_add_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let add = _mm256_add_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, add, f32x8::ZERO))
+    }
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -774,9 +886,11 @@ pub unsafe fn _mm256_maskz_add_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddps))]
-pub unsafe fn _mm_mask_add_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let add = _mm_add_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, add, src.as_f32x4()))
+pub fn _mm_mask_add_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let add = _mm_add_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, add, src.as_f32x4()))
+    }
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -786,9 +900,11 @@ pub unsafe fn _mm_mask_add_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddps))]
-pub unsafe fn _mm_maskz_add_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let add = _mm_add_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, add, f32x4::ZERO))
+pub fn _mm_maskz_add_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let add = _mm_add_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, add, f32x4::ZERO))
+    }
 }
 
 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
@@ -798,8 +914,8 @@ pub unsafe fn _mm_maskz_add_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddpd))]
-pub unsafe fn _mm512_add_pd(a: __m512d, b: __m512d) -> __m512d {
-    transmute(simd_add(a.as_f64x8(), b.as_f64x8()))
+pub fn _mm512_add_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_add(a.as_f64x8(), b.as_f64x8())) }
 }
 
 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -809,9 +925,11 @@ pub unsafe fn _mm512_add_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddpd))]
-pub unsafe fn _mm512_mask_add_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let add = _mm512_add_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, add, src.as_f64x8()))
+pub fn _mm512_mask_add_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let add = _mm512_add_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, add, src.as_f64x8()))
+    }
 }
 
 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -821,9 +939,11 @@ pub unsafe fn _mm512_mask_add_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddpd))]
-pub unsafe fn _mm512_maskz_add_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let add = _mm512_add_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, add, f64x8::ZERO))
+pub fn _mm512_maskz_add_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let add = _mm512_add_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, add, f64x8::ZERO))
+    }
 }
 
 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -833,9 +953,11 @@ pub unsafe fn _mm512_maskz_add_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddpd))]
-pub unsafe fn _mm256_mask_add_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let add = _mm256_add_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, add, src.as_f64x4()))
+pub fn _mm256_mask_add_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let add = _mm256_add_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, add, src.as_f64x4()))
+    }
 }
 
 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -845,9 +967,11 @@ pub unsafe fn _mm256_mask_add_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddpd))]
-pub unsafe fn _mm256_maskz_add_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let add = _mm256_add_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, add, f64x4::ZERO))
+pub fn _mm256_maskz_add_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let add = _mm256_add_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, add, f64x4::ZERO))
+    }
 }
 
 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -857,9 +981,11 @@ pub unsafe fn _mm256_maskz_add_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddpd))]
-pub unsafe fn _mm_mask_add_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let add = _mm_add_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, add, src.as_f64x2()))
+pub fn _mm_mask_add_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let add = _mm_add_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, add, src.as_f64x2()))
+    }
 }
 
 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -869,9 +995,11 @@ pub unsafe fn _mm_mask_add_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddpd))]
-pub unsafe fn _mm_maskz_add_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let add = _mm_add_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, add, f64x2::ZERO))
+pub fn _mm_maskz_add_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let add = _mm_add_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, add, f64x2::ZERO))
+    }
 }
 
 /// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.
@@ -881,8 +1009,8 @@ pub unsafe fn _mm_maskz_add_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubd))]
-pub unsafe fn _mm512_sub_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_sub(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_sub_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -892,9 +1020,11 @@ pub unsafe fn _mm512_sub_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubd))]
-pub unsafe fn _mm512_mask_sub_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_sub_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, sub, src.as_i32x16()))
+pub fn _mm512_mask_sub_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, sub, src.as_i32x16()))
+    }
 }
 
 /// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -904,9 +1034,11 @@ pub unsafe fn _mm512_mask_sub_epi32(src: __m512i, k: __mmask16, a: __m512i, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubd))]
-pub unsafe fn _mm512_maskz_sub_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_sub_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, sub, i32x16::ZERO))
+pub fn _mm512_maskz_sub_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, sub, i32x16::ZERO))
+    }
 }
 
 /// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -916,9 +1048,11 @@ pub unsafe fn _mm512_maskz_sub_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubd))]
-pub unsafe fn _mm256_mask_sub_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_sub_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, sub, src.as_i32x8()))
+pub fn _mm256_mask_sub_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i32x8()))
+    }
 }
 
 /// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -928,9 +1062,11 @@ pub unsafe fn _mm256_mask_sub_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubd))]
-pub unsafe fn _mm256_maskz_sub_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_sub_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, sub, i32x8::ZERO))
+pub fn _mm256_maskz_sub_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, sub, i32x8::ZERO))
+    }
 }
 
 /// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -940,9 +1076,11 @@ pub unsafe fn _mm256_maskz_sub_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubd))]
-pub unsafe fn _mm_mask_sub_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_sub_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, sub, src.as_i32x4()))
+pub fn _mm_mask_sub_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, sub, src.as_i32x4()))
+    }
 }
 
 /// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -952,9 +1090,11 @@ pub unsafe fn _mm_mask_sub_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubd))]
-pub unsafe fn _mm_maskz_sub_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_sub_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, sub, i32x4::ZERO))
+pub fn _mm_maskz_sub_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, sub, i32x4::ZERO))
+    }
 }
 
 /// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.
@@ -964,8 +1104,8 @@ pub unsafe fn _mm_maskz_sub_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubq))]
-pub unsafe fn _mm512_sub_epi64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_sub(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_sub_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_sub(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -975,9 +1115,11 @@ pub unsafe fn _mm512_sub_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubq))]
-pub unsafe fn _mm512_mask_sub_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_sub_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, sub, src.as_i64x8()))
+pub fn _mm512_mask_sub_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, sub, src.as_i64x8()))
+    }
 }
 
 /// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -987,9 +1129,11 @@ pub unsafe fn _mm512_mask_sub_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubq))]
-pub unsafe fn _mm512_maskz_sub_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let sub = _mm512_sub_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, sub, i64x8::ZERO))
+pub fn _mm512_maskz_sub_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let sub = _mm512_sub_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, sub, i64x8::ZERO))
+    }
 }
 
 /// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -999,9 +1143,11 @@ pub unsafe fn _mm512_maskz_sub_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubq))]
-pub unsafe fn _mm256_mask_sub_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_sub_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, sub, src.as_i64x4()))
+pub fn _mm256_mask_sub_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, sub, src.as_i64x4()))
+    }
 }
 
 /// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1011,9 +1157,11 @@ pub unsafe fn _mm256_mask_sub_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubq))]
-pub unsafe fn _mm256_maskz_sub_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let sub = _mm256_sub_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, sub, i64x4::ZERO))
+pub fn _mm256_maskz_sub_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let sub = _mm256_sub_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, sub, i64x4::ZERO))
+    }
 }
 
 /// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1023,9 +1171,11 @@ pub unsafe fn _mm256_maskz_sub_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubq))]
-pub unsafe fn _mm_mask_sub_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_sub_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, sub, src.as_i64x2()))
+pub fn _mm_mask_sub_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, sub, src.as_i64x2()))
+    }
 }
 
 /// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1035,9 +1185,11 @@ pub unsafe fn _mm_mask_sub_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsubq))]
-pub unsafe fn _mm_maskz_sub_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let sub = _mm_sub_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, sub, i64x2::ZERO))
+pub fn _mm_maskz_sub_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let sub = _mm_sub_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, sub, i64x2::ZERO))
+    }
 }
 
 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
@@ -1047,8 +1199,8 @@ pub unsafe fn _mm_maskz_sub_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubps))]
-pub unsafe fn _mm512_sub_ps(a: __m512, b: __m512) -> __m512 {
-    transmute(simd_sub(a.as_f32x16(), b.as_f32x16()))
+pub fn _mm512_sub_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_sub(a.as_f32x16(), b.as_f32x16())) }
 }
 
 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1058,9 +1210,11 @@ pub unsafe fn _mm512_sub_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubps))]
-pub unsafe fn _mm512_mask_sub_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let sub = _mm512_sub_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, sub, src.as_f32x16()))
+pub fn _mm512_mask_sub_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let sub = _mm512_sub_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, sub, src.as_f32x16()))
+    }
 }
 
 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1070,9 +1224,11 @@ pub unsafe fn _mm512_mask_sub_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubps))]
-pub unsafe fn _mm512_maskz_sub_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let sub = _mm512_sub_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, sub, f32x16::ZERO))
+pub fn _mm512_maskz_sub_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let sub = _mm512_sub_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, sub, f32x16::ZERO))
+    }
 }
 
 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1082,9 +1238,11 @@ pub unsafe fn _mm512_maskz_sub_ps(k: __mmask16, a: __m512, b: __m512) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubps))]
-pub unsafe fn _mm256_mask_sub_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let sub = _mm256_sub_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, sub, src.as_f32x8()))
+pub fn _mm256_mask_sub_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let sub = _mm256_sub_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, sub, src.as_f32x8()))
+    }
 }
 
 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1094,9 +1252,11 @@ pub unsafe fn _mm256_mask_sub_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubps))]
-pub unsafe fn _mm256_maskz_sub_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let sub = _mm256_sub_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, sub, f32x8::ZERO))
+pub fn _mm256_maskz_sub_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let sub = _mm256_sub_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, sub, f32x8::ZERO))
+    }
 }
 
 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1106,9 +1266,11 @@ pub unsafe fn _mm256_maskz_sub_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubps))]
-pub unsafe fn _mm_mask_sub_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let sub = _mm_sub_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, sub, src.as_f32x4()))
+pub fn _mm_mask_sub_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let sub = _mm_sub_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, sub, src.as_f32x4()))
+    }
 }
 
 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1118,9 +1280,11 @@ pub unsafe fn _mm_mask_sub_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubps))]
-pub unsafe fn _mm_maskz_sub_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let sub = _mm_sub_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, sub, f32x4::ZERO))
+pub fn _mm_maskz_sub_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let sub = _mm_sub_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, sub, f32x4::ZERO))
+    }
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
@@ -1130,8 +1294,8 @@ pub unsafe fn _mm_maskz_sub_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubpd))]
-pub unsafe fn _mm512_sub_pd(a: __m512d, b: __m512d) -> __m512d {
-    transmute(simd_sub(a.as_f64x8(), b.as_f64x8()))
+pub fn _mm512_sub_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_sub(a.as_f64x8(), b.as_f64x8())) }
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1141,9 +1305,11 @@ pub unsafe fn _mm512_sub_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubpd))]
-pub unsafe fn _mm512_mask_sub_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let sub = _mm512_sub_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, sub, src.as_f64x8()))
+pub fn _mm512_mask_sub_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let sub = _mm512_sub_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, sub, src.as_f64x8()))
+    }
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1153,9 +1319,11 @@ pub unsafe fn _mm512_mask_sub_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubpd))]
-pub unsafe fn _mm512_maskz_sub_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let sub = _mm512_sub_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, sub, f64x8::ZERO))
+pub fn _mm512_maskz_sub_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let sub = _mm512_sub_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, sub, f64x8::ZERO))
+    }
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1165,9 +1333,11 @@ pub unsafe fn _mm512_maskz_sub_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubpd))]
-pub unsafe fn _mm256_mask_sub_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let sub = _mm256_sub_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, sub, src.as_f64x4()))
+pub fn _mm256_mask_sub_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let sub = _mm256_sub_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, sub, src.as_f64x4()))
+    }
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1177,9 +1347,11 @@ pub unsafe fn _mm256_mask_sub_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubpd))]
-pub unsafe fn _mm256_maskz_sub_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let sub = _mm256_sub_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, sub, f64x4::ZERO))
+pub fn _mm256_maskz_sub_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let sub = _mm256_sub_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, sub, f64x4::ZERO))
+    }
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1189,9 +1361,11 @@ pub unsafe fn _mm256_maskz_sub_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubpd))]
-pub unsafe fn _mm_mask_sub_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let sub = _mm_sub_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, sub, src.as_f64x2()))
+pub fn _mm_mask_sub_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let sub = _mm_sub_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, sub, src.as_f64x2()))
+    }
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1201,9 +1375,11 @@ pub unsafe fn _mm_mask_sub_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubpd))]
-pub unsafe fn _mm_maskz_sub_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let sub = _mm_sub_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, sub, f64x2::ZERO))
+pub fn _mm_maskz_sub_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let sub = _mm_sub_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, sub, f64x2::ZERO))
+    }
 }
 
 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
@@ -1213,10 +1389,12 @@ pub unsafe fn _mm_maskz_sub_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuldq))]
-pub unsafe fn _mm512_mul_epi32(a: __m512i, b: __m512i) -> __m512i {
-    let a = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(a.as_i64x8()));
-    let b = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(b.as_i64x8()));
-    transmute(simd_mul(a, b))
+pub fn _mm512_mul_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(a.as_i64x8()));
+        let b = simd_cast::<_, i64x8>(simd_cast::<_, i32x8>(b.as_i64x8()));
+        transmute(simd_mul(a, b))
+    }
 }
 
 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1226,9 +1404,11 @@ pub unsafe fn _mm512_mul_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuldq))]
-pub unsafe fn _mm512_mask_mul_epi32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let mul = _mm512_mul_epi32(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
+pub fn _mm512_mask_mul_epi32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epi32(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
+    }
 }
 
 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1238,9 +1418,11 @@ pub unsafe fn _mm512_mask_mul_epi32(src: __m512i, k: __mmask8, a: __m512i, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuldq))]
-pub unsafe fn _mm512_maskz_mul_epi32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let mul = _mm512_mul_epi32(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, mul, i64x8::ZERO))
+pub fn _mm512_maskz_mul_epi32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epi32(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, mul, i64x8::ZERO))
+    }
 }
 
 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1250,9 +1432,11 @@ pub unsafe fn _mm512_maskz_mul_epi32(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuldq))]
-pub unsafe fn _mm256_mask_mul_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let mul = _mm256_mul_epi32(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, mul, src.as_i64x4()))
+pub fn _mm256_mask_mul_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epi32(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x4()))
+    }
 }
 
 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1262,9 +1446,11 @@ pub unsafe fn _mm256_mask_mul_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuldq))]
-pub unsafe fn _mm256_maskz_mul_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let mul = _mm256_mul_epi32(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, mul, i64x4::ZERO))
+pub fn _mm256_maskz_mul_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epi32(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, mul, i64x4::ZERO))
+    }
 }
 
 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1274,9 +1460,11 @@ pub unsafe fn _mm256_maskz_mul_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuldq))]
-pub unsafe fn _mm_mask_mul_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mul_epi32(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, mul, src.as_i64x2()))
+pub fn _mm_mask_mul_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epi32(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x2()))
+    }
 }
 
 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1286,9 +1474,11 @@ pub unsafe fn _mm_mask_mul_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuldq))]
-pub unsafe fn _mm_maskz_mul_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mul_epi32(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, mul, i64x2::ZERO))
+pub fn _mm_maskz_mul_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epi32(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, mul, i64x2::ZERO))
+    }
 }
 
 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
@@ -1298,8 +1488,8 @@ pub unsafe fn _mm_maskz_mul_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulld))]
-pub unsafe fn _mm512_mullo_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_mul(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_mullo_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1309,14 +1499,11 @@ pub unsafe fn _mm512_mullo_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulld))]
-pub unsafe fn _mm512_mask_mullo_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let mul = _mm512_mullo_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, mul, src.as_i32x16()))
+pub fn _mm512_mask_mullo_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, mul, src.as_i32x16()))
+    }
 }
 
 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1326,9 +1513,11 @@ pub unsafe fn _mm512_mask_mullo_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulld))]
-pub unsafe fn _mm512_maskz_mullo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let mul = _mm512_mullo_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, mul, i32x16::ZERO))
+pub fn _mm512_maskz_mullo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, mul, i32x16::ZERO))
+    }
 }
 
 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1338,14 +1527,11 @@ pub unsafe fn _mm512_maskz_mullo_epi32(k: __mmask16, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulld))]
-pub unsafe fn _mm256_mask_mullo_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let mul = _mm256_mullo_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, mul, src.as_i32x8()))
+pub fn _mm256_mask_mullo_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i32x8()))
+    }
 }
 
 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1355,9 +1541,11 @@ pub unsafe fn _mm256_mask_mullo_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulld))]
-pub unsafe fn _mm256_maskz_mullo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let mul = _mm256_mullo_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, mul, i32x8::ZERO))
+pub fn _mm256_maskz_mullo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mullo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, mul, i32x8::ZERO))
+    }
 }
 
 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1367,9 +1555,11 @@ pub unsafe fn _mm256_maskz_mullo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulld))]
-pub unsafe fn _mm_mask_mullo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mullo_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, mul, src.as_i32x4()))
+pub fn _mm_mask_mullo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, mul, src.as_i32x4()))
+    }
 }
 
 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1379,9 +1569,11 @@ pub unsafe fn _mm_mask_mullo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmulld))]
-pub unsafe fn _mm_maskz_mullo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mullo_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, mul, i32x4::ZERO))
+pub fn _mm_maskz_mullo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mullo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, mul, i32x4::ZERO))
+    }
 }
 
 /// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst.
@@ -1392,8 +1584,8 @@ pub unsafe fn _mm_maskz_mullo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m1
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mullox_epi64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_mul(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_mullox_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_mul(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1404,14 +1596,11 @@ pub unsafe fn _mm512_mullox_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_mullox_epi64(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let mul = _mm512_mullox_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
+pub fn _mm512_mask_mullox_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mullox_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
+    }
 }
 
 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.
@@ -1421,11 +1610,13 @@ pub unsafe fn _mm512_mask_mullox_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuludq))]
-pub unsafe fn _mm512_mul_epu32(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_u64x8();
-    let b = b.as_u64x8();
-    let mask = u64x8::splat(u32::MAX.into());
-    transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+pub fn _mm512_mul_epu32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        let mask = u64x8::splat(u32::MAX.into());
+        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
+    }
 }
 
 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1435,9 +1626,11 @@ pub unsafe fn _mm512_mul_epu32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuludq))]
-pub unsafe fn _mm512_mask_mul_epu32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let mul = _mm512_mul_epu32(a, b).as_u64x8();
-    transmute(simd_select_bitmask(k, mul, src.as_u64x8()))
+pub fn _mm512_mask_mul_epu32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epu32(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_u64x8()))
+    }
 }
 
 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1447,9 +1640,11 @@ pub unsafe fn _mm512_mask_mul_epu32(src: __m512i, k: __mmask8, a: __m512i, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuludq))]
-pub unsafe fn _mm512_maskz_mul_epu32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let mul = _mm512_mul_epu32(a, b).as_u64x8();
-    transmute(simd_select_bitmask(k, mul, u64x8::ZERO))
+pub fn _mm512_maskz_mul_epu32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let mul = _mm512_mul_epu32(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, mul, u64x8::ZERO))
+    }
 }
 
 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1459,9 +1654,11 @@ pub unsafe fn _mm512_maskz_mul_epu32(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuludq))]
-pub unsafe fn _mm256_mask_mul_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let mul = _mm256_mul_epu32(a, b).as_u64x4();
-    transmute(simd_select_bitmask(k, mul, src.as_u64x4()))
+pub fn _mm256_mask_mul_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epu32(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, mul, src.as_u64x4()))
+    }
 }
 
 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1471,9 +1668,11 @@ pub unsafe fn _mm256_mask_mul_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuludq))]
-pub unsafe fn _mm256_maskz_mul_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let mul = _mm256_mul_epu32(a, b).as_u64x4();
-    transmute(simd_select_bitmask(k, mul, u64x4::ZERO))
+pub fn _mm256_maskz_mul_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let mul = _mm256_mul_epu32(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, mul, u64x4::ZERO))
+    }
 }
 
 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1483,9 +1682,11 @@ pub unsafe fn _mm256_maskz_mul_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuludq))]
-pub unsafe fn _mm_mask_mul_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mul_epu32(a, b).as_u64x2();
-    transmute(simd_select_bitmask(k, mul, src.as_u64x2()))
+pub fn _mm_mask_mul_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epu32(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, mul, src.as_u64x2()))
+    }
 }
 
 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1495,9 +1696,11 @@ pub unsafe fn _mm_mask_mul_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmuludq))]
-pub unsafe fn _mm_maskz_mul_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let mul = _mm_mul_epu32(a, b).as_u64x2();
-    transmute(simd_select_bitmask(k, mul, u64x2::ZERO))
+pub fn _mm_maskz_mul_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let mul = _mm_mul_epu32(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, mul, u64x2::ZERO))
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
@@ -1507,8 +1710,8 @@ pub unsafe fn _mm_maskz_mul_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulps))]
-pub unsafe fn _mm512_mul_ps(a: __m512, b: __m512) -> __m512 {
-    transmute(simd_mul(a.as_f32x16(), b.as_f32x16()))
+pub fn _mm512_mul_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_mul(a.as_f32x16(), b.as_f32x16())) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1518,9 +1721,11 @@ pub unsafe fn _mm512_mul_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulps))]
-pub unsafe fn _mm512_mask_mul_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let mul = _mm512_mul_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, mul, src.as_f32x16()))
+pub fn _mm512_mask_mul_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let mul = _mm512_mul_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, mul, src.as_f32x16()))
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1530,9 +1735,11 @@ pub unsafe fn _mm512_mask_mul_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulps))]
-pub unsafe fn _mm512_maskz_mul_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let mul = _mm512_mul_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, mul, f32x16::ZERO))
+pub fn _mm512_maskz_mul_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let mul = _mm512_mul_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, mul, f32x16::ZERO))
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1542,9 +1749,11 @@ pub unsafe fn _mm512_maskz_mul_ps(k: __mmask16, a: __m512, b: __m512) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulps))]
-pub unsafe fn _mm256_mask_mul_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let mul = _mm256_mul_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, mul, src.as_f32x8()))
+pub fn _mm256_mask_mul_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let mul = _mm256_mul_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, mul, src.as_f32x8()))
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1554,9 +1763,11 @@ pub unsafe fn _mm256_mask_mul_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulps))]
-pub unsafe fn _mm256_maskz_mul_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let mul = _mm256_mul_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, mul, f32x8::ZERO))
+pub fn _mm256_maskz_mul_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let mul = _mm256_mul_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, mul, f32x8::ZERO))
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1566,9 +1777,11 @@ pub unsafe fn _mm256_maskz_mul_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulps))]
-pub unsafe fn _mm_mask_mul_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mul = _mm_mul_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, mul, src.as_f32x4()))
+pub fn _mm_mask_mul_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mul = _mm_mul_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, mul, src.as_f32x4()))
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1578,9 +1791,11 @@ pub unsafe fn _mm_mask_mul_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulps))]
-pub unsafe fn _mm_maskz_mul_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mul = _mm_mul_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, mul, f32x4::ZERO))
+pub fn _mm_maskz_mul_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mul = _mm_mul_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, mul, f32x4::ZERO))
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
@@ -1590,8 +1805,8 @@ pub unsafe fn _mm_maskz_mul_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulpd))]
-pub unsafe fn _mm512_mul_pd(a: __m512d, b: __m512d) -> __m512d {
-    transmute(simd_mul(a.as_f64x8(), b.as_f64x8()))
+pub fn _mm512_mul_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_mul(a.as_f64x8(), b.as_f64x8())) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1601,9 +1816,11 @@ pub unsafe fn _mm512_mul_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulpd))]
-pub unsafe fn _mm512_mask_mul_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let mul = _mm512_mul_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, mul, src.as_f64x8()))
+pub fn _mm512_mask_mul_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let mul = _mm512_mul_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, mul, src.as_f64x8()))
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1613,9 +1830,11 @@ pub unsafe fn _mm512_mask_mul_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulpd))]
-pub unsafe fn _mm512_maskz_mul_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let mul = _mm512_mul_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, mul, f64x8::ZERO))
+pub fn _mm512_maskz_mul_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let mul = _mm512_mul_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, mul, f64x8::ZERO))
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1625,9 +1844,11 @@ pub unsafe fn _mm512_maskz_mul_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulpd))]
-pub unsafe fn _mm256_mask_mul_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let mul = _mm256_mul_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, mul, src.as_f64x4()))
+pub fn _mm256_mask_mul_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let mul = _mm256_mul_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, mul, src.as_f64x4()))
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1637,9 +1858,11 @@ pub unsafe fn _mm256_mask_mul_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulpd))]
-pub unsafe fn _mm256_maskz_mul_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let mul = _mm256_mul_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, mul, f64x4::ZERO))
+pub fn _mm256_maskz_mul_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let mul = _mm256_mul_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, mul, f64x4::ZERO))
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1649,9 +1872,11 @@ pub unsafe fn _mm256_maskz_mul_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulpd))]
-pub unsafe fn _mm_mask_mul_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mul = _mm_mul_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, mul, src.as_f64x2()))
+pub fn _mm_mask_mul_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mul = _mm_mul_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, mul, src.as_f64x2()))
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1661,9 +1886,11 @@ pub unsafe fn _mm_mask_mul_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulpd))]
-pub unsafe fn _mm_maskz_mul_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mul = _mm_mul_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, mul, f64x2::ZERO))
+pub fn _mm_maskz_mul_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mul = _mm_mul_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, mul, f64x2::ZERO))
+    }
 }
 
 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
@@ -1673,8 +1900,8 @@ pub unsafe fn _mm_maskz_mul_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivps))]
-pub unsafe fn _mm512_div_ps(a: __m512, b: __m512) -> __m512 {
-    transmute(simd_div(a.as_f32x16(), b.as_f32x16()))
+pub fn _mm512_div_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_div(a.as_f32x16(), b.as_f32x16())) }
 }
 
 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1684,9 +1911,11 @@ pub unsafe fn _mm512_div_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivps))]
-pub unsafe fn _mm512_mask_div_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let div = _mm512_div_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, div, src.as_f32x16()))
+pub fn _mm512_mask_div_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let div = _mm512_div_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, div, src.as_f32x16()))
+    }
 }
 
 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1696,9 +1925,11 @@ pub unsafe fn _mm512_mask_div_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivps))]
-pub unsafe fn _mm512_maskz_div_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let div = _mm512_div_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, div, f32x16::ZERO))
+pub fn _mm512_maskz_div_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let div = _mm512_div_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, div, f32x16::ZERO))
+    }
 }
 
 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1708,9 +1939,11 @@ pub unsafe fn _mm512_maskz_div_ps(k: __mmask16, a: __m512, b: __m512) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivps))]
-pub unsafe fn _mm256_mask_div_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let div = _mm256_div_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, div, src.as_f32x8()))
+pub fn _mm256_mask_div_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let div = _mm256_div_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, div, src.as_f32x8()))
+    }
 }
 
 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1720,9 +1953,11 @@ pub unsafe fn _mm256_mask_div_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivps))]
-pub unsafe fn _mm256_maskz_div_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let div = _mm256_div_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, div, f32x8::ZERO))
+pub fn _mm256_maskz_div_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let div = _mm256_div_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, div, f32x8::ZERO))
+    }
 }
 
 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1732,9 +1967,11 @@ pub unsafe fn _mm256_maskz_div_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivps))]
-pub unsafe fn _mm_mask_div_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let div = _mm_div_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, div, src.as_f32x4()))
+pub fn _mm_mask_div_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let div = _mm_div_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, div, src.as_f32x4()))
+    }
 }
 
 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1744,9 +1981,11 @@ pub unsafe fn _mm_mask_div_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivps))]
-pub unsafe fn _mm_maskz_div_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let div = _mm_div_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, div, f32x4::ZERO))
+pub fn _mm_maskz_div_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let div = _mm_div_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, div, f32x4::ZERO))
+    }
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.
@@ -1756,8 +1995,8 @@ pub unsafe fn _mm_maskz_div_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivpd))]
-pub unsafe fn _mm512_div_pd(a: __m512d, b: __m512d) -> __m512d {
-    transmute(simd_div(a.as_f64x8(), b.as_f64x8()))
+pub fn _mm512_div_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_div(a.as_f64x8(), b.as_f64x8())) }
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1767,9 +2006,11 @@ pub unsafe fn _mm512_div_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivpd))]
-pub unsafe fn _mm512_mask_div_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let div = _mm512_div_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, div, src.as_f64x8()))
+pub fn _mm512_mask_div_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let div = _mm512_div_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, div, src.as_f64x8()))
+    }
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1779,9 +2020,11 @@ pub unsafe fn _mm512_mask_div_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivpd))]
-pub unsafe fn _mm512_maskz_div_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let div = _mm512_div_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, div, f64x8::ZERO))
+pub fn _mm512_maskz_div_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let div = _mm512_div_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, div, f64x8::ZERO))
+    }
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1791,9 +2034,11 @@ pub unsafe fn _mm512_maskz_div_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivpd))]
-pub unsafe fn _mm256_mask_div_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let div = _mm256_div_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, div, src.as_f64x4()))
+pub fn _mm256_mask_div_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let div = _mm256_div_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, div, src.as_f64x4()))
+    }
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1803,9 +2048,11 @@ pub unsafe fn _mm256_mask_div_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivpd))]
-pub unsafe fn _mm256_maskz_div_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let div = _mm256_div_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, div, f64x4::ZERO))
+pub fn _mm256_maskz_div_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let div = _mm256_div_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, div, f64x4::ZERO))
+    }
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1815,9 +2062,11 @@ pub unsafe fn _mm256_maskz_div_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivpd))]
-pub unsafe fn _mm_mask_div_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let div = _mm_div_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, div, src.as_f64x2()))
+pub fn _mm_mask_div_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let div = _mm_div_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, div, src.as_f64x2()))
+    }
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1827,9 +2076,11 @@ pub unsafe fn _mm_mask_div_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivpd))]
-pub unsafe fn _mm_maskz_div_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let div = _mm_div_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, div, f64x2::ZERO))
+pub fn _mm_maskz_div_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let div = _mm_div_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, div, f64x2::ZERO))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst.
@@ -1839,10 +2090,12 @@ pub unsafe fn _mm_maskz_div_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
-pub unsafe fn _mm512_max_epi32(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
+pub fn _mm512_max_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1852,9 +2105,11 @@ pub unsafe fn _mm512_max_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
-pub unsafe fn _mm512_mask_max_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, max, src.as_i32x16()))
+pub fn _mm512_mask_max_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, max, src.as_i32x16()))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1864,9 +2119,11 @@ pub unsafe fn _mm512_mask_max_epi32(src: __m512i, k: __mmask16, a: __m512i, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
-pub unsafe fn _mm512_maskz_max_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, max, i32x16::ZERO))
+pub fn _mm512_maskz_max_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, max, i32x16::ZERO))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1876,9 +2133,11 @@ pub unsafe fn _mm512_maskz_max_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
-pub unsafe fn _mm256_mask_max_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, max, src.as_i32x8()))
+pub fn _mm256_mask_max_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, max, src.as_i32x8()))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1888,9 +2147,11 @@ pub unsafe fn _mm256_mask_max_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
-pub unsafe fn _mm256_maskz_max_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, max, i32x8::ZERO))
+pub fn _mm256_maskz_max_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, max, i32x8::ZERO))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1900,9 +2161,11 @@ pub unsafe fn _mm256_maskz_max_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
-pub unsafe fn _mm_mask_max_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, max, src.as_i32x4()))
+pub fn _mm_mask_max_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, max, src.as_i32x4()))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1912,9 +2175,11 @@ pub unsafe fn _mm_mask_max_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
-pub unsafe fn _mm_maskz_max_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, max, i32x4::ZERO))
+pub fn _mm_maskz_max_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, max, i32x4::ZERO))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
@@ -1924,10 +2189,12 @@ pub unsafe fn _mm_maskz_max_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
-pub unsafe fn _mm512_max_epi64(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i64x8();
-    let b = b.as_i64x8();
-    transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
+pub fn _mm512_max_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1937,9 +2204,11 @@ pub unsafe fn _mm512_max_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
-pub unsafe fn _mm512_mask_max_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, max, src.as_i64x8()))
+pub fn _mm512_mask_max_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, max, src.as_i64x8()))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1949,9 +2218,11 @@ pub unsafe fn _mm512_mask_max_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
-pub unsafe fn _mm512_maskz_max_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, max, i64x8::ZERO))
+pub fn _mm512_maskz_max_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, max, i64x8::ZERO))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
@@ -1961,10 +2232,12 @@ pub unsafe fn _mm512_maskz_max_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
-pub unsafe fn _mm256_max_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
+pub fn _mm256_max_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -1974,9 +2247,11 @@ pub unsafe fn _mm256_max_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
-pub unsafe fn _mm256_mask_max_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, max, src.as_i64x4()))
+pub fn _mm256_mask_max_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, max, src.as_i64x4()))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1986,9 +2261,11 @@ pub unsafe fn _mm256_mask_max_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
-pub unsafe fn _mm256_maskz_max_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, max, i64x4::ZERO))
+pub fn _mm256_maskz_max_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, max, i64x4::ZERO))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
@@ -1998,10 +2275,12 @@ pub unsafe fn _mm256_maskz_max_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
-pub unsafe fn _mm_max_epi64(a: __m128i, b: __m128i) -> __m128i {
-    let a = a.as_i64x2();
-    let b = b.as_i64x2();
-    transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
+pub fn _mm_max_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2011,9 +2290,11 @@ pub unsafe fn _mm_max_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
-pub unsafe fn _mm_mask_max_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, max, src.as_i64x2()))
+pub fn _mm_mask_max_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, max, src.as_i64x2()))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2023,9 +2304,11 @@ pub unsafe fn _mm_mask_max_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxsq))]
-pub unsafe fn _mm_maskz_max_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, max, i64x2::ZERO))
+pub fn _mm_maskz_max_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, max, i64x2::ZERO))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
@@ -2035,12 +2318,14 @@ pub unsafe fn _mm_maskz_max_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxps))]
-pub unsafe fn _mm512_max_ps(a: __m512, b: __m512) -> __m512 {
-    transmute(vmaxps(
-        a.as_f32x16(),
-        b.as_f32x16(),
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_max_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vmaxps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2050,9 +2335,11 @@ pub unsafe fn _mm512_max_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxps))]
-pub unsafe fn _mm512_mask_max_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let max = _mm512_max_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, max, src.as_f32x16()))
+pub fn _mm512_mask_max_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let max = _mm512_max_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, max, src.as_f32x16()))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2062,9 +2349,11 @@ pub unsafe fn _mm512_mask_max_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxps))]
-pub unsafe fn _mm512_maskz_max_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let max = _mm512_max_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, max, f32x16::ZERO))
+pub fn _mm512_maskz_max_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let max = _mm512_max_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, max, f32x16::ZERO))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2074,9 +2363,11 @@ pub unsafe fn _mm512_maskz_max_ps(k: __mmask16, a: __m512, b: __m512) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxps))]
-pub unsafe fn _mm256_mask_max_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let max = _mm256_max_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, max, src.as_f32x8()))
+pub fn _mm256_mask_max_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let max = _mm256_max_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, max, src.as_f32x8()))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2086,9 +2377,11 @@ pub unsafe fn _mm256_mask_max_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxps))]
-pub unsafe fn _mm256_maskz_max_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let max = _mm256_max_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, max, f32x8::ZERO))
+pub fn _mm256_maskz_max_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let max = _mm256_max_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, max, f32x8::ZERO))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2098,9 +2391,11 @@ pub unsafe fn _mm256_maskz_max_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxps))]
-pub unsafe fn _mm_mask_max_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let max = _mm_max_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, max, src.as_f32x4()))
+pub fn _mm_mask_max_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let max = _mm_max_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, max, src.as_f32x4()))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2110,9 +2405,11 @@ pub unsafe fn _mm_mask_max_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxps))]
-pub unsafe fn _mm_maskz_max_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let max = _mm_max_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, max, f32x4::ZERO))
+pub fn _mm_maskz_max_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let max = _mm_max_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, max, f32x4::ZERO))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
@@ -2122,8 +2419,8 @@ pub unsafe fn _mm_maskz_max_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxpd))]
-pub unsafe fn _mm512_max_pd(a: __m512d, b: __m512d) -> __m512d {
-    transmute(vmaxpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm512_max_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(vmaxpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2133,9 +2430,11 @@ pub unsafe fn _mm512_max_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxpd))]
-pub unsafe fn _mm512_mask_max_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let max = _mm512_max_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, max, src.as_f64x8()))
+pub fn _mm512_mask_max_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let max = _mm512_max_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, max, src.as_f64x8()))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2145,9 +2444,11 @@ pub unsafe fn _mm512_mask_max_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxpd))]
-pub unsafe fn _mm512_maskz_max_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let max = _mm512_max_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, max, f64x8::ZERO))
+pub fn _mm512_maskz_max_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let max = _mm512_max_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, max, f64x8::ZERO))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2157,9 +2458,11 @@ pub unsafe fn _mm512_maskz_max_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxpd))]
-pub unsafe fn _mm256_mask_max_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let max = _mm256_max_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, max, src.as_f64x4()))
+pub fn _mm256_mask_max_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let max = _mm256_max_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, max, src.as_f64x4()))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2169,9 +2472,11 @@ pub unsafe fn _mm256_mask_max_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxpd))]
-pub unsafe fn _mm256_maskz_max_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let max = _mm256_max_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, max, f64x4::ZERO))
+pub fn _mm256_maskz_max_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let max = _mm256_max_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, max, f64x4::ZERO))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2181,9 +2486,11 @@ pub unsafe fn _mm256_maskz_max_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxpd))]
-pub unsafe fn _mm_mask_max_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let max = _mm_max_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, max, src.as_f64x2()))
+pub fn _mm_mask_max_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let max = _mm_max_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, max, src.as_f64x2()))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2193,9 +2500,11 @@ pub unsafe fn _mm_mask_max_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxpd))]
-pub unsafe fn _mm_maskz_max_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let max = _mm_max_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, max, f64x2::ZERO))
+pub fn _mm_maskz_max_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let max = _mm_max_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, max, f64x2::ZERO))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
@@ -2205,10 +2514,12 @@ pub unsafe fn _mm_maskz_max_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxud))]
-pub unsafe fn _mm512_max_epu32(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_u32x16();
-    let b = b.as_u32x16();
-    transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
+pub fn _mm512_max_epu32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        transmute(simd_select::<i32x16, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2218,9 +2529,11 @@ pub unsafe fn _mm512_max_epu32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxud))]
-pub unsafe fn _mm512_mask_max_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epu32(a, b).as_u32x16();
-    transmute(simd_select_bitmask(k, max, src.as_u32x16()))
+pub fn _mm512_mask_max_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, max, src.as_u32x16()))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2230,9 +2543,11 @@ pub unsafe fn _mm512_mask_max_epu32(src: __m512i, k: __mmask16, a: __m512i, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxud))]
-pub unsafe fn _mm512_maskz_max_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epu32(a, b).as_u32x16();
-    transmute(simd_select_bitmask(k, max, u32x16::ZERO))
+pub fn _mm512_maskz_max_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, max, u32x16::ZERO))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2242,9 +2557,11 @@ pub unsafe fn _mm512_maskz_max_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxud))]
-pub unsafe fn _mm256_mask_max_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epu32(a, b).as_u32x8();
-    transmute(simd_select_bitmask(k, max, src.as_u32x8()))
+pub fn _mm256_mask_max_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, max, src.as_u32x8()))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2254,9 +2571,11 @@ pub unsafe fn _mm256_mask_max_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxud))]
-pub unsafe fn _mm256_maskz_max_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epu32(a, b).as_u32x8();
-    transmute(simd_select_bitmask(k, max, u32x8::ZERO))
+pub fn _mm256_maskz_max_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, max, u32x8::ZERO))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2266,9 +2585,11 @@ pub unsafe fn _mm256_maskz_max_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxud))]
-pub unsafe fn _mm_mask_max_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epu32(a, b).as_u32x4();
-    transmute(simd_select_bitmask(k, max, src.as_u32x4()))
+pub fn _mm_mask_max_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, max, src.as_u32x4()))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2278,9 +2599,11 @@ pub unsafe fn _mm_mask_max_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxud))]
-pub unsafe fn _mm_maskz_max_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epu32(a, b).as_u32x4();
-    transmute(simd_select_bitmask(k, max, u32x4::ZERO))
+pub fn _mm_maskz_max_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, max, u32x4::ZERO))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
@@ -2290,10 +2613,12 @@ pub unsafe fn _mm_maskz_max_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
-pub unsafe fn _mm512_max_epu64(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_u64x8();
-    let b = b.as_u64x8();
-    transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
+pub fn _mm512_max_epu64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        transmute(simd_select::<i64x8, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2303,9 +2628,11 @@ pub unsafe fn _mm512_max_epu64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
-pub unsafe fn _mm512_mask_max_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epu64(a, b).as_u64x8();
-    transmute(simd_select_bitmask(k, max, src.as_u64x8()))
+pub fn _mm512_mask_max_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, max, src.as_u64x8()))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2315,9 +2642,11 @@ pub unsafe fn _mm512_mask_max_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
-pub unsafe fn _mm512_maskz_max_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let max = _mm512_max_epu64(a, b).as_u64x8();
-    transmute(simd_select_bitmask(k, max, u64x8::ZERO))
+pub fn _mm512_maskz_max_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let max = _mm512_max_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, max, u64x8::ZERO))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
@@ -2327,10 +2656,12 @@ pub unsafe fn _mm512_maskz_max_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
-pub unsafe fn _mm256_max_epu64(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_u64x4();
-    let b = b.as_u64x4();
-    transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
+pub fn _mm256_max_epu64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        transmute(simd_select::<i64x4, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2340,9 +2671,11 @@ pub unsafe fn _mm256_max_epu64(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
-pub unsafe fn _mm256_mask_max_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epu64(a, b).as_u64x4();
-    transmute(simd_select_bitmask(k, max, src.as_u64x4()))
+pub fn _mm256_mask_max_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, max, src.as_u64x4()))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2352,9 +2685,11 @@ pub unsafe fn _mm256_mask_max_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
-pub unsafe fn _mm256_maskz_max_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let max = _mm256_max_epu64(a, b).as_u64x4();
-    transmute(simd_select_bitmask(k, max, u64x4::ZERO))
+pub fn _mm256_maskz_max_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let max = _mm256_max_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, max, u64x4::ZERO))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
@@ -2364,10 +2699,12 @@ pub unsafe fn _mm256_maskz_max_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
-pub unsafe fn _mm_max_epu64(a: __m128i, b: __m128i) -> __m128i {
-    let a = a.as_u64x2();
-    let b = b.as_u64x2();
-    transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
+pub fn _mm_max_epu64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        transmute(simd_select::<i64x2, _>(simd_gt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2377,9 +2714,11 @@ pub unsafe fn _mm_max_epu64(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
-pub unsafe fn _mm_mask_max_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epu64(a, b).as_u64x2();
-    transmute(simd_select_bitmask(k, max, src.as_u64x2()))
+pub fn _mm_mask_max_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, max, src.as_u64x2()))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2389,9 +2728,11 @@ pub unsafe fn _mm_mask_max_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmaxuq))]
-pub unsafe fn _mm_maskz_max_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let max = _mm_max_epu64(a, b).as_u64x2();
-    transmute(simd_select_bitmask(k, max, u64x2::ZERO))
+pub fn _mm_maskz_max_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let max = _mm_max_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, max, u64x2::ZERO))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst.
@@ -2401,10 +2742,12 @@ pub unsafe fn _mm_maskz_max_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsd))]
-pub unsafe fn _mm512_min_epi32(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
+pub fn _mm512_min_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2414,9 +2757,11 @@ pub unsafe fn _mm512_min_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsd))]
-pub unsafe fn _mm512_mask_min_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, min, src.as_i32x16()))
+pub fn _mm512_mask_min_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, min, src.as_i32x16()))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2426,9 +2771,11 @@ pub unsafe fn _mm512_mask_min_epi32(src: __m512i, k: __mmask16, a: __m512i, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsd))]
-pub unsafe fn _mm512_maskz_min_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, min, i32x16::ZERO))
+pub fn _mm512_maskz_min_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, min, i32x16::ZERO))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2438,9 +2785,11 @@ pub unsafe fn _mm512_maskz_min_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsd))]
-pub unsafe fn _mm256_mask_min_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, min, src.as_i32x8()))
+pub fn _mm256_mask_min_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, min, src.as_i32x8()))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2450,9 +2799,11 @@ pub unsafe fn _mm256_mask_min_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsd))]
-pub unsafe fn _mm256_maskz_min_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, min, i32x8::ZERO))
+pub fn _mm256_maskz_min_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, min, i32x8::ZERO))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2462,9 +2813,11 @@ pub unsafe fn _mm256_maskz_min_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsd))]
-pub unsafe fn _mm_mask_min_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, min, src.as_i32x4()))
+pub fn _mm_mask_min_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, min, src.as_i32x4()))
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2474,9 +2827,11 @@ pub unsafe fn _mm_mask_min_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsd))]
-pub unsafe fn _mm_maskz_min_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, min, i32x4::ZERO))
+pub fn _mm_maskz_min_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, min, i32x4::ZERO))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
@@ -2486,10 +2841,12 @@ pub unsafe fn _mm_maskz_min_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsq))]
-pub unsafe fn _mm512_min_epi64(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i64x8();
-    let b = b.as_i64x8();
-    transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
+pub fn _mm512_min_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2499,9 +2856,11 @@ pub unsafe fn _mm512_min_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsq))]
-pub unsafe fn _mm512_mask_min_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, min, src.as_i64x8()))
+pub fn _mm512_mask_min_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, min, src.as_i64x8()))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2511,9 +2870,11 @@ pub unsafe fn _mm512_mask_min_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsq))]
-pub unsafe fn _mm512_maskz_min_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, min, i64x8::ZERO))
+pub fn _mm512_maskz_min_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, min, i64x8::ZERO))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
@@ -2523,10 +2884,12 @@ pub unsafe fn _mm512_maskz_min_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsq))]
-pub unsafe fn _mm256_min_epi64(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
+pub fn _mm256_min_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2536,9 +2899,11 @@ pub unsafe fn _mm256_min_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsq))]
-pub unsafe fn _mm256_mask_min_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, min, src.as_i64x4()))
+pub fn _mm256_mask_min_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, min, src.as_i64x4()))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2548,9 +2913,11 @@ pub unsafe fn _mm256_mask_min_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsq))]
-pub unsafe fn _mm256_maskz_min_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, min, i64x4::ZERO))
+pub fn _mm256_maskz_min_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, min, i64x4::ZERO))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
@@ -2560,10 +2927,12 @@ pub unsafe fn _mm256_maskz_min_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsq))]
-pub unsafe fn _mm_min_epi64(a: __m128i, b: __m128i) -> __m128i {
-    let a = a.as_i64x2();
-    let b = b.as_i64x2();
-    transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
+pub fn _mm_min_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2573,9 +2942,11 @@ pub unsafe fn _mm_min_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsq))]
-pub unsafe fn _mm_mask_min_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, min, src.as_i64x2()))
+pub fn _mm_mask_min_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, min, src.as_i64x2()))
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2585,9 +2956,11 @@ pub unsafe fn _mm_mask_min_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminsq))]
-pub unsafe fn _mm_maskz_min_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, min, i64x2::ZERO))
+pub fn _mm_maskz_min_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, min, i64x2::ZERO))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
@@ -2597,12 +2970,14 @@ pub unsafe fn _mm_maskz_min_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminps))]
-pub unsafe fn _mm512_min_ps(a: __m512, b: __m512) -> __m512 {
-    transmute(vminps(
-        a.as_f32x16(),
-        b.as_f32x16(),
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_min_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vminps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2612,9 +2987,11 @@ pub unsafe fn _mm512_min_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminps))]
-pub unsafe fn _mm512_mask_min_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let min = _mm512_min_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, min, src.as_f32x16()))
+pub fn _mm512_mask_min_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let min = _mm512_min_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, min, src.as_f32x16()))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2624,9 +3001,11 @@ pub unsafe fn _mm512_mask_min_ps(src: __m512, k: __mmask16, a: __m512, b: __m512
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminps))]
-pub unsafe fn _mm512_maskz_min_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let min = _mm512_min_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, min, f32x16::ZERO))
+pub fn _mm512_maskz_min_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let min = _mm512_min_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, min, f32x16::ZERO))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2636,9 +3015,11 @@ pub unsafe fn _mm512_maskz_min_ps(k: __mmask16, a: __m512, b: __m512) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminps))]
-pub unsafe fn _mm256_mask_min_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let min = _mm256_min_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, min, src.as_f32x8()))
+pub fn _mm256_mask_min_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let min = _mm256_min_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, min, src.as_f32x8()))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2648,9 +3029,11 @@ pub unsafe fn _mm256_mask_min_ps(src: __m256, k: __mmask8, a: __m256, b: __m256)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminps))]
-pub unsafe fn _mm256_maskz_min_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let min = _mm256_min_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, min, f32x8::ZERO))
+pub fn _mm256_maskz_min_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let min = _mm256_min_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, min, f32x8::ZERO))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2660,9 +3043,11 @@ pub unsafe fn _mm256_maskz_min_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminps))]
-pub unsafe fn _mm_mask_min_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let min = _mm_min_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, min, src.as_f32x4()))
+pub fn _mm_mask_min_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let min = _mm_min_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, min, src.as_f32x4()))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2672,9 +3057,11 @@ pub unsafe fn _mm_mask_min_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminps))]
-pub unsafe fn _mm_maskz_min_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let min = _mm_min_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, min, f32x4::ZERO))
+pub fn _mm_maskz_min_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let min = _mm_min_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, min, f32x4::ZERO))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
@@ -2684,8 +3071,8 @@ pub unsafe fn _mm_maskz_min_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminpd))]
-pub unsafe fn _mm512_min_pd(a: __m512d, b: __m512d) -> __m512d {
-    transmute(vminpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
+pub fn _mm512_min_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(vminpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION)) }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2695,9 +3082,11 @@ pub unsafe fn _mm512_min_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminpd))]
-pub unsafe fn _mm512_mask_min_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let min = _mm512_min_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, min, src.as_f64x8()))
+pub fn _mm512_mask_min_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let min = _mm512_min_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, min, src.as_f64x8()))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2707,9 +3096,11 @@ pub unsafe fn _mm512_mask_min_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminpd))]
-pub unsafe fn _mm512_maskz_min_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let min = _mm512_min_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, min, f64x8::ZERO))
+pub fn _mm512_maskz_min_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let min = _mm512_min_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, min, f64x8::ZERO))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2719,9 +3110,11 @@ pub unsafe fn _mm512_maskz_min_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminpd))]
-pub unsafe fn _mm256_mask_min_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let min = _mm256_min_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, min, src.as_f64x4()))
+pub fn _mm256_mask_min_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let min = _mm256_min_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, min, src.as_f64x4()))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2731,9 +3124,11 @@ pub unsafe fn _mm256_mask_min_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminpd))]
-pub unsafe fn _mm256_maskz_min_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let min = _mm256_min_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, min, f64x4::ZERO))
+pub fn _mm256_maskz_min_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let min = _mm256_min_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, min, f64x4::ZERO))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2743,9 +3138,11 @@ pub unsafe fn _mm256_maskz_min_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminpd))]
-pub unsafe fn _mm_mask_min_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let min = _mm_min_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, min, src.as_f64x2()))
+pub fn _mm_mask_min_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let min = _mm_min_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, min, src.as_f64x2()))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2755,9 +3152,11 @@ pub unsafe fn _mm_mask_min_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminpd))]
-pub unsafe fn _mm_maskz_min_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let min = _mm_min_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, min, f64x2::ZERO))
+pub fn _mm_maskz_min_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let min = _mm_min_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, min, f64x2::ZERO))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
@@ -2767,10 +3166,12 @@ pub unsafe fn _mm_maskz_min_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminud))]
-pub unsafe fn _mm512_min_epu32(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_u32x16();
-    let b = b.as_u32x16();
-    transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
+pub fn _mm512_min_epu32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        transmute(simd_select::<i32x16, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2780,9 +3181,11 @@ pub unsafe fn _mm512_min_epu32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminud))]
-pub unsafe fn _mm512_mask_min_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epu32(a, b).as_u32x16();
-    transmute(simd_select_bitmask(k, min, src.as_u32x16()))
+pub fn _mm512_mask_min_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, min, src.as_u32x16()))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2792,9 +3195,11 @@ pub unsafe fn _mm512_mask_min_epu32(src: __m512i, k: __mmask16, a: __m512i, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminud))]
-pub unsafe fn _mm512_maskz_min_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epu32(a, b).as_u32x16();
-    transmute(simd_select_bitmask(k, min, u32x16::ZERO))
+pub fn _mm512_maskz_min_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu32(a, b).as_u32x16();
+        transmute(simd_select_bitmask(k, min, u32x16::ZERO))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2804,9 +3209,11 @@ pub unsafe fn _mm512_maskz_min_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminud))]
-pub unsafe fn _mm256_mask_min_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epu32(a, b).as_u32x8();
-    transmute(simd_select_bitmask(k, min, src.as_u32x8()))
+pub fn _mm256_mask_min_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, min, src.as_u32x8()))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2816,9 +3223,11 @@ pub unsafe fn _mm256_mask_min_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminud))]
-pub unsafe fn _mm256_maskz_min_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epu32(a, b).as_u32x8();
-    transmute(simd_select_bitmask(k, min, u32x8::ZERO))
+pub fn _mm256_maskz_min_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu32(a, b).as_u32x8();
+        transmute(simd_select_bitmask(k, min, u32x8::ZERO))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2828,9 +3237,11 @@ pub unsafe fn _mm256_maskz_min_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminud))]
-pub unsafe fn _mm_mask_min_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epu32(a, b).as_u32x4();
-    transmute(simd_select_bitmask(k, min, src.as_u32x4()))
+pub fn _mm_mask_min_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, min, src.as_u32x4()))
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2840,9 +3251,11 @@ pub unsafe fn _mm_mask_min_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminud))]
-pub unsafe fn _mm_maskz_min_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epu32(a, b).as_u32x4();
-    transmute(simd_select_bitmask(k, min, u32x4::ZERO))
+pub fn _mm_maskz_min_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu32(a, b).as_u32x4();
+        transmute(simd_select_bitmask(k, min, u32x4::ZERO))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
@@ -2852,10 +3265,12 @@ pub unsafe fn _mm_maskz_min_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
-pub unsafe fn _mm512_min_epu64(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_u64x8();
-    let b = b.as_u64x8();
-    transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
+pub fn _mm512_min_epu64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        transmute(simd_select::<i64x8, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2865,9 +3280,11 @@ pub unsafe fn _mm512_min_epu64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
-pub unsafe fn _mm512_mask_min_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epu64(a, b).as_u64x8();
-    transmute(simd_select_bitmask(k, min, src.as_u64x8()))
+pub fn _mm512_mask_min_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, min, src.as_u64x8()))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2877,9 +3294,11 @@ pub unsafe fn _mm512_mask_min_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
-pub unsafe fn _mm512_maskz_min_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let min = _mm512_min_epu64(a, b).as_u64x8();
-    transmute(simd_select_bitmask(k, min, u64x8::ZERO))
+pub fn _mm512_maskz_min_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let min = _mm512_min_epu64(a, b).as_u64x8();
+        transmute(simd_select_bitmask(k, min, u64x8::ZERO))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
@@ -2889,10 +3308,12 @@ pub unsafe fn _mm512_maskz_min_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
-pub unsafe fn _mm256_min_epu64(a: __m256i, b: __m256i) -> __m256i {
-    let a = a.as_u64x4();
-    let b = b.as_u64x4();
-    transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
+pub fn _mm256_min_epu64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        transmute(simd_select::<i64x4, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2902,9 +3323,11 @@ pub unsafe fn _mm256_min_epu64(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
-pub unsafe fn _mm256_mask_min_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epu64(a, b).as_u64x4();
-    transmute(simd_select_bitmask(k, min, src.as_u64x4()))
+pub fn _mm256_mask_min_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, min, src.as_u64x4()))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2914,9 +3337,11 @@ pub unsafe fn _mm256_mask_min_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
-pub unsafe fn _mm256_maskz_min_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let min = _mm256_min_epu64(a, b).as_u64x4();
-    transmute(simd_select_bitmask(k, min, u64x4::ZERO))
+pub fn _mm256_maskz_min_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let min = _mm256_min_epu64(a, b).as_u64x4();
+        transmute(simd_select_bitmask(k, min, u64x4::ZERO))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
@@ -2926,10 +3351,12 @@ pub unsafe fn _mm256_maskz_min_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
-pub unsafe fn _mm_min_epu64(a: __m128i, b: __m128i) -> __m128i {
-    let a = a.as_u64x2();
-    let b = b.as_u64x2();
-    transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
+pub fn _mm_min_epu64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        transmute(simd_select::<i64x2, _>(simd_lt(a, b), a, b))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2939,9 +3366,11 @@ pub unsafe fn _mm_min_epu64(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
-pub unsafe fn _mm_mask_min_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epu64(a, b).as_u64x2();
-    transmute(simd_select_bitmask(k, min, src.as_u64x2()))
+pub fn _mm_mask_min_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, min, src.as_u64x2()))
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2951,9 +3380,11 @@ pub unsafe fn _mm_mask_min_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpminuq))]
-pub unsafe fn _mm_maskz_min_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let min = _mm_min_epu64(a, b).as_u64x2();
-    transmute(simd_select_bitmask(k, min, u64x2::ZERO))
+pub fn _mm_maskz_min_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let min = _mm_min_epu64(a, b).as_u64x2();
+        transmute(simd_select_bitmask(k, min, u64x2::ZERO))
+    }
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
@@ -2963,8 +3394,8 @@ pub unsafe fn _mm_maskz_min_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
-pub unsafe fn _mm512_sqrt_ps(a: __m512) -> __m512 {
-    simd_fsqrt(a)
+pub fn _mm512_sqrt_ps(a: __m512) -> __m512 {
+    unsafe { simd_fsqrt(a) }
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2974,8 +3405,8 @@ pub unsafe fn _mm512_sqrt_ps(a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
-pub unsafe fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    simd_select_bitmask(k, simd_fsqrt(a), src)
+pub fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2985,8 +3416,8 @@ pub unsafe fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
-pub unsafe fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 {
-    simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_ps())
+pub fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_ps()) }
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -2996,8 +3427,8 @@ pub unsafe fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
-pub unsafe fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    simd_select_bitmask(k, simd_fsqrt(a), src)
+pub fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3007,8 +3438,8 @@ pub unsafe fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
-pub unsafe fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 {
-    simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_ps())
+pub fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_ps()) }
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -3018,8 +3449,8 @@ pub unsafe fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
-pub unsafe fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    simd_select_bitmask(k, simd_fsqrt(a), src)
+pub fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3029,8 +3460,8 @@ pub unsafe fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
-pub unsafe fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 {
-    simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_ps())
+pub fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_ps()) }
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
@@ -3040,8 +3471,8 @@ pub unsafe fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
-pub unsafe fn _mm512_sqrt_pd(a: __m512d) -> __m512d {
-    simd_fsqrt(a)
+pub fn _mm512_sqrt_pd(a: __m512d) -> __m512d {
+    unsafe { simd_fsqrt(a) }
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -3051,8 +3482,8 @@ pub unsafe fn _mm512_sqrt_pd(a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
-pub unsafe fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    simd_select_bitmask(k, simd_fsqrt(a), src)
+pub fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3062,8 +3493,8 @@ pub unsafe fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
-pub unsafe fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d {
-    simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_pd())
+pub fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm512_setzero_pd()) }
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -3073,8 +3504,8 @@ pub unsafe fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
-pub unsafe fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    simd_select_bitmask(k, simd_fsqrt(a), src)
+pub fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3084,8 +3515,8 @@ pub unsafe fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
-pub unsafe fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d {
-    simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_pd())
+pub fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm256_setzero_pd()) }
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -3095,8 +3526,8 @@ pub unsafe fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
-pub unsafe fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    simd_select_bitmask(k, simd_fsqrt(a), src)
+pub fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), src) }
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3106,8 +3537,8 @@ pub unsafe fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
-pub unsafe fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d {
-    simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_pd())
+pub fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, simd_fsqrt(a), _mm_setzero_pd()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
@@ -3117,8 +3548,8 @@ pub unsafe fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub unsafe fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    simd_fma(a, b, c)
+pub fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(a, b, c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3128,8 +3559,8 @@ pub unsafe fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub unsafe fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), a)
+pub fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3139,8 +3570,8 @@ pub unsafe fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub unsafe fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), _mm512_setzero_ps())
+pub fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), _mm512_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3150,8 +3581,8 @@ pub unsafe fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub unsafe fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), c)
+pub fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3161,8 +3592,8 @@ pub unsafe fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub unsafe fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), a)
+pub fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3172,8 +3603,8 @@ pub unsafe fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub unsafe fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), _mm256_setzero_ps())
+pub fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), _mm256_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3183,8 +3614,8 @@ pub unsafe fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub unsafe fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), c)
+pub fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3194,8 +3625,8 @@ pub unsafe fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub unsafe fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), a)
+pub fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3205,8 +3636,8 @@ pub unsafe fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub unsafe fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), _mm_setzero_ps())
+pub fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), _mm_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3216,8 +3647,8 @@ pub unsafe fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
-pub unsafe fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), c)
+pub fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ps(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
@@ -3227,8 +3658,8 @@ pub unsafe fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub unsafe fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    simd_fma(a, b, c)
+pub fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(a, b, c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3238,8 +3669,8 @@ pub unsafe fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub unsafe fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), a)
+pub fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3249,8 +3680,8 @@ pub unsafe fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub unsafe fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), _mm512_setzero_pd())
+pub fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), _mm512_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3260,8 +3691,8 @@ pub unsafe fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub unsafe fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), c)
+pub fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3271,8 +3702,8 @@ pub unsafe fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mma
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub unsafe fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), a)
+pub fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3282,8 +3713,8 @@ pub unsafe fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub unsafe fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), _mm256_setzero_pd())
+pub fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), _mm256_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3293,8 +3724,8 @@ pub unsafe fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub unsafe fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), c)
+pub fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3304,8 +3735,8 @@ pub unsafe fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mma
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub unsafe fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), a)
+pub fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3315,8 +3746,8 @@ pub unsafe fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub unsafe fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), _mm_setzero_pd())
+pub fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), _mm_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3326,8 +3757,8 @@ pub unsafe fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
-pub unsafe fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), c)
+pub fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_pd(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
@@ -3337,8 +3768,8 @@ pub unsafe fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    simd_fma(a, b, simd_neg(c))
+pub fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3348,8 +3779,8 @@ pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), a)
+pub fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3359,8 +3790,8 @@ pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), _mm512_setzero_ps())
+pub fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), _mm512_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3370,8 +3801,8 @@ pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub unsafe fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), c)
+pub fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3381,8 +3812,8 @@ pub unsafe fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub unsafe fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), a)
+pub fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3392,8 +3823,8 @@ pub unsafe fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub unsafe fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), _mm256_setzero_ps())
+pub fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), _mm256_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3403,8 +3834,8 @@ pub unsafe fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub unsafe fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), c)
+pub fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3414,8 +3845,8 @@ pub unsafe fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub unsafe fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), a)
+pub fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3425,8 +3856,8 @@ pub unsafe fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub unsafe fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), _mm_setzero_ps())
+pub fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), _mm_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3436,8 +3867,8 @@ pub unsafe fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
-pub unsafe fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), c)
+pub fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ps(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
@@ -3447,8 +3878,8 @@ pub unsafe fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    simd_fma(a, b, simd_neg(c))
+pub fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3458,8 +3889,8 @@ pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), a)
+pub fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3469,8 +3900,8 @@ pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), _mm512_setzero_pd())
+pub fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), _mm512_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3480,8 +3911,8 @@ pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub unsafe fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), c)
+pub fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3491,8 +3922,8 @@ pub unsafe fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mma
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub unsafe fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), a)
+pub fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3502,8 +3933,8 @@ pub unsafe fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub unsafe fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), _mm256_setzero_pd())
+pub fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), _mm256_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3513,8 +3944,8 @@ pub unsafe fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub unsafe fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), c)
+pub fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3524,8 +3955,8 @@ pub unsafe fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mma
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub unsafe fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), a)
+pub fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3535,8 +3966,8 @@ pub unsafe fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub unsafe fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), _mm_setzero_pd())
+pub fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), _mm_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3546,8 +3977,8 @@ pub unsafe fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
-pub unsafe fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), c)
+pub fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_pd(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
@@ -3557,14 +3988,16 @@ pub unsafe fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub unsafe fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    let add = simd_fma(a, b, c);
-    let sub = simd_fma(a, b, simd_neg(c));
-    simd_shuffle!(
-        add,
-        sub,
-        [16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15]
-    )
+pub fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(
+            add,
+            sub,
+            [16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15]
+        )
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3574,8 +4007,8 @@ pub unsafe fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub unsafe fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), a)
+pub fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3585,8 +4018,8 @@ pub unsafe fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub unsafe fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), _mm512_setzero_ps())
+pub fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), _mm512_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3596,8 +4029,8 @@ pub unsafe fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub unsafe fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), c)
+pub fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3607,8 +4040,8 @@ pub unsafe fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mma
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub unsafe fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), a)
+pub fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3618,8 +4051,8 @@ pub unsafe fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub unsafe fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), _mm256_setzero_ps())
+pub fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), _mm256_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3629,8 +4062,8 @@ pub unsafe fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub unsafe fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), c)
+pub fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3640,8 +4073,8 @@ pub unsafe fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mma
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub unsafe fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), a)
+pub fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3651,8 +4084,8 @@ pub unsafe fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub unsafe fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), _mm_setzero_ps())
+pub fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), _mm_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3662,8 +4095,8 @@ pub unsafe fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
-pub unsafe fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), c)
+pub fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ps(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
@@ -3673,10 +4106,12 @@ pub unsafe fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub unsafe fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let add = simd_fma(a, b, c);
-    let sub = simd_fma(a, b, simd_neg(c));
-    simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+pub fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3686,8 +4121,8 @@ pub unsafe fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub unsafe fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), a)
+pub fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3697,8 +4132,8 @@ pub unsafe fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub unsafe fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), _mm512_setzero_pd())
+pub fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), _mm512_setzero_pd()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3708,8 +4143,8 @@ pub unsafe fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub unsafe fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), c)
+pub fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3719,8 +4154,8 @@ pub unsafe fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub unsafe fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), a)
+pub fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3730,8 +4165,8 @@ pub unsafe fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub unsafe fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), _mm256_setzero_pd())
+pub fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), _mm256_setzero_pd()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3741,8 +4176,8 @@ pub unsafe fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub unsafe fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), c)
+pub fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3752,8 +4187,8 @@ pub unsafe fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub unsafe fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), a)
+pub fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3763,8 +4198,8 @@ pub unsafe fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub unsafe fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), _mm_setzero_pd())
+pub fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), _mm_setzero_pd()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3774,8 +4209,8 @@ pub unsafe fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
-pub unsafe fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), c)
+pub fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_pd(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
@@ -3785,14 +4220,16 @@ pub unsafe fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mma
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    let add = simd_fma(a, b, c);
-    let sub = simd_fma(a, b, simd_neg(c));
-    simd_shuffle!(
-        add,
-        sub,
-        [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
-    )
+pub fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(
+            add,
+            sub,
+            [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
+        )
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3802,8 +4239,8 @@ pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), a)
+pub fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3813,8 +4250,8 @@ pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), _mm512_setzero_ps())
+pub fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), _mm512_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3824,8 +4261,8 @@ pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub unsafe fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), c)
+pub fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3835,8 +4272,8 @@ pub unsafe fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mma
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub unsafe fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), a)
+pub fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3846,8 +4283,8 @@ pub unsafe fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub unsafe fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), _mm256_setzero_ps())
+pub fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), _mm256_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3857,8 +4294,8 @@ pub unsafe fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub unsafe fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), c)
+pub fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3868,8 +4305,8 @@ pub unsafe fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mma
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub unsafe fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), a)
+pub fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3879,8 +4316,8 @@ pub unsafe fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub unsafe fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), _mm_setzero_ps())
+pub fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), _mm_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3890,8 +4327,8 @@ pub unsafe fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
-pub unsafe fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), c)
+pub fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ps(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
@@ -3901,10 +4338,12 @@ pub unsafe fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    let add = simd_fma(a, b, c);
-    let sub = simd_fma(a, b, simd_neg(c));
-    simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
+pub fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        let add = simd_fma(a, b, c);
+        let sub = simd_fma(a, b, simd_neg(c));
+        simd_shuffle!(add, sub, [0, 9, 2, 11, 4, 13, 6, 15])
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3914,8 +4353,8 @@ pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), a)
+pub fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3925,8 +4364,8 @@ pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), _mm512_setzero_pd())
+pub fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), _mm512_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3936,8 +4375,8 @@ pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub unsafe fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), c)
+pub fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3947,8 +4386,8 @@ pub unsafe fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub unsafe fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), a)
+pub fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3958,8 +4397,8 @@ pub unsafe fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub unsafe fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), _mm256_setzero_pd())
+pub fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), _mm256_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -3969,8 +4408,8 @@ pub unsafe fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub unsafe fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), c)
+pub fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -3980,8 +4419,8 @@ pub unsafe fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub unsafe fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), a)
+pub fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -3991,8 +4430,8 @@ pub unsafe fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub unsafe fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), _mm_setzero_pd())
+pub fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), _mm_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4002,8 +4441,8 @@ pub unsafe fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
-pub unsafe fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), c)
+pub fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_pd(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
@@ -4013,8 +4452,8 @@ pub unsafe fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mma
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    simd_fma(simd_neg(a), b, c)
+pub fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(simd_neg(a), b, c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4024,8 +4463,8 @@ pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), a)
+pub fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4035,8 +4474,8 @@ pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), _mm512_setzero_ps())
+pub fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), _mm512_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4046,8 +4485,8 @@ pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub unsafe fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), c)
+pub fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4057,8 +4496,8 @@ pub unsafe fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub unsafe fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), a)
+pub fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4068,8 +4507,8 @@ pub unsafe fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub unsafe fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), _mm256_setzero_ps())
+pub fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), _mm256_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4079,8 +4518,8 @@ pub unsafe fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub unsafe fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), c)
+pub fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4090,8 +4529,8 @@ pub unsafe fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub unsafe fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), a)
+pub fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4101,8 +4540,8 @@ pub unsafe fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub unsafe fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), _mm_setzero_ps())
+pub fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), _mm_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4112,8 +4551,8 @@ pub unsafe fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
-pub unsafe fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), c)
+pub fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ps(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
@@ -4123,8 +4562,8 @@ pub unsafe fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    simd_fma(simd_neg(a), b, c)
+pub fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(simd_neg(a), b, c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4134,8 +4573,8 @@ pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), a)
+pub fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4145,8 +4584,8 @@ pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), _mm512_setzero_pd())
+pub fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), _mm512_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4156,8 +4595,8 @@ pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub unsafe fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), c)
+pub fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4167,8 +4606,8 @@ pub unsafe fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mm
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub unsafe fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), a)
+pub fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4178,8 +4617,8 @@ pub unsafe fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub unsafe fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), _mm256_setzero_pd())
+pub fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), _mm256_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4189,8 +4628,8 @@ pub unsafe fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub unsafe fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), c)
+pub fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4200,8 +4639,8 @@ pub unsafe fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mm
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub unsafe fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), a)
+pub fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4211,8 +4650,8 @@ pub unsafe fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub unsafe fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), _mm_setzero_pd())
+pub fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), _mm_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4222,8 +4661,8 @@ pub unsafe fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
-pub unsafe fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), c)
+pub fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_pd(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
@@ -4233,8 +4672,8 @@ pub unsafe fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
-    simd_fma(simd_neg(a), b, simd_neg(c))
+pub fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4244,8 +4683,8 @@ pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), a)
+pub fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4255,8 +4694,8 @@ pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
-    simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), _mm512_setzero_ps())
+pub fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), _mm512_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4266,8 +4705,8 @@ pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub unsafe fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
-    simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), c)
+pub fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4277,8 +4716,8 @@ pub unsafe fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub unsafe fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), a)
+pub fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4288,8 +4727,8 @@ pub unsafe fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub unsafe fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
-    simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), _mm256_setzero_ps())
+pub fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), _mm256_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4299,8 +4738,8 @@ pub unsafe fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub unsafe fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
-    simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), c)
+pub fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ps(a, b, c), c) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4310,8 +4749,8 @@ pub unsafe fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub unsafe fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), a)
+pub fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), a) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4321,8 +4760,8 @@ pub unsafe fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub unsafe fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), _mm_setzero_ps())
+pub fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), _mm_setzero_ps()) }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4332,8 +4771,8 @@ pub unsafe fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
-pub unsafe fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), c)
+pub fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ps(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
@@ -4343,8 +4782,8 @@ pub unsafe fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    simd_fma(simd_neg(a), b, simd_neg(c))
+pub fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4354,8 +4793,8 @@ pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), a)
+pub fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4365,8 +4804,8 @@ pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
-    simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), _mm512_setzero_pd())
+pub fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), _mm512_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4376,8 +4815,8 @@ pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub unsafe fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
-    simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), c)
+pub fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4387,8 +4826,8 @@ pub unsafe fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mm
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub unsafe fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), a)
+pub fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4398,8 +4837,8 @@ pub unsafe fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub unsafe fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
-    simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), _mm256_setzero_pd())
+pub fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), _mm256_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4409,8 +4848,8 @@ pub unsafe fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub unsafe fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
-    simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), c)
+pub fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_pd(a, b, c), c) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -4420,8 +4859,8 @@ pub unsafe fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mm
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub unsafe fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), a)
+pub fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), a) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -4431,8 +4870,8 @@ pub unsafe fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub unsafe fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), _mm_setzero_pd())
+pub fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), _mm_setzero_pd()) }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
@@ -4442,8 +4881,8 @@ pub unsafe fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
-pub unsafe fn _mm_mask3_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), c)
+pub fn _mm_mask3_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_pd(a, b, c), c) }
 }
 
 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4453,8 +4892,8 @@ pub unsafe fn _mm_mask3_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ps))]
-pub unsafe fn _mm512_rcp14_ps(a: __m512) -> __m512 {
-    transmute(vrcp14ps(a.as_f32x16(), f32x16::ZERO, 0b11111111_11111111))
+pub fn _mm512_rcp14_ps(a: __m512) -> __m512 {
+    unsafe { transmute(vrcp14ps(a.as_f32x16(), f32x16::ZERO, 0b11111111_11111111)) }
 }
 
 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4464,8 +4903,8 @@ pub unsafe fn _mm512_rcp14_ps(a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ps))]
-pub unsafe fn _mm512_mask_rcp14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    transmute(vrcp14ps(a.as_f32x16(), src.as_f32x16(), k))
+pub fn _mm512_mask_rcp14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrcp14ps(a.as_f32x16(), src.as_f32x16(), k)) }
 }
 
 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4475,8 +4914,8 @@ pub unsafe fn _mm512_mask_rcp14_ps(src: __m512, k: __mmask16, a: __m512) -> __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ps))]
-pub unsafe fn _mm512_maskz_rcp14_ps(k: __mmask16, a: __m512) -> __m512 {
-    transmute(vrcp14ps(a.as_f32x16(), f32x16::ZERO, k))
+pub fn _mm512_maskz_rcp14_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrcp14ps(a.as_f32x16(), f32x16::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4486,8 +4925,8 @@ pub unsafe fn _mm512_maskz_rcp14_ps(k: __mmask16, a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ps))]
-pub unsafe fn _mm256_rcp14_ps(a: __m256) -> __m256 {
-    transmute(vrcp14ps256(a.as_f32x8(), f32x8::ZERO, 0b11111111))
+pub fn _mm256_rcp14_ps(a: __m256) -> __m256 {
+    unsafe { transmute(vrcp14ps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
 }
 
 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4497,8 +4936,8 @@ pub unsafe fn _mm256_rcp14_ps(a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ps))]
-pub unsafe fn _mm256_mask_rcp14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    transmute(vrcp14ps256(a.as_f32x8(), src.as_f32x8(), k))
+pub fn _mm256_mask_rcp14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrcp14ps256(a.as_f32x8(), src.as_f32x8(), k)) }
 }
 
 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4508,8 +4947,8 @@ pub unsafe fn _mm256_mask_rcp14_ps(src: __m256, k: __mmask8, a: __m256) -> __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ps))]
-pub unsafe fn _mm256_maskz_rcp14_ps(k: __mmask8, a: __m256) -> __m256 {
-    transmute(vrcp14ps256(a.as_f32x8(), f32x8::ZERO, k))
+pub fn _mm256_maskz_rcp14_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrcp14ps256(a.as_f32x8(), f32x8::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4519,8 +4958,8 @@ pub unsafe fn _mm256_maskz_rcp14_ps(k: __mmask8, a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ps))]
-pub unsafe fn _mm_rcp14_ps(a: __m128) -> __m128 {
-    transmute(vrcp14ps128(a.as_f32x4(), f32x4::ZERO, 0b00001111))
+pub fn _mm_rcp14_ps(a: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
 }
 
 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4530,8 +4969,8 @@ pub unsafe fn _mm_rcp14_ps(a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ps))]
-pub unsafe fn _mm_mask_rcp14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    transmute(vrcp14ps128(a.as_f32x4(), src.as_f32x4(), k))
+pub fn _mm_mask_rcp14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ps128(a.as_f32x4(), src.as_f32x4(), k)) }
 }
 
 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4541,8 +4980,8 @@ pub unsafe fn _mm_mask_rcp14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ps))]
-pub unsafe fn _mm_maskz_rcp14_ps(k: __mmask8, a: __m128) -> __m128 {
-    transmute(vrcp14ps128(a.as_f32x4(), f32x4::ZERO, k))
+pub fn _mm_maskz_rcp14_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ps128(a.as_f32x4(), f32x4::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4552,8 +4991,8 @@ pub unsafe fn _mm_maskz_rcp14_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14pd))]
-pub unsafe fn _mm512_rcp14_pd(a: __m512d) -> __m512d {
-    transmute(vrcp14pd(a.as_f64x8(), f64x8::ZERO, 0b11111111))
+pub fn _mm512_rcp14_pd(a: __m512d) -> __m512d {
+    unsafe { transmute(vrcp14pd(a.as_f64x8(), f64x8::ZERO, 0b11111111)) }
 }
 
 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4563,8 +5002,8 @@ pub unsafe fn _mm512_rcp14_pd(a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14pd))]
-pub unsafe fn _mm512_mask_rcp14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    transmute(vrcp14pd(a.as_f64x8(), src.as_f64x8(), k))
+pub fn _mm512_mask_rcp14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrcp14pd(a.as_f64x8(), src.as_f64x8(), k)) }
 }
 
 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4574,8 +5013,8 @@ pub unsafe fn _mm512_mask_rcp14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14pd))]
-pub unsafe fn _mm512_maskz_rcp14_pd(k: __mmask8, a: __m512d) -> __m512d {
-    transmute(vrcp14pd(a.as_f64x8(), f64x8::ZERO, k))
+pub fn _mm512_maskz_rcp14_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrcp14pd(a.as_f64x8(), f64x8::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4585,8 +5024,8 @@ pub unsafe fn _mm512_maskz_rcp14_pd(k: __mmask8, a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14pd))]
-pub unsafe fn _mm256_rcp14_pd(a: __m256d) -> __m256d {
-    transmute(vrcp14pd256(a.as_f64x4(), f64x4::ZERO, 0b00001111))
+pub fn _mm256_rcp14_pd(a: __m256d) -> __m256d {
+    unsafe { transmute(vrcp14pd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
 }
 
 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4596,8 +5035,8 @@ pub unsafe fn _mm256_rcp14_pd(a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14pd))]
-pub unsafe fn _mm256_mask_rcp14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    transmute(vrcp14pd256(a.as_f64x4(), src.as_f64x4(), k))
+pub fn _mm256_mask_rcp14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrcp14pd256(a.as_f64x4(), src.as_f64x4(), k)) }
 }
 
 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4607,8 +5046,8 @@ pub unsafe fn _mm256_mask_rcp14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14pd))]
-pub unsafe fn _mm256_maskz_rcp14_pd(k: __mmask8, a: __m256d) -> __m256d {
-    transmute(vrcp14pd256(a.as_f64x4(), f64x4::ZERO, k))
+pub fn _mm256_maskz_rcp14_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrcp14pd256(a.as_f64x4(), f64x4::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4618,8 +5057,8 @@ pub unsafe fn _mm256_maskz_rcp14_pd(k: __mmask8, a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14pd))]
-pub unsafe fn _mm_rcp14_pd(a: __m128d) -> __m128d {
-    transmute(vrcp14pd128(a.as_f64x2(), f64x2::ZERO, 0b00000011))
+pub fn _mm_rcp14_pd(a: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14pd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
 }
 
 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4629,8 +5068,8 @@ pub unsafe fn _mm_rcp14_pd(a: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14pd))]
-pub unsafe fn _mm_mask_rcp14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    transmute(vrcp14pd128(a.as_f64x2(), src.as_f64x2(), k))
+pub fn _mm_mask_rcp14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14pd128(a.as_f64x2(), src.as_f64x2(), k)) }
 }
 
 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4640,8 +5079,8 @@ pub unsafe fn _mm_mask_rcp14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14pd))]
-pub unsafe fn _mm_maskz_rcp14_pd(k: __mmask8, a: __m128d) -> __m128d {
-    transmute(vrcp14pd128(a.as_f64x2(), f64x2::ZERO, k))
+pub fn _mm_maskz_rcp14_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14pd128(a.as_f64x2(), f64x2::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4651,8 +5090,8 @@ pub unsafe fn _mm_maskz_rcp14_pd(k: __mmask8, a: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub unsafe fn _mm512_rsqrt14_ps(a: __m512) -> __m512 {
-    transmute(vrsqrt14ps(a.as_f32x16(), f32x16::ZERO, 0b11111111_11111111))
+pub fn _mm512_rsqrt14_ps(a: __m512) -> __m512 {
+    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), f32x16::ZERO, 0b11111111_11111111)) }
 }
 
 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4662,8 +5101,8 @@ pub unsafe fn _mm512_rsqrt14_ps(a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub unsafe fn _mm512_mask_rsqrt14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    transmute(vrsqrt14ps(a.as_f32x16(), src.as_f32x16(), k))
+pub fn _mm512_mask_rsqrt14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), src.as_f32x16(), k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4673,8 +5112,8 @@ pub unsafe fn _mm512_mask_rsqrt14_ps(src: __m512, k: __mmask16, a: __m512) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub unsafe fn _mm512_maskz_rsqrt14_ps(k: __mmask16, a: __m512) -> __m512 {
-    transmute(vrsqrt14ps(a.as_f32x16(), f32x16::ZERO, k))
+pub fn _mm512_maskz_rsqrt14_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vrsqrt14ps(a.as_f32x16(), f32x16::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4684,8 +5123,8 @@ pub unsafe fn _mm512_maskz_rsqrt14_ps(k: __mmask16, a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub unsafe fn _mm256_rsqrt14_ps(a: __m256) -> __m256 {
-    transmute(vrsqrt14ps256(a.as_f32x8(), f32x8::ZERO, 0b11111111))
+pub fn _mm256_rsqrt14_ps(a: __m256) -> __m256 {
+    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
 }
 
 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4695,8 +5134,8 @@ pub unsafe fn _mm256_rsqrt14_ps(a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub unsafe fn _mm256_mask_rsqrt14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    transmute(vrsqrt14ps256(a.as_f32x8(), src.as_f32x8(), k))
+pub fn _mm256_mask_rsqrt14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), src.as_f32x8(), k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4706,8 +5145,8 @@ pub unsafe fn _mm256_mask_rsqrt14_ps(src: __m256, k: __mmask8, a: __m256) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub unsafe fn _mm256_maskz_rsqrt14_ps(k: __mmask8, a: __m256) -> __m256 {
-    transmute(vrsqrt14ps256(a.as_f32x8(), f32x8::ZERO, k))
+pub fn _mm256_maskz_rsqrt14_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vrsqrt14ps256(a.as_f32x8(), f32x8::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4717,8 +5156,8 @@ pub unsafe fn _mm256_maskz_rsqrt14_ps(k: __mmask8, a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub unsafe fn _mm_rsqrt14_ps(a: __m128) -> __m128 {
-    transmute(vrsqrt14ps128(a.as_f32x4(), f32x4::ZERO, 0b00001111))
+pub fn _mm_rsqrt14_ps(a: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
 }
 
 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4728,8 +5167,8 @@ pub unsafe fn _mm_rsqrt14_ps(a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub unsafe fn _mm_mask_rsqrt14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    transmute(vrsqrt14ps128(a.as_f32x4(), src.as_f32x4(), k))
+pub fn _mm_mask_rsqrt14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), src.as_f32x4(), k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4739,8 +5178,8 @@ pub unsafe fn _mm_mask_rsqrt14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
-pub unsafe fn _mm_maskz_rsqrt14_ps(k: __mmask8, a: __m128) -> __m128 {
-    transmute(vrsqrt14ps128(a.as_f32x4(), f32x4::ZERO, k))
+pub fn _mm_maskz_rsqrt14_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ps128(a.as_f32x4(), f32x4::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4750,8 +5189,8 @@ pub unsafe fn _mm_maskz_rsqrt14_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub unsafe fn _mm512_rsqrt14_pd(a: __m512d) -> __m512d {
-    transmute(vrsqrt14pd(a.as_f64x8(), f64x8::ZERO, 0b11111111))
+pub fn _mm512_rsqrt14_pd(a: __m512d) -> __m512d {
+    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), f64x8::ZERO, 0b11111111)) }
 }
 
 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4761,8 +5200,8 @@ pub unsafe fn _mm512_rsqrt14_pd(a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub unsafe fn _mm512_mask_rsqrt14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    transmute(vrsqrt14pd(a.as_f64x8(), src.as_f64x8(), k))
+pub fn _mm512_mask_rsqrt14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), src.as_f64x8(), k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4772,8 +5211,8 @@ pub unsafe fn _mm512_mask_rsqrt14_pd(src: __m512d, k: __mmask8, a: __m512d) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub unsafe fn _mm512_maskz_rsqrt14_pd(k: __mmask8, a: __m512d) -> __m512d {
-    transmute(vrsqrt14pd(a.as_f64x8(), f64x8::ZERO, k))
+pub fn _mm512_maskz_rsqrt14_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vrsqrt14pd(a.as_f64x8(), f64x8::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4783,8 +5222,8 @@ pub unsafe fn _mm512_maskz_rsqrt14_pd(k: __mmask8, a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub unsafe fn _mm256_rsqrt14_pd(a: __m256d) -> __m256d {
-    transmute(vrsqrt14pd256(a.as_f64x4(), f64x4::ZERO, 0b00001111))
+pub fn _mm256_rsqrt14_pd(a: __m256d) -> __m256d {
+    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
 }
 
 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4794,8 +5233,8 @@ pub unsafe fn _mm256_rsqrt14_pd(a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub unsafe fn _mm256_mask_rsqrt14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    transmute(vrsqrt14pd256(a.as_f64x4(), src.as_f64x4(), k))
+pub fn _mm256_mask_rsqrt14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), src.as_f64x4(), k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4805,8 +5244,8 @@ pub unsafe fn _mm256_mask_rsqrt14_pd(src: __m256d, k: __mmask8, a: __m256d) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub unsafe fn _mm256_maskz_rsqrt14_pd(k: __mmask8, a: __m256d) -> __m256d {
-    transmute(vrsqrt14pd256(a.as_f64x4(), f64x4::ZERO, k))
+pub fn _mm256_maskz_rsqrt14_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vrsqrt14pd256(a.as_f64x4(), f64x4::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
@@ -4816,8 +5255,8 @@ pub unsafe fn _mm256_maskz_rsqrt14_pd(k: __mmask8, a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub unsafe fn _mm_rsqrt14_pd(a: __m128d) -> __m128d {
-    transmute(vrsqrt14pd128(a.as_f64x2(), f64x2::ZERO, 0b00000011))
+pub fn _mm_rsqrt14_pd(a: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
 }
 
 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4827,8 +5266,8 @@ pub unsafe fn _mm_rsqrt14_pd(a: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub unsafe fn _mm_mask_rsqrt14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    transmute(vrsqrt14pd128(a.as_f64x2(), src.as_f64x2(), k))
+pub fn _mm_mask_rsqrt14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), src.as_f64x2(), k)) }
 }
 
 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
@@ -4838,8 +5277,8 @@ pub unsafe fn _mm_mask_rsqrt14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
-pub unsafe fn _mm_maskz_rsqrt14_pd(k: __mmask8, a: __m128d) -> __m128d {
-    transmute(vrsqrt14pd128(a.as_f64x2(), f64x2::ZERO, k))
+pub fn _mm_maskz_rsqrt14_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14pd128(a.as_f64x2(), f64x2::ZERO, k)) }
 }
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -4849,14 +5288,16 @@ pub unsafe fn _mm_maskz_rsqrt14_pd(k: __mmask8, a: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps))]
-pub unsafe fn _mm512_getexp_ps(a: __m512) -> __m512 {
-    transmute(vgetexpps(
-        a.as_f32x16(),
-        f32x16::ZERO,
-        0b11111111_11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
-}
+pub fn _mm512_getexp_ps(a: __m512) -> __m512 {
+    unsafe {
+        transmute(vgetexpps(
+            a.as_f32x16(),
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
+}
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
 ///
@@ -4865,13 +5306,15 @@ pub unsafe fn _mm512_getexp_ps(a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps))]
-pub unsafe fn _mm512_mask_getexp_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    transmute(vgetexpps(
-        a.as_f32x16(),
-        src.as_f32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_getexp_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        transmute(vgetexpps(
+            a.as_f32x16(),
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -4881,13 +5324,15 @@ pub unsafe fn _mm512_mask_getexp_ps(src: __m512, k: __mmask16, a: __m512) -> __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps))]
-pub unsafe fn _mm512_maskz_getexp_ps(k: __mmask16, a: __m512) -> __m512 {
-    transmute(vgetexpps(
-        a.as_f32x16(),
-        f32x16::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_getexp_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        transmute(vgetexpps(
+            a.as_f32x16(),
+            f32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -4897,8 +5342,8 @@ pub unsafe fn _mm512_maskz_getexp_ps(k: __mmask16, a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps))]
-pub unsafe fn _mm256_getexp_ps(a: __m256) -> __m256 {
-    transmute(vgetexpps256(a.as_f32x8(), f32x8::ZERO, 0b11111111))
+pub fn _mm256_getexp_ps(a: __m256) -> __m256 {
+    unsafe { transmute(vgetexpps256(a.as_f32x8(), f32x8::ZERO, 0b11111111)) }
 }
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -4908,8 +5353,8 @@ pub unsafe fn _mm256_getexp_ps(a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps))]
-pub unsafe fn _mm256_mask_getexp_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    transmute(vgetexpps256(a.as_f32x8(), src.as_f32x8(), k))
+pub fn _mm256_mask_getexp_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vgetexpps256(a.as_f32x8(), src.as_f32x8(), k)) }
 }
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -4919,8 +5364,8 @@ pub unsafe fn _mm256_mask_getexp_ps(src: __m256, k: __mmask8, a: __m256) -> __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps))]
-pub unsafe fn _mm256_maskz_getexp_ps(k: __mmask8, a: __m256) -> __m256 {
-    transmute(vgetexpps256(a.as_f32x8(), f32x8::ZERO, k))
+pub fn _mm256_maskz_getexp_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vgetexpps256(a.as_f32x8(), f32x8::ZERO, k)) }
 }
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -4930,8 +5375,8 @@ pub unsafe fn _mm256_maskz_getexp_ps(k: __mmask8, a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps))]
-pub unsafe fn _mm_getexp_ps(a: __m128) -> __m128 {
-    transmute(vgetexpps128(a.as_f32x4(), f32x4::ZERO, 0b00001111))
+pub fn _mm_getexp_ps(a: __m128) -> __m128 {
+    unsafe { transmute(vgetexpps128(a.as_f32x4(), f32x4::ZERO, 0b00001111)) }
 }
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -4941,8 +5386,8 @@ pub unsafe fn _mm_getexp_ps(a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps))]
-pub unsafe fn _mm_mask_getexp_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    transmute(vgetexpps128(a.as_f32x4(), src.as_f32x4(), k))
+pub fn _mm_mask_getexp_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vgetexpps128(a.as_f32x4(), src.as_f32x4(), k)) }
 }
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -4952,8 +5397,8 @@ pub unsafe fn _mm_mask_getexp_ps(src: __m128, k: __mmask8, a: __m128) -> __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps))]
-pub unsafe fn _mm_maskz_getexp_ps(k: __mmask8, a: __m128) -> __m128 {
-    transmute(vgetexpps128(a.as_f32x4(), f32x4::ZERO, k))
+pub fn _mm_maskz_getexp_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vgetexpps128(a.as_f32x4(), f32x4::ZERO, k)) }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -4963,13 +5408,15 @@ pub unsafe fn _mm_maskz_getexp_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd))]
-pub unsafe fn _mm512_getexp_pd(a: __m512d) -> __m512d {
-    transmute(vgetexppd(
-        a.as_f64x8(),
-        f64x8::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_getexp_pd(a: __m512d) -> __m512d {
+    unsafe {
+        transmute(vgetexppd(
+            a.as_f64x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -4979,13 +5426,15 @@ pub unsafe fn _mm512_getexp_pd(a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd))]
-pub unsafe fn _mm512_mask_getexp_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    transmute(vgetexppd(
-        a.as_f64x8(),
-        src.as_f64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_getexp_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        transmute(vgetexppd(
+            a.as_f64x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -4995,13 +5444,15 @@ pub unsafe fn _mm512_mask_getexp_pd(src: __m512d, k: __mmask8, a: __m512d) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd))]
-pub unsafe fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d {
-    transmute(vgetexppd(
-        a.as_f64x8(),
-        f64x8::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        transmute(vgetexppd(
+            a.as_f64x8(),
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -5011,8 +5462,8 @@ pub unsafe fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd))]
-pub unsafe fn _mm256_getexp_pd(a: __m256d) -> __m256d {
-    transmute(vgetexppd256(a.as_f64x4(), f64x4::ZERO, 0b00001111))
+pub fn _mm256_getexp_pd(a: __m256d) -> __m256d {
+    unsafe { transmute(vgetexppd256(a.as_f64x4(), f64x4::ZERO, 0b00001111)) }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -5022,8 +5473,8 @@ pub unsafe fn _mm256_getexp_pd(a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd))]
-pub unsafe fn _mm256_mask_getexp_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    transmute(vgetexppd256(a.as_f64x4(), src.as_f64x4(), k))
+pub fn _mm256_mask_getexp_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vgetexppd256(a.as_f64x4(), src.as_f64x4(), k)) }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -5033,8 +5484,8 @@ pub unsafe fn _mm256_mask_getexp_pd(src: __m256d, k: __mmask8, a: __m256d) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd))]
-pub unsafe fn _mm256_maskz_getexp_pd(k: __mmask8, a: __m256d) -> __m256d {
-    transmute(vgetexppd256(a.as_f64x4(), f64x4::ZERO, k))
+pub fn _mm256_maskz_getexp_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vgetexppd256(a.as_f64x4(), f64x4::ZERO, k)) }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -5044,8 +5495,8 @@ pub unsafe fn _mm256_maskz_getexp_pd(k: __mmask8, a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd))]
-pub unsafe fn _mm_getexp_pd(a: __m128d) -> __m128d {
-    transmute(vgetexppd128(a.as_f64x2(), f64x2::ZERO, 0b00000011))
+pub fn _mm_getexp_pd(a: __m128d) -> __m128d {
+    unsafe { transmute(vgetexppd128(a.as_f64x2(), f64x2::ZERO, 0b00000011)) }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -5055,8 +5506,8 @@ pub unsafe fn _mm_getexp_pd(a: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd))]
-pub unsafe fn _mm_mask_getexp_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    transmute(vgetexppd128(a.as_f64x2(), src.as_f64x2(), k))
+pub fn _mm_mask_getexp_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vgetexppd128(a.as_f64x2(), src.as_f64x2(), k)) }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
@@ -5066,8 +5517,8 @@ pub unsafe fn _mm_mask_getexp_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd))]
-pub unsafe fn _mm_maskz_getexp_pd(k: __mmask8, a: __m128d) -> __m128d {
-    transmute(vgetexppd128(a.as_f64x2(), f64x2::ZERO, k))
+pub fn _mm_maskz_getexp_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vgetexppd128(a.as_f64x2(), f64x2::ZERO, k)) }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
@@ -5084,17 +5535,19 @@ pub unsafe fn _mm_maskz_getexp_pd(k: __mmask8, a: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_roundscale_ps<const IMM8: i32>(a: __m512) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x16();
-    let r = vrndscaleps(
-        a,
-        IMM8,
-        f32x16::ZERO,
-        0b11111111_11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    transmute(r)
+pub fn _mm512_roundscale_ps<const IMM8: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(
+            a,
+            IMM8,
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -5111,16 +5564,14 @@ pub unsafe fn _mm512_roundscale_ps<const IMM8: i32>(a: __m512) -> __m512 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_roundscale_ps<const IMM8: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x16();
-    let src = src.as_f32x16();
-    let r = vrndscaleps(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub fn _mm512_mask_roundscale_ps<const IMM8: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vrndscaleps(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -5137,11 +5588,13 @@ pub unsafe fn _mm512_mask_roundscale_ps<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_roundscale_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x16();
-    let r = vrndscaleps(a, IMM8, f32x16::ZERO, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub fn _mm512_maskz_roundscale_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(a, IMM8, f32x16::ZERO, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
@@ -5158,11 +5611,13 @@ pub unsafe fn _mm512_maskz_roundscale_ps<const IMM8: i32>(k: __mmask16, a: __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm256_roundscale_ps<const IMM8: i32>(a: __m256) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x8();
-    let r = vrndscaleps256(a, IMM8, f32x8::ZERO, 0b11111111);
-    transmute(r)
+pub fn _mm256_roundscale_ps<const IMM8: i32>(a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vrndscaleps256(a, IMM8, f32x8::ZERO, 0b11111111);
+        transmute(r)
+    }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -5179,16 +5634,14 @@ pub unsafe fn _mm256_roundscale_ps<const IMM8: i32>(a: __m256) -> __m256 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_roundscale_ps<const IMM8: i32>(
-    src: __m256,
-    k: __mmask8,
-    a: __m256,
-) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x8();
-    let src = src.as_f32x8();
-    let r = vrndscaleps256(a, IMM8, src, k);
-    transmute(r)
+pub fn _mm256_mask_roundscale_ps<const IMM8: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let src = src.as_f32x8();
+        let r = vrndscaleps256(a, IMM8, src, k);
+        transmute(r)
+    }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -5205,11 +5658,13 @@ pub unsafe fn _mm256_mask_roundscale_ps<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x8();
-    let r = vrndscaleps256(a, IMM8, f32x8::ZERO, k);
-    transmute(r)
+pub fn _mm256_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vrndscaleps256(a, IMM8, f32x8::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
@@ -5226,11 +5681,13 @@ pub unsafe fn _mm256_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m256
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_roundscale_ps<const IMM8: i32>(a: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let r = vrndscaleps128(a, IMM8, f32x4::ZERO, 0b00001111);
-    transmute(r)
+pub fn _mm_roundscale_ps<const IMM8: i32>(a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vrndscaleps128(a, IMM8, f32x4::ZERO, 0b00001111);
+        transmute(r)
+    }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -5247,16 +5704,14 @@ pub unsafe fn _mm_roundscale_ps<const IMM8: i32>(a: __m128) -> __m128 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_roundscale_ps<const IMM8: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m128,
-) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vrndscaleps128(a, IMM8, src, k);
-    transmute(r)
+pub fn _mm_mask_roundscale_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vrndscaleps128(a, IMM8, src, k);
+        transmute(r)
+    }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -5273,11 +5728,13 @@ pub unsafe fn _mm_mask_roundscale_ps<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let r = vrndscaleps128(a, IMM8, f32x4::ZERO, k);
-    transmute(r)
+pub fn _mm_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vrndscaleps128(a, IMM8, f32x4::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
@@ -5294,11 +5751,13 @@ pub unsafe fn _mm_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m128) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_roundscale_pd<const IMM8: i32>(a: __m512d) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x8();
-    let r = vrndscalepd(a, IMM8, f64x8::ZERO, 0b11111111, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub fn _mm512_roundscale_pd<const IMM8: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -5315,16 +5774,18 @@ pub unsafe fn _mm512_roundscale_pd<const IMM8: i32>(a: __m512d) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_roundscale_pd<const IMM8: i32>(
+pub fn _mm512_mask_roundscale_pd<const IMM8: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x8();
-    let src = src.as_f64x8();
-    let r = vrndscalepd(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vrndscalepd(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -5341,11 +5802,13 @@ pub unsafe fn _mm512_mask_roundscale_pd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x8();
-    let r = vrndscalepd(a, IMM8, f64x8::ZERO, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub fn _mm512_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
@@ -5362,11 +5825,13 @@ pub unsafe fn _mm512_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m512
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm256_roundscale_pd<const IMM8: i32>(a: __m256d) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x4();
-    let r = vrndscalepd256(a, IMM8, f64x4::ZERO, 0b00001111);
-    transmute(r)
+pub fn _mm256_roundscale_pd<const IMM8: i32>(a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let r = vrndscalepd256(a, IMM8, f64x4::ZERO, 0b00001111);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -5383,16 +5848,18 @@ pub unsafe fn _mm256_roundscale_pd<const IMM8: i32>(a: __m256d) -> __m256d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_roundscale_pd<const IMM8: i32>(
+pub fn _mm256_mask_roundscale_pd<const IMM8: i32>(
     src: __m256d,
     k: __mmask8,
     a: __m256d,
 ) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x4();
-    let src = src.as_f64x4();
-    let r = vrndscalepd256(a, IMM8, src, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let src = src.as_f64x4();
+        let r = vrndscalepd256(a, IMM8, src, k);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -5409,11 +5876,13 @@ pub unsafe fn _mm256_mask_roundscale_pd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x4();
-    let r = vrndscalepd256(a, IMM8, f64x4::ZERO, k);
-    transmute(r)
+pub fn _mm256_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let r = vrndscalepd256(a, IMM8, f64x4::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
@@ -5430,11 +5899,13 @@ pub unsafe fn _mm256_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m256
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_roundscale_pd<const IMM8: i32>(a: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let r = vrndscalepd128(a, IMM8, f64x2::ZERO, 0b00000011);
-    transmute(r)
+pub fn _mm_roundscale_pd<const IMM8: i32>(a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let r = vrndscalepd128(a, IMM8, f64x2::ZERO, 0b00000011);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -5451,16 +5922,14 @@ pub unsafe fn _mm_roundscale_pd<const IMM8: i32>(a: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_roundscale_pd<const IMM8: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vrndscalepd128(a, IMM8, src, k);
-    transmute(r)
+pub fn _mm_mask_roundscale_pd<const IMM8: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vrndscalepd128(a, IMM8, src, k);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -5477,11 +5946,13 @@ pub unsafe fn _mm_mask_roundscale_pd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let r = vrndscalepd128(a, IMM8, f64x2::ZERO, k);
-    transmute(r)
+pub fn _mm_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let r = vrndscalepd128(a, IMM8, f64x2::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
@@ -5491,14 +5962,16 @@ pub unsafe fn _mm_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m128d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps))]
-pub unsafe fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 {
-    transmute(vscalefps(
-        a.as_f32x16(),
-        b.as_f32x16(),
-        f32x16::ZERO,
-        0b11111111_11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vscalefps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5508,14 +5981,16 @@ pub unsafe fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps))]
-pub unsafe fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    transmute(vscalefps(
-        a.as_f32x16(),
-        b.as_f32x16(),
-        src.as_f32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vscalefps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5525,14 +6000,16 @@ pub unsafe fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps))]
-pub unsafe fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    transmute(vscalefps(
-        a.as_f32x16(),
-        b.as_f32x16(),
-        f32x16::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        transmute(vscalefps(
+            a.as_f32x16(),
+            b.as_f32x16(),
+            f32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
@@ -5542,13 +6019,15 @@ pub unsafe fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m5
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps))]
-pub unsafe fn _mm256_scalef_ps(a: __m256, b: __m256) -> __m256 {
-    transmute(vscalefps256(
-        a.as_f32x8(),
-        b.as_f32x8(),
-        f32x8::ZERO,
-        0b11111111,
-    ))
+pub fn _mm256_scalef_ps(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        transmute(vscalefps256(
+            a.as_f32x8(),
+            b.as_f32x8(),
+            f32x8::ZERO,
+            0b11111111,
+        ))
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5558,8 +6037,8 @@ pub unsafe fn _mm256_scalef_ps(a: __m256, b: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps))]
-pub unsafe fn _mm256_mask_scalef_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), src.as_f32x8(), k))
+pub fn _mm256_mask_scalef_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe { transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), src.as_f32x8(), k)) }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5569,8 +6048,8 @@ pub unsafe fn _mm256_mask_scalef_ps(src: __m256, k: __mmask8, a: __m256, b: __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps))]
-pub unsafe fn _mm256_maskz_scalef_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), f32x8::ZERO, k))
+pub fn _mm256_maskz_scalef_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe { transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), f32x8::ZERO, k)) }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
@@ -5580,13 +6059,15 @@ pub unsafe fn _mm256_maskz_scalef_ps(k: __mmask8, a: __m256, b: __m256) -> __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps))]
-pub unsafe fn _mm_scalef_ps(a: __m128, b: __m128) -> __m128 {
-    transmute(vscalefps128(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        f32x4::ZERO,
-        0b00001111,
-    ))
+pub fn _mm_scalef_ps(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vscalefps128(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            0b00001111,
+        ))
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5596,8 +6077,8 @@ pub unsafe fn _mm_scalef_ps(a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps))]
-pub unsafe fn _mm_mask_scalef_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+pub fn _mm_mask_scalef_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5607,8 +6088,8 @@ pub unsafe fn _mm_mask_scalef_ps(src: __m128, k: __mmask8, a: __m128, b: __m128)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps))]
-pub unsafe fn _mm_maskz_scalef_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k))
+pub fn _mm_maskz_scalef_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
@@ -5618,14 +6099,16 @@ pub unsafe fn _mm_maskz_scalef_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd))]
-pub unsafe fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d {
-    transmute(vscalefpd(
-        a.as_f64x8(),
-        b.as_f64x8(),
-        f64x8::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        transmute(vscalefpd(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5635,14 +6118,16 @@ pub unsafe fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd))]
-pub unsafe fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    transmute(vscalefpd(
-        a.as_f64x8(),
-        b.as_f64x8(),
-        src.as_f64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        transmute(vscalefpd(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5652,14 +6137,16 @@ pub unsafe fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd))]
-pub unsafe fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    transmute(vscalefpd(
-        a.as_f64x8(),
-        b.as_f64x8(),
-        f64x8::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        transmute(vscalefpd(
+            a.as_f64x8(),
+            b.as_f64x8(),
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
@@ -5669,13 +6156,15 @@ pub unsafe fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd))]
-pub unsafe fn _mm256_scalef_pd(a: __m256d, b: __m256d) -> __m256d {
-    transmute(vscalefpd256(
-        a.as_f64x4(),
-        b.as_f64x4(),
-        f64x4::ZERO,
-        0b00001111,
-    ))
+pub fn _mm256_scalef_pd(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        transmute(vscalefpd256(
+            a.as_f64x4(),
+            b.as_f64x4(),
+            f64x4::ZERO,
+            0b00001111,
+        ))
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5685,8 +6174,8 @@ pub unsafe fn _mm256_scalef_pd(a: __m256d, b: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd))]
-pub unsafe fn _mm256_mask_scalef_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), src.as_f64x4(), k))
+pub fn _mm256_mask_scalef_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe { transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), src.as_f64x4(), k)) }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5696,8 +6185,8 @@ pub unsafe fn _mm256_mask_scalef_pd(src: __m256d, k: __mmask8, a: __m256d, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd))]
-pub unsafe fn _mm256_maskz_scalef_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), f64x4::ZERO, k))
+pub fn _mm256_maskz_scalef_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe { transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), f64x4::ZERO, k)) }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
@@ -5707,13 +6196,15 @@ pub unsafe fn _mm256_maskz_scalef_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd))]
-pub unsafe fn _mm_scalef_pd(a: __m128d, b: __m128d) -> __m128d {
-    transmute(vscalefpd128(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        f64x2::ZERO,
-        0b00000011,
-    ))
+pub fn _mm_scalef_pd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefpd128(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            0b00000011,
+        ))
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -5723,8 +6214,8 @@ pub unsafe fn _mm_scalef_pd(a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd))]
-pub unsafe fn _mm_mask_scalef_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+pub fn _mm_mask_scalef_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -5734,8 +6225,8 @@ pub unsafe fn _mm_mask_scalef_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd))]
-pub unsafe fn _mm_maskz_scalef_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k))
+pub fn _mm_maskz_scalef_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
@@ -5746,13 +6237,15 @@ pub unsafe fn _mm_maskz_scalef_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fixupimm_ps<const IMM8: i32>(a: __m512, b: __m512, c: __m512i) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_i32x16();
-    let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub fn _mm512_fixupimm_ps<const IMM8: i32>(a: __m512, b: __m512, c: __m512i) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -5763,18 +6256,20 @@ pub unsafe fn _mm512_fixupimm_ps<const IMM8: i32>(a: __m512, b: __m512, c: __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fixupimm_ps<const IMM8: i32>(
+pub fn _mm512_mask_fixupimm_ps<const IMM8: i32>(
     a: __m512,
     k: __mmask16,
     b: __m512,
     c: __m512i,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_i32x16();
-    let r = vfixupimmps(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -5785,18 +6280,20 @@ pub unsafe fn _mm512_mask_fixupimm_ps<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fixupimm_ps<const IMM8: i32>(
+pub fn _mm512_maskz_fixupimm_ps<const IMM8: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
     c: __m512i,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_i32x16();
-    let r = vfixupimmpsz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmpsz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
@@ -5807,13 +6304,15 @@ pub unsafe fn _mm512_maskz_fixupimm_ps<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_fixupimm_ps<const IMM8: i32>(a: __m256, b: __m256, c: __m256i) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x8();
-    let b = b.as_f32x8();
-    let c = c.as_i32x8();
-    let r = vfixupimmps256(a, b, c, IMM8, 0b11111111);
-    transmute(r)
+pub fn _mm256_fixupimm_ps<const IMM8: i32>(a: __m256, b: __m256, c: __m256i) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let c = c.as_i32x8();
+        let r = vfixupimmps256(a, b, c, IMM8, 0b11111111);
+        transmute(r)
+    }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -5824,18 +6323,20 @@ pub unsafe fn _mm256_fixupimm_ps<const IMM8: i32>(a: __m256, b: __m256, c: __m25
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_fixupimm_ps<const IMM8: i32>(
+pub fn _mm256_mask_fixupimm_ps<const IMM8: i32>(
     a: __m256,
     k: __mmask8,
     b: __m256,
     c: __m256i,
 ) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x8();
-    let b = b.as_f32x8();
-    let c = c.as_i32x8();
-    let r = vfixupimmps256(a, b, c, IMM8, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let c = c.as_i32x8();
+        let r = vfixupimmps256(a, b, c, IMM8, k);
+        transmute(r)
+    }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -5846,18 +6347,20 @@ pub unsafe fn _mm256_mask_fixupimm_ps<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_maskz_fixupimm_ps<const IMM8: i32>(
+pub fn _mm256_maskz_fixupimm_ps<const IMM8: i32>(
     k: __mmask8,
     a: __m256,
     b: __m256,
     c: __m256i,
 ) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x8();
-    let b = b.as_f32x8();
-    let c = c.as_i32x8();
-    let r = vfixupimmpsz256(a, b, c, IMM8, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let c = c.as_i32x8();
+        let r = vfixupimmpsz256(a, b, c, IMM8, k);
+        transmute(r)
+    }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
@@ -5868,13 +6371,15 @@ pub unsafe fn _mm256_maskz_fixupimm_ps<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fixupimm_ps<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let c = c.as_i32x4();
-    let r = vfixupimmps128(a, b, c, IMM8, 0b00001111);
-    transmute(r)
+pub fn _mm_fixupimm_ps<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmps128(a, b, c, IMM8, 0b00001111);
+        transmute(r)
+    }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -5885,18 +6390,20 @@ pub unsafe fn _mm_fixupimm_ps<const IMM8: i32>(a: __m128, b: __m128, c: __m128i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fixupimm_ps<const IMM8: i32>(
+pub fn _mm_mask_fixupimm_ps<const IMM8: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128i,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let c = c.as_i32x4();
-    let r = vfixupimmps128(a, b, c, IMM8, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmps128(a, b, c, IMM8, k);
+        transmute(r)
+    }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -5907,18 +6414,20 @@ pub unsafe fn _mm_mask_fixupimm_ps<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fixupimm_ps<const IMM8: i32>(
+pub fn _mm_maskz_fixupimm_ps<const IMM8: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128i,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let c = c.as_i32x4();
-    let r = vfixupimmpsz128(a, b, c, IMM8, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmpsz128(a, b, c, IMM8, k);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
@@ -5929,13 +6438,15 @@ pub unsafe fn _mm_maskz_fixupimm_ps<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fixupimm_pd<const IMM8: i32>(a: __m512d, b: __m512d, c: __m512i) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_i64x8();
-    let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub fn _mm512_fixupimm_pd<const IMM8: i32>(a: __m512d, b: __m512d, c: __m512i) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -5946,18 +6457,20 @@ pub unsafe fn _mm512_fixupimm_pd<const IMM8: i32>(a: __m512d, b: __m512d, c: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fixupimm_pd<const IMM8: i32>(
+pub fn _mm512_mask_fixupimm_pd<const IMM8: i32>(
     a: __m512d,
     k: __mmask8,
     b: __m512d,
     c: __m512i,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_i64x8();
-    let r = vfixupimmpd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -5968,18 +6481,20 @@ pub unsafe fn _mm512_mask_fixupimm_pd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fixupimm_pd<const IMM8: i32>(
+pub fn _mm512_maskz_fixupimm_pd<const IMM8: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
     c: __m512i,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_i64x8();
-    let r = vfixupimmpdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
@@ -5990,13 +6505,15 @@ pub unsafe fn _mm512_maskz_fixupimm_pd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_fixupimm_pd<const IMM8: i32>(a: __m256d, b: __m256d, c: __m256i) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x4();
-    let b = b.as_f64x4();
-    let c = c.as_i64x4();
-    let r = vfixupimmpd256(a, b, c, IMM8, 0b00001111);
-    transmute(r)
+pub fn _mm256_fixupimm_pd<const IMM8: i32>(a: __m256d, b: __m256d, c: __m256i) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let c = c.as_i64x4();
+        let r = vfixupimmpd256(a, b, c, IMM8, 0b00001111);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -6007,18 +6524,20 @@ pub unsafe fn _mm256_fixupimm_pd<const IMM8: i32>(a: __m256d, b: __m256d, c: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_fixupimm_pd<const IMM8: i32>(
+pub fn _mm256_mask_fixupimm_pd<const IMM8: i32>(
     a: __m256d,
     k: __mmask8,
     b: __m256d,
     c: __m256i,
 ) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x4();
-    let b = b.as_f64x4();
-    let c = c.as_i64x4();
-    let r = vfixupimmpd256(a, b, c, IMM8, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let c = c.as_i64x4();
+        let r = vfixupimmpd256(a, b, c, IMM8, k);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -6029,18 +6548,20 @@ pub unsafe fn _mm256_mask_fixupimm_pd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_maskz_fixupimm_pd<const IMM8: i32>(
+pub fn _mm256_maskz_fixupimm_pd<const IMM8: i32>(
     k: __mmask8,
     a: __m256d,
     b: __m256d,
     c: __m256i,
 ) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x4();
-    let b = b.as_f64x4();
-    let c = c.as_i64x4();
-    let r = vfixupimmpdz256(a, b, c, IMM8, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let c = c.as_i64x4();
+        let r = vfixupimmpdz256(a, b, c, IMM8, k);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
@@ -6051,13 +6572,15 @@ pub unsafe fn _mm256_maskz_fixupimm_pd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fixupimm_pd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let c = c.as_i64x2();
-    let r = vfixupimmpd128(a, b, c, IMM8, 0b00000011);
-    transmute(r)
+pub fn _mm_fixupimm_pd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmpd128(a, b, c, IMM8, 0b00000011);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -6068,18 +6591,20 @@ pub unsafe fn _mm_fixupimm_pd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fixupimm_pd<const IMM8: i32>(
+pub fn _mm_mask_fixupimm_pd<const IMM8: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128i,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let c = c.as_i64x2();
-    let r = vfixupimmpd128(a, b, c, IMM8, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmpd128(a, b, c, IMM8, k);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
@@ -6090,18 +6615,20 @@ pub unsafe fn _mm_mask_fixupimm_pd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fixupimm_pd<const IMM8: i32>(
+pub fn _mm_maskz_fixupimm_pd<const IMM8: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128i,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let c = c.as_i64x2();
-    let r = vfixupimmpdz128(a, b, c, IMM8, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmpdz128(a, b, c, IMM8, k);
+        transmute(r)
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
@@ -6112,17 +6639,15 @@ pub unsafe fn _mm_maskz_fixupimm_pd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_ternarylogic_epi32<const IMM8: i32>(
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    let c = c.as_i32x16();
-    let r = vpternlogd(a, b, c, IMM8);
-    transmute(r)
+pub fn _mm512_ternarylogic_epi32<const IMM8: i32>(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let c = c.as_i32x16();
+        let r = vpternlogd(a, b, c, IMM8);
+        transmute(r)
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
@@ -6133,18 +6658,20 @@ pub unsafe fn _mm512_ternarylogic_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_ternarylogic_epi32<const IMM8: i32>(
+pub fn _mm512_mask_ternarylogic_epi32<const IMM8: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let src = src.as_i32x16();
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    let r = vpternlogd(src, a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, src))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i32x16();
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let r = vpternlogd(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
@@ -6155,18 +6682,20 @@ pub unsafe fn _mm512_mask_ternarylogic_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_ternarylogic_epi32<const IMM8: i32>(
+pub fn _mm512_maskz_ternarylogic_epi32<const IMM8: i32>(
     k: __mmask16,
     a: __m512i,
     b: __m512i,
     c: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    let c = c.as_i32x16();
-    let r = vpternlogd(a, b, c, IMM8);
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let c = c.as_i32x16();
+        let r = vpternlogd(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
@@ -6177,17 +6706,15 @@ pub unsafe fn _mm512_maskz_ternarylogic_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_ternarylogic_epi32<const IMM8: i32>(
-    a: __m256i,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let c = c.as_i32x8();
-    let r = vpternlogd256(a, b, c, IMM8);
-    transmute(r)
+pub fn _mm256_ternarylogic_epi32<const IMM8: i32>(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let c = c.as_i32x8();
+        let r = vpternlogd256(a, b, c, IMM8);
+        transmute(r)
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
@@ -6198,18 +6725,20 @@ pub unsafe fn _mm256_ternarylogic_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_ternarylogic_epi32<const IMM8: i32>(
+pub fn _mm256_mask_ternarylogic_epi32<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let src = src.as_i32x8();
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let r = vpternlogd256(src, a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, src))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i32x8();
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r = vpternlogd256(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
@@ -6220,18 +6749,20 @@ pub unsafe fn _mm256_mask_ternarylogic_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_maskz_ternarylogic_epi32<const IMM8: i32>(
+pub fn _mm256_maskz_ternarylogic_epi32<const IMM8: i32>(
     k: __mmask8,
     a: __m256i,
     b: __m256i,
     c: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let c = c.as_i32x8();
-    let r = vpternlogd256(a, b, c, IMM8);
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let c = c.as_i32x8();
+        let r = vpternlogd256(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
@@ -6242,17 +6773,15 @@ pub unsafe fn _mm256_maskz_ternarylogic_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_ternarylogic_epi32<const IMM8: i32>(
-    a: __m128i,
-    b: __m128i,
-    c: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    let c = c.as_i32x4();
-    let r = vpternlogd128(a, b, c, IMM8);
-    transmute(r)
+pub fn _mm_ternarylogic_epi32<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let c = c.as_i32x4();
+        let r = vpternlogd128(a, b, c, IMM8);
+        transmute(r)
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
@@ -6263,18 +6792,20 @@ pub unsafe fn _mm_ternarylogic_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_ternarylogic_epi32<const IMM8: i32>(
+pub fn _mm_mask_ternarylogic_epi32<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let src = src.as_i32x4();
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    let r = vpternlogd128(src, a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, src))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i32x4();
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let r = vpternlogd128(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
@@ -6285,18 +6816,20 @@ pub unsafe fn _mm_mask_ternarylogic_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_ternarylogic_epi32<const IMM8: i32>(
+pub fn _mm_maskz_ternarylogic_epi32<const IMM8: i32>(
     k: __mmask8,
     a: __m128i,
     b: __m128i,
     c: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    let c = c.as_i32x4();
-    let r = vpternlogd128(a, b, c, IMM8);
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let c = c.as_i32x4();
+        let r = vpternlogd128(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
@@ -6307,17 +6840,15 @@ pub unsafe fn _mm_maskz_ternarylogic_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_ternarylogic_epi64<const IMM8: i32>(
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let b = b.as_i64x8();
-    let c = c.as_i64x8();
-    let r = vpternlogq(a, b, c, IMM8);
-    transmute(r)
+pub fn _mm512_ternarylogic_epi64<const IMM8: i32>(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let c = c.as_i64x8();
+        let r = vpternlogq(a, b, c, IMM8);
+        transmute(r)
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
@@ -6328,18 +6859,20 @@ pub unsafe fn _mm512_ternarylogic_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_ternarylogic_epi64<const IMM8: i32>(
+pub fn _mm512_mask_ternarylogic_epi64<const IMM8: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let src = src.as_i64x8();
-    let a = a.as_i64x8();
-    let b = b.as_i64x8();
-    let r = vpternlogq(src, a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, src))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i64x8();
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let r = vpternlogq(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
@@ -6350,18 +6883,20 @@ pub unsafe fn _mm512_mask_ternarylogic_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_ternarylogic_epi64<const IMM8: i32>(
+pub fn _mm512_maskz_ternarylogic_epi64<const IMM8: i32>(
     k: __mmask8,
     a: __m512i,
     b: __m512i,
     c: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let b = b.as_i64x8();
-    let c = c.as_i64x8();
-    let r = vpternlogq(a, b, c, IMM8);
-    transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let c = c.as_i64x8();
+        let r = vpternlogq(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
@@ -6372,17 +6907,15 @@ pub unsafe fn _mm512_maskz_ternarylogic_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_ternarylogic_epi64<const IMM8: i32>(
-    a: __m256i,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    let c = c.as_i64x4();
-    let r = vpternlogq256(a, b, c, IMM8);
-    transmute(r)
+pub fn _mm256_ternarylogic_epi64<const IMM8: i32>(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let c = c.as_i64x4();
+        let r = vpternlogq256(a, b, c, IMM8);
+        transmute(r)
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
@@ -6393,18 +6926,20 @@ pub unsafe fn _mm256_ternarylogic_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_ternarylogic_epi64<const IMM8: i32>(
+pub fn _mm256_mask_ternarylogic_epi64<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let src = src.as_i64x4();
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    let r = vpternlogq256(src, a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, src))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i64x4();
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let r = vpternlogq256(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
@@ -6415,18 +6950,20 @@ pub unsafe fn _mm256_mask_ternarylogic_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_maskz_ternarylogic_epi64<const IMM8: i32>(
+pub fn _mm256_maskz_ternarylogic_epi64<const IMM8: i32>(
     k: __mmask8,
     a: __m256i,
     b: __m256i,
     c: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    let c = c.as_i64x4();
-    let r = vpternlogq256(a, b, c, IMM8);
-    transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let c = c.as_i64x4();
+        let r = vpternlogq256(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
@@ -6437,17 +6974,15 @@ pub unsafe fn _mm256_maskz_ternarylogic_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_ternarylogic_epi64<const IMM8: i32>(
-    a: __m128i,
-    b: __m128i,
-    c: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x2();
-    let b = b.as_i64x2();
-    let c = c.as_i64x2();
-    let r = vpternlogq128(a, b, c, IMM8);
-    transmute(r)
+pub fn _mm_ternarylogic_epi64<const IMM8: i32>(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let c = c.as_i64x2();
+        let r = vpternlogq128(a, b, c, IMM8);
+        transmute(r)
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
@@ -6458,18 +6993,20 @@ pub unsafe fn _mm_ternarylogic_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_ternarylogic_epi64<const IMM8: i32>(
+pub fn _mm_mask_ternarylogic_epi64<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let src = src.as_i64x2();
-    let a = a.as_i64x2();
-    let b = b.as_i64x2();
-    let r = vpternlogq128(src, a, b, IMM8);
-    transmute(simd_select_bitmask(k, r, src))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let src = src.as_i64x2();
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let r = vpternlogq128(src, a, b, IMM8);
+        transmute(simd_select_bitmask(k, r, src))
+    }
 }
 
 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
@@ -6480,18 +7017,20 @@ pub unsafe fn _mm_mask_ternarylogic_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_ternarylogic_epi64<const IMM8: i32>(
+pub fn _mm_maskz_ternarylogic_epi64<const IMM8: i32>(
     k: __mmask8,
     a: __m128i,
     b: __m128i,
     c: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x2();
-    let b = b.as_i64x2();
-    let c = c.as_i64x2();
-    let r = vpternlogq128(a, b, c, IMM8);
-    transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let c = c.as_i64x2();
+        let r = vpternlogq128(a, b, c, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
@@ -6511,24 +7050,23 @@ pub unsafe fn _mm_maskz_ternarylogic_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(1, 2)]
-pub unsafe fn _mm512_getmant_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+pub fn _mm512_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x16();
-    let zero = f32x16::ZERO;
-    let r = vgetmantps(
-        a,
-        SIGN << 2 | NORM,
-        zero,
-        0b11111111_11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x16();
+        let zero = f32x16::ZERO;
+        let r = vgetmantps(
+            a,
+            SIGN << 2 | NORM,
+            zero,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6548,7 +7086,7 @@ pub unsafe fn _mm512_getmant_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm512_mask_getmant_ps<
+pub fn _mm512_mask_getmant_ps<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -6556,12 +7094,14 @@ pub unsafe fn _mm512_mask_getmant_ps<
     k: __mmask16,
     a: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x16();
-    let src = src.as_f32x16();
-    let r = vgetmantps(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6581,24 +7121,26 @@ pub unsafe fn _mm512_mask_getmant_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm512_maskz_getmant_ps<
+pub fn _mm512_maskz_getmant_ps<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
     k: __mmask16,
     a: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x16();
-    let r = vgetmantps(
-        a,
-        SIGN << 2 | NORM,
-        f32x16::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x16();
+        let r = vgetmantps(
+            a,
+            SIGN << 2 | NORM,
+            f32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
@@ -6618,17 +7160,16 @@ pub unsafe fn _mm512_maskz_getmant_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(1, 2)]
-pub unsafe fn _mm256_getmant_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+pub fn _mm256_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m256,
 ) -> __m256 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x8();
-    let r = vgetmantps256(a, SIGN << 2 | NORM, f32x8::ZERO, 0b11111111);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x8();
+        let r = vgetmantps256(a, SIGN << 2 | NORM, f32x8::ZERO, 0b11111111);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6648,7 +7189,7 @@ pub unsafe fn _mm256_getmant_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm256_mask_getmant_ps<
+pub fn _mm256_mask_getmant_ps<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -6656,12 +7197,14 @@ pub unsafe fn _mm256_mask_getmant_ps<
     k: __mmask8,
     a: __m256,
 ) -> __m256 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x8();
-    let src = src.as_f32x8();
-    let r = vgetmantps256(a, SIGN << 2 | NORM, src, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x8();
+        let src = src.as_f32x8();
+        let r = vgetmantps256(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6681,18 +7224,20 @@ pub unsafe fn _mm256_mask_getmant_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm256_maskz_getmant_ps<
+pub fn _mm256_maskz_getmant_ps<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
     k: __mmask8,
     a: __m256,
 ) -> __m256 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x8();
-    let r = vgetmantps256(a, SIGN << 2 | NORM, f32x8::ZERO, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x8();
+        let r = vgetmantps256(a, SIGN << 2 | NORM, f32x8::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
@@ -6712,17 +7257,16 @@ pub unsafe fn _mm256_maskz_getmant_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(1, 2)]
-pub unsafe fn _mm_getmant_ps<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+pub fn _mm_getmant_ps<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x4();
-    let r = vgetmantps128(a, SIGN << 2 | NORM, f32x4::ZERO, 0b00001111);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let r = vgetmantps128(a, SIGN << 2 | NORM, f32x4::ZERO, 0b00001111);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6742,7 +7286,7 @@ pub unsafe fn _mm_getmant_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_mask_getmant_ps<
+pub fn _mm_mask_getmant_ps<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -6750,12 +7294,14 @@ pub unsafe fn _mm_mask_getmant_ps<
     k: __mmask8,
     a: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vgetmantps128(a, SIGN << 2 | NORM, src, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetmantps128(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6775,18 +7321,20 @@ pub unsafe fn _mm_mask_getmant_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_maskz_getmant_ps<
+pub fn _mm_maskz_getmant_ps<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
     k: __mmask8,
     a: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x4();
-    let r = vgetmantps128(a, SIGN << 2 | NORM, f32x4::ZERO, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let r = vgetmantps128(a, SIGN << 2 | NORM, f32x4::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6806,24 +7354,23 @@ pub unsafe fn _mm_maskz_getmant_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(1, 2)]
-pub unsafe fn _mm512_getmant_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+pub fn _mm512_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x8();
-    let zero = f64x8::ZERO;
-    let r = vgetmantpd(
-        a,
-        SIGN << 2 | NORM,
-        zero,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x8();
+        let zero = f64x8::ZERO;
+        let r = vgetmantpd(
+            a,
+            SIGN << 2 | NORM,
+            zero,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6843,7 +7390,7 @@ pub unsafe fn _mm512_getmant_pd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm512_mask_getmant_pd<
+pub fn _mm512_mask_getmant_pd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -6851,12 +7398,14 @@ pub unsafe fn _mm512_mask_getmant_pd<
     k: __mmask8,
     a: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x8();
-    let src = src.as_f64x8();
-    let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6876,24 +7425,26 @@ pub unsafe fn _mm512_mask_getmant_pd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm512_maskz_getmant_pd<
+pub fn _mm512_maskz_getmant_pd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
     k: __mmask8,
     a: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x8();
-    let r = vgetmantpd(
-        a,
-        SIGN << 2 | NORM,
-        f64x8::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x8();
+        let r = vgetmantpd(
+            a,
+            SIGN << 2 | NORM,
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6913,17 +7464,16 @@ pub unsafe fn _mm512_maskz_getmant_pd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(1, 2)]
-pub unsafe fn _mm256_getmant_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+pub fn _mm256_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m256d,
 ) -> __m256d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x4();
-    let r = vgetmantpd256(a, SIGN << 2 | NORM, f64x4::ZERO, 0b00001111);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x4();
+        let r = vgetmantpd256(a, SIGN << 2 | NORM, f64x4::ZERO, 0b00001111);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6943,7 +7493,7 @@ pub unsafe fn _mm256_getmant_pd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm256_mask_getmant_pd<
+pub fn _mm256_mask_getmant_pd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -6951,12 +7501,14 @@ pub unsafe fn _mm256_mask_getmant_pd<
     k: __mmask8,
     a: __m256d,
 ) -> __m256d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x4();
-    let src = src.as_f64x4();
-    let r = vgetmantpd256(a, SIGN << 2 | NORM, src, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x4();
+        let src = src.as_f64x4();
+        let r = vgetmantpd256(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -6976,18 +7528,20 @@ pub unsafe fn _mm256_mask_getmant_pd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm256_maskz_getmant_pd<
+pub fn _mm256_maskz_getmant_pd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
     k: __mmask8,
     a: __m256d,
 ) -> __m256d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x4();
-    let r = vgetmantpd256(a, SIGN << 2 | NORM, f64x4::ZERO, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x4();
+        let r = vgetmantpd256(a, SIGN << 2 | NORM, f64x4::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -7007,17 +7561,16 @@ pub unsafe fn _mm256_maskz_getmant_pd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(1, 2)]
-pub unsafe fn _mm_getmant_pd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+pub fn _mm_getmant_pd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x2();
-    let r = vgetmantpd128(a, SIGN << 2 | NORM, f64x2::ZERO, 0b00000011);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let r = vgetmantpd128(a, SIGN << 2 | NORM, f64x2::ZERO, 0b00000011);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -7037,7 +7590,7 @@ pub unsafe fn _mm_getmant_pd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_mask_getmant_pd<
+pub fn _mm_mask_getmant_pd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -7045,12 +7598,14 @@ pub unsafe fn _mm_mask_getmant_pd<
     k: __mmask8,
     a: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vgetmantpd128(a, SIGN << 2 | NORM, src, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetmantpd128(a, SIGN << 2 | NORM, src, k);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -7070,18 +7625,20 @@ pub unsafe fn _mm_mask_getmant_pd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_maskz_getmant_pd<
+pub fn _mm_maskz_getmant_pd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
     k: __mmask8,
     a: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x2();
-    let r = vgetmantpd128(a, SIGN << 2 | NORM, f64x2::ZERO, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let r = vgetmantpd128(a, SIGN << 2 | NORM, f64x2::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
@@ -7099,12 +7656,14 @@ pub unsafe fn _mm_maskz_getmant_pd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_add_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vaddps(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm512_add_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vaddps(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -7122,17 +7681,19 @@ pub unsafe fn _mm512_add_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_add_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_add_round_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vaddps(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vaddps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -7150,16 +7711,18 @@ pub unsafe fn _mm512_mask_add_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_add_round_ps<const ROUNDING: i32>(
+pub fn _mm512_maskz_add_round_ps<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vaddps(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vaddps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
 }
 
 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
@@ -7177,12 +7740,14 @@ pub unsafe fn _mm512_maskz_add_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_add_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vaddpd(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm512_add_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vaddpd(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -7200,17 +7765,19 @@ pub unsafe fn _mm512_add_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_add_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_add_round_pd<const ROUNDING: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vaddpd(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vaddpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
 }
 
 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -7228,16 +7795,18 @@ pub unsafe fn _mm512_mask_add_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_add_round_pd<const ROUNDING: i32>(
+pub fn _mm512_maskz_add_round_pd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vaddpd(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vaddpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
 }
 
 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
@@ -7255,12 +7824,14 @@ pub unsafe fn _mm512_maskz_add_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_sub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vsubps(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm512_sub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vsubps(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -7278,17 +7849,19 @@ pub unsafe fn _mm512_sub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_sub_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_sub_round_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vsubps(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vsubps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
 }
 
 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -7306,16 +7879,18 @@ pub unsafe fn _mm512_mask_sub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_sub_round_ps<const ROUNDING: i32>(
+pub fn _mm512_maskz_sub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vsubps(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vsubps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
@@ -7333,12 +7908,14 @@ pub unsafe fn _mm512_maskz_sub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_sub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vsubpd(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm512_sub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vsubpd(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -7356,17 +7933,19 @@ pub unsafe fn _mm512_sub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_sub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_sub_round_pd<const ROUNDING: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vsubpd(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vsubpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -7384,16 +7963,18 @@ pub unsafe fn _mm512_mask_sub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_sub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_maskz_sub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vsubpd(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vsubpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
@@ -7411,12 +7992,14 @@ pub unsafe fn _mm512_maskz_sub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_mul_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vmulps(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm512_mul_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmulps(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -7434,17 +8017,19 @@ pub unsafe fn _mm512_mul_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_mul_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_mul_round_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vmulps(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmulps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -7462,16 +8047,18 @@ pub unsafe fn _mm512_mask_mul_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_mul_round_ps<const ROUNDING: i32>(
+pub fn _mm512_maskz_mul_round_ps<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vmulps(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmulps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
@@ -7489,12 +8076,14 @@ pub unsafe fn _mm512_maskz_mul_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_mul_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vmulpd(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm512_mul_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmulpd(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -7512,17 +8101,19 @@ pub unsafe fn _mm512_mul_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_mul_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_mul_round_pd<const ROUNDING: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vmulpd(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmulpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -7540,16 +8131,18 @@ pub unsafe fn _mm512_mask_mul_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_mul_round_pd<const ROUNDING: i32>(
+pub fn _mm512_maskz_mul_round_pd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vmulpd(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmulpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
 }
 
 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.\
@@ -7567,12 +8160,14 @@ pub unsafe fn _mm512_maskz_mul_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_div_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vdivps(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm512_div_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vdivps(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -7590,17 +8185,19 @@ pub unsafe fn _mm512_div_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_div_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_div_round_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vdivps(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vdivps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
 }
 
 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -7618,16 +8215,18 @@ pub unsafe fn _mm512_mask_div_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_div_round_ps<const ROUNDING: i32>(
+pub fn _mm512_maskz_div_round_ps<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vdivps(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vdivps(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, =and store the results in dst.\
@@ -7645,12 +8244,14 @@ pub unsafe fn _mm512_maskz_div_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_div_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vdivpd(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm512_div_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vdivpd(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -7668,17 +8269,19 @@ pub unsafe fn _mm512_div_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_div_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_div_round_pd<const ROUNDING: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vdivpd(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vdivpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -7696,16 +8299,18 @@ pub unsafe fn _mm512_mask_div_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_div_round_pd<const ROUNDING: i32>(
+pub fn _mm512_maskz_div_round_pd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vdivpd(a, b, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vdivpd(a, b, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
@@ -7723,11 +8328,13 @@ pub unsafe fn _mm512_maskz_div_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_sqrt_round_ps<const ROUNDING: i32>(a: __m512) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let r = vsqrtps(a, ROUNDING);
-    transmute(r)
+pub fn _mm512_sqrt_round_ps<const ROUNDING: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vsqrtps(a, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -7745,15 +8352,17 @@ pub unsafe fn _mm512_sqrt_round_ps<const ROUNDING: i32>(a: __m512) -> __m512 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_sqrt_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_sqrt_round_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let r = vsqrtps(a, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vsqrtps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
 }
 
 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -7771,11 +8380,13 @@ pub unsafe fn _mm512_mask_sqrt_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_sqrt_round_ps<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let r = vsqrtps(a, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+pub fn _mm512_maskz_sqrt_round_ps<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vsqrtps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
@@ -7793,11 +8404,13 @@ pub unsafe fn _mm512_maskz_sqrt_round_ps<const ROUNDING: i32>(k: __mmask16, a: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_sqrt_round_pd<const ROUNDING: i32>(a: __m512d) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let r = vsqrtpd(a, ROUNDING);
-    transmute(r)
+pub fn _mm512_sqrt_round_pd<const ROUNDING: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vsqrtpd(a, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -7815,15 +8428,17 @@ pub unsafe fn _mm512_sqrt_round_pd<const ROUNDING: i32>(a: __m512d) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_sqrt_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_sqrt_round_pd<const ROUNDING: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let r = vsqrtpd(a, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vsqrtpd(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
 }
 
 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -7841,11 +8456,13 @@ pub unsafe fn _mm512_mask_sqrt_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_sqrt_round_pd<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let r = vsqrtpd(a, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+pub fn _mm512_maskz_sqrt_round_pd<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vsqrtpd(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
@@ -7863,13 +8480,11 @@ pub unsafe fn _mm512_maskz_sqrt_round_pd<const ROUNDING: i32>(k: __mmask8, a: __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fmadd_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    vfmadd132psround(a, b, c, ROUNDING)
+pub fn _mm512_fmadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(a, b, c, ROUNDING)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -7887,14 +8502,16 @@ pub unsafe fn _mm512_fmadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fmadd_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_fmadd_round_ps<const ROUNDING: i32>(
     a: __m512,
     k: __mmask16,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), a)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in a using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -7912,14 +8529,16 @@ pub unsafe fn _mm512_mask_fmadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fmadd_round_ps<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), _mm512_setzero_ps())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), _mm512_setzero_ps())
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -7937,14 +8556,16 @@ pub unsafe fn _mm512_maskz_fmadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fmadd_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmadd_round_ps<const ROUNDING: i32>(
     a: __m512,
     b: __m512,
     c: __m512,
     k: __mmask16,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132psround(a, b, c, ROUNDING), c)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
@@ -7962,13 +8583,11 @@ pub unsafe fn _mm512_mask3_fmadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fmadd_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    vfmadd132pdround(a, b, c, ROUNDING)
+pub fn _mm512_fmadd_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(a, b, c, ROUNDING)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -7986,14 +8605,16 @@ pub unsafe fn _mm512_fmadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fmadd_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_fmadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
     k: __mmask8,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), a)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8011,14 +8632,16 @@ pub unsafe fn _mm512_mask_fmadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fmadd_round_pd<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), _mm512_setzero_pd())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), _mm512_setzero_pd())
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8036,14 +8659,16 @@ pub unsafe fn _mm512_maskz_fmadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fmadd_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
     b: __m512d,
     c: __m512d,
     k: __mmask8,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmadd132pdround(a, b, c, ROUNDING), c)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
@@ -8061,13 +8686,11 @@ pub unsafe fn _mm512_mask3_fmadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fmsub_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    vfmadd132psround(a, b, simd_neg(c), ROUNDING)
+pub fn _mm512_fmsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(a, b, simd_neg(c), ROUNDING)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8085,15 +8708,17 @@ pub unsafe fn _mm512_fmsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
     k: __mmask16,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8111,15 +8736,17 @@ pub unsafe fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, _mm512_setzero_ps())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8137,15 +8764,17 @@ pub unsafe fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
     b: __m512,
     c: __m512,
     k: __mmask16,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
@@ -8163,13 +8792,11 @@ pub unsafe fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fmsub_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    vfmadd132pdround(a, b, simd_neg(c), ROUNDING)
+pub fn _mm512_fmsub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(a, b, simd_neg(c), ROUNDING)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8187,15 +8814,17 @@ pub unsafe fn _mm512_fmsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
     k: __mmask8,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8213,15 +8842,17 @@ pub unsafe fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, _mm512_setzero_pd())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8239,15 +8870,17 @@ pub unsafe fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
     b: __m512d,
     c: __m512d,
     k: __mmask8,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
@@ -8265,13 +8898,11 @@ pub unsafe fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fmaddsub_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    vfmaddsubpsround(a, b, c, ROUNDING)
+pub fn _mm512_fmaddsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpsround(a, b, c, ROUNDING)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8289,14 +8920,16 @@ pub unsafe fn _mm512_fmaddsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fmaddsub_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_fmaddsub_round_ps<const ROUNDING: i32>(
     a: __m512,
     k: __mmask16,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), a)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8314,14 +8947,16 @@ pub unsafe fn _mm512_mask_fmaddsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fmaddsub_round_ps<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmaddsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), _mm512_setzero_ps())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), _mm512_setzero_ps())
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8339,14 +8974,16 @@ pub unsafe fn _mm512_maskz_fmaddsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fmaddsub_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmaddsub_round_ps<const ROUNDING: i32>(
     a: __m512,
     b: __m512,
     c: __m512,
     k: __mmask16,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpsround(a, b, c, ROUNDING), c)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
@@ -8364,13 +9001,15 @@ pub unsafe fn _mm512_mask3_fmaddsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fmaddsub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_fmaddsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    vfmaddsubpdround(a, b, c, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpdround(a, b, c, ROUNDING)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8388,14 +9027,16 @@ pub unsafe fn _mm512_fmaddsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fmaddsub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_fmaddsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
     k: __mmask8,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), a)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8413,14 +9054,16 @@ pub unsafe fn _mm512_mask_fmaddsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fmaddsub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmaddsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), _mm512_setzero_pd())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), _mm512_setzero_pd())
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8438,14 +9081,16 @@ pub unsafe fn _mm512_maskz_fmaddsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fmaddsub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmaddsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
     b: __m512d,
     c: __m512d,
     k: __mmask8,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, vfmaddsubpdround(a, b, c, ROUNDING), c)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
@@ -8463,13 +9108,11 @@ pub unsafe fn _mm512_mask3_fmaddsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    vfmaddsubpsround(a, b, simd_neg(c), ROUNDING)
+pub fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpsround(a, b, simd_neg(c), ROUNDING)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8487,15 +9130,17 @@ pub unsafe fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
     a: __m512,
     k: __mmask16,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8513,15 +9158,17 @@ pub unsafe fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, _mm512_setzero_ps())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8539,15 +9186,17 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
     a: __m512,
     b: __m512,
     c: __m512,
     k: __mmask16,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpsround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
@@ -8565,13 +9214,15 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
+pub fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    vfmaddsubpdround(a, b, simd_neg(c), ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubpdround(a, b, simd_neg(c), ROUNDING)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8589,15 +9240,17 @@ pub unsafe fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
     k: __mmask8,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8615,15 +9268,17 @@ pub unsafe fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, _mm512_setzero_pd())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8641,15 +9296,17 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
     b: __m512d,
     c: __m512d,
     k: __mmask8,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmaddsubpdround(a, b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
@@ -8667,13 +9324,11 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    vfmadd132psround(simd_neg(a), b, c, ROUNDING)
+pub fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(simd_neg(a), b, c, ROUNDING)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8691,15 +9346,17 @@ pub unsafe fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
     a: __m512,
     k: __mmask16,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
-    simd_select_bitmask(k, r, a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8717,15 +9374,17 @@ pub unsafe fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
+pub fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
-    simd_select_bitmask(k, r, _mm512_setzero_ps())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8743,15 +9402,17 @@ pub unsafe fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
     a: __m512,
     b: __m512,
     c: __m512,
     k: __mmask16,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
-    simd_select_bitmask(k, r, c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
@@ -8769,13 +9430,11 @@ pub unsafe fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    vfmadd132pdround(simd_neg(a), b, c, ROUNDING)
+pub fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(simd_neg(a), b, c, ROUNDING)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8793,15 +9452,17 @@ pub unsafe fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
     k: __mmask8,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
-    simd_select_bitmask(k, r, a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8819,15 +9480,17 @@ pub unsafe fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
+pub fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
-    simd_select_bitmask(k, r, _mm512_setzero_pd())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8845,15 +9508,17 @@ pub unsafe fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
     a: __m512d,
     b: __m512d,
     c: __m512d,
     k: __mmask8,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
-    simd_select_bitmask(k, r, c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, c, ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
@@ -8871,13 +9536,11 @@ pub unsafe fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(
-    a: __m512,
-    b: __m512,
-    c: __m512,
-) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING)
+pub fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512, c: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8895,15 +9558,17 @@ pub unsafe fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
     k: __mmask16,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -8921,15 +9586,17 @@ pub unsafe fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
+pub fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
     c: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, _mm512_setzero_ps())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_ps())
+    }
 }
 
 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -8947,15 +9614,17 @@ pub unsafe fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
     a: __m512,
     b: __m512,
     c: __m512,
     k: __mmask16,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132psround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
@@ -8973,13 +9642,11 @@ pub unsafe fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(
-    a: __m512d,
-    b: __m512d,
-    c: __m512d,
-) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING)
+pub fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
@@ -8997,15 +9664,17 @@ pub unsafe fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
     k: __mmask8,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, a)
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9023,15 +9692,17 @@ pub unsafe fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
     c: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, _mm512_setzero_pd())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, _mm512_setzero_pd())
+    }
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
@@ -9049,15 +9720,17 @@ pub unsafe fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
     a: __m512d,
     b: __m512d,
     c: __m512d,
     k: __mmask8,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
-    simd_select_bitmask(k, r, c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = vfmadd132pdround(simd_neg(a), b, simd_neg(c), ROUNDING);
+        simd_select_bitmask(k, r, c)
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.\
@@ -9069,12 +9742,14 @@ pub unsafe fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_max_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vmaxps(a, b, SAE);
-    transmute(r)
+pub fn _mm512_max_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmaxps(a, b, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -9086,17 +9761,19 @@ pub unsafe fn _mm512_max_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_max_round_ps<const SAE: i32>(
+pub fn _mm512_mask_max_round_ps<const SAE: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vmaxps(a, b, SAE);
-    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmaxps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9108,16 +9785,14 @@ pub unsafe fn _mm512_mask_max_round_ps<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_max_round_ps<const SAE: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vmaxps(a, b, SAE);
-    transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+pub fn _mm512_maskz_max_round_ps<const SAE: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vmaxps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.\
@@ -9129,12 +9804,14 @@ pub unsafe fn _mm512_maskz_max_round_ps<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_max_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vmaxpd(a, b, SAE);
-    transmute(r)
+pub fn _mm512_max_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmaxpd(a, b, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -9146,17 +9823,19 @@ pub unsafe fn _mm512_max_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_max_round_pd<const SAE: i32>(
+pub fn _mm512_mask_max_round_pd<const SAE: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vmaxpd(a, b, SAE);
-    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmaxpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9168,16 +9847,14 @@ pub unsafe fn _mm512_mask_max_round_pd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_max_round_pd<const SAE: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vmaxpd(a, b, SAE);
-    transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+pub fn _mm512_maskz_max_round_pd<const SAE: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vmaxpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.\
@@ -9189,12 +9866,14 @@ pub unsafe fn _mm512_maskz_max_round_pd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminps, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_min_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vminps(a, b, SAE);
-    transmute(r)
+pub fn _mm512_min_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vminps(a, b, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -9206,17 +9885,19 @@ pub unsafe fn _mm512_min_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminps, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_min_round_ps<const SAE: i32>(
+pub fn _mm512_mask_min_round_ps<const SAE: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vminps(a, b, SAE);
-    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vminps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9228,16 +9909,14 @@ pub unsafe fn _mm512_mask_min_round_ps<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminps, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_min_round_ps<const SAE: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vminps(a, b, SAE);
-    transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+pub fn _mm512_maskz_min_round_ps<const SAE: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vminps(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.\
@@ -9249,12 +9928,14 @@ pub unsafe fn _mm512_maskz_min_round_ps<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_min_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vminpd(a, b, SAE);
-    transmute(r)
+pub fn _mm512_min_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vminpd(a, b, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -9266,17 +9947,19 @@ pub unsafe fn _mm512_min_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_min_round_pd<const SAE: i32>(
+pub fn _mm512_mask_min_round_pd<const SAE: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vminpd(a, b, SAE);
-    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vminpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, src.as_f64x8()))
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9288,16 +9971,14 @@ pub unsafe fn _mm512_mask_min_round_pd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_min_round_pd<const SAE: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vminpd(a, b, SAE);
-    transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+pub fn _mm512_maskz_min_round_pd<const SAE: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vminpd(a, b, SAE);
+        transmute(simd_select_bitmask(k, r, f64x8::ZERO))
+    }
 }
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
@@ -9309,11 +9990,13 @@ pub unsafe fn _mm512_maskz_min_round_pd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_getexp_round_ps<const SAE: i32>(a: __m512) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vgetexpps(a, f32x16::ZERO, 0b11111111_11111111, SAE);
-    transmute(r)
+pub fn _mm512_getexp_round_ps<const SAE: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetexpps(a, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
@@ -9325,16 +10008,14 @@ pub unsafe fn _mm512_getexp_round_ps<const SAE: i32>(a: __m512) -> __m512 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_getexp_round_ps<const SAE: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let src = src.as_f32x16();
-    let r = vgetexpps(a, src, k, SAE);
-    transmute(r)
+pub fn _mm512_mask_getexp_round_ps<const SAE: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vgetexpps(a, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
@@ -9346,11 +10027,13 @@ pub unsafe fn _mm512_mask_getexp_round_ps<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_getexp_round_ps<const SAE: i32>(k: __mmask16, a: __m512) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vgetexpps(a, f32x16::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm512_maskz_getexp_round_ps<const SAE: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetexpps(a, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
@@ -9362,11 +10045,13 @@ pub unsafe fn _mm512_maskz_getexp_round_ps<const SAE: i32>(k: __mmask16, a: __m5
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_getexp_round_pd<const SAE: i32>(a: __m512d) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let r = vgetexppd(a, f64x8::ZERO, 0b11111111, SAE);
-    transmute(r)
+pub fn _mm512_getexp_round_pd<const SAE: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetexppd(a, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
@@ -9378,16 +10063,18 @@ pub unsafe fn _mm512_getexp_round_pd<const SAE: i32>(a: __m512d) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_getexp_round_pd<const SAE: i32>(
+pub fn _mm512_mask_getexp_round_pd<const SAE: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
 ) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let src = src.as_f64x8();
-    let r = vgetexppd(a, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vgetexppd(a, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
@@ -9399,11 +10086,13 @@ pub unsafe fn _mm512_mask_getexp_round_pd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_getexp_round_pd<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let r = vgetexppd(a, f64x8::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm512_maskz_getexp_round_pd<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetexppd(a, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
@@ -9421,12 +10110,14 @@ pub unsafe fn _mm512_maskz_getexp_round_pd<const SAE: i32>(k: __mmask8, a: __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(1, 2)]
-pub unsafe fn _mm512_roundscale_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vrndscaleps(a, IMM8, f32x16::ZERO, 0b11111111_11111111, SAE);
-    transmute(r)
+pub fn _mm512_roundscale_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(a, IMM8, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -9444,17 +10135,19 @@ pub unsafe fn _mm512_roundscale_round_ps<const IMM8: i32, const SAE: i32>(a: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm512_mask_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_mask_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x16();
-    let src = src.as_f32x16();
-    let r = vrndscaleps(a, IMM8, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vrndscaleps(a, IMM8, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9472,15 +10165,17 @@ pub unsafe fn _mm512_mask_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm512_maskz_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_maskz_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
     k: __mmask16,
     a: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vrndscaleps(a, IMM8, f32x16::ZERO, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vrndscaleps(a, IMM8, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
@@ -9498,12 +10193,14 @@ pub unsafe fn _mm512_maskz_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(1, 2)]
-pub unsafe fn _mm512_roundscale_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x8();
-    let r = vrndscalepd(a, IMM8, f64x8::ZERO, 0b11111111, SAE);
-    transmute(r)
+pub fn _mm512_roundscale_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -9521,17 +10218,19 @@ pub unsafe fn _mm512_roundscale_round_pd<const IMM8: i32, const SAE: i32>(a: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm512_mask_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_mask_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x8();
-    let src = src.as_f64x8();
-    let r = vrndscalepd(a, IMM8, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vrndscalepd(a, IMM8, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9549,15 +10248,17 @@ pub unsafe fn _mm512_mask_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm512_maskz_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_maskz_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x8();
-    let r = vrndscalepd(a, IMM8, f64x8::ZERO, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vrndscalepd(a, IMM8, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.\
@@ -9575,12 +10276,14 @@ pub unsafe fn _mm512_maskz_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_scalef_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vscalefps(a, b, f32x16::ZERO, 0b11111111_11111111, ROUNDING);
-    transmute(r)
+pub fn _mm512_scalef_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vscalefps(a, b, f32x16::ZERO, 0b11111111_11111111, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -9598,18 +10301,20 @@ pub unsafe fn _mm512_scalef_round_ps<const ROUNDING: i32>(a: __m512, b: __m512)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_scalef_round_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_scalef_round_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let src = src.as_f32x16();
-    let r = vscalefps(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vscalefps(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9627,16 +10332,18 @@ pub unsafe fn _mm512_mask_scalef_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_scalef_round_ps<const ROUNDING: i32>(
+pub fn _mm512_maskz_scalef_round_ps<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vscalefps(a, b, f32x16::ZERO, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vscalefps(a, b, f32x16::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.\
@@ -9654,12 +10361,14 @@ pub unsafe fn _mm512_maskz_scalef_round_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_scalef_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vscalefpd(a, b, f64x8::ZERO, 0b11111111, ROUNDING);
-    transmute(r)
+pub fn _mm512_scalef_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vscalefpd(a, b, f64x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -9677,18 +10386,20 @@ pub unsafe fn _mm512_scalef_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_scalef_round_pd<const ROUNDING: i32>(
+pub fn _mm512_mask_scalef_round_pd<const ROUNDING: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let src = src.as_f64x8();
-    let r = vscalefpd(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vscalefpd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -9706,16 +10417,18 @@ pub unsafe fn _mm512_mask_scalef_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_scalef_round_pd<const ROUNDING: i32>(
+pub fn _mm512_maskz_scalef_round_pd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vscalefpd(a, b, f64x8::ZERO, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vscalefpd(a, b, f64x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
@@ -9727,18 +10440,20 @@ pub unsafe fn _mm512_maskz_scalef_round_pd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm512_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
     a: __m512,
     b: __m512,
     c: __m512i,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_i32x16();
-    let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
@@ -9750,19 +10465,21 @@ pub unsafe fn _mm512_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm512_mask_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_mask_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
     a: __m512,
     k: __mmask16,
     b: __m512,
     c: __m512i,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_i32x16();
-    let r = vfixupimmps(a, b, c, IMM8, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmps(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
@@ -9774,19 +10491,21 @@ pub unsafe fn _mm512_mask_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm512_maskz_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_maskz_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
     k: __mmask16,
     a: __m512,
     b: __m512,
     c: __m512i,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let c = c.as_i32x16();
-    let r = vfixupimmpsz(a, b, c, IMM8, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let c = c.as_i32x16();
+        let r = vfixupimmpsz(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
@@ -9798,18 +10517,20 @@ pub unsafe fn _mm512_maskz_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm512_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
     a: __m512d,
     b: __m512d,
     c: __m512i,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_i64x8();
-    let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
@@ -9821,19 +10542,21 @@ pub unsafe fn _mm512_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm512_mask_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_mask_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
     a: __m512d,
     k: __mmask8,
     b: __m512d,
     c: __m512i,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_i64x8();
-    let r = vfixupimmpd(a, b, c, IMM8, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpd(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
@@ -9845,19 +10568,21 @@ pub unsafe fn _mm512_mask_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm512_maskz_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_maskz_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m512d,
     b: __m512d,
     c: __m512i,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let c = c.as_i64x8();
-    let r = vfixupimmpdz(a, b, c, IMM8, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let c = c.as_i64x8();
+        let r = vfixupimmpdz(a, b, c, IMM8, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -9878,19 +10603,21 @@ pub unsafe fn _mm512_maskz_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(1, 2, 3)]
-pub unsafe fn _mm512_getmant_round_ps<
+pub fn _mm512_getmant_round_ps<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
 >(
     a: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vgetmantps(a, SIGN << 2 | NORM, f32x16::ZERO, 0b11111111_11111111, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -9911,7 +10638,7 @@ pub unsafe fn _mm512_getmant_round_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(3, 4, 5)]
-pub unsafe fn _mm512_mask_getmant_round_ps<
+pub fn _mm512_mask_getmant_round_ps<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -9920,13 +10647,15 @@ pub unsafe fn _mm512_mask_getmant_round_ps<
     k: __mmask16,
     a: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x16();
-    let src = src.as_f32x16();
-    let r = vgetmantps(a, SIGN << 2 | NORM, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -9947,7 +10676,7 @@ pub unsafe fn _mm512_mask_getmant_round_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(2, 3, 4)]
-pub unsafe fn _mm512_maskz_getmant_round_ps<
+pub fn _mm512_maskz_getmant_round_ps<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -9955,12 +10684,14 @@ pub unsafe fn _mm512_maskz_getmant_round_ps<
     k: __mmask16,
     a: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vgetmantps(a, SIGN << 2 | NORM, f32x16::ZERO, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vgetmantps(a, SIGN << 2 | NORM, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -9981,19 +10712,21 @@ pub unsafe fn _mm512_maskz_getmant_round_ps<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(1, 2, 3)]
-pub unsafe fn _mm512_getmant_round_pd<
+pub fn _mm512_getmant_round_pd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
 >(
     a: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x8();
-    let r = vgetmantpd(a, SIGN << 2 | NORM, f64x8::ZERO, 0b11111111, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -10014,7 +10747,7 @@ pub unsafe fn _mm512_getmant_round_pd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(3, 4, 5)]
-pub unsafe fn _mm512_mask_getmant_round_pd<
+pub fn _mm512_mask_getmant_round_pd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -10023,13 +10756,15 @@ pub unsafe fn _mm512_mask_getmant_round_pd<
     k: __mmask8,
     a: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x8();
-    let src = src.as_f64x8();
-    let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -10050,7 +10785,7 @@ pub unsafe fn _mm512_mask_getmant_round_pd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(2, 3, 4)]
-pub unsafe fn _mm512_maskz_getmant_round_pd<
+pub fn _mm512_maskz_getmant_round_pd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -10058,28 +10793,32 @@ pub unsafe fn _mm512_maskz_getmant_round_pd<
     k: __mmask8,
     a: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x8();
-    let r = vgetmantpd(a, SIGN << 2 | NORM, f64x8::ZERO, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vgetmantpd(a, SIGN << 2 | NORM, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
 ///
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epi32&expand=1737)   
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epi32&expand=1737)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub unsafe fn _mm512_cvtps_epi32(a: __m512) -> __m512i {
-    transmute(vcvtps2dq(
-        a.as_f32x16(),
-        i32x16::ZERO,
-        0b11111111_11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_cvtps_epi32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10089,13 +10828,15 @@ pub unsafe fn _mm512_cvtps_epi32(a: __m512) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub unsafe fn _mm512_mask_cvtps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
-    transmute(vcvtps2dq(
-        a.as_f32x16(),
-        src.as_i32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2dq(
+            a.as_f32x16(),
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10105,13 +10846,15 @@ pub unsafe fn _mm512_mask_cvtps_epi32(src: __m512i, k: __mmask16, a: __m512) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub unsafe fn _mm512_maskz_cvtps_epi32(k: __mmask16, a: __m512) -> __m512i {
-    transmute(vcvtps2dq(
-        a.as_f32x16(),
-        i32x16::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_cvtps_epi32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10121,9 +10864,11 @@ pub unsafe fn _mm512_maskz_cvtps_epi32(k: __mmask16, a: __m512) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub unsafe fn _mm256_mask_cvtps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
-    let convert = _mm256_cvtps_epi32(a);
-    transmute(simd_select_bitmask(k, convert.as_i32x8(), src.as_i32x8()))
+pub fn _mm256_mask_cvtps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x8(), src.as_i32x8()))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10133,9 +10878,11 @@ pub unsafe fn _mm256_mask_cvtps_epi32(src: __m256i, k: __mmask8, a: __m256) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub unsafe fn _mm256_maskz_cvtps_epi32(k: __mmask8, a: __m256) -> __m256i {
-    let convert = _mm256_cvtps_epi32(a);
-    transmute(simd_select_bitmask(k, convert.as_i32x8(), i32x8::ZERO))
+pub fn _mm256_maskz_cvtps_epi32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x8(), i32x8::ZERO))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10145,9 +10892,11 @@ pub unsafe fn _mm256_maskz_cvtps_epi32(k: __mmask8, a: __m256) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub unsafe fn _mm_mask_cvtps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    let convert = _mm_cvtps_epi32(a);
-    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+pub fn _mm_mask_cvtps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10157,123 +10906,131 @@ pub unsafe fn _mm_mask_cvtps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2dq))]
-pub unsafe fn _mm_maskz_cvtps_epi32(k: __mmask8, a: __m128) -> __m128i {
-    let convert = _mm_cvtps_epi32(a);
-    transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+pub fn _mm_maskz_cvtps_epi32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtps_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_epu32&expand=1755)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub unsafe fn _mm512_cvtps_epu32(a: __m512) -> __m512i {
-    transmute(vcvtps2udq(
-        a.as_f32x16(),
-        u32x16::ZERO,
-        0b11111111_11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_cvtps_epu32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_epu32&expand=1756)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub unsafe fn _mm512_mask_cvtps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
-    transmute(vcvtps2udq(
-        a.as_f32x16(),
-        src.as_u32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2udq(
+            a.as_f32x16(),
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_epu32&expand=1343)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub unsafe fn _mm512_maskz_cvtps_epu32(k: __mmask16, a: __m512) -> __m512i {
-    transmute(vcvtps2udq(
-        a.as_f32x16(),
-        u32x16::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_cvtps_epu32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvtps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epu32&expand=1752)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub unsafe fn _mm256_cvtps_epu32(a: __m256) -> __m256i {
-    transmute(vcvtps2udq256(a.as_f32x8(), u32x8::ZERO, 0b11111111))
+pub fn _mm256_cvtps_epu32(a: __m256) -> __m256i {
+    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), u32x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_epu32&expand=1753)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub unsafe fn _mm256_mask_cvtps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
-    transmute(vcvtps2udq256(a.as_f32x8(), src.as_u32x8(), k))
+pub fn _mm256_mask_cvtps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), src.as_u32x8(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_epu32&expand=1754)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub unsafe fn _mm256_maskz_cvtps_epu32(k: __mmask8, a: __m256) -> __m256i {
-    transmute(vcvtps2udq256(a.as_f32x8(), u32x8::ZERO, k))
+pub fn _mm256_maskz_cvtps_epu32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvtps2udq256(a.as_f32x8(), u32x8::ZERO, k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epu32&expand=1749)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub unsafe fn _mm_cvtps_epu32(a: __m128) -> __m128i {
-    transmute(vcvtps2udq128(a.as_f32x4(), u32x4::ZERO, 0b11111111))
+pub fn _mm_cvtps_epu32(a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), u32x4::ZERO, 0b11111111)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_epu32&expand=1750)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub unsafe fn _mm_mask_cvtps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    transmute(vcvtps2udq128(a.as_f32x4(), src.as_u32x4(), k))
+pub fn _mm_mask_cvtps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), src.as_u32x4(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_epu32&expand=1751)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq))]
-pub unsafe fn _mm_maskz_cvtps_epu32(k: __mmask8, a: __m128) -> __m128i {
-    transmute(vcvtps2udq128(a.as_f32x4(), u32x4::ZERO, k))
+pub fn _mm_maskz_cvtps_epu32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvtps2udq128(a.as_f32x4(), u32x4::ZERO, k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
@@ -10283,13 +11040,15 @@ pub unsafe fn _mm_maskz_cvtps_epu32(k: __mmask8, a: __m128) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2pd))]
-pub unsafe fn _mm512_cvtps_pd(a: __m256) -> __m512d {
-    transmute(vcvtps2pd(
-        a.as_f32x8(),
-        f64x8::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_cvtps_pd(a: __m256) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            a.as_f32x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10299,13 +11058,15 @@ pub unsafe fn _mm512_cvtps_pd(a: __m256) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2pd))]
-pub unsafe fn _mm512_mask_cvtps_pd(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
-    transmute(vcvtps2pd(
-        a.as_f32x8(),
-        src.as_f64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtps_pd(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            a.as_f32x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10315,13 +11076,15 @@ pub unsafe fn _mm512_mask_cvtps_pd(src: __m512d, k: __mmask8, a: __m256) -> __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2pd))]
-pub unsafe fn _mm512_maskz_cvtps_pd(k: __mmask8, a: __m256) -> __m512d {
-    transmute(vcvtps2pd(
-        a.as_f32x8(),
-        f64x8::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_cvtps_pd(k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            a.as_f32x8(),
+            f64x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
@@ -10331,13 +11094,15 @@ pub unsafe fn _mm512_maskz_cvtps_pd(k: __mmask8, a: __m256) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2pd))]
-pub unsafe fn _mm512_cvtpslo_pd(v2: __m512) -> __m512d {
-    transmute(vcvtps2pd(
-        _mm512_castps512_ps256(v2).as_f32x8(),
-        f64x8::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_cvtpslo_pd(v2: __m512) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            _mm512_castps512_ps256(v2).as_f32x8(),
+            f64x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10347,13 +11112,15 @@ pub unsafe fn _mm512_cvtpslo_pd(v2: __m512) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2pd))]
-pub unsafe fn _mm512_mask_cvtpslo_pd(src: __m512d, k: __mmask8, v2: __m512) -> __m512d {
-    transmute(vcvtps2pd(
-        _mm512_castps512_ps256(v2).as_f32x8(),
-        src.as_f64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtpslo_pd(src: __m512d, k: __mmask8, v2: __m512) -> __m512d {
+    unsafe {
+        transmute(vcvtps2pd(
+            _mm512_castps512_ps256(v2).as_f32x8(),
+            src.as_f64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
@@ -10363,13 +11130,15 @@ pub unsafe fn _mm512_mask_cvtpslo_pd(src: __m512d, k: __mmask8, v2: __m512) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub unsafe fn _mm512_cvtpd_ps(a: __m512d) -> __m256 {
-    transmute(vcvtpd2ps(
-        a.as_f64x8(),
-        f32x8::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_cvtpd_ps(a: __m512d) -> __m256 {
+    unsafe {
+        transmute(vcvtpd2ps(
+            a.as_f64x8(),
+            f32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10379,13 +11148,15 @@ pub unsafe fn _mm512_cvtpd_ps(a: __m512d) -> __m256 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub unsafe fn _mm512_mask_cvtpd_ps(src: __m256, k: __mmask8, a: __m512d) -> __m256 {
-    transmute(vcvtpd2ps(
-        a.as_f64x8(),
-        src.as_f32x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtpd_ps(src: __m256, k: __mmask8, a: __m512d) -> __m256 {
+    unsafe {
+        transmute(vcvtpd2ps(
+            a.as_f64x8(),
+            src.as_f32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10395,13 +11166,15 @@ pub unsafe fn _mm512_mask_cvtpd_ps(src: __m256, k: __mmask8, a: __m512d) -> __m2
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub unsafe fn _mm512_maskz_cvtpd_ps(k: __mmask8, a: __m512d) -> __m256 {
-    transmute(vcvtpd2ps(
-        a.as_f64x8(),
-        f32x8::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_cvtpd_ps(k: __mmask8, a: __m512d) -> __m256 {
+    unsafe {
+        transmute(vcvtpd2ps(
+            a.as_f64x8(),
+            f32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10411,9 +11184,11 @@ pub unsafe fn _mm512_maskz_cvtpd_ps(k: __mmask8, a: __m512d) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub unsafe fn _mm256_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m256d) -> __m128 {
-    let convert = _mm256_cvtpd_ps(a);
-    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+pub fn _mm256_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m256d) -> __m128 {
+    unsafe {
+        let convert = _mm256_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10423,9 +11198,11 @@ pub unsafe fn _mm256_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m256d) -> __m1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub unsafe fn _mm256_maskz_cvtpd_ps(k: __mmask8, a: __m256d) -> __m128 {
-    let convert = _mm256_cvtpd_ps(a);
-    transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+pub fn _mm256_maskz_cvtpd_ps(k: __mmask8, a: __m256d) -> __m128 {
+    unsafe {
+        let convert = _mm256_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10435,9 +11212,11 @@ pub unsafe fn _mm256_maskz_cvtpd_ps(k: __mmask8, a: __m256d) -> __m128 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub unsafe fn _mm_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m128d) -> __m128 {
-    let convert = _mm_cvtpd_ps(a);
-    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+pub fn _mm_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m128d) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10447,9 +11226,11 @@ pub unsafe fn _mm_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m128d) -> __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub unsafe fn _mm_maskz_cvtpd_ps(k: __mmask8, a: __m128d) -> __m128 {
-    let convert = _mm_cvtpd_ps(a);
-    transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+pub fn _mm_maskz_cvtpd_ps(k: __mmask8, a: __m128d) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtpd_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
@@ -10459,13 +11240,15 @@ pub unsafe fn _mm_maskz_cvtpd_ps(k: __mmask8, a: __m128d) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub unsafe fn _mm512_cvtpd_epi32(a: __m512d) -> __m256i {
-    transmute(vcvtpd2dq(
-        a.as_f64x8(),
-        i32x8::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_cvtpd_epi32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10475,13 +11258,15 @@ pub unsafe fn _mm512_cvtpd_epi32(a: __m512d) -> __m256i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub unsafe fn _mm512_mask_cvtpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
-    transmute(vcvtpd2dq(
-        a.as_f64x8(),
-        src.as_i32x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2dq(
+            a.as_f64x8(),
+            src.as_i32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10491,13 +11276,15 @@ pub unsafe fn _mm512_mask_cvtpd_epi32(src: __m256i, k: __mmask8, a: __m512d) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub unsafe fn _mm512_maskz_cvtpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
-    transmute(vcvtpd2dq(
-        a.as_f64x8(),
-        i32x8::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_cvtpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10507,9 +11294,11 @@ pub unsafe fn _mm512_maskz_cvtpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub unsafe fn _mm256_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
-    let convert = _mm256_cvtpd_epi32(a);
-    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+pub fn _mm256_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10519,9 +11308,11 @@ pub unsafe fn _mm256_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m256d) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub unsafe fn _mm256_maskz_cvtpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
-    let convert = _mm256_cvtpd_epi32(a);
-    transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+pub fn _mm256_maskz_cvtpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10531,9 +11322,11 @@ pub unsafe fn _mm256_maskz_cvtpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub unsafe fn _mm_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    let convert = _mm_cvtpd_epi32(a);
-    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+pub fn _mm_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10543,9 +11336,11 @@ pub unsafe fn _mm_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq))]
-pub unsafe fn _mm_maskz_cvtpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
-    let convert = _mm_cvtpd_epi32(a);
-    transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+pub fn _mm_maskz_cvtpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtpd_epi32(a);
+        transmute(simd_select_bitmask(k, convert.as_i32x4(), i32x4::ZERO))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
@@ -10555,13 +11350,15 @@ pub unsafe fn _mm_maskz_cvtpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub unsafe fn _mm512_cvtpd_epu32(a: __m512d) -> __m256i {
-    transmute(vcvtpd2udq(
-        a.as_f64x8(),
-        u32x8::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_cvtpd_epu32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2udq(
+            a.as_f64x8(),
+            u32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10571,13 +11368,15 @@ pub unsafe fn _mm512_cvtpd_epu32(a: __m512d) -> __m256i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub unsafe fn _mm512_mask_cvtpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
-    transmute(vcvtpd2udq(
-        a.as_f64x8(),
-        src.as_u32x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2udq(
+            a.as_f64x8(),
+            src.as_u32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10587,13 +11386,15 @@ pub unsafe fn _mm512_mask_cvtpd_epu32(src: __m256i, k: __mmask8, a: __m512d) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub unsafe fn _mm512_maskz_cvtpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
-    transmute(vcvtpd2udq(
-        a.as_f64x8(),
-        u32x8::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_cvtpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvtpd2udq(
+            a.as_f64x8(),
+            u32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
@@ -10603,8 +11404,8 @@ pub unsafe fn _mm512_maskz_cvtpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub unsafe fn _mm256_cvtpd_epu32(a: __m256d) -> __m128i {
-    transmute(vcvtpd2udq256(a.as_f64x4(), u32x4::ZERO, 0b11111111))
+pub fn _mm256_cvtpd_epu32(a: __m256d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), u32x4::ZERO, 0b11111111)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10614,8 +11415,8 @@ pub unsafe fn _mm256_cvtpd_epu32(a: __m256d) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub unsafe fn _mm256_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
-    transmute(vcvtpd2udq256(a.as_f64x4(), src.as_u32x4(), k))
+pub fn _mm256_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), src.as_u32x4(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10625,8 +11426,8 @@ pub unsafe fn _mm256_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m256d) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub unsafe fn _mm256_maskz_cvtpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
-    transmute(vcvtpd2udq256(a.as_f64x4(), u32x4::ZERO, k))
+pub fn _mm256_maskz_cvtpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq256(a.as_f64x4(), u32x4::ZERO, k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
@@ -10636,8 +11437,8 @@ pub unsafe fn _mm256_maskz_cvtpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub unsafe fn _mm_cvtpd_epu32(a: __m128d) -> __m128i {
-    transmute(vcvtpd2udq128(a.as_f64x2(), u32x4::ZERO, 0b11111111))
+pub fn _mm_cvtpd_epu32(a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), u32x4::ZERO, 0b11111111)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10647,8 +11448,8 @@ pub unsafe fn _mm_cvtpd_epu32(a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub unsafe fn _mm_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    transmute(vcvtpd2udq128(a.as_f64x2(), src.as_u32x4(), k))
+pub fn _mm_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), src.as_u32x4(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10658,8 +11459,8 @@ pub unsafe fn _mm_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq))]
-pub unsafe fn _mm_maskz_cvtpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
-    transmute(vcvtpd2udq128(a.as_f64x2(), u32x4::ZERO, k))
+pub fn _mm_maskz_cvtpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvtpd2udq128(a.as_f64x2(), u32x4::ZERO, k)) }
 }
 
 /// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
@@ -10669,18 +11470,20 @@ pub unsafe fn _mm_maskz_cvtpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub unsafe fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
-    let r: f32x8 = vcvtpd2ps(
-        v2.as_f64x8(),
-        f32x8::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    simd_shuffle!(
-        r,
-        f32x8::ZERO,
-        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
-    )
+pub fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
+    unsafe {
+        let r: f32x8 = vcvtpd2ps(
+            v2.as_f64x8(),
+            f32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        simd_shuffle!(
+            r,
+            f32x8::ZERO,
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
 }
 
 /// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
@@ -10690,18 +11493,20 @@ pub unsafe fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
-pub unsafe fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> __m512 {
-    let r: f32x8 = vcvtpd2ps(
-        v2.as_f64x8(),
-        _mm512_castps512_ps256(src).as_f32x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    simd_shuffle!(
-        r,
-        f32x8::ZERO,
-        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
-    )
+pub fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> __m512 {
+    unsafe {
+        let r: f32x8 = vcvtpd2ps(
+            v2.as_f64x8(),
+            _mm512_castps512_ps256(src).as_f32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        simd_shuffle!(
+            r,
+            f32x8::ZERO,
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
@@ -10711,9 +11516,11 @@ pub unsafe fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub unsafe fn _mm512_cvtepi8_epi32(a: __m128i) -> __m512i {
-    let a = a.as_i8x16();
-    transmute::<i32x16, _>(simd_cast(a))
+pub fn _mm512_cvtepi8_epi32(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10723,9 +11530,11 @@ pub unsafe fn _mm512_cvtepi8_epi32(a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub unsafe fn _mm512_mask_cvtepi8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+pub fn _mm512_mask_cvtepi8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10735,9 +11544,11 @@ pub unsafe fn _mm512_mask_cvtepi8_epi32(src: __m512i, k: __mmask16, a: __m128i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub unsafe fn _mm512_maskz_cvtepi8_epi32(k: __mmask16, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+pub fn _mm512_maskz_cvtepi8_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10747,9 +11558,11 @@ pub unsafe fn _mm512_maskz_cvtepi8_epi32(k: __mmask16, a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub unsafe fn _mm256_mask_cvtepi8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+pub fn _mm256_mask_cvtepi8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10759,9 +11572,11 @@ pub unsafe fn _mm256_mask_cvtepi8_epi32(src: __m256i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub unsafe fn _mm256_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+pub fn _mm256_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10771,9 +11586,11 @@ pub unsafe fn _mm256_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub unsafe fn _mm_mask_cvtepi8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi8_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+pub fn _mm_mask_cvtepi8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
 }
 
 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10783,9 +11600,11 @@ pub unsafe fn _mm_mask_cvtepi8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbd))]
-pub unsafe fn _mm_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi8_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+pub fn _mm_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
 }
 
 /// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
@@ -10795,10 +11614,12 @@ pub unsafe fn _mm_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub unsafe fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
-    let a = a.as_i8x16();
-    let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    transmute::<i64x8, _>(simd_cast(v64))
+pub fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i8x16();
+        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<i64x8, _>(simd_cast(v64))
+    }
 }
 
 /// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10808,9 +11629,11 @@ pub unsafe fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub unsafe fn _mm512_mask_cvtepi8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+pub fn _mm512_mask_cvtepi8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
 }
 
 /// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10820,9 +11643,11 @@ pub unsafe fn _mm512_mask_cvtepi8_epi64(src: __m512i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub unsafe fn _mm512_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+pub fn _mm512_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
 }
 
 /// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10832,9 +11657,11 @@ pub unsafe fn _mm512_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub unsafe fn _mm256_mask_cvtepi8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+pub fn _mm256_mask_cvtepi8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
 }
 
 /// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10844,9 +11671,11 @@ pub unsafe fn _mm256_mask_cvtepi8_epi64(src: __m256i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub unsafe fn _mm256_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+pub fn _mm256_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
 }
 
 /// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10856,9 +11685,11 @@ pub unsafe fn _mm256_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub unsafe fn _mm_mask_cvtepi8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi8_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+pub fn _mm_mask_cvtepi8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
 }
 
 /// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10868,9 +11699,11 @@ pub unsafe fn _mm_mask_cvtepi8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
-pub unsafe fn _mm_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi8_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+pub fn _mm_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
@@ -10880,9 +11713,11 @@ pub unsafe fn _mm_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub unsafe fn _mm512_cvtepu8_epi32(a: __m128i) -> __m512i {
-    let a = a.as_u8x16();
-    transmute::<i32x16, _>(simd_cast(a))
+pub fn _mm512_cvtepu8_epi32(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10892,9 +11727,11 @@ pub unsafe fn _mm512_cvtepu8_epi32(a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub unsafe fn _mm512_mask_cvtepu8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+pub fn _mm512_mask_cvtepu8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10904,9 +11741,11 @@ pub unsafe fn _mm512_mask_cvtepu8_epi32(src: __m512i, k: __mmask16, a: __m128i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub unsafe fn _mm512_maskz_cvtepu8_epi32(k: __mmask16, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+pub fn _mm512_maskz_cvtepu8_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10916,9 +11755,11 @@ pub unsafe fn _mm512_maskz_cvtepu8_epi32(k: __mmask16, a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub unsafe fn _mm256_mask_cvtepu8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+pub fn _mm256_mask_cvtepu8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10928,9 +11769,11 @@ pub unsafe fn _mm256_mask_cvtepu8_epi32(src: __m256i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub unsafe fn _mm256_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+pub fn _mm256_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10940,9 +11783,11 @@ pub unsafe fn _mm256_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub unsafe fn _mm_mask_cvtepu8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu8_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+pub fn _mm_mask_cvtepu8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in th elow 4 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10952,9 +11797,11 @@ pub unsafe fn _mm_mask_cvtepu8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbd))]
-pub unsafe fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu8_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+pub fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
@@ -10964,10 +11811,12 @@ pub unsafe fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub unsafe fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
-    let a = a.as_u8x16();
-    let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    transmute::<i64x8, _>(simd_cast(v64))
+pub fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_u8x16();
+        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<i64x8, _>(simd_cast(v64))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -10977,9 +11826,11 @@ pub unsafe fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub unsafe fn _mm512_mask_cvtepu8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+pub fn _mm512_mask_cvtepu8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -10989,9 +11840,11 @@ pub unsafe fn _mm512_mask_cvtepu8_epi64(src: __m512i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub unsafe fn _mm512_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+pub fn _mm512_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11001,9 +11854,11 @@ pub unsafe fn _mm512_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub unsafe fn _mm256_mask_cvtepu8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+pub fn _mm256_mask_cvtepu8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11013,9 +11868,11 @@ pub unsafe fn _mm256_mask_cvtepu8_epi64(src: __m256i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub unsafe fn _mm256_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+pub fn _mm256_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11025,9 +11882,11 @@ pub unsafe fn _mm256_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub unsafe fn _mm_mask_cvtepu8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu8_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+pub fn _mm_mask_cvtepu8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
 }
 
 /// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11037,9 +11896,11 @@ pub unsafe fn _mm_mask_cvtepu8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
-pub unsafe fn _mm_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu8_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+pub fn _mm_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu8_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
@@ -11049,9 +11910,11 @@ pub unsafe fn _mm_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub unsafe fn _mm512_cvtepi16_epi32(a: __m256i) -> __m512i {
-    let a = a.as_i16x16();
-    transmute::<i32x16, _>(simd_cast(a))
+pub fn _mm512_cvtepi16_epi32(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11061,9 +11924,11 @@ pub unsafe fn _mm512_cvtepi16_epi32(a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub unsafe fn _mm512_mask_cvtepi16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+pub fn _mm512_mask_cvtepi16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11073,9 +11938,11 @@ pub unsafe fn _mm512_mask_cvtepi16_epi32(src: __m512i, k: __mmask16, a: __m256i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub unsafe fn _mm512_maskz_cvtepi16_epi32(k: __mmask16, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+pub fn _mm512_maskz_cvtepi16_epi32(k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11085,9 +11952,11 @@ pub unsafe fn _mm512_maskz_cvtepi16_epi32(k: __mmask16, a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub unsafe fn _mm256_mask_cvtepi16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+pub fn _mm256_mask_cvtepi16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11097,9 +11966,11 @@ pub unsafe fn _mm256_mask_cvtepi16_epi32(src: __m256i, k: __mmask8, a: __m128i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub unsafe fn _mm256_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+pub fn _mm256_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11109,9 +11980,11 @@ pub unsafe fn _mm256_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub unsafe fn _mm_mask_cvtepi16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi16_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+pub fn _mm_mask_cvtepi16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11121,9 +11994,11 @@ pub unsafe fn _mm_mask_cvtepi16_epi32(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwd))]
-pub unsafe fn _mm_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi16_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+pub fn _mm_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
@@ -11133,9 +12008,11 @@ pub unsafe fn _mm_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub unsafe fn _mm512_cvtepi16_epi64(a: __m128i) -> __m512i {
-    let a = a.as_i16x8();
-    transmute::<i64x8, _>(simd_cast(a))
+pub fn _mm512_cvtepi16_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i16x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11145,9 +12022,11 @@ pub unsafe fn _mm512_cvtepi16_epi64(a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub unsafe fn _mm512_mask_cvtepi16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+pub fn _mm512_mask_cvtepi16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11157,9 +12036,11 @@ pub unsafe fn _mm512_mask_cvtepi16_epi64(src: __m512i, k: __mmask8, a: __m128i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub unsafe fn _mm512_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+pub fn _mm512_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11169,9 +12050,11 @@ pub unsafe fn _mm512_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub unsafe fn _mm256_mask_cvtepi16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+pub fn _mm256_mask_cvtepi16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11181,9 +12064,11 @@ pub unsafe fn _mm256_mask_cvtepi16_epi64(src: __m256i, k: __mmask8, a: __m128i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub unsafe fn _mm256_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+pub fn _mm256_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11193,9 +12078,11 @@ pub unsafe fn _mm256_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub unsafe fn _mm_mask_cvtepi16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi16_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+pub fn _mm_mask_cvtepi16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
 }
 
 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11205,9 +12092,11 @@ pub unsafe fn _mm_mask_cvtepi16_epi64(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxwq))]
-pub unsafe fn _mm_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi16_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+pub fn _mm_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
@@ -11217,9 +12106,11 @@ pub unsafe fn _mm_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub unsafe fn _mm512_cvtepu16_epi32(a: __m256i) -> __m512i {
-    let a = a.as_u16x16();
-    transmute::<i32x16, _>(simd_cast(a))
+pub fn _mm512_cvtepu16_epi32(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x16();
+        transmute::<i32x16, _>(simd_cast(a))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11229,9 +12120,11 @@ pub unsafe fn _mm512_cvtepu16_epi32(a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub unsafe fn _mm512_mask_cvtepu16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+pub fn _mm512_mask_cvtepu16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11241,9 +12134,11 @@ pub unsafe fn _mm512_mask_cvtepu16_epi32(src: __m512i, k: __mmask16, a: __m256i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub unsafe fn _mm512_maskz_cvtepu16_epi32(k: __mmask16, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+pub fn _mm512_maskz_cvtepu16_epi32(k: __mmask16, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, convert, i32x16::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11253,9 +12148,11 @@ pub unsafe fn _mm512_maskz_cvtepu16_epi32(k: __mmask16, a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub unsafe fn _mm256_mask_cvtepu16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+pub fn _mm256_mask_cvtepu16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11265,9 +12162,11 @@ pub unsafe fn _mm256_mask_cvtepu16_epi32(src: __m256i, k: __mmask8, a: __m128i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub unsafe fn _mm256_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+pub fn _mm256_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11277,9 +12176,11 @@ pub unsafe fn _mm256_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub unsafe fn _mm_mask_cvtepu16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu16_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+pub fn _mm_mask_cvtepu16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11289,9 +12190,11 @@ pub unsafe fn _mm_mask_cvtepu16_epi32(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwd))]
-pub unsafe fn _mm_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu16_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+pub fn _mm_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
@@ -11301,9 +12204,11 @@ pub unsafe fn _mm_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub unsafe fn _mm512_cvtepu16_epi64(a: __m128i) -> __m512i {
-    let a = a.as_u16x8();
-    transmute::<i64x8, _>(simd_cast(a))
+pub fn _mm512_cvtepu16_epi64(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_u16x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11313,9 +12218,11 @@ pub unsafe fn _mm512_cvtepu16_epi64(a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub unsafe fn _mm512_mask_cvtepu16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+pub fn _mm512_mask_cvtepu16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11325,9 +12232,11 @@ pub unsafe fn _mm512_mask_cvtepu16_epi64(src: __m512i, k: __mmask8, a: __m128i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub unsafe fn _mm512_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m512i {
-    let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+pub fn _mm512_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11337,9 +12246,11 @@ pub unsafe fn _mm512_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub unsafe fn _mm256_mask_cvtepu16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+pub fn _mm256_mask_cvtepu16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11349,9 +12260,11 @@ pub unsafe fn _mm256_mask_cvtepu16_epi64(src: __m256i, k: __mmask8, a: __m128i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub unsafe fn _mm256_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+pub fn _mm256_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11361,9 +12274,11 @@ pub unsafe fn _mm256_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub unsafe fn _mm_mask_cvtepu16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu16_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+pub fn _mm_mask_cvtepu16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
 }
 
 /// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11373,9 +12288,11 @@ pub unsafe fn _mm_mask_cvtepu16_epi64(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxwq))]
-pub unsafe fn _mm_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu16_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+pub fn _mm_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu16_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
 }
 
 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
@@ -11385,9 +12302,11 @@ pub unsafe fn _mm_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub unsafe fn _mm512_cvtepi32_epi64(a: __m256i) -> __m512i {
-    let a = a.as_i32x8();
-    transmute::<i64x8, _>(simd_cast(a))
+pub fn _mm512_cvtepi32_epi64(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
 }
 
 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11397,9 +12316,11 @@ pub unsafe fn _mm512_cvtepi32_epi64(a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub unsafe fn _mm512_mask_cvtepi32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+pub fn _mm512_mask_cvtepi32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
 }
 
 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11409,9 +12330,11 @@ pub unsafe fn _mm512_mask_cvtepi32_epi64(src: __m512i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub unsafe fn _mm512_maskz_cvtepi32_epi64(k: __mmask8, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+pub fn _mm512_maskz_cvtepi32_epi64(k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
 }
 
 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11421,9 +12344,11 @@ pub unsafe fn _mm512_maskz_cvtepi32_epi64(k: __mmask8, a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub unsafe fn _mm256_mask_cvtepi32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+pub fn _mm256_mask_cvtepi32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
 }
 
 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11433,9 +12358,11 @@ pub unsafe fn _mm256_mask_cvtepi32_epi64(src: __m256i, k: __mmask8, a: __m128i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub unsafe fn _mm256_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+pub fn _mm256_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
 }
 
 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11445,9 +12372,11 @@ pub unsafe fn _mm256_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub unsafe fn _mm_mask_cvtepi32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi32_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+pub fn _mm_mask_cvtepi32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
 }
 
 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11457,9 +12386,11 @@ pub unsafe fn _mm_mask_cvtepi32_epi64(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsxdq))]
-pub unsafe fn _mm_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepi32_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+pub fn _mm_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepi32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
@@ -11469,9 +12400,11 @@ pub unsafe fn _mm_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub unsafe fn _mm512_cvtepu32_epi64(a: __m256i) -> __m512i {
-    let a = a.as_u32x8();
-    transmute::<i64x8, _>(simd_cast(a))
+pub fn _mm512_cvtepu32_epi64(a: __m256i) -> __m512i {
+    unsafe {
+        let a = a.as_u32x8();
+        transmute::<i64x8, _>(simd_cast(a))
+    }
 }
 
 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11481,9 +12414,11 @@ pub unsafe fn _mm512_cvtepu32_epi64(a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub unsafe fn _mm512_mask_cvtepu32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+pub fn _mm512_mask_cvtepu32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
+    }
 }
 
 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11493,9 +12428,11 @@ pub unsafe fn _mm512_mask_cvtepu32_epi64(src: __m512i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub unsafe fn _mm512_maskz_cvtepu32_epi64(k: __mmask8, a: __m256i) -> __m512i {
-    let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+pub fn _mm512_maskz_cvtepu32_epi64(k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, convert, i64x8::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11505,9 +12442,11 @@ pub unsafe fn _mm512_maskz_cvtepu32_epi64(k: __mmask8, a: __m256i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub unsafe fn _mm256_mask_cvtepu32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+pub fn _mm256_mask_cvtepu32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
+    }
 }
 
 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11517,9 +12456,11 @@ pub unsafe fn _mm256_mask_cvtepu32_epi64(src: __m256i, k: __mmask8, a: __m128i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub unsafe fn _mm256_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+pub fn _mm256_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, convert, i64x4::ZERO))
+    }
 }
 
 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11529,9 +12470,11 @@ pub unsafe fn _mm256_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub unsafe fn _mm_mask_cvtepu32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu32_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+pub fn _mm_mask_cvtepu32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
+    }
 }
 
 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11541,9 +12484,11 @@ pub unsafe fn _mm_mask_cvtepu32_epi64(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovzxdq))]
-pub unsafe fn _mm_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let convert = _mm_cvtepu32_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+pub fn _mm_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let convert = _mm_cvtepu32_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, convert, i64x2::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
@@ -11553,9 +12498,11 @@ pub unsafe fn _mm_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub unsafe fn _mm512_cvtepi32_ps(a: __m512i) -> __m512 {
-    let a = a.as_i32x16();
-    transmute::<f32x16, _>(simd_cast(a))
+pub fn _mm512_cvtepi32_ps(a: __m512i) -> __m512 {
+    unsafe {
+        let a = a.as_i32x16();
+        transmute::<f32x16, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11565,9 +12512,11 @@ pub unsafe fn _mm512_cvtepi32_ps(a: __m512i) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub unsafe fn _mm512_mask_cvtepi32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
-    let convert = _mm512_cvtepi32_ps(a).as_f32x16();
-    transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
+pub fn _mm512_mask_cvtepi32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepi32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11577,9 +12526,11 @@ pub unsafe fn _mm512_mask_cvtepi32_ps(src: __m512, k: __mmask16, a: __m512i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub unsafe fn _mm512_maskz_cvtepi32_ps(k: __mmask16, a: __m512i) -> __m512 {
-    let convert = _mm512_cvtepi32_ps(a).as_f32x16();
-    transmute(simd_select_bitmask(k, convert, f32x16::ZERO))
+pub fn _mm512_maskz_cvtepi32_ps(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepi32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, f32x16::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11589,9 +12540,11 @@ pub unsafe fn _mm512_maskz_cvtepi32_ps(k: __mmask16, a: __m512i) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub unsafe fn _mm256_mask_cvtepi32_ps(src: __m256, k: __mmask8, a: __m256i) -> __m256 {
-    let convert = _mm256_cvtepi32_ps(a).as_f32x8();
-    transmute(simd_select_bitmask(k, convert, src.as_f32x8()))
+pub fn _mm256_mask_cvtepi32_ps(src: __m256, k: __mmask8, a: __m256i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtepi32_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x8()))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11601,9 +12554,11 @@ pub unsafe fn _mm256_mask_cvtepi32_ps(src: __m256, k: __mmask8, a: __m256i) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub unsafe fn _mm256_maskz_cvtepi32_ps(k: __mmask8, a: __m256i) -> __m256 {
-    let convert = _mm256_cvtepi32_ps(a).as_f32x8();
-    transmute(simd_select_bitmask(k, convert, f32x8::ZERO))
+pub fn _mm256_maskz_cvtepi32_ps(k: __mmask8, a: __m256i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtepi32_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, convert, f32x8::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11613,9 +12568,11 @@ pub unsafe fn _mm256_maskz_cvtepi32_ps(k: __mmask8, a: __m256i) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub unsafe fn _mm_mask_cvtepi32_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
-    let convert = _mm_cvtepi32_ps(a).as_f32x4();
-    transmute(simd_select_bitmask(k, convert, src.as_f32x4()))
+pub fn _mm_mask_cvtepi32_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtepi32_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x4()))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11625,9 +12582,11 @@ pub unsafe fn _mm_mask_cvtepi32_ps(src: __m128, k: __mmask8, a: __m128i) -> __m1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
-pub unsafe fn _mm_maskz_cvtepi32_ps(k: __mmask8, a: __m128i) -> __m128 {
-    let convert = _mm_cvtepi32_ps(a).as_f32x4();
-    transmute(simd_select_bitmask(k, convert, f32x4::ZERO))
+pub fn _mm_maskz_cvtepi32_ps(k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtepi32_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, convert, f32x4::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
@@ -11637,9 +12596,11 @@ pub unsafe fn _mm_maskz_cvtepi32_ps(k: __mmask8, a: __m128i) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub unsafe fn _mm512_cvtepi32_pd(a: __m256i) -> __m512d {
-    let a = a.as_i32x8();
-    transmute::<f64x8, _>(simd_cast(a))
+pub fn _mm512_cvtepi32_pd(a: __m256i) -> __m512d {
+    unsafe {
+        let a = a.as_i32x8();
+        transmute::<f64x8, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11649,9 +12610,11 @@ pub unsafe fn _mm512_cvtepi32_pd(a: __m256i) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub unsafe fn _mm512_mask_cvtepi32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
-    let convert = _mm512_cvtepi32_pd(a).as_f64x8();
-    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+pub fn _mm512_mask_cvtepi32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepi32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11661,9 +12624,11 @@ pub unsafe fn _mm512_mask_cvtepi32_pd(src: __m512d, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub unsafe fn _mm512_maskz_cvtepi32_pd(k: __mmask8, a: __m256i) -> __m512d {
-    let convert = _mm512_cvtepi32_pd(a).as_f64x8();
-    transmute(simd_select_bitmask(k, convert, f64x8::ZERO))
+pub fn _mm512_maskz_cvtepi32_pd(k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepi32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, f64x8::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11673,9 +12638,11 @@ pub unsafe fn _mm512_maskz_cvtepi32_pd(k: __mmask8, a: __m256i) -> __m512d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub unsafe fn _mm256_mask_cvtepi32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
-    let convert = _mm256_cvtepi32_pd(a).as_f64x4();
-    transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
+pub fn _mm256_mask_cvtepi32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepi32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11685,9 +12652,11 @@ pub unsafe fn _mm256_mask_cvtepi32_pd(src: __m256d, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub unsafe fn _mm256_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m256d {
-    let convert = _mm256_cvtepi32_pd(a).as_f64x4();
-    transmute(simd_select_bitmask(k, convert, f64x4::ZERO))
+pub fn _mm256_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepi32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, f64x4::ZERO))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11697,9 +12666,11 @@ pub unsafe fn _mm256_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub unsafe fn _mm_mask_cvtepi32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
-    let convert = _mm_cvtepi32_pd(a).as_f64x2();
-    transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
+pub fn _mm_mask_cvtepi32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepi32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11709,9 +12680,11 @@ pub unsafe fn _mm_mask_cvtepi32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub unsafe fn _mm_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m128d {
-    let convert = _mm_cvtepi32_pd(a).as_f64x2();
-    transmute(simd_select_bitmask(k, convert, f64x2::ZERO))
+pub fn _mm_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepi32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, f64x2::ZERO))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
@@ -11721,9 +12694,11 @@ pub unsafe fn _mm_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2ps))]
-pub unsafe fn _mm512_cvtepu32_ps(a: __m512i) -> __m512 {
-    let a = a.as_u32x16();
-    transmute::<f32x16, _>(simd_cast(a))
+pub fn _mm512_cvtepu32_ps(a: __m512i) -> __m512 {
+    unsafe {
+        let a = a.as_u32x16();
+        transmute::<f32x16, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11733,9 +12708,11 @@ pub unsafe fn _mm512_cvtepu32_ps(a: __m512i) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2ps))]
-pub unsafe fn _mm512_mask_cvtepu32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
-    let convert = _mm512_cvtepu32_ps(a).as_f32x16();
-    transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
+pub fn _mm512_mask_cvtepu32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepu32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11745,9 +12722,11 @@ pub unsafe fn _mm512_mask_cvtepu32_ps(src: __m512, k: __mmask16, a: __m512i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2ps))]
-pub unsafe fn _mm512_maskz_cvtepu32_ps(k: __mmask16, a: __m512i) -> __m512 {
-    let convert = _mm512_cvtepu32_ps(a).as_f32x16();
-    transmute(simd_select_bitmask(k, convert, f32x16::ZERO))
+pub fn _mm512_maskz_cvtepu32_ps(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        let convert = _mm512_cvtepu32_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, convert, f32x16::ZERO))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
@@ -11757,9 +12736,11 @@ pub unsafe fn _mm512_maskz_cvtepu32_ps(k: __mmask16, a: __m512i) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub unsafe fn _mm512_cvtepu32_pd(a: __m256i) -> __m512d {
-    let a = a.as_u32x8();
-    transmute::<f64x8, _>(simd_cast(a))
+pub fn _mm512_cvtepu32_pd(a: __m256i) -> __m512d {
+    unsafe {
+        let a = a.as_u32x8();
+        transmute::<f64x8, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11769,9 +12750,11 @@ pub unsafe fn _mm512_cvtepu32_pd(a: __m256i) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub unsafe fn _mm512_mask_cvtepu32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
-    let convert = _mm512_cvtepu32_pd(a).as_f64x8();
-    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+pub fn _mm512_mask_cvtepu32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepu32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11781,9 +12764,11 @@ pub unsafe fn _mm512_mask_cvtepu32_pd(src: __m512d, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub unsafe fn _mm512_maskz_cvtepu32_pd(k: __mmask8, a: __m256i) -> __m512d {
-    let convert = _mm512_cvtepu32_pd(a).as_f64x8();
-    transmute(simd_select_bitmask(k, convert, f64x8::ZERO))
+pub fn _mm512_maskz_cvtepu32_pd(k: __mmask8, a: __m256i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepu32_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, f64x8::ZERO))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
@@ -11793,9 +12778,11 @@ pub unsafe fn _mm512_maskz_cvtepu32_pd(k: __mmask8, a: __m256i) -> __m512d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub unsafe fn _mm256_cvtepu32_pd(a: __m128i) -> __m256d {
-    let a = a.as_u32x4();
-    transmute::<f64x4, _>(simd_cast(a))
+pub fn _mm256_cvtepu32_pd(a: __m128i) -> __m256d {
+    unsafe {
+        let a = a.as_u32x4();
+        transmute::<f64x4, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11805,9 +12792,11 @@ pub unsafe fn _mm256_cvtepu32_pd(a: __m128i) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub unsafe fn _mm256_mask_cvtepu32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
-    let convert = _mm256_cvtepu32_pd(a).as_f64x4();
-    transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
+pub fn _mm256_mask_cvtepu32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepu32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11817,9 +12806,11 @@ pub unsafe fn _mm256_mask_cvtepu32_pd(src: __m256d, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub unsafe fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
-    let convert = _mm256_cvtepu32_pd(a).as_f64x4();
-    transmute(simd_select_bitmask(k, convert, f64x4::ZERO))
+pub fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
+    unsafe {
+        let convert = _mm256_cvtepu32_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, convert, f64x4::ZERO))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
@@ -11829,10 +12820,12 @@ pub unsafe fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub unsafe fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
-    let a = a.as_u32x4();
-    let u64: u32x2 = simd_shuffle!(a, a, [0, 1]);
-    transmute::<f64x2, _>(simd_cast(u64))
+pub fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
+    unsafe {
+        let a = a.as_u32x4();
+        let u64: u32x2 = simd_shuffle!(a, a, [0, 1]);
+        transmute::<f64x2, _>(simd_cast(u64))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11842,9 +12835,11 @@ pub unsafe fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub unsafe fn _mm_mask_cvtepu32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
-    let convert = _mm_cvtepu32_pd(a).as_f64x2();
-    transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
+pub fn _mm_mask_cvtepu32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepu32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11854,9 +12849,11 @@ pub unsafe fn _mm_mask_cvtepu32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub unsafe fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
-    let convert = _mm_cvtepu32_pd(a).as_f64x2();
-    transmute(simd_select_bitmask(k, convert, f64x2::ZERO))
+pub fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
+    unsafe {
+        let convert = _mm_cvtepu32_pd(a).as_f64x2();
+        transmute(simd_select_bitmask(k, convert, f64x2::ZERO))
+    }
 }
 
 /// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
@@ -11866,10 +12863,12 @@ pub unsafe fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub unsafe fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
-    let v2 = v2.as_i32x16();
-    let v256: i32x8 = simd_shuffle!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
-    transmute::<f64x8, _>(simd_cast(v256))
+pub fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
+    unsafe {
+        let v2 = v2.as_i32x16();
+        let v256: i32x8 = simd_shuffle!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<f64x8, _>(simd_cast(v256))
+    }
 }
 
 /// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11879,9 +12878,11 @@ pub unsafe fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
-pub unsafe fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
-    let convert = _mm512_cvtepi32lo_pd(v2).as_f64x8();
-    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+pub fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepi32lo_pd(v2).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
 }
 
 /// Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
@@ -11891,10 +12892,12 @@ pub unsafe fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub unsafe fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
-    let v2 = v2.as_u32x16();
-    let v256: u32x8 = simd_shuffle!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
-    transmute::<f64x8, _>(simd_cast(v256))
+pub fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
+    unsafe {
+        let v2 = v2.as_u32x16();
+        let v256: u32x8 = simd_shuffle!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
+        transmute::<f64x8, _>(simd_cast(v256))
+    }
 }
 
 /// Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11904,9 +12907,11 @@ pub unsafe fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
-pub unsafe fn _mm512_mask_cvtepu32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
-    let convert = _mm512_cvtepu32lo_pd(v2).as_f64x8();
-    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+pub fn _mm512_mask_cvtepu32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
+    unsafe {
+        let convert = _mm512_cvtepu32lo_pd(v2).as_f64x8();
+        transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
+    }
 }
 
 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
@@ -11916,9 +12921,11 @@ pub unsafe fn _mm512_mask_cvtepu32lo_pd(src: __m512d, k: __mmask8, v2: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm512_cvtepi32_epi16(a: __m512i) -> __m256i {
-    let a = a.as_i32x16();
-    transmute::<i16x16, _>(simd_cast(a))
+pub fn _mm512_cvtepi32_epi16(a: __m512i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x16();
+        transmute::<i16x16, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11928,9 +12935,11 @@ pub unsafe fn _mm512_cvtepi32_epi16(a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm512_mask_cvtepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
-    let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+pub fn _mm512_mask_cvtepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
+    }
 }
 
 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11940,9 +12949,11 @@ pub unsafe fn _mm512_mask_cvtepi32_epi16(src: __m256i, k: __mmask16, a: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm512_maskz_cvtepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
-    let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
-    transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+pub fn _mm512_maskz_cvtepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
+        transmute(simd_select_bitmask(k, convert, i16x16::ZERO))
+    }
 }
 
 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
@@ -11952,9 +12963,11 @@ pub unsafe fn _mm512_maskz_cvtepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm256_cvtepi32_epi16(a: __m256i) -> __m128i {
-    let a = a.as_i32x8();
-    transmute::<i16x8, _>(simd_cast(a))
+pub fn _mm256_cvtepi32_epi16(a: __m256i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x8();
+        transmute::<i16x8, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11964,9 +12977,11 @@ pub unsafe fn _mm256_cvtepi32_epi16(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm256_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+pub fn _mm256_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
 }
 
 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -11976,9 +12991,11 @@ pub unsafe fn _mm256_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm256_maskz_cvtepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+pub fn _mm256_maskz_cvtepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
 }
 
 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
@@ -11988,8 +13005,8 @@ pub unsafe fn _mm256_maskz_cvtepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm_cvtepi32_epi16(a: __m128i) -> __m128i {
-    transmute(vpmovdw128(a.as_i32x4(), i16x8::ZERO, 0b11111111))
+pub fn _mm_cvtepi32_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdw128(a.as_i32x4(), i16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -11999,8 +13016,8 @@ pub unsafe fn _mm_cvtepi32_epi16(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovdw128(a.as_i32x4(), src.as_i16x8(), k))
+pub fn _mm_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdw128(a.as_i32x4(), src.as_i16x8(), k)) }
 }
 
 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12010,8 +13027,8 @@ pub unsafe fn _mm_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdw))]
-pub unsafe fn _mm_maskz_cvtepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovdw128(a.as_i32x4(), i16x8::ZERO, k))
+pub fn _mm_maskz_cvtepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdw128(a.as_i32x4(), i16x8::ZERO, k)) }
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
@@ -12021,9 +13038,11 @@ pub unsafe fn _mm_maskz_cvtepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm512_cvtepi32_epi8(a: __m512i) -> __m128i {
-    let a = a.as_i32x16();
-    transmute::<i8x16, _>(simd_cast(a))
+pub fn _mm512_cvtepi32_epi8(a: __m512i) -> __m128i {
+    unsafe {
+        let a = a.as_i32x16();
+        transmute::<i8x16, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12033,9 +13052,11 @@ pub unsafe fn _mm512_cvtepi32_epi8(a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm512_mask_cvtepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
-    let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
-    transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+pub fn _mm512_mask_cvtepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
+    }
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12045,9 +13066,11 @@ pub unsafe fn _mm512_mask_cvtepi32_epi8(src: __m128i, k: __mmask16, a: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm512_maskz_cvtepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
-    let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
-    transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+pub fn _mm512_maskz_cvtepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
+        transmute(simd_select_bitmask(k, convert, i8x16::ZERO))
+    }
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
@@ -12057,8 +13080,8 @@ pub unsafe fn _mm512_maskz_cvtepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm256_cvtepi32_epi8(a: __m256i) -> __m128i {
-    transmute(vpmovdb256(a.as_i32x8(), i8x16::ZERO, 0b11111111))
+pub fn _mm256_cvtepi32_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovdb256(a.as_i32x8(), i8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12068,8 +13091,8 @@ pub unsafe fn _mm256_cvtepi32_epi8(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm256_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovdb256(a.as_i32x8(), src.as_i8x16(), k))
+pub fn _mm256_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovdb256(a.as_i32x8(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12079,8 +13102,8 @@ pub unsafe fn _mm256_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm256_maskz_cvtepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovdb256(a.as_i32x8(), i8x16::ZERO, k))
+pub fn _mm256_maskz_cvtepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovdb256(a.as_i32x8(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
@@ -12090,8 +13113,8 @@ pub unsafe fn _mm256_maskz_cvtepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm_cvtepi32_epi8(a: __m128i) -> __m128i {
-    transmute(vpmovdb128(a.as_i32x4(), i8x16::ZERO, 0b11111111))
+pub fn _mm_cvtepi32_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdb128(a.as_i32x4(), i8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12101,8 +13124,8 @@ pub unsafe fn _mm_cvtepi32_epi8(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovdb128(a.as_i32x4(), src.as_i8x16(), k))
+pub fn _mm_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdb128(a.as_i32x4(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12112,8 +13135,8 @@ pub unsafe fn _mm_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovdb))]
-pub unsafe fn _mm_maskz_cvtepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovdb128(a.as_i32x4(), i8x16::ZERO, k))
+pub fn _mm_maskz_cvtepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovdb128(a.as_i32x4(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
@@ -12123,9 +13146,11 @@ pub unsafe fn _mm_maskz_cvtepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm512_cvtepi64_epi32(a: __m512i) -> __m256i {
-    let a = a.as_i64x8();
-    transmute::<i32x8, _>(simd_cast(a))
+pub fn _mm512_cvtepi64_epi32(a: __m512i) -> __m256i {
+    unsafe {
+        let a = a.as_i64x8();
+        transmute::<i32x8, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12135,9 +13160,11 @@ pub unsafe fn _mm512_cvtepi64_epi32(a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm512_mask_cvtepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
-    let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+pub fn _mm512_mask_cvtepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
+    }
 }
 
 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12147,9 +13174,11 @@ pub unsafe fn _mm512_mask_cvtepi64_epi32(src: __m256i, k: __mmask8, a: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm512_maskz_cvtepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
-    let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+pub fn _mm512_maskz_cvtepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, convert, i32x8::ZERO))
+    }
 }
 
 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
@@ -12159,9 +13188,11 @@ pub unsafe fn _mm512_maskz_cvtepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm256_cvtepi64_epi32(a: __m256i) -> __m128i {
-    let a = a.as_i64x4();
-    transmute::<i32x4, _>(simd_cast(a))
+pub fn _mm256_cvtepi64_epi32(a: __m256i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x4();
+        transmute::<i32x4, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12171,9 +13202,11 @@ pub unsafe fn _mm256_cvtepi64_epi32(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm256_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+pub fn _mm256_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
+    }
 }
 
 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12183,9 +13216,11 @@ pub unsafe fn _mm256_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm256_maskz_cvtepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
-    let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+pub fn _mm256_maskz_cvtepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, convert, i32x4::ZERO))
+    }
 }
 
 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
@@ -12195,8 +13230,8 @@ pub unsafe fn _mm256_maskz_cvtepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm_cvtepi64_epi32(a: __m128i) -> __m128i {
-    transmute(vpmovqd128(a.as_i64x2(), i32x4::ZERO, 0b11111111))
+pub fn _mm_cvtepi64_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqd128(a.as_i64x2(), i32x4::ZERO, 0b11111111)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12206,8 +13241,8 @@ pub unsafe fn _mm_cvtepi64_epi32(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovqd128(a.as_i64x2(), src.as_i32x4(), k))
+pub fn _mm_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqd128(a.as_i64x2(), src.as_i32x4(), k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12217,8 +13252,8 @@ pub unsafe fn _mm_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqd))]
-pub unsafe fn _mm_maskz_cvtepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovqd128(a.as_i64x2(), i32x4::ZERO, k))
+pub fn _mm_maskz_cvtepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqd128(a.as_i64x2(), i32x4::ZERO, k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
@@ -12228,9 +13263,11 @@ pub unsafe fn _mm_maskz_cvtepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm512_cvtepi64_epi16(a: __m512i) -> __m128i {
-    let a = a.as_i64x8();
-    transmute::<i16x8, _>(simd_cast(a))
+pub fn _mm512_cvtepi64_epi16(a: __m512i) -> __m128i {
+    unsafe {
+        let a = a.as_i64x8();
+        transmute::<i16x8, _>(simd_cast(a))
+    }
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12240,9 +13277,11 @@ pub unsafe fn _mm512_cvtepi64_epi16(a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm512_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+pub fn _mm512_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
+    }
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12252,9 +13291,11 @@ pub unsafe fn _mm512_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm512_maskz_cvtepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
-    let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
-    transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+pub fn _mm512_maskz_cvtepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
+        transmute(simd_select_bitmask(k, convert, i16x8::ZERO))
+    }
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
@@ -12264,8 +13305,8 @@ pub unsafe fn _mm512_maskz_cvtepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm256_cvtepi64_epi16(a: __m256i) -> __m128i {
-    transmute(vpmovqw256(a.as_i64x4(), i16x8::ZERO, 0b11111111))
+pub fn _mm256_cvtepi64_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqw256(a.as_i64x4(), i16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12275,8 +13316,8 @@ pub unsafe fn _mm256_cvtepi64_epi16(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm256_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovqw256(a.as_i64x4(), src.as_i16x8(), k))
+pub fn _mm256_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqw256(a.as_i64x4(), src.as_i16x8(), k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12286,8 +13327,8 @@ pub unsafe fn _mm256_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm256_maskz_cvtepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovqw256(a.as_i64x4(), i16x8::ZERO, k))
+pub fn _mm256_maskz_cvtepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqw256(a.as_i64x4(), i16x8::ZERO, k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
@@ -12297,8 +13338,8 @@ pub unsafe fn _mm256_maskz_cvtepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm_cvtepi64_epi16(a: __m128i) -> __m128i {
-    transmute(vpmovqw128(a.as_i64x2(), i16x8::ZERO, 0b11111111))
+pub fn _mm_cvtepi64_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqw128(a.as_i64x2(), i16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12308,8 +13349,8 @@ pub unsafe fn _mm_cvtepi64_epi16(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovqw128(a.as_i64x2(), src.as_i16x8(), k))
+pub fn _mm_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqw128(a.as_i64x2(), src.as_i16x8(), k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12319,8 +13360,8 @@ pub unsafe fn _mm_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqw))]
-pub unsafe fn _mm_maskz_cvtepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovqw128(a.as_i64x2(), i16x8::ZERO, k))
+pub fn _mm_maskz_cvtepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqw128(a.as_i64x2(), i16x8::ZERO, k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
@@ -12330,8 +13371,8 @@ pub unsafe fn _mm_maskz_cvtepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm512_cvtepi64_epi8(a: __m512i) -> __m128i {
-    transmute(vpmovqb(a.as_i64x8(), i8x16::ZERO, 0b11111111))
+pub fn _mm512_cvtepi64_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovqb(a.as_i64x8(), i8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12341,8 +13382,8 @@ pub unsafe fn _mm512_cvtepi64_epi8(a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm512_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    transmute(vpmovqb(a.as_i64x8(), src.as_i8x16(), k))
+pub fn _mm512_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovqb(a.as_i64x8(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12352,8 +13393,8 @@ pub unsafe fn _mm512_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm512_maskz_cvtepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
-    transmute(vpmovqb(a.as_i64x8(), i8x16::ZERO, k))
+pub fn _mm512_maskz_cvtepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovqb(a.as_i64x8(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
@@ -12363,8 +13404,8 @@ pub unsafe fn _mm512_maskz_cvtepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm256_cvtepi64_epi8(a: __m256i) -> __m128i {
-    transmute(vpmovqb256(a.as_i64x4(), i8x16::ZERO, 0b11111111))
+pub fn _mm256_cvtepi64_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqb256(a.as_i64x4(), i8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12374,8 +13415,8 @@ pub unsafe fn _mm256_cvtepi64_epi8(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm256_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovqb256(a.as_i64x4(), src.as_i8x16(), k))
+pub fn _mm256_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqb256(a.as_i64x4(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12385,8 +13426,8 @@ pub unsafe fn _mm256_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm256_maskz_cvtepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovqb256(a.as_i64x4(), i8x16::ZERO, k))
+pub fn _mm256_maskz_cvtepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovqb256(a.as_i64x4(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
@@ -12396,8 +13437,8 @@ pub unsafe fn _mm256_maskz_cvtepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm_cvtepi64_epi8(a: __m128i) -> __m128i {
-    transmute(vpmovqb128(a.as_i64x2(), i8x16::ZERO, 0b11111111))
+pub fn _mm_cvtepi64_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqb128(a.as_i64x2(), i8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12407,8 +13448,8 @@ pub unsafe fn _mm_cvtepi64_epi8(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovqb128(a.as_i64x2(), src.as_i8x16(), k))
+pub fn _mm_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqb128(a.as_i64x2(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12418,8 +13459,8 @@ pub unsafe fn _mm_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovqb))]
-pub unsafe fn _mm_maskz_cvtepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovqb128(a.as_i64x2(), i8x16::ZERO, k))
+pub fn _mm_maskz_cvtepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovqb128(a.as_i64x2(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
@@ -12429,8 +13470,8 @@ pub unsafe fn _mm_maskz_cvtepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm512_cvtsepi32_epi16(a: __m512i) -> __m256i {
-    transmute(vpmovsdw(a.as_i32x16(), i16x16::ZERO, 0b11111111_11111111))
+pub fn _mm512_cvtsepi32_epi16(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsdw(a.as_i32x16(), i16x16::ZERO, 0b11111111_11111111)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12440,8 +13481,8 @@ pub unsafe fn _mm512_cvtsepi32_epi16(a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm512_mask_cvtsepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
-    transmute(vpmovsdw(a.as_i32x16(), src.as_i16x16(), k))
+pub fn _mm512_mask_cvtsepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsdw(a.as_i32x16(), src.as_i16x16(), k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
@@ -12451,8 +13492,8 @@ pub unsafe fn _mm512_mask_cvtsepi32_epi16(src: __m256i, k: __mmask16, a: __m512i
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm512_maskz_cvtsepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
-    transmute(vpmovsdw(a.as_i32x16(), i16x16::ZERO, k))
+pub fn _mm512_maskz_cvtsepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsdw(a.as_i32x16(), i16x16::ZERO, k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
@@ -12462,8 +13503,8 @@ pub unsafe fn _mm512_maskz_cvtsepi32_epi16(k: __mmask16, a: __m512i) -> __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm256_cvtsepi32_epi16(a: __m256i) -> __m128i {
-    transmute(vpmovsdw256(a.as_i32x8(), i16x8::ZERO, 0b11111111))
+pub fn _mm256_cvtsepi32_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdw256(a.as_i32x8(), i16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12473,8 +13514,8 @@ pub unsafe fn _mm256_cvtsepi32_epi16(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm256_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovsdw256(a.as_i32x8(), src.as_i16x8(), k))
+pub fn _mm256_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdw256(a.as_i32x8(), src.as_i16x8(), k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
@@ -12484,8 +13525,8 @@ pub unsafe fn _mm256_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm256_maskz_cvtsepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovsdw256(a.as_i32x8(), i16x8::ZERO, k))
+pub fn _mm256_maskz_cvtsepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdw256(a.as_i32x8(), i16x8::ZERO, k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
@@ -12495,8 +13536,8 @@ pub unsafe fn _mm256_maskz_cvtsepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm_cvtsepi32_epi16(a: __m128i) -> __m128i {
-    transmute(vpmovsdw128(a.as_i32x4(), i16x8::ZERO, 0b11111111))
+pub fn _mm_cvtsepi32_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdw128(a.as_i32x4(), i16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12506,8 +13547,8 @@ pub unsafe fn _mm_cvtsepi32_epi16(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovsdw128(a.as_i32x4(), src.as_i16x8(), k))
+pub fn _mm_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdw128(a.as_i32x4(), src.as_i16x8(), k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
@@ -12517,8 +13558,8 @@ pub unsafe fn _mm_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdw))]
-pub unsafe fn _mm_maskz_cvtsepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovsdw128(a.as_i32x4(), i16x8::ZERO, k))
+pub fn _mm_maskz_cvtsepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdw128(a.as_i32x4(), i16x8::ZERO, k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
@@ -12528,8 +13569,8 @@ pub unsafe fn _mm_maskz_cvtsepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm512_cvtsepi32_epi8(a: __m512i) -> __m128i {
-    transmute(vpmovsdb(a.as_i32x16(), i8x16::ZERO, 0b11111111_11111111))
+pub fn _mm512_cvtsepi32_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsdb(a.as_i32x16(), i8x16::ZERO, 0b11111111_11111111)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12539,8 +13580,8 @@ pub unsafe fn _mm512_cvtsepi32_epi8(a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm512_mask_cvtsepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
-    transmute(vpmovsdb(a.as_i32x16(), src.as_i8x16(), k))
+pub fn _mm512_mask_cvtsepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsdb(a.as_i32x16(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12550,8 +13591,8 @@ pub unsafe fn _mm512_mask_cvtsepi32_epi8(src: __m128i, k: __mmask16, a: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm512_maskz_cvtsepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
-    transmute(vpmovsdb(a.as_i32x16(), i8x16::ZERO, k))
+pub fn _mm512_maskz_cvtsepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsdb(a.as_i32x16(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
@@ -12561,8 +13602,8 @@ pub unsafe fn _mm512_maskz_cvtsepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm256_cvtsepi32_epi8(a: __m256i) -> __m128i {
-    transmute(vpmovsdb256(a.as_i32x8(), i8x16::ZERO, 0b11111111))
+pub fn _mm256_cvtsepi32_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdb256(a.as_i32x8(), i8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12572,8 +13613,8 @@ pub unsafe fn _mm256_cvtsepi32_epi8(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm256_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovsdb256(a.as_i32x8(), src.as_i8x16(), k))
+pub fn _mm256_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdb256(a.as_i32x8(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12583,8 +13624,8 @@ pub unsafe fn _mm256_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm256_maskz_cvtsepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovsdb256(a.as_i32x8(), i8x16::ZERO, k))
+pub fn _mm256_maskz_cvtsepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsdb256(a.as_i32x8(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
@@ -12594,8 +13635,8 @@ pub unsafe fn _mm256_maskz_cvtsepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm_cvtsepi32_epi8(a: __m128i) -> __m128i {
-    transmute(vpmovsdb128(a.as_i32x4(), i8x16::ZERO, 0b11111111))
+pub fn _mm_cvtsepi32_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdb128(a.as_i32x4(), i8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12605,8 +13646,8 @@ pub unsafe fn _mm_cvtsepi32_epi8(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovsdb128(a.as_i32x4(), src.as_i8x16(), k))
+pub fn _mm_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdb128(a.as_i32x4(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12616,8 +13657,8 @@ pub unsafe fn _mm_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsdb))]
-pub unsafe fn _mm_maskz_cvtsepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovsdb128(a.as_i32x4(), i8x16::ZERO, k))
+pub fn _mm_maskz_cvtsepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsdb128(a.as_i32x4(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
@@ -12627,8 +13668,8 @@ pub unsafe fn _mm_maskz_cvtsepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm512_cvtsepi64_epi32(a: __m512i) -> __m256i {
-    transmute(vpmovsqd(a.as_i64x8(), i32x8::ZERO, 0b11111111))
+pub fn _mm512_cvtsepi64_epi32(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsqd(a.as_i64x8(), i32x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12638,8 +13679,8 @@ pub unsafe fn _mm512_cvtsepi64_epi32(a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm512_mask_cvtsepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
-    transmute(vpmovsqd(a.as_i64x8(), src.as_i32x8(), k))
+pub fn _mm512_mask_cvtsepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsqd(a.as_i64x8(), src.as_i32x8(), k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12649,8 +13690,8 @@ pub unsafe fn _mm512_mask_cvtsepi64_epi32(src: __m256i, k: __mmask8, a: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm512_maskz_cvtsepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
-    transmute(vpmovsqd(a.as_i64x8(), i32x8::ZERO, k))
+pub fn _mm512_maskz_cvtsepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovsqd(a.as_i64x8(), i32x8::ZERO, k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
@@ -12660,8 +13701,8 @@ pub unsafe fn _mm512_maskz_cvtsepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm256_cvtsepi64_epi32(a: __m256i) -> __m128i {
-    transmute(vpmovsqd256(a.as_i64x4(), i32x4::ZERO, 0b11111111))
+pub fn _mm256_cvtsepi64_epi32(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqd256(a.as_i64x4(), i32x4::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12671,8 +13712,8 @@ pub unsafe fn _mm256_cvtsepi64_epi32(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm256_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovsqd256(a.as_i64x4(), src.as_i32x4(), k))
+pub fn _mm256_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqd256(a.as_i64x4(), src.as_i32x4(), k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12682,8 +13723,8 @@ pub unsafe fn _mm256_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm256_maskz_cvtsepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovsqd256(a.as_i64x4(), i32x4::ZERO, k))
+pub fn _mm256_maskz_cvtsepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqd256(a.as_i64x4(), i32x4::ZERO, k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
@@ -12693,8 +13734,8 @@ pub unsafe fn _mm256_maskz_cvtsepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm_cvtsepi64_epi32(a: __m128i) -> __m128i {
-    transmute(vpmovsqd128(a.as_i64x2(), i32x4::ZERO, 0b11111111))
+pub fn _mm_cvtsepi64_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqd128(a.as_i64x2(), i32x4::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12704,8 +13745,8 @@ pub unsafe fn _mm_cvtsepi64_epi32(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovsqd128(a.as_i64x2(), src.as_i32x4(), k))
+pub fn _mm_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqd128(a.as_i64x2(), src.as_i32x4(), k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12715,8 +13756,8 @@ pub unsafe fn _mm_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqd))]
-pub unsafe fn _mm_maskz_cvtsepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovsqd128(a.as_i64x2(), i32x4::ZERO, k))
+pub fn _mm_maskz_cvtsepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqd128(a.as_i64x2(), i32x4::ZERO, k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
@@ -12726,8 +13767,8 @@ pub unsafe fn _mm_maskz_cvtsepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm512_cvtsepi64_epi16(a: __m512i) -> __m128i {
-    transmute(vpmovsqw(a.as_i64x8(), i16x8::ZERO, 0b11111111))
+pub fn _mm512_cvtsepi64_epi16(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqw(a.as_i64x8(), i16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12737,8 +13778,8 @@ pub unsafe fn _mm512_cvtsepi64_epi16(a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm512_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    transmute(vpmovsqw(a.as_i64x8(), src.as_i16x8(), k))
+pub fn _mm512_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqw(a.as_i64x8(), src.as_i16x8(), k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12748,8 +13789,8 @@ pub unsafe fn _mm512_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm512_maskz_cvtsepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
-    transmute(vpmovsqw(a.as_i64x8(), i16x8::ZERO, k))
+pub fn _mm512_maskz_cvtsepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqw(a.as_i64x8(), i16x8::ZERO, k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
@@ -12759,8 +13800,8 @@ pub unsafe fn _mm512_maskz_cvtsepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm256_cvtsepi64_epi16(a: __m256i) -> __m128i {
-    transmute(vpmovsqw256(a.as_i64x4(), i16x8::ZERO, 0b11111111))
+pub fn _mm256_cvtsepi64_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqw256(a.as_i64x4(), i16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12770,8 +13811,8 @@ pub unsafe fn _mm256_cvtsepi64_epi16(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm256_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovsqw256(a.as_i64x4(), src.as_i16x8(), k))
+pub fn _mm256_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqw256(a.as_i64x4(), src.as_i16x8(), k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12781,8 +13822,8 @@ pub unsafe fn _mm256_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm256_maskz_cvtsepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovsqw256(a.as_i64x4(), i16x8::ZERO, k))
+pub fn _mm256_maskz_cvtsepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqw256(a.as_i64x4(), i16x8::ZERO, k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
@@ -12792,8 +13833,8 @@ pub unsafe fn _mm256_maskz_cvtsepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm_cvtsepi64_epi16(a: __m128i) -> __m128i {
-    transmute(vpmovsqw128(a.as_i64x2(), i16x8::ZERO, 0b11111111))
+pub fn _mm_cvtsepi64_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqw128(a.as_i64x2(), i16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12803,8 +13844,8 @@ pub unsafe fn _mm_cvtsepi64_epi16(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovsqw128(a.as_i64x2(), src.as_i16x8(), k))
+pub fn _mm_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqw128(a.as_i64x2(), src.as_i16x8(), k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12814,8 +13855,8 @@ pub unsafe fn _mm_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqw))]
-pub unsafe fn _mm_maskz_cvtsepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovsqw128(a.as_i64x2(), i16x8::ZERO, k))
+pub fn _mm_maskz_cvtsepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqw128(a.as_i64x2(), i16x8::ZERO, k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
@@ -12825,8 +13866,8 @@ pub unsafe fn _mm_maskz_cvtsepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm512_cvtsepi64_epi8(a: __m512i) -> __m128i {
-    transmute(vpmovsqb(a.as_i64x8(), i8x16::ZERO, 0b11111111))
+pub fn _mm512_cvtsepi64_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqb(a.as_i64x8(), i8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12836,8 +13877,8 @@ pub unsafe fn _mm512_cvtsepi64_epi8(a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm512_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    transmute(vpmovsqb(a.as_i64x8(), src.as_i8x16(), k))
+pub fn _mm512_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqb(a.as_i64x8(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12847,8 +13888,8 @@ pub unsafe fn _mm512_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm512_maskz_cvtsepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
-    transmute(vpmovsqb(a.as_i64x8(), i8x16::ZERO, k))
+pub fn _mm512_maskz_cvtsepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovsqb(a.as_i64x8(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
@@ -12858,8 +13899,8 @@ pub unsafe fn _mm512_maskz_cvtsepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm256_cvtsepi64_epi8(a: __m256i) -> __m128i {
-    transmute(vpmovsqb256(a.as_i64x4(), i8x16::ZERO, 0b11111111))
+pub fn _mm256_cvtsepi64_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqb256(a.as_i64x4(), i8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12869,8 +13910,8 @@ pub unsafe fn _mm256_cvtsepi64_epi8(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm256_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovsqb256(a.as_i64x4(), src.as_i8x16(), k))
+pub fn _mm256_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqb256(a.as_i64x4(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12880,8 +13921,8 @@ pub unsafe fn _mm256_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm256_maskz_cvtsepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovsqb256(a.as_i64x4(), i8x16::ZERO, k))
+pub fn _mm256_maskz_cvtsepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovsqb256(a.as_i64x4(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
@@ -12891,8 +13932,8 @@ pub unsafe fn _mm256_maskz_cvtsepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm_cvtsepi64_epi8(a: __m128i) -> __m128i {
-    transmute(vpmovsqb128(a.as_i64x2(), i8x16::ZERO, 0b11111111))
+pub fn _mm_cvtsepi64_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqb128(a.as_i64x2(), i8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12902,8 +13943,8 @@ pub unsafe fn _mm_cvtsepi64_epi8(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovsqb128(a.as_i64x2(), src.as_i8x16(), k))
+pub fn _mm_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqb128(a.as_i64x2(), src.as_i8x16(), k)) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12913,8 +13954,8 @@ pub unsafe fn _mm_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovsqb))]
-pub unsafe fn _mm_maskz_cvtsepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovsqb128(a.as_i64x2(), i8x16::ZERO, k))
+pub fn _mm_maskz_cvtsepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovsqb128(a.as_i64x2(), i8x16::ZERO, k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
@@ -12924,8 +13965,8 @@ pub unsafe fn _mm_maskz_cvtsepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm512_cvtusepi32_epi16(a: __m512i) -> __m256i {
-    transmute(vpmovusdw(a.as_u32x16(), u16x16::ZERO, 0b11111111_11111111))
+pub fn _mm512_cvtusepi32_epi16(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusdw(a.as_u32x16(), u16x16::ZERO, 0b11111111_11111111)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12935,8 +13976,8 @@ pub unsafe fn _mm512_cvtusepi32_epi16(a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm512_mask_cvtusepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
-    transmute(vpmovusdw(a.as_u32x16(), src.as_u16x16(), k))
+pub fn _mm512_mask_cvtusepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusdw(a.as_u32x16(), src.as_u16x16(), k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12946,8 +13987,8 @@ pub unsafe fn _mm512_mask_cvtusepi32_epi16(src: __m256i, k: __mmask16, a: __m512
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm512_maskz_cvtusepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
-    transmute(vpmovusdw(a.as_u32x16(), u16x16::ZERO, k))
+pub fn _mm512_maskz_cvtusepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusdw(a.as_u32x16(), u16x16::ZERO, k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
@@ -12957,8 +13998,8 @@ pub unsafe fn _mm512_maskz_cvtusepi32_epi16(k: __mmask16, a: __m512i) -> __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm256_cvtusepi32_epi16(a: __m256i) -> __m128i {
-    transmute(vpmovusdw256(a.as_u32x8(), u16x8::ZERO, 0b11111111))
+pub fn _mm256_cvtusepi32_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdw256(a.as_u32x8(), u16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -12968,8 +14009,8 @@ pub unsafe fn _mm256_cvtusepi32_epi16(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm256_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovusdw256(a.as_u32x8(), src.as_u16x8(), k))
+pub fn _mm256_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdw256(a.as_u32x8(), src.as_u16x8(), k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -12979,8 +14020,8 @@ pub unsafe fn _mm256_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm256_maskz_cvtusepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovusdw256(a.as_u32x8(), u16x8::ZERO, k))
+pub fn _mm256_maskz_cvtusepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdw256(a.as_u32x8(), u16x8::ZERO, k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
@@ -12990,8 +14031,8 @@ pub unsafe fn _mm256_maskz_cvtusepi32_epi16(k: __mmask8, a: __m256i) -> __m128i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm_cvtusepi32_epi16(a: __m128i) -> __m128i {
-    transmute(vpmovusdw128(a.as_u32x4(), u16x8::ZERO, 0b11111111))
+pub fn _mm_cvtusepi32_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdw128(a.as_u32x4(), u16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13001,8 +14042,8 @@ pub unsafe fn _mm_cvtusepi32_epi16(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovusdw128(a.as_u32x4(), src.as_u16x8(), k))
+pub fn _mm_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdw128(a.as_u32x4(), src.as_u16x8(), k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13012,8 +14053,8 @@ pub unsafe fn _mm_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdw))]
-pub unsafe fn _mm_maskz_cvtusepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovusdw128(a.as_u32x4(), u16x8::ZERO, k))
+pub fn _mm_maskz_cvtusepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdw128(a.as_u32x4(), u16x8::ZERO, k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
@@ -13023,8 +14064,8 @@ pub unsafe fn _mm_maskz_cvtusepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm512_cvtusepi32_epi8(a: __m512i) -> __m128i {
-    transmute(vpmovusdb(a.as_u32x16(), u8x16::ZERO, 0b11111111_11111111))
+pub fn _mm512_cvtusepi32_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusdb(a.as_u32x16(), u8x16::ZERO, 0b11111111_11111111)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13034,8 +14075,8 @@ pub unsafe fn _mm512_cvtusepi32_epi8(a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm512_mask_cvtusepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
-    transmute(vpmovusdb(a.as_u32x16(), src.as_u8x16(), k))
+pub fn _mm512_mask_cvtusepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusdb(a.as_u32x16(), src.as_u8x16(), k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13045,8 +14086,8 @@ pub unsafe fn _mm512_mask_cvtusepi32_epi8(src: __m128i, k: __mmask16, a: __m512i
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm512_maskz_cvtusepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
-    transmute(vpmovusdb(a.as_u32x16(), u8x16::ZERO, k))
+pub fn _mm512_maskz_cvtusepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusdb(a.as_u32x16(), u8x16::ZERO, k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
@@ -13056,8 +14097,8 @@ pub unsafe fn _mm512_maskz_cvtusepi32_epi8(k: __mmask16, a: __m512i) -> __m128i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm256_cvtusepi32_epi8(a: __m256i) -> __m128i {
-    transmute(vpmovusdb256(a.as_u32x8(), u8x16::ZERO, 0b11111111))
+pub fn _mm256_cvtusepi32_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdb256(a.as_u32x8(), u8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13067,8 +14108,8 @@ pub unsafe fn _mm256_cvtusepi32_epi8(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm256_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovusdb256(a.as_u32x8(), src.as_u8x16(), k))
+pub fn _mm256_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdb256(a.as_u32x8(), src.as_u8x16(), k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13078,8 +14119,8 @@ pub unsafe fn _mm256_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm256_maskz_cvtusepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovusdb256(a.as_u32x8(), u8x16::ZERO, k))
+pub fn _mm256_maskz_cvtusepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusdb256(a.as_u32x8(), u8x16::ZERO, k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
@@ -13089,8 +14130,8 @@ pub unsafe fn _mm256_maskz_cvtusepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm_cvtusepi32_epi8(a: __m128i) -> __m128i {
-    transmute(vpmovusdb128(a.as_u32x4(), u8x16::ZERO, 0b11111111))
+pub fn _mm_cvtusepi32_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdb128(a.as_u32x4(), u8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13100,8 +14141,8 @@ pub unsafe fn _mm_cvtusepi32_epi8(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovusdb128(a.as_u32x4(), src.as_u8x16(), k))
+pub fn _mm_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdb128(a.as_u32x4(), src.as_u8x16(), k)) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13111,8 +14152,8 @@ pub unsafe fn _mm_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusdb))]
-pub unsafe fn _mm_maskz_cvtusepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovusdb128(a.as_u32x4(), u8x16::ZERO, k))
+pub fn _mm_maskz_cvtusepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusdb128(a.as_u32x4(), u8x16::ZERO, k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
@@ -13122,8 +14163,8 @@ pub unsafe fn _mm_maskz_cvtusepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm512_cvtusepi64_epi32(a: __m512i) -> __m256i {
-    transmute(vpmovusqd(a.as_u64x8(), u32x8::ZERO, 0b11111111))
+pub fn _mm512_cvtusepi64_epi32(a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusqd(a.as_u64x8(), u32x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13133,8 +14174,8 @@ pub unsafe fn _mm512_cvtusepi64_epi32(a: __m512i) -> __m256i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm512_mask_cvtusepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
-    transmute(vpmovusqd(a.as_u64x8(), src.as_u32x8(), k))
+pub fn _mm512_mask_cvtusepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusqd(a.as_u64x8(), src.as_u32x8(), k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13144,8 +14185,8 @@ pub unsafe fn _mm512_mask_cvtusepi64_epi32(src: __m256i, k: __mmask8, a: __m512i
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm512_maskz_cvtusepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
-    transmute(vpmovusqd(a.as_u64x8(), u32x8::ZERO, k))
+pub fn _mm512_maskz_cvtusepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe { transmute(vpmovusqd(a.as_u64x8(), u32x8::ZERO, k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
@@ -13155,8 +14196,8 @@ pub unsafe fn _mm512_maskz_cvtusepi64_epi32(k: __mmask8, a: __m512i) -> __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm256_cvtusepi64_epi32(a: __m256i) -> __m128i {
-    transmute(vpmovusqd256(a.as_u64x4(), u32x4::ZERO, 0b11111111))
+pub fn _mm256_cvtusepi64_epi32(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqd256(a.as_u64x4(), u32x4::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13166,8 +14207,8 @@ pub unsafe fn _mm256_cvtusepi64_epi32(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm256_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovusqd256(a.as_u64x4(), src.as_u32x4(), k))
+pub fn _mm256_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqd256(a.as_u64x4(), src.as_u32x4(), k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13177,8 +14218,8 @@ pub unsafe fn _mm256_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm256_maskz_cvtusepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovusqd256(a.as_u64x4(), u32x4::ZERO, k))
+pub fn _mm256_maskz_cvtusepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqd256(a.as_u64x4(), u32x4::ZERO, k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
@@ -13188,8 +14229,8 @@ pub unsafe fn _mm256_maskz_cvtusepi64_epi32(k: __mmask8, a: __m256i) -> __m128i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm_cvtusepi64_epi32(a: __m128i) -> __m128i {
-    transmute(vpmovusqd128(a.as_u64x2(), u32x4::ZERO, 0b11111111))
+pub fn _mm_cvtusepi64_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqd128(a.as_u64x2(), u32x4::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13199,8 +14240,8 @@ pub unsafe fn _mm_cvtusepi64_epi32(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovusqd128(a.as_u64x2(), src.as_u32x4(), k))
+pub fn _mm_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqd128(a.as_u64x2(), src.as_u32x4(), k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13210,8 +14251,8 @@ pub unsafe fn _mm_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqd))]
-pub unsafe fn _mm_maskz_cvtusepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovusqd128(a.as_u64x2(), u32x4::ZERO, k))
+pub fn _mm_maskz_cvtusepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqd128(a.as_u64x2(), u32x4::ZERO, k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
@@ -13221,8 +14262,8 @@ pub unsafe fn _mm_maskz_cvtusepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm512_cvtusepi64_epi16(a: __m512i) -> __m128i {
-    transmute(vpmovusqw(a.as_u64x8(), u16x8::ZERO, 0b11111111))
+pub fn _mm512_cvtusepi64_epi16(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqw(a.as_u64x8(), u16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13232,8 +14273,8 @@ pub unsafe fn _mm512_cvtusepi64_epi16(a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm512_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    transmute(vpmovusqw(a.as_u64x8(), src.as_u16x8(), k))
+pub fn _mm512_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqw(a.as_u64x8(), src.as_u16x8(), k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13243,8 +14284,8 @@ pub unsafe fn _mm512_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m512i
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm512_maskz_cvtusepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
-    transmute(vpmovusqw(a.as_u64x8(), u16x8::ZERO, k))
+pub fn _mm512_maskz_cvtusepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqw(a.as_u64x8(), u16x8::ZERO, k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
@@ -13254,8 +14295,8 @@ pub unsafe fn _mm512_maskz_cvtusepi64_epi16(k: __mmask8, a: __m512i) -> __m128i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm256_cvtusepi64_epi16(a: __m256i) -> __m128i {
-    transmute(vpmovusqw256(a.as_u64x4(), u16x8::ZERO, 0b11111111))
+pub fn _mm256_cvtusepi64_epi16(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqw256(a.as_u64x4(), u16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13265,8 +14306,8 @@ pub unsafe fn _mm256_cvtusepi64_epi16(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm256_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovusqw256(a.as_u64x4(), src.as_u16x8(), k))
+pub fn _mm256_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqw256(a.as_u64x4(), src.as_u16x8(), k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13276,8 +14317,8 @@ pub unsafe fn _mm256_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm256_maskz_cvtusepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovusqw256(a.as_u64x4(), u16x8::ZERO, k))
+pub fn _mm256_maskz_cvtusepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqw256(a.as_u64x4(), u16x8::ZERO, k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
@@ -13287,8 +14328,8 @@ pub unsafe fn _mm256_maskz_cvtusepi64_epi16(k: __mmask8, a: __m256i) -> __m128i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm_cvtusepi64_epi16(a: __m128i) -> __m128i {
-    transmute(vpmovusqw128(a.as_u64x2(), u16x8::ZERO, 0b11111111))
+pub fn _mm_cvtusepi64_epi16(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqw128(a.as_u64x2(), u16x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13298,8 +14339,8 @@ pub unsafe fn _mm_cvtusepi64_epi16(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovusqw128(a.as_u64x2(), src.as_u16x8(), k))
+pub fn _mm_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqw128(a.as_u64x2(), src.as_u16x8(), k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13309,8 +14350,8 @@ pub unsafe fn _mm_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqw))]
-pub unsafe fn _mm_maskz_cvtusepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovusqw128(a.as_u64x2(), u16x8::ZERO, k))
+pub fn _mm_maskz_cvtusepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqw128(a.as_u64x2(), u16x8::ZERO, k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
@@ -13320,8 +14361,8 @@ pub unsafe fn _mm_maskz_cvtusepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm512_cvtusepi64_epi8(a: __m512i) -> __m128i {
-    transmute(vpmovusqb(a.as_u64x8(), u8x16::ZERO, 0b11111111))
+pub fn _mm512_cvtusepi64_epi8(a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqb(a.as_u64x8(), u8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13331,8 +14372,8 @@ pub unsafe fn _mm512_cvtusepi64_epi8(a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm512_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
-    transmute(vpmovusqb(a.as_u64x8(), src.as_u8x16(), k))
+pub fn _mm512_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqb(a.as_u64x8(), src.as_u8x16(), k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13342,8 +14383,8 @@ pub unsafe fn _mm512_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm512_maskz_cvtusepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
-    transmute(vpmovusqb(a.as_u64x8(), u8x16::ZERO, k))
+pub fn _mm512_maskz_cvtusepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe { transmute(vpmovusqb(a.as_u64x8(), u8x16::ZERO, k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
@@ -13353,8 +14394,8 @@ pub unsafe fn _mm512_maskz_cvtusepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm256_cvtusepi64_epi8(a: __m256i) -> __m128i {
-    transmute(vpmovusqb256(a.as_u64x4(), u8x16::ZERO, 0b11111111))
+pub fn _mm256_cvtusepi64_epi8(a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqb256(a.as_u64x4(), u8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13364,8 +14405,8 @@ pub unsafe fn _mm256_cvtusepi64_epi8(a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm256_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovusqb256(a.as_u64x4(), src.as_u8x16(), k))
+pub fn _mm256_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqb256(a.as_u64x4(), src.as_u8x16(), k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13375,8 +14416,8 @@ pub unsafe fn _mm256_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm256_maskz_cvtusepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
-    transmute(vpmovusqb256(a.as_u64x4(), u8x16::ZERO, k))
+pub fn _mm256_maskz_cvtusepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe { transmute(vpmovusqb256(a.as_u64x4(), u8x16::ZERO, k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
@@ -13386,8 +14427,8 @@ pub unsafe fn _mm256_maskz_cvtusepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm_cvtusepi64_epi8(a: __m128i) -> __m128i {
-    transmute(vpmovusqb128(a.as_u64x2(), u8x16::ZERO, 0b11111111))
+pub fn _mm_cvtusepi64_epi8(a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqb128(a.as_u64x2(), u8x16::ZERO, 0b11111111)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -13397,8 +14438,8 @@ pub unsafe fn _mm_cvtusepi64_epi8(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovusqb128(a.as_u64x2(), src.as_u8x16(), k))
+pub fn _mm_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqb128(a.as_u64x2(), src.as_u8x16(), k)) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -13408,8 +14449,8 @@ pub unsafe fn _mm_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmovusqb))]
-pub unsafe fn _mm_maskz_cvtusepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpmovusqb128(a.as_u64x2(), u8x16::ZERO, k))
+pub fn _mm_maskz_cvtusepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpmovusqb128(a.as_u64x2(), u8x16::ZERO, k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
@@ -13420,18 +14461,20 @@ pub unsafe fn _mm_maskz_cvtusepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
 /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
 /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epi32&expand=1335)   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epi32&expand=1335)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvt_roundps_epi32<const ROUNDING: i32>(a: __m512) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let r = vcvtps2dq(a, i32x16::ZERO, 0b11111111_11111111, ROUNDING);
-    transmute(r)
+pub fn _mm512_cvt_roundps_epi32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2dq(a, i32x16::ZERO, 0b11111111_11111111, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -13442,23 +14485,25 @@ pub unsafe fn _mm512_cvt_roundps_epi32<const ROUNDING: i32>(a: __m512) -> __m512
 /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
 /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_epi32<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundps_epi32<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let src = src.as_i32x16();
-    let r = vcvtps2dq(a, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let src = src.as_i32x16();
+        let r = vcvtps2dq(a, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -13469,21 +14514,20 @@ pub unsafe fn _mm512_mask_cvt_roundps_epi32<const ROUNDING: i32>(
 /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
 /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_epi32<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let r = vcvtps2dq(a, i32x16::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm512_maskz_cvt_roundps_epi32<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2dq(a, i32x16::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
@@ -13494,18 +14538,20 @@ pub unsafe fn _mm512_maskz_cvt_roundps_epi32<const ROUNDING: i32>(
 /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
 /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_epu32&expand=1341)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvt_roundps_epu32<const ROUNDING: i32>(a: __m512) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let r = vcvtps2udq(a, u32x16::ZERO, 0b11111111_11111111, ROUNDING);
-    transmute(r)
+pub fn _mm512_cvt_roundps_epu32<const ROUNDING: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2udq(a, u32x16::ZERO, 0b11111111_11111111, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -13516,23 +14562,25 @@ pub unsafe fn _mm512_cvt_roundps_epu32<const ROUNDING: i32>(a: __m512) -> __m512
 /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
 /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_epu32&expand=1342)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_epu32<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundps_epu32<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let src = src.as_u32x16();
-    let r = vcvtps2udq(a, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let src = src.as_u32x16();
+        let r = vcvtps2udq(a, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -13543,37 +14591,38 @@ pub unsafe fn _mm512_mask_cvt_roundps_epu32<const ROUNDING: i32>(
 /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
 /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_epu32&expand=1343)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_epu32<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x16();
-    let r = vcvtps2udq(a, u32x16::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm512_maskz_cvt_roundps_epu32<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x16();
+        let r = vcvtps2udq(a, u32x16::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_pd&expand=1347)   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_pd&expand=1347)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvt_roundps_pd<const SAE: i32>(a: __m256) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x8();
-    let r = vcvtps2pd(a, f64x8::ZERO, 0b11111111, SAE);
-    transmute(r)
+pub fn _mm512_cvt_roundps_pd<const SAE: i32>(a: __m256) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x8();
+        let r = vcvtps2pd(a, f64x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -13585,16 +14634,14 @@ pub unsafe fn _mm512_cvt_roundps_pd<const SAE: i32>(a: __m256) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_pd<const SAE: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m256,
-) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x8();
-    let src = src.as_f64x8();
-    let r = vcvtps2pd(a, src, k, SAE);
-    transmute(r)
+pub fn _mm512_mask_cvt_roundps_pd<const SAE: i32>(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x8();
+        let src = src.as_f64x8();
+        let r = vcvtps2pd(a, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -13606,11 +14653,13 @@ pub unsafe fn _mm512_mask_cvt_roundps_pd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_pd<const SAE: i32>(k: __mmask8, a: __m256) -> __m512d {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x8();
-    let r = vcvtps2pd(a, f64x8::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm512_maskz_cvt_roundps_pd<const SAE: i32>(k: __mmask8, a: __m256) -> __m512d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x8();
+        let r = vcvtps2pd(a, f64x8::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.\
@@ -13628,11 +14677,13 @@ pub unsafe fn _mm512_maskz_cvt_roundps_pd<const SAE: i32>(k: __mmask8, a: __m256
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvt_roundpd_epi32<const ROUNDING: i32>(a: __m512d) -> __m256i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let r = vcvtpd2dq(a, i32x8::ZERO, 0b11111111, ROUNDING);
-    transmute(r)
+pub fn _mm512_cvt_roundpd_epi32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2dq(a, i32x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -13650,16 +14701,18 @@ pub unsafe fn _mm512_cvt_roundpd_epi32<const ROUNDING: i32>(a: __m512d) -> __m25
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvt_roundpd_epi32<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundpd_epi32<const ROUNDING: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
 ) -> __m256i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let src = src.as_i32x8();
-    let r = vcvtpd2dq(a, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let src = src.as_i32x8();
+        let r = vcvtpd2dq(a, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -13677,14 +14730,13 @@ pub unsafe fn _mm512_mask_cvt_roundpd_epi32<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvt_roundpd_epi32<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-) -> __m256i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let r = vcvtpd2dq(a, i32x8::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm512_maskz_cvt_roundpd_epi32<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2dq(a, i32x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
@@ -13702,11 +14754,13 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_epi32<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvt_roundpd_epu32<const ROUNDING: i32>(a: __m512d) -> __m256i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let r = vcvtpd2udq(a, u32x8::ZERO, 0b11111111, ROUNDING);
-    transmute(r)
+pub fn _mm512_cvt_roundpd_epu32<const ROUNDING: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2udq(a, u32x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -13724,16 +14778,18 @@ pub unsafe fn _mm512_cvt_roundpd_epu32<const ROUNDING: i32>(a: __m512d) -> __m25
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvt_roundpd_epu32<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundpd_epu32<const ROUNDING: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
 ) -> __m256i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let src = src.as_u32x8();
-    let r = vcvtpd2udq(a, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let src = src.as_u32x8();
+        let r = vcvtpd2udq(a, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -13751,14 +14807,13 @@ pub unsafe fn _mm512_mask_cvt_roundpd_epu32<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvt_roundpd_epu32<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512d,
-) -> __m256i {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let r = vcvtpd2udq(a, u32x8::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm512_maskz_cvt_roundpd_epu32<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2udq(a, u32x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
@@ -13776,11 +14831,13 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_epu32<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvt_roundpd_ps<const ROUNDING: i32>(a: __m512d) -> __m256 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let r = vcvtpd2ps(a, f32x8::ZERO, 0b11111111, ROUNDING);
-    transmute(r)
+pub fn _mm512_cvt_roundpd_ps<const ROUNDING: i32>(a: __m512d) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2ps(a, f32x8::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -13798,16 +14855,18 @@ pub unsafe fn _mm512_cvt_roundpd_ps<const ROUNDING: i32>(a: __m512d) -> __m256 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvt_roundpd_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundpd_ps<const ROUNDING: i32>(
     src: __m256,
     k: __mmask8,
     a: __m512d,
 ) -> __m256 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let src = src.as_f32x8();
-    let r = vcvtpd2ps(a, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let src = src.as_f32x8();
+        let r = vcvtpd2ps(a, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -13825,11 +14884,13 @@ pub unsafe fn _mm512_mask_cvt_roundpd_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvt_roundpd_ps<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x8();
-    let r = vcvtpd2ps(a, f32x8::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm512_maskz_cvt_roundpd_ps<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x8();
+        let r = vcvtpd2ps(a, f32x8::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
@@ -13847,11 +14908,13 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_ps<const ROUNDING: i32>(k: __mmask8, a: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvt_roundepi32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_i32x16();
-    let r = vcvtdq2ps(a, ROUNDING);
-    transmute(r)
+pub fn _mm512_cvt_roundepi32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_i32x16();
+        let r = vcvtdq2ps(a, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -13869,15 +14932,17 @@ pub unsafe fn _mm512_cvt_roundepi32_ps<const ROUNDING: i32>(a: __m512i) -> __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvt_roundepi32_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepi32_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512i,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_i32x16();
-    let r = vcvtdq2ps(a, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_i32x16();
+        let r = vcvtdq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -13895,14 +14960,13 @@ pub unsafe fn _mm512_mask_cvt_roundepi32_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvt_roundepi32_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512i,
-) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_i32x16();
-    let r = vcvtdq2ps(a, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+pub fn _mm512_maskz_cvt_roundepi32_ps<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_i32x16();
+        let r = vcvtdq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
@@ -13920,11 +14984,13 @@ pub unsafe fn _mm512_maskz_cvt_roundepi32_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvt_roundepu32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_u32x16();
-    let r = vcvtudq2ps(a, ROUNDING);
-    transmute(r)
+pub fn _mm512_cvt_roundepu32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_u32x16();
+        let r = vcvtudq2ps(a, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -13942,15 +15008,17 @@ pub unsafe fn _mm512_cvt_roundepu32_ps<const ROUNDING: i32>(a: __m512i) -> __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvt_roundepu32_ps<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepu32_ps<const ROUNDING: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512i,
 ) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_u32x16();
-    let r = vcvtudq2ps(a, ROUNDING);
-    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_u32x16();
+        let r = vcvtudq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, src.as_f32x16()))
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -13968,67 +15036,72 @@ pub unsafe fn _mm512_mask_cvt_roundepu32_ps<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvt_roundepu32_ps<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512i,
-) -> __m512 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_u32x16();
-    let r = vcvtudq2ps(a, ROUNDING);
-    transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+pub fn _mm512_maskz_cvt_roundepu32_ps<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m512 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_u32x16();
+        let r = vcvtudq2ps(a, ROUNDING);
+        transmute(simd_select_bitmask(k, r, f32x16::ZERO))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_ph&expand=1354)   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundps_ph&expand=1354)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvt_roundps_ph<const SAE: i32>(a: __m512) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vcvtps2ph(a, SAE, i16x16::ZERO, 0b11111111_11111111);
-    transmute(r)
+pub fn _mm512_cvt_roundps_ph<const SAE: i32>(a: __m512) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, SAE, i16x16::ZERO, 0b11111111_11111111);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_ph&expand=1355)   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundps_ph&expand=1355)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvt_roundps_ph<const SAE: i32>(
+pub fn _mm512_mask_cvt_roundps_ph<const SAE: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m512,
 ) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let src = src.as_i16x16();
-    let r = vcvtps2ph(a, SAE, src, k);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_i16x16();
+        let r = vcvtps2ph(a, SAE, src, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundps_ph&expand=1356)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvt_roundps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vcvtps2ph(a, SAE, i16x16::ZERO, k);
-    transmute(r)
+pub fn _mm512_maskz_cvt_roundps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, SAE, i16x16::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -14038,23 +15111,25 @@ pub unsafe fn _mm512_maskz_cvt_roundps_ph<const SAE: i32>(k: __mmask16, a: __m51
 /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
 /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvt_roundps_ph&expand=1352)   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvt_roundps_ph&expand=1352)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_cvt_roundps_ph<const IMM8: i32>(
+pub fn _mm256_mask_cvt_roundps_ph<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m256,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x8();
-    let src = src.as_i16x8();
-    let r = vcvtps2ph256(a, IMM8, src, k);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph256(a, IMM8, src, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -14064,18 +15139,20 @@ pub unsafe fn _mm256_mask_cvt_roundps_ph<const IMM8: i32>(
 /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
 /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvt_roundps_ph&expand=1353)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x8();
-    let r = vcvtps2ph256(a, IMM8, i16x8::ZERO, k);
-    transmute(r)
+pub fn _mm256_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vcvtps2ph256(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -14085,23 +15162,21 @@ pub unsafe fn _mm256_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m25
 /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
 /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvt_roundps_ph&expand=1350)   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvt_roundps_ph&expand=1350)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_cvt_roundps_ph<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let src = src.as_i16x8();
-    let r = vcvtps2ph128(a, IMM8, src, k);
-    transmute(r)
+pub fn _mm_mask_cvt_roundps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph128(a, IMM8, src, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -14111,71 +15186,75 @@ pub unsafe fn _mm_mask_cvt_roundps_ph<const IMM8: i32>(
 /// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
 /// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvt_roundps_ph&expand=1351)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let r = vcvtps2ph128(a, IMM8, i16x8::ZERO, k);
-    transmute(r)
+pub fn _mm_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vcvtps2ph128(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtps_ph&expand=1778)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvtps_ph<const SAE: i32>(a: __m512) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vcvtps2ph(a, SAE, i16x16::ZERO, 0b11111111_11111111);
-    transmute(r)
+pub fn _mm512_cvtps_ph<const SAE: i32>(a: __m512) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, SAE, i16x16::ZERO, 0b11111111_11111111);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvtps_ph&expand=1779)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvtps_ph<const SAE: i32>(
-    src: __m256i,
-    k: __mmask16,
-    a: __m512,
-) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let src = src.as_i16x16();
-    let r = vcvtps2ph(a, SAE, src, k);
-    transmute(r)
+pub fn _mm512_mask_cvtps_ph<const SAE: i32>(src: __m256i, k: __mmask16, a: __m512) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_i16x16();
+        let r = vcvtps2ph(a, SAE, src, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvtps_ph&expand=1780)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvtps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vcvtps2ph(a, SAE, i16x16::ZERO, k);
-    transmute(r)
+pub fn _mm512_maskz_cvtps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvtps2ph(a, SAE, i16x16::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -14185,23 +15264,21 @@ pub unsafe fn _mm512_maskz_cvtps_ph<const SAE: i32>(k: __mmask16, a: __m512) ->
 /// * [`_MM_FROUND_TO_POS_INF`] : round up
 /// * [`_MM_FROUND_TO_ZERO`] : truncate
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_cvtps_ph&expand=1776)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_cvtps_ph<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m256,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x8();
-    let src = src.as_i16x8();
-    let r = vcvtps2ph256(a, IMM8, src, k);
-    transmute(r)
+pub fn _mm256_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m256) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph256(a, IMM8, src, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -14211,18 +15288,20 @@ pub unsafe fn _mm256_mask_cvtps_ph<const IMM8: i32>(
 /// * [`_MM_FROUND_TO_POS_INF`] : round up
 /// * [`_MM_FROUND_TO_ZERO`] : truncate
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskz_cvtps_ph&expand=1777)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x8();
-    let r = vcvtps2ph256(a, IMM8, i16x8::ZERO, k);
-    transmute(r)
+pub fn _mm256_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x8();
+        let r = vcvtps2ph256(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -14232,19 +15311,21 @@ pub unsafe fn _mm256_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m256) ->
 /// * [`_MM_FROUND_TO_POS_INF`] : round up
 /// * [`_MM_FROUND_TO_ZERO`] : truncate
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_cvtps_ph&expand=1773)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let src = src.as_i16x8();
-    let r = vcvtps2ph128(a, IMM8, src, k);
-    transmute(r)
+pub fn _mm_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let src = src.as_i16x8();
+        let r = vcvtps2ph128(a, IMM8, src, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -14254,71 +15335,75 @@ pub unsafe fn _mm_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: _
 /// * [`_MM_FROUND_TO_POS_INF`] : round up
 /// * [`_MM_FROUND_TO_ZERO`] : truncate
 /// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskz_cvtps_ph&expand=1774)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let r = vcvtps2ph128(a, IMM8, i16x8::ZERO, k);
-    transmute(r)
+pub fn _mm_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let r = vcvtps2ph128(a, IMM8, i16x8::ZERO, k);
+        transmute(r)
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvt_roundph_ps&expand=1332)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvt_roundph_ps<const SAE: i32>(a: __m256i) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_i16x16();
-    let r = vcvtph2ps(a, f32x16::ZERO, 0b11111111_11111111, SAE);
-    transmute(r)
+pub fn _mm512_cvt_roundph_ps<const SAE: i32>(a: __m256i) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_i16x16();
+        let r = vcvtph2ps(a, f32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_mask_cvt_roundph_ps&expand=1333)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvt_roundph_ps<const SAE: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m256i,
-) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_i16x16();
-    let src = src.as_f32x16();
-    let r = vcvtph2ps(a, src, k, SAE);
-    transmute(r)
+pub fn _mm512_mask_cvt_roundph_ps<const SAE: i32>(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_i16x16();
+        let src = src.as_f32x16();
+        let r = vcvtph2ps(a, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_maskz_cvt_roundph_ps&expand=1334)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvt_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256i) -> __m512 {
-    static_assert_sae!(SAE);
-    let a = a.as_i16x16();
-    let r = vcvtph2ps(a, f32x16::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm512_maskz_cvt_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256i) -> __m512 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_i16x16();
+        let r = vcvtph2ps(a, f32x16::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
@@ -14328,13 +15413,15 @@ pub unsafe fn _mm512_maskz_cvt_roundph_ps<const SAE: i32>(k: __mmask16, a: __m25
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub unsafe fn _mm512_cvtph_ps(a: __m256i) -> __m512 {
-    transmute(vcvtph2ps(
-        a.as_i16x16(),
-        f32x16::ZERO,
-        0b11111111_11111111,
-        _MM_FROUND_NO_EXC,
-    ))
+pub fn _mm512_cvtph_ps(a: __m256i) -> __m512 {
+    unsafe {
+        transmute(vcvtph2ps(
+            a.as_i16x16(),
+            f32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14344,13 +15431,15 @@ pub unsafe fn _mm512_cvtph_ps(a: __m256i) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub unsafe fn _mm512_mask_cvtph_ps(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
-    transmute(vcvtph2ps(
-        a.as_i16x16(),
-        src.as_f32x16(),
-        k,
-        _MM_FROUND_NO_EXC,
-    ))
+pub fn _mm512_mask_cvtph_ps(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
+    unsafe {
+        transmute(vcvtph2ps(
+            a.as_i16x16(),
+            src.as_f32x16(),
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14360,8 +15449,8 @@ pub unsafe fn _mm512_mask_cvtph_ps(src: __m512, k: __mmask16, a: __m256i) -> __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub unsafe fn _mm512_maskz_cvtph_ps(k: __mmask16, a: __m256i) -> __m512 {
-    transmute(vcvtph2ps(a.as_i16x16(), f32x16::ZERO, k, _MM_FROUND_NO_EXC))
+pub fn _mm512_maskz_cvtph_ps(k: __mmask16, a: __m256i) -> __m512 {
+    unsafe { transmute(vcvtph2ps(a.as_i16x16(), f32x16::ZERO, k, _MM_FROUND_NO_EXC)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14371,9 +15460,11 @@ pub unsafe fn _mm512_maskz_cvtph_ps(k: __mmask16, a: __m256i) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub unsafe fn _mm256_mask_cvtph_ps(src: __m256, k: __mmask8, a: __m128i) -> __m256 {
-    let convert = _mm256_cvtph_ps(a);
-    transmute(simd_select_bitmask(k, convert.as_f32x8(), src.as_f32x8()))
+pub fn _mm256_mask_cvtph_ps(src: __m256, k: __mmask8, a: __m128i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x8(), src.as_f32x8()))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14383,9 +15474,11 @@ pub unsafe fn _mm256_mask_cvtph_ps(src: __m256, k: __mmask8, a: __m128i) -> __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub unsafe fn _mm256_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m256 {
-    let convert = _mm256_cvtph_ps(a);
-    transmute(simd_select_bitmask(k, convert.as_f32x8(), f32x8::ZERO))
+pub fn _mm256_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m256 {
+    unsafe {
+        let convert = _mm256_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x8(), f32x8::ZERO))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14395,9 +15488,11 @@ pub unsafe fn _mm256_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub unsafe fn _mm_mask_cvtph_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
-    let convert = _mm_cvtph_ps(a);
-    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+pub fn _mm_mask_cvtph_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14407,25 +15502,29 @@ pub unsafe fn _mm_mask_cvtph_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtph2ps))]
-pub unsafe fn _mm_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m128 {
-    let convert = _mm_cvtph_ps(a);
-    transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+pub fn _mm_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m128 {
+    unsafe {
+        let convert = _mm_cvtph_ps(a);
+        transmute(simd_select_bitmask(k, convert.as_f32x4(), f32x4::ZERO))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epi32&expand=1916)   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epi32&expand=1916)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvtt_roundps_epi32<const SAE: i32>(a: __m512) -> __m512i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vcvttps2dq(a, i32x16::ZERO, 0b11111111_11111111, SAE);
-    transmute(r)
+pub fn _mm512_cvtt_roundps_epi32<const SAE: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2dq(a, i32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -14437,16 +15536,18 @@ pub unsafe fn _mm512_cvtt_roundps_epi32<const SAE: i32>(a: __m512) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvtt_roundps_epi32<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundps_epi32<const SAE: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let src = src.as_i32x16();
-    let r = vcvttps2dq(a, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_i32x16();
+        let r = vcvttps2dq(a, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -14458,27 +15559,31 @@ pub unsafe fn _mm512_mask_cvtt_roundps_epi32<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundps_epi32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vcvttps2dq(a, i32x16::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm512_maskz_cvtt_roundps_epi32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2dq(a, i32x16::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
-/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epu32&expand=1922)   
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundps_epu32&expand=1922)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvtt_roundps_epu32<const SAE: i32>(a: __m512) -> __m512i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vcvttps2udq(a, u32x16::ZERO, 0b11111111_11111111, SAE);
-    transmute(r)
+pub fn _mm512_cvtt_roundps_epu32<const SAE: i32>(a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2udq(a, u32x16::ZERO, 0b11111111_11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -14490,16 +15595,18 @@ pub unsafe fn _mm512_cvtt_roundps_epu32<const SAE: i32>(a: __m512) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvtt_roundps_epu32<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundps_epu32<const SAE: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let src = src.as_u32x16();
-    let r = vcvttps2udq(a, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let src = src.as_u32x16();
+        let r = vcvttps2udq(a, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -14511,27 +15618,31 @@ pub unsafe fn _mm512_mask_cvtt_roundps_epu32<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundps_epu32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x16();
-    let r = vcvttps2udq(a, u32x16::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm512_maskz_cvtt_roundps_epu32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x16();
+        let r = vcvttps2udq(a, u32x16::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epi32&expand=1904)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvtt_roundpd_epi32<const SAE: i32>(a: __m512d) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let r = vcvttpd2dq(a, i32x8::ZERO, 0b11111111, SAE);
-    transmute(r)
+pub fn _mm512_cvtt_roundpd_epi32<const SAE: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2dq(a, i32x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -14543,16 +15654,18 @@ pub unsafe fn _mm512_cvtt_roundpd_epi32<const SAE: i32>(a: __m512d) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvtt_roundpd_epi32<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundpd_epi32<const SAE: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
 ) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let src = src.as_i32x8();
-    let r = vcvttpd2dq(a, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_i32x8();
+        let r = vcvttpd2dq(a, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -14564,27 +15677,31 @@ pub unsafe fn _mm512_mask_cvtt_roundpd_epi32<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundpd_epi32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let r = vcvttpd2dq(a, i32x8::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm512_maskz_cvtt_roundpd_epi32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2dq(a, i32x8::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvtt_roundpd_epu32&expand=1910)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_cvtt_roundpd_epu32<const SAE: i32>(a: __m512d) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let r = vcvttpd2udq(a, i32x8::ZERO, 0b11111111, SAE);
-    transmute(r)
+pub fn _mm512_cvtt_roundpd_epu32<const SAE: i32>(a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2udq(a, i32x8::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
@@ -14596,32 +15713,36 @@ pub unsafe fn _mm512_cvtt_roundpd_epu32<const SAE: i32>(a: __m512d) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_cvtt_roundpd_epu32<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundpd_epu32<const SAE: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512d,
 ) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let src = src.as_i32x8();
-    let r = vcvttpd2udq(a, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let src = src.as_i32x8();
+        let r = vcvttpd2udq(a, src, k, SAE);
+        transmute(r)
+    }
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.    
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epi32&expand=1984)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub unsafe fn _mm512_cvttps_epi32(a: __m512) -> __m512i {
-    transmute(vcvttps2dq(
-        a.as_f32x16(),
-        i32x16::ZERO,
-        0b11111111_11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_cvttps_epi32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14631,13 +15752,15 @@ pub unsafe fn _mm512_cvttps_epi32(a: __m512) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub unsafe fn _mm512_mask_cvttps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
-    transmute(vcvttps2dq(
-        a.as_f32x16(),
-        src.as_i32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2dq(
+            a.as_f32x16(),
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14647,13 +15770,15 @@ pub unsafe fn _mm512_mask_cvttps_epi32(src: __m512i, k: __mmask16, a: __m512) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub unsafe fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i {
-    transmute(vcvttps2dq(
-        a.as_f32x16(),
-        i32x16::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2dq(
+            a.as_f32x16(),
+            i32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14663,8 +15788,8 @@ pub unsafe fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub unsafe fn _mm256_mask_cvttps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
-    transmute(vcvttps2dq256(a.as_f32x8(), src.as_i32x8(), k))
+pub fn _mm256_mask_cvttps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2dq256(a.as_f32x8(), src.as_i32x8(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14674,8 +15799,8 @@ pub unsafe fn _mm256_mask_cvttps_epi32(src: __m256i, k: __mmask8, a: __m256) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub unsafe fn _mm256_maskz_cvttps_epi32(k: __mmask8, a: __m256) -> __m256i {
-    transmute(vcvttps2dq256(a.as_f32x8(), i32x8::ZERO, k))
+pub fn _mm256_maskz_cvttps_epi32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2dq256(a.as_f32x8(), i32x8::ZERO, k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14685,8 +15810,8 @@ pub unsafe fn _mm256_maskz_cvttps_epi32(k: __mmask8, a: __m256) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub unsafe fn _mm_mask_cvttps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    transmute(vcvttps2dq128(a.as_f32x4(), src.as_i32x4(), k))
+pub fn _mm_mask_cvttps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2dq128(a.as_f32x4(), src.as_i32x4(), k)) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14696,24 +15821,26 @@ pub unsafe fn _mm_mask_cvttps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2dq))]
-pub unsafe fn _mm_maskz_cvttps_epi32(k: __mmask8, a: __m128) -> __m128i {
-    transmute(vcvttps2dq128(a.as_f32x4(), i32x4::ZERO, k))
+pub fn _mm_maskz_cvttps_epi32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2dq128(a.as_f32x4(), i32x4::ZERO, k)) }
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttps_epu32&expand=2002)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub unsafe fn _mm512_cvttps_epu32(a: __m512) -> __m512i {
-    transmute(vcvttps2udq(
-        a.as_f32x16(),
-        u32x16::ZERO,
-        0b11111111_11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_cvttps_epu32(a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            0b11111111_11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14723,13 +15850,15 @@ pub unsafe fn _mm512_cvttps_epu32(a: __m512) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub unsafe fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
-    transmute(vcvttps2udq(
-        a.as_f32x16(),
-        src.as_u32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2udq(
+            a.as_f32x16(),
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14739,24 +15868,26 @@ pub unsafe fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub unsafe fn _mm512_maskz_cvttps_epu32(k: __mmask16, a: __m512) -> __m512i {
-    transmute(vcvttps2udq(
-        a.as_f32x16(),
-        u32x16::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_cvttps_epu32(k: __mmask16, a: __m512) -> __m512i {
+    unsafe {
+        transmute(vcvttps2udq(
+            a.as_f32x16(),
+            u32x16::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epu32&expand=1999)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub unsafe fn _mm256_cvttps_epu32(a: __m256) -> __m256i {
-    transmute(vcvttps2udq256(a.as_f32x8(), u32x8::ZERO, 0b11111111))
+pub fn _mm256_cvttps_epu32(a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), u32x8::ZERO, 0b11111111)) }
 }
 
 /// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14766,8 +15897,8 @@ pub unsafe fn _mm256_cvttps_epu32(a: __m256) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub unsafe fn _mm256_mask_cvttps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
-    transmute(vcvttps2udq256(a.as_f32x8(), src.as_u32x8(), k))
+pub fn _mm256_mask_cvttps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), src.as_u32x8(), k)) }
 }
 
 /// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14777,19 +15908,19 @@ pub unsafe fn _mm256_mask_cvttps_epu32(src: __m256i, k: __mmask8, a: __m256) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub unsafe fn _mm256_maskz_cvttps_epu32(k: __mmask8, a: __m256) -> __m256i {
-    transmute(vcvttps2udq256(a.as_f32x8(), u32x8::ZERO, k))
+pub fn _mm256_maskz_cvttps_epu32(k: __mmask8, a: __m256) -> __m256i {
+    unsafe { transmute(vcvttps2udq256(a.as_f32x8(), u32x8::ZERO, k)) }
 }
 
-/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epu32&expand=1996)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub unsafe fn _mm_cvttps_epu32(a: __m128) -> __m128i {
-    transmute(vcvttps2udq128(a.as_f32x4(), u32x4::ZERO, 0b11111111))
+pub fn _mm_cvttps_epu32(a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), u32x4::ZERO, 0b11111111)) }
 }
 
 /// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14799,8 +15930,8 @@ pub unsafe fn _mm_cvttps_epu32(a: __m128) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub unsafe fn _mm_mask_cvttps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
-    transmute(vcvttps2udq128(a.as_f32x4(), src.as_u32x4(), k))
+pub fn _mm_mask_cvttps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), src.as_u32x4(), k)) }
 }
 
 /// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14810,8 +15941,8 @@ pub unsafe fn _mm_mask_cvttps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttps2udq))]
-pub unsafe fn _mm_maskz_cvttps_epu32(k: __mmask8, a: __m128) -> __m128i {
-    transmute(vcvttps2udq128(a.as_f32x4(), u32x4::ZERO, k))
+pub fn _mm_maskz_cvttps_epu32(k: __mmask8, a: __m128) -> __m128i {
+    unsafe { transmute(vcvttps2udq128(a.as_f32x4(), u32x4::ZERO, k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
@@ -14823,27 +15954,31 @@ pub unsafe fn _mm_maskz_cvttps_epu32(k: __mmask8, a: __m128) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_cvtt_roundpd_epu32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x8();
-    let r = vcvttpd2udq(a, i32x8::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm512_maskz_cvtt_roundpd_epu32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x8();
+        let r = vcvttpd2udq(a, i32x8::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.  
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epi32&expand=1947)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub unsafe fn _mm512_cvttpd_epi32(a: __m512d) -> __m256i {
-    transmute(vcvttpd2dq(
-        a.as_f64x8(),
-        i32x8::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_cvttpd_epi32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14853,13 +15988,15 @@ pub unsafe fn _mm512_cvttpd_epi32(a: __m512d) -> __m256i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub unsafe fn _mm512_mask_cvttpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
-    transmute(vcvttpd2dq(
-        a.as_f64x8(),
-        src.as_i32x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2dq(
+            a.as_f64x8(),
+            src.as_i32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14869,13 +16006,15 @@ pub unsafe fn _mm512_mask_cvttpd_epi32(src: __m256i, k: __mmask8, a: __m512d) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub unsafe fn _mm512_maskz_cvttpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
-    transmute(vcvttpd2dq(
-        a.as_f64x8(),
-        i32x8::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_cvttpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2dq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14885,8 +16024,8 @@ pub unsafe fn _mm512_maskz_cvttpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub unsafe fn _mm256_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
-    transmute(vcvttpd2dq256(a.as_f64x4(), src.as_i32x4(), k))
+pub fn _mm256_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq256(a.as_f64x4(), src.as_i32x4(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14896,8 +16035,8 @@ pub unsafe fn _mm256_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m256d) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub unsafe fn _mm256_maskz_cvttpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
-    transmute(vcvttpd2dq256(a.as_f64x4(), i32x4::ZERO, k))
+pub fn _mm256_maskz_cvttpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq256(a.as_f64x4(), i32x4::ZERO, k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14907,8 +16046,8 @@ pub unsafe fn _mm256_maskz_cvttpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub unsafe fn _mm_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    transmute(vcvttpd2dq128(a.as_f64x2(), src.as_i32x4(), k))
+pub fn _mm_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq128(a.as_f64x2(), src.as_i32x4(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14918,24 +16057,26 @@ pub unsafe fn _mm_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
-pub unsafe fn _mm_maskz_cvttpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
-    transmute(vcvttpd2dq128(a.as_f64x2(), i32x4::ZERO, k))
+pub fn _mm_maskz_cvttpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2dq128(a.as_f64x2(), i32x4::ZERO, k)) }
 }
 
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_cvttpd_epu32&expand=1965)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub unsafe fn _mm512_cvttpd_epu32(a: __m512d) -> __m256i {
-    transmute(vcvttpd2udq(
-        a.as_f64x8(),
-        i32x8::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_cvttpd_epu32(a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2udq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14945,13 +16086,15 @@ pub unsafe fn _mm512_cvttpd_epu32(a: __m512d) -> __m256i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub unsafe fn _mm512_mask_cvttpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
-    transmute(vcvttpd2udq(
-        a.as_f64x8(),
-        src.as_i32x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2udq(
+            a.as_f64x8(),
+            src.as_i32x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14961,24 +16104,26 @@ pub unsafe fn _mm512_mask_cvttpd_epu32(src: __m256i, k: __mmask8, a: __m512d) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub unsafe fn _mm512_maskz_cvttpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
-    transmute(vcvttpd2udq(
-        a.as_f64x8(),
-        i32x8::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_maskz_cvttpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
+    unsafe {
+        transmute(vcvttpd2udq(
+            a.as_f64x8(),
+            i32x8::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epu32&expand=1962)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub unsafe fn _mm256_cvttpd_epu32(a: __m256d) -> __m128i {
-    transmute(vcvttpd2udq256(a.as_f64x4(), i32x4::ZERO, 0b11111111))
+pub fn _mm256_cvttpd_epu32(a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), i32x4::ZERO, 0b11111111)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -14988,8 +16133,8 @@ pub unsafe fn _mm256_cvttpd_epu32(a: __m256d) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub unsafe fn _mm256_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
-    transmute(vcvttpd2udq256(a.as_f64x4(), src.as_i32x4(), k))
+pub fn _mm256_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), src.as_i32x4(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -14999,19 +16144,19 @@ pub unsafe fn _mm256_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m256d) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub unsafe fn _mm256_maskz_cvttpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
-    transmute(vcvttpd2udq256(a.as_f64x4(), i32x4::ZERO, k))
+pub fn _mm256_maskz_cvttpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq256(a.as_f64x4(), i32x4::ZERO, k)) }
 }
 
-/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
+/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epu32&expand=1959)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub unsafe fn _mm_cvttpd_epu32(a: __m128d) -> __m128i {
-    transmute(vcvttpd2udq128(a.as_f64x2(), i32x4::ZERO, 0b11111111))
+pub fn _mm_cvttpd_epu32(a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), i32x4::ZERO, 0b11111111)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -15021,8 +16166,8 @@ pub unsafe fn _mm_cvttpd_epu32(a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub unsafe fn _mm_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
-    transmute(vcvttpd2udq128(a.as_f64x2(), src.as_i32x4(), k))
+pub fn _mm_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), src.as_i32x4(), k)) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -15032,8 +16177,8 @@ pub unsafe fn _mm_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
-pub unsafe fn _mm_maskz_cvttpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
-    transmute(vcvttpd2udq128(a.as_f64x2(), i32x4::ZERO, k))
+pub fn _mm_maskz_cvttpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
+    unsafe { transmute(vcvttpd2udq128(a.as_f64x2(), i32x4::ZERO, k)) }
 }
 
 /// Returns vector of type `__m512d` with all elements set to zero.
@@ -15043,9 +16188,9 @@ pub unsafe fn _mm_maskz_cvttpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vxorps))]
-pub unsafe fn _mm512_setzero_pd() -> __m512d {
+pub fn _mm512_setzero_pd() -> __m512d {
     // All-0 is a properly initialized __m512d
-    const { mem::zeroed() }
+    unsafe { const { mem::zeroed() } }
 }
 
 /// Returns vector of type `__m512` with all elements set to zero.
@@ -15055,9 +16200,9 @@ pub unsafe fn _mm512_setzero_pd() -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vxorps))]
-pub unsafe fn _mm512_setzero_ps() -> __m512 {
+pub fn _mm512_setzero_ps() -> __m512 {
     // All-0 is a properly initialized __m512
-    const { mem::zeroed() }
+    unsafe { const { mem::zeroed() } }
 }
 
 /// Return vector of type `__m512` with all elements set to zero.
@@ -15067,9 +16212,9 @@ pub unsafe fn _mm512_setzero_ps() -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vxorps))]
-pub unsafe fn _mm512_setzero() -> __m512 {
+pub fn _mm512_setzero() -> __m512 {
     // All-0 is a properly initialized __m512
-    const { mem::zeroed() }
+    unsafe { const { mem::zeroed() } }
 }
 
 /// Returns vector of type `__m512i` with all elements set to zero.
@@ -15079,9 +16224,9 @@ pub unsafe fn _mm512_setzero() -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vxorps))]
-pub unsafe fn _mm512_setzero_si512() -> __m512i {
+pub fn _mm512_setzero_si512() -> __m512i {
     // All-0 is a properly initialized __m512i
-    const { mem::zeroed() }
+    unsafe { const { mem::zeroed() } }
 }
 
 /// Return vector of type `__m512i` with all elements set to zero.
@@ -15091,9 +16236,9 @@ pub unsafe fn _mm512_setzero_si512() -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vxorps))]
-pub unsafe fn _mm512_setzero_epi32() -> __m512i {
+pub fn _mm512_setzero_epi32() -> __m512i {
     // All-0 is a properly initialized __m512i
-    const { mem::zeroed() }
+    unsafe { const { mem::zeroed() } }
 }
 
 /// Sets packed 32-bit integers in `dst` with the supplied values in reverse
@@ -15103,7 +16248,7 @@ pub unsafe fn _mm512_setzero_epi32() -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_setr_epi32(
+pub fn _mm512_setr_epi32(
     e15: i32,
     e14: i32,
     e13: i32,
@@ -15121,10 +16266,12 @@ pub unsafe fn _mm512_setr_epi32(
     e1: i32,
     e0: i32,
 ) -> __m512i {
-    let r = i32x16::new(
-        e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
-    );
-    transmute(r)
+    unsafe {
+        let r = i32x16::new(
+            e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
+        );
+        transmute(r)
+    }
 }
 
 /// Set packed 8-bit integers in dst with the supplied values.
@@ -15133,7 +16280,7 @@ pub unsafe fn _mm512_setr_epi32(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set_epi8(
+pub fn _mm512_set_epi8(
     e63: i8,
     e62: i8,
     e61: i8,
@@ -15199,13 +16346,15 @@ pub unsafe fn _mm512_set_epi8(
     e1: i8,
     e0: i8,
 ) -> __m512i {
-    let r = i8x64::new(
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
-        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, e32, e33, e34, e35, e36, e37,
-        e38, e39, e40, e41, e42, e43, e44, e45, e46, e47, e48, e49, e50, e51, e52, e53, e54, e55,
-        e56, e57, e58, e59, e60, e61, e62, e63,
-    );
-    transmute(r)
+    unsafe {
+        let r = i8x64::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18,
+            e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, e32, e33, e34, e35,
+            e36, e37, e38, e39, e40, e41, e42, e43, e44, e45, e46, e47, e48, e49, e50, e51, e52,
+            e53, e54, e55, e56, e57, e58, e59, e60, e61, e62, e63,
+        );
+        transmute(r)
+    }
 }
 
 /// Set packed 16-bit integers in dst with the supplied values.
@@ -15214,7 +16363,7 @@ pub unsafe fn _mm512_set_epi8(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set_epi16(
+pub fn _mm512_set_epi16(
     e31: i16,
     e30: i16,
     e29: i16,
@@ -15248,11 +16397,13 @@ pub unsafe fn _mm512_set_epi16(
     e1: i16,
     e0: i16,
 ) -> __m512i {
-    let r = i16x32::new(
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
-        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
-    );
-    transmute(r)
+    unsafe {
+        let r = i16x32::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18,
+            e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
+        );
+        transmute(r)
+    }
 }
 
 /// Set packed 32-bit integers in dst with the repeated 4 element sequence.
@@ -15261,7 +16412,7 @@ pub unsafe fn _mm512_set_epi16(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
+pub fn _mm512_set4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
     _mm512_set_epi32(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
 }
 
@@ -15271,7 +16422,7 @@ pub unsafe fn _mm512_set4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
+pub fn _mm512_set4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
     _mm512_set_ps(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
 }
 
@@ -15281,7 +16432,7 @@ pub unsafe fn _mm512_set4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
+pub fn _mm512_set4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
     _mm512_set_pd(d, c, b, a, d, c, b, a)
 }
 
@@ -15291,7 +16442,7 @@ pub unsafe fn _mm512_set4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_setr4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
+pub fn _mm512_setr4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
     _mm512_set_epi32(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
 }
 
@@ -15301,7 +16452,7 @@ pub unsafe fn _mm512_setr4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_setr4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
+pub fn _mm512_setr4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
     _mm512_set_ps(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
 }
 
@@ -15311,7 +16462,7 @@ pub unsafe fn _mm512_setr4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_setr4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
+pub fn _mm512_setr4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
     _mm512_set_pd(a, b, c, d, a, b, c, d)
 }
 
@@ -15321,7 +16472,7 @@ pub unsafe fn _mm512_setr4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set_epi64(
+pub fn _mm512_set_epi64(
     e0: i64,
     e1: i64,
     e2: i64,
@@ -15340,7 +16491,7 @@ pub unsafe fn _mm512_set_epi64(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_setr_epi64(
+pub fn _mm512_setr_epi64(
     e0: i64,
     e1: i64,
     e2: i64,
@@ -15350,8 +16501,10 @@ pub unsafe fn _mm512_setr_epi64(
     e6: i64,
     e7: i64,
 ) -> __m512i {
-    let r = i64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
-    transmute(r)
+    unsafe {
+        let r = i64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
+        transmute(r)
+    }
 }
 
 /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
@@ -17188,8 +18341,8 @@ pub unsafe fn _mm_mmask_i64gather_ps<const SCALE: i32>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressd))]
-pub unsafe fn _mm512_mask_compress_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    transmute(vpcompressd(a.as_i32x16(), src.as_i32x16(), k))
+pub fn _mm512_mask_compress_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressd(a.as_i32x16(), src.as_i32x16(), k)) }
 }
 
 /// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17199,8 +18352,8 @@ pub unsafe fn _mm512_mask_compress_epi32(src: __m512i, k: __mmask16, a: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressd))]
-pub unsafe fn _mm512_maskz_compress_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    transmute(vpcompressd(a.as_i32x16(), i32x16::ZERO, k))
+pub fn _mm512_maskz_compress_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressd(a.as_i32x16(), i32x16::ZERO, k)) }
 }
 
 /// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -17210,8 +18363,8 @@ pub unsafe fn _mm512_maskz_compress_epi32(k: __mmask16, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressd))]
-pub unsafe fn _mm256_mask_compress_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    transmute(vpcompressd256(a.as_i32x8(), src.as_i32x8(), k))
+pub fn _mm256_mask_compress_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressd256(a.as_i32x8(), src.as_i32x8(), k)) }
 }
 
 /// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17221,8 +18374,8 @@ pub unsafe fn _mm256_mask_compress_epi32(src: __m256i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressd))]
-pub unsafe fn _mm256_maskz_compress_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    transmute(vpcompressd256(a.as_i32x8(), i32x8::ZERO, k))
+pub fn _mm256_maskz_compress_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressd256(a.as_i32x8(), i32x8::ZERO, k)) }
 }
 
 /// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -17232,8 +18385,8 @@ pub unsafe fn _mm256_maskz_compress_epi32(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressd))]
-pub unsafe fn _mm_mask_compress_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpcompressd128(a.as_i32x4(), src.as_i32x4(), k))
+pub fn _mm_mask_compress_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressd128(a.as_i32x4(), src.as_i32x4(), k)) }
 }
 
 /// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17243,8 +18396,8 @@ pub unsafe fn _mm_mask_compress_epi32(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressd))]
-pub unsafe fn _mm_maskz_compress_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpcompressd128(a.as_i32x4(), i32x4::ZERO, k))
+pub fn _mm_maskz_compress_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressd128(a.as_i32x4(), i32x4::ZERO, k)) }
 }
 
 /// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -17254,8 +18407,8 @@ pub unsafe fn _mm_maskz_compress_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressq))]
-pub unsafe fn _mm512_mask_compress_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    transmute(vpcompressq(a.as_i64x8(), src.as_i64x8(), k))
+pub fn _mm512_mask_compress_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressq(a.as_i64x8(), src.as_i64x8(), k)) }
 }
 
 /// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17265,8 +18418,8 @@ pub unsafe fn _mm512_mask_compress_epi64(src: __m512i, k: __mmask8, a: __m512i)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressq))]
-pub unsafe fn _mm512_maskz_compress_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    transmute(vpcompressq(a.as_i64x8(), i64x8::ZERO, k))
+pub fn _mm512_maskz_compress_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressq(a.as_i64x8(), i64x8::ZERO, k)) }
 }
 
 /// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -17276,8 +18429,8 @@ pub unsafe fn _mm512_maskz_compress_epi64(k: __mmask8, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressq))]
-pub unsafe fn _mm256_mask_compress_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    transmute(vpcompressq256(a.as_i64x4(), src.as_i64x4(), k))
+pub fn _mm256_mask_compress_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressq256(a.as_i64x4(), src.as_i64x4(), k)) }
 }
 
 /// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17287,8 +18440,8 @@ pub unsafe fn _mm256_mask_compress_epi64(src: __m256i, k: __mmask8, a: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressq))]
-pub unsafe fn _mm256_maskz_compress_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    transmute(vpcompressq256(a.as_i64x4(), i64x4::ZERO, k))
+pub fn _mm256_maskz_compress_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressq256(a.as_i64x4(), i64x4::ZERO, k)) }
 }
 
 /// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -17298,8 +18451,8 @@ pub unsafe fn _mm256_maskz_compress_epi64(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressq))]
-pub unsafe fn _mm_mask_compress_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpcompressq128(a.as_i64x2(), src.as_i64x2(), k))
+pub fn _mm_mask_compress_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressq128(a.as_i64x2(), src.as_i64x2(), k)) }
 }
 
 /// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17309,8 +18462,8 @@ pub unsafe fn _mm_mask_compress_epi64(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressq))]
-pub unsafe fn _mm_maskz_compress_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpcompressq128(a.as_i64x2(), i64x2::ZERO, k))
+pub fn _mm_maskz_compress_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressq128(a.as_i64x2(), i64x2::ZERO, k)) }
 }
 
 /// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -17320,8 +18473,8 @@ pub unsafe fn _mm_maskz_compress_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompressps))]
-pub unsafe fn _mm512_mask_compress_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    transmute(vcompressps(a.as_f32x16(), src.as_f32x16(), k))
+pub fn _mm512_mask_compress_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vcompressps(a.as_f32x16(), src.as_f32x16(), k)) }
 }
 
 /// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17331,8 +18484,8 @@ pub unsafe fn _mm512_mask_compress_ps(src: __m512, k: __mmask16, a: __m512) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompressps))]
-pub unsafe fn _mm512_maskz_compress_ps(k: __mmask16, a: __m512) -> __m512 {
-    transmute(vcompressps(a.as_f32x16(), f32x16::ZERO, k))
+pub fn _mm512_maskz_compress_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vcompressps(a.as_f32x16(), f32x16::ZERO, k)) }
 }
 
 /// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -17342,8 +18495,8 @@ pub unsafe fn _mm512_maskz_compress_ps(k: __mmask16, a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompressps))]
-pub unsafe fn _mm256_mask_compress_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    transmute(vcompressps256(a.as_f32x8(), src.as_f32x8(), k))
+pub fn _mm256_mask_compress_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vcompressps256(a.as_f32x8(), src.as_f32x8(), k)) }
 }
 
 /// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17353,8 +18506,8 @@ pub unsafe fn _mm256_mask_compress_ps(src: __m256, k: __mmask8, a: __m256) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompressps))]
-pub unsafe fn _mm256_maskz_compress_ps(k: __mmask8, a: __m256) -> __m256 {
-    transmute(vcompressps256(a.as_f32x8(), f32x8::ZERO, k))
+pub fn _mm256_maskz_compress_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vcompressps256(a.as_f32x8(), f32x8::ZERO, k)) }
 }
 
 /// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -17364,8 +18517,8 @@ pub unsafe fn _mm256_maskz_compress_ps(k: __mmask8, a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompressps))]
-pub unsafe fn _mm_mask_compress_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    transmute(vcompressps128(a.as_f32x4(), src.as_f32x4(), k))
+pub fn _mm_mask_compress_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vcompressps128(a.as_f32x4(), src.as_f32x4(), k)) }
 }
 
 /// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17375,8 +18528,8 @@ pub unsafe fn _mm_mask_compress_ps(src: __m128, k: __mmask8, a: __m128) -> __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompressps))]
-pub unsafe fn _mm_maskz_compress_ps(k: __mmask8, a: __m128) -> __m128 {
-    transmute(vcompressps128(a.as_f32x4(), f32x4::ZERO, k))
+pub fn _mm_maskz_compress_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vcompressps128(a.as_f32x4(), f32x4::ZERO, k)) }
 }
 
 /// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -17386,8 +18539,8 @@ pub unsafe fn _mm_maskz_compress_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompresspd))]
-pub unsafe fn _mm512_mask_compress_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    transmute(vcompresspd(a.as_f64x8(), src.as_f64x8(), k))
+pub fn _mm512_mask_compress_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vcompresspd(a.as_f64x8(), src.as_f64x8(), k)) }
 }
 
 /// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17397,8 +18550,8 @@ pub unsafe fn _mm512_mask_compress_pd(src: __m512d, k: __mmask8, a: __m512d) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompresspd))]
-pub unsafe fn _mm512_maskz_compress_pd(k: __mmask8, a: __m512d) -> __m512d {
-    transmute(vcompresspd(a.as_f64x8(), f64x8::ZERO, k))
+pub fn _mm512_maskz_compress_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vcompresspd(a.as_f64x8(), f64x8::ZERO, k)) }
 }
 
 /// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -17408,8 +18561,8 @@ pub unsafe fn _mm512_maskz_compress_pd(k: __mmask8, a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompresspd))]
-pub unsafe fn _mm256_mask_compress_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    transmute(vcompresspd256(a.as_f64x4(), src.as_f64x4(), k))
+pub fn _mm256_mask_compress_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vcompresspd256(a.as_f64x4(), src.as_f64x4(), k)) }
 }
 
 /// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17419,8 +18572,8 @@ pub unsafe fn _mm256_mask_compress_pd(src: __m256d, k: __mmask8, a: __m256d) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompresspd))]
-pub unsafe fn _mm256_maskz_compress_pd(k: __mmask8, a: __m256d) -> __m256d {
-    transmute(vcompresspd256(a.as_f64x4(), f64x4::ZERO, k))
+pub fn _mm256_maskz_compress_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vcompresspd256(a.as_f64x4(), f64x4::ZERO, k)) }
 }
 
 /// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -17430,8 +18583,8 @@ pub unsafe fn _mm256_maskz_compress_pd(k: __mmask8, a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompresspd))]
-pub unsafe fn _mm_mask_compress_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    transmute(vcompresspd128(a.as_f64x2(), src.as_f64x2(), k))
+pub fn _mm_mask_compress_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vcompresspd128(a.as_f64x2(), src.as_f64x2(), k)) }
 }
 
 /// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -17441,8 +18594,8 @@ pub unsafe fn _mm_mask_compress_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcompresspd))]
-pub unsafe fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d {
-    transmute(vcompresspd128(a.as_f64x2(), f64x2::ZERO, k))
+pub fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vcompresspd128(a.as_f64x2(), f64x2::ZERO, k)) }
 }
 
 /// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
@@ -17584,8 +18737,8 @@ pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandd))]
-pub unsafe fn _mm512_mask_expand_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    transmute(vpexpandd(a.as_i32x16(), src.as_i32x16(), k))
+pub fn _mm512_mask_expand_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandd(a.as_i32x16(), src.as_i32x16(), k)) }
 }
 
 /// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17595,8 +18748,8 @@ pub unsafe fn _mm512_mask_expand_epi32(src: __m512i, k: __mmask16, a: __m512i) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandd))]
-pub unsafe fn _mm512_maskz_expand_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    transmute(vpexpandd(a.as_i32x16(), i32x16::ZERO, k))
+pub fn _mm512_maskz_expand_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandd(a.as_i32x16(), i32x16::ZERO, k)) }
 }
 
 /// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17606,8 +18759,8 @@ pub unsafe fn _mm512_maskz_expand_epi32(k: __mmask16, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandd))]
-pub unsafe fn _mm256_mask_expand_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    transmute(vpexpandd256(a.as_i32x8(), src.as_i32x8(), k))
+pub fn _mm256_mask_expand_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandd256(a.as_i32x8(), src.as_i32x8(), k)) }
 }
 
 /// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17617,8 +18770,8 @@ pub unsafe fn _mm256_mask_expand_epi32(src: __m256i, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandd))]
-pub unsafe fn _mm256_maskz_expand_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    transmute(vpexpandd256(a.as_i32x8(), i32x8::ZERO, k))
+pub fn _mm256_maskz_expand_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandd256(a.as_i32x8(), i32x8::ZERO, k)) }
 }
 
 /// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17628,8 +18781,8 @@ pub unsafe fn _mm256_maskz_expand_epi32(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandd))]
-pub unsafe fn _mm_mask_expand_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpexpandd128(a.as_i32x4(), src.as_i32x4(), k))
+pub fn _mm_mask_expand_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandd128(a.as_i32x4(), src.as_i32x4(), k)) }
 }
 
 /// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17639,8 +18792,8 @@ pub unsafe fn _mm_mask_expand_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandd))]
-pub unsafe fn _mm_maskz_expand_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpexpandd128(a.as_i32x4(), i32x4::ZERO, k))
+pub fn _mm_maskz_expand_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandd128(a.as_i32x4(), i32x4::ZERO, k)) }
 }
 
 /// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17650,8 +18803,8 @@ pub unsafe fn _mm_maskz_expand_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandq))]
-pub unsafe fn _mm512_mask_expand_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    transmute(vpexpandq(a.as_i64x8(), src.as_i64x8(), k))
+pub fn _mm512_mask_expand_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandq(a.as_i64x8(), src.as_i64x8(), k)) }
 }
 
 /// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17661,8 +18814,8 @@ pub unsafe fn _mm512_mask_expand_epi64(src: __m512i, k: __mmask8, a: __m512i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandq))]
-pub unsafe fn _mm512_maskz_expand_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    transmute(vpexpandq(a.as_i64x8(), i64x8::ZERO, k))
+pub fn _mm512_maskz_expand_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandq(a.as_i64x8(), i64x8::ZERO, k)) }
 }
 
 /// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17672,8 +18825,8 @@ pub unsafe fn _mm512_maskz_expand_epi64(k: __mmask8, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandq))]
-pub unsafe fn _mm256_mask_expand_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    transmute(vpexpandq256(a.as_i64x4(), src.as_i64x4(), k))
+pub fn _mm256_mask_expand_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandq256(a.as_i64x4(), src.as_i64x4(), k)) }
 }
 
 /// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17683,8 +18836,8 @@ pub unsafe fn _mm256_mask_expand_epi64(src: __m256i, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandq))]
-pub unsafe fn _mm256_maskz_expand_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    transmute(vpexpandq256(a.as_i64x4(), i64x4::ZERO, k))
+pub fn _mm256_maskz_expand_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandq256(a.as_i64x4(), i64x4::ZERO, k)) }
 }
 
 /// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17694,8 +18847,8 @@ pub unsafe fn _mm256_maskz_expand_epi64(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandq))]
-pub unsafe fn _mm_mask_expand_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpexpandq128(a.as_i64x2(), src.as_i64x2(), k))
+pub fn _mm_mask_expand_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandq128(a.as_i64x2(), src.as_i64x2(), k)) }
 }
 
 /// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17705,8 +18858,8 @@ pub unsafe fn _mm_mask_expand_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandq))]
-pub unsafe fn _mm_maskz_expand_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpexpandq128(a.as_i64x2(), i64x2::ZERO, k))
+pub fn _mm_maskz_expand_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandq128(a.as_i64x2(), i64x2::ZERO, k)) }
 }
 
 /// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17716,8 +18869,8 @@ pub unsafe fn _mm_maskz_expand_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandps))]
-pub unsafe fn _mm512_mask_expand_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    transmute(vexpandps(a.as_f32x16(), src.as_f32x16(), k))
+pub fn _mm512_mask_expand_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vexpandps(a.as_f32x16(), src.as_f32x16(), k)) }
 }
 
 /// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17727,8 +18880,8 @@ pub unsafe fn _mm512_mask_expand_ps(src: __m512, k: __mmask16, a: __m512) -> __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandps))]
-pub unsafe fn _mm512_maskz_expand_ps(k: __mmask16, a: __m512) -> __m512 {
-    transmute(vexpandps(a.as_f32x16(), f32x16::ZERO, k))
+pub fn _mm512_maskz_expand_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe { transmute(vexpandps(a.as_f32x16(), f32x16::ZERO, k)) }
 }
 
 /// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17738,8 +18891,8 @@ pub unsafe fn _mm512_maskz_expand_ps(k: __mmask16, a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandps))]
-pub unsafe fn _mm256_mask_expand_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    transmute(vexpandps256(a.as_f32x8(), src.as_f32x8(), k))
+pub fn _mm256_mask_expand_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vexpandps256(a.as_f32x8(), src.as_f32x8(), k)) }
 }
 
 /// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17749,8 +18902,8 @@ pub unsafe fn _mm256_mask_expand_ps(src: __m256, k: __mmask8, a: __m256) -> __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandps))]
-pub unsafe fn _mm256_maskz_expand_ps(k: __mmask8, a: __m256) -> __m256 {
-    transmute(vexpandps256(a.as_f32x8(), f32x8::ZERO, k))
+pub fn _mm256_maskz_expand_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe { transmute(vexpandps256(a.as_f32x8(), f32x8::ZERO, k)) }
 }
 
 /// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17760,8 +18913,8 @@ pub unsafe fn _mm256_maskz_expand_ps(k: __mmask8, a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandps))]
-pub unsafe fn _mm_mask_expand_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    transmute(vexpandps128(a.as_f32x4(), src.as_f32x4(), k))
+pub fn _mm_mask_expand_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vexpandps128(a.as_f32x4(), src.as_f32x4(), k)) }
 }
 
 /// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17771,8 +18924,8 @@ pub unsafe fn _mm_mask_expand_ps(src: __m128, k: __mmask8, a: __m128) -> __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandps))]
-pub unsafe fn _mm_maskz_expand_ps(k: __mmask8, a: __m128) -> __m128 {
-    transmute(vexpandps128(a.as_f32x4(), f32x4::ZERO, k))
+pub fn _mm_maskz_expand_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe { transmute(vexpandps128(a.as_f32x4(), f32x4::ZERO, k)) }
 }
 
 /// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17782,8 +18935,8 @@ pub unsafe fn _mm_maskz_expand_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandpd))]
-pub unsafe fn _mm512_mask_expand_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    transmute(vexpandpd(a.as_f64x8(), src.as_f64x8(), k))
+pub fn _mm512_mask_expand_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vexpandpd(a.as_f64x8(), src.as_f64x8(), k)) }
 }
 
 /// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17793,8 +18946,8 @@ pub unsafe fn _mm512_mask_expand_pd(src: __m512d, k: __mmask8, a: __m512d) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandpd))]
-pub unsafe fn _mm512_maskz_expand_pd(k: __mmask8, a: __m512d) -> __m512d {
-    transmute(vexpandpd(a.as_f64x8(), f64x8::ZERO, k))
+pub fn _mm512_maskz_expand_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe { transmute(vexpandpd(a.as_f64x8(), f64x8::ZERO, k)) }
 }
 
 /// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17804,8 +18957,8 @@ pub unsafe fn _mm512_maskz_expand_pd(k: __mmask8, a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandpd))]
-pub unsafe fn _mm256_mask_expand_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    transmute(vexpandpd256(a.as_f64x4(), src.as_f64x4(), k))
+pub fn _mm256_mask_expand_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vexpandpd256(a.as_f64x4(), src.as_f64x4(), k)) }
 }
 
 /// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17815,8 +18968,8 @@ pub unsafe fn _mm256_mask_expand_pd(src: __m256d, k: __mmask8, a: __m256d) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandpd))]
-pub unsafe fn _mm256_maskz_expand_pd(k: __mmask8, a: __m256d) -> __m256d {
-    transmute(vexpandpd256(a.as_f64x4(), f64x4::ZERO, k))
+pub fn _mm256_maskz_expand_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe { transmute(vexpandpd256(a.as_f64x4(), f64x4::ZERO, k)) }
 }
 
 /// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17826,8 +18979,8 @@ pub unsafe fn _mm256_maskz_expand_pd(k: __mmask8, a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandpd))]
-pub unsafe fn _mm_mask_expand_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    transmute(vexpandpd128(a.as_f64x2(), src.as_f64x2(), k))
+pub fn _mm_mask_expand_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vexpandpd128(a.as_f64x2(), src.as_f64x2(), k)) }
 }
 
 /// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17837,8 +18990,8 @@ pub unsafe fn _mm_mask_expand_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vexpandpd))]
-pub unsafe fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
-    transmute(vexpandpd128(a.as_f64x2(), f64x2::ZERO, k))
+pub fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe { transmute(vexpandpd128(a.as_f64x2(), f64x2::ZERO, k)) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -17849,11 +19002,13 @@ pub unsafe fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let r = vprold(a, IMM8);
-    transmute(r)
+pub fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprold(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17864,15 +19019,13 @@ pub unsafe fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_rol_epi32<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let r = vprold(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_rol_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprold(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17883,11 +19036,13 @@ pub unsafe fn _mm512_mask_rol_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let r = vprold(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprold(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -17898,11 +19053,13 @@ pub unsafe fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x8();
-    let r = vprold256(a, IMM8);
-    transmute(r)
+pub fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprold256(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17913,15 +19070,13 @@ pub unsafe fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_rol_epi32<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x8();
-    let r = vprold256(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_rol_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprold256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17932,11 +19087,13 @@ pub unsafe fn _mm256_mask_rol_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x8();
-    let r = vprold256(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprold256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -17947,11 +19104,13 @@ pub unsafe fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x4();
-    let r = vprold128(a, IMM8);
-    transmute(r)
+pub fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprold128(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -17962,15 +19121,13 @@ pub unsafe fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_rol_epi32<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x4();
-    let r = vprold128(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_rol_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprold128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -17981,11 +19138,13 @@ pub unsafe fn _mm_mask_rol_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x4();
-    let r = vprold128(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprold128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -17996,11 +19155,13 @@ pub unsafe fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let r = vprord(a, IMM8);
-    transmute(r)
+pub fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprord(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18011,15 +19172,13 @@ pub unsafe fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_ror_epi32<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let r = vprord(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_ror_epi32<const IMM8: i32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprord(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18030,11 +19189,13 @@ pub unsafe fn _mm512_mask_ror_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let r = vprord(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let r = vprord(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -18045,11 +19206,13 @@ pub unsafe fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x8();
-    let r = vprord256(a, IMM8);
-    transmute(r)
+pub fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprord256(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18060,15 +19223,13 @@ pub unsafe fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_ror_epi32<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x8();
-    let r = vprord256(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_ror_epi32<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprord256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18079,11 +19240,13 @@ pub unsafe fn _mm256_mask_ror_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x8();
-    let r = vprord256(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let r = vprord256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -18094,11 +19257,13 @@ pub unsafe fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x4();
-    let r = vprord128(a, IMM8);
-    transmute(r)
+pub fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprord128(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18109,15 +19274,13 @@ pub unsafe fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_ror_epi32<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x4();
-    let r = vprord128(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_ror_epi32<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprord128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18128,11 +19291,13 @@ pub unsafe fn _mm_mask_ror_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x4();
-    let r = vprord128(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let r = vprord128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -18143,11 +19308,13 @@ pub unsafe fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let r = vprolq(a, IMM8);
-    transmute(r)
+pub fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprolq(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18158,15 +19325,13 @@ pub unsafe fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_rol_epi64<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let r = vprolq(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+pub fn _mm512_mask_rol_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprolq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18177,11 +19342,13 @@ pub unsafe fn _mm512_mask_rol_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let r = vprolq(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+pub fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprolq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -18192,11 +19359,13 @@ pub unsafe fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x4();
-    let r = vprolq256(a, IMM8);
-    transmute(r)
+pub fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprolq256(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18207,15 +19376,13 @@ pub unsafe fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_rol_epi64<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x4();
-    let r = vprolq256(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+pub fn _mm256_mask_rol_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprolq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18226,11 +19393,13 @@ pub unsafe fn _mm256_mask_rol_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x4();
-    let r = vprolq256(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+pub fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprolq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
@@ -18241,11 +19410,13 @@ pub unsafe fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x2();
-    let r = vprolq128(a, IMM8);
-    transmute(r)
+pub fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprolq128(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18256,15 +19427,13 @@ pub unsafe fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_rol_epi64<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x2();
-    let r = vprolq128(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+pub fn _mm_mask_rol_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprolq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18275,11 +19444,13 @@ pub unsafe fn _mm_mask_rol_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x2();
-    let r = vprolq128(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+pub fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprolq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -18290,11 +19461,13 @@ pub unsafe fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let r = vprorq(a, IMM8);
-    transmute(r)
+pub fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprorq(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18305,15 +19478,13 @@ pub unsafe fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_ror_epi64<const IMM8: i32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let r = vprorq(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+pub fn _mm512_mask_ror_epi64<const IMM8: i32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprorq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18324,11 +19495,13 @@ pub unsafe fn _mm512_mask_ror_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x8();
-    let r = vprorq(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+pub fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x8();
+        let r = vprorq(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -18339,11 +19512,13 @@ pub unsafe fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x4();
-    let r = vprorq256(a, IMM8);
-    transmute(r)
+pub fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprorq256(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18354,15 +19529,13 @@ pub unsafe fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_ror_epi64<const IMM8: i32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x4();
-    let r = vprorq256(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+pub fn _mm256_mask_ror_epi64<const IMM8: i32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprorq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18373,11 +19546,13 @@ pub unsafe fn _mm256_mask_ror_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x4();
-    let r = vprorq256(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+pub fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x4();
+        let r = vprorq256(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
@@ -18388,11 +19563,13 @@ pub unsafe fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x2();
-    let r = vprorq128(a, IMM8);
-    transmute(r)
+pub fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprorq128(a, IMM8);
+        transmute(r)
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18403,15 +19580,13 @@ pub unsafe fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_ror_epi64<const IMM8: i32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x2();
-    let r = vprorq128(a, IMM8);
-    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+pub fn _mm_mask_ror_epi64<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprorq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18422,11 +19597,13 @@ pub unsafe fn _mm_mask_ror_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i64x2();
-    let r = vprorq128(a, IMM8);
-    transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+pub fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i64x2();
+        let r = vprorq128(a, IMM8);
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
@@ -18437,12 +19614,14 @@ pub unsafe fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 32 {
-        _mm512_setzero_si512()
-    } else {
-        transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8)))
+pub fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shl(a.as_u32x16(), u32x16::splat(IMM8)))
+        }
     }
 }
 
@@ -18454,18 +19633,16 @@ pub unsafe fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_slli_epi32<const IMM8: u32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = if IMM8 >= 32 {
-        u32x16::ZERO
-    } else {
-        simd_shl(a.as_u32x16(), u32x16::splat(IMM8))
-    };
-    transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
+pub fn _mm512_mask_slli_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 32 {
+            u32x16::ZERO
+        } else {
+            simd_shl(a.as_u32x16(), u32x16::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18476,13 +19653,15 @@ pub unsafe fn _mm512_mask_slli_epi32<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_slli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 32 {
-        _mm512_setzero_si512()
-    } else {
-        let shf = simd_shl(a.as_u32x16(), u32x16::splat(IMM8));
-        transmute(simd_select_bitmask(k, shf, u32x16::ZERO))
+pub fn _mm512_maskz_slli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shl(a.as_u32x16(), u32x16::splat(IMM8));
+            transmute(simd_select_bitmask(k, shf, u32x16::ZERO))
+        }
     }
 }
 
@@ -18494,18 +19673,16 @@ pub unsafe fn _mm512_maskz_slli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_slli_epi32<const IMM8: u32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = if IMM8 >= 32 {
-        u32x8::ZERO
-    } else {
-        simd_shl(a.as_u32x8(), u32x8::splat(IMM8))
-    };
-    transmute(simd_select_bitmask(k, r, src.as_u32x8()))
+pub fn _mm256_mask_slli_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x8::ZERO
+        } else {
+            simd_shl(a.as_u32x8(), u32x8::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x8()))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18516,13 +19693,15 @@ pub unsafe fn _mm256_mask_slli_epi32<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 32 {
-        _mm256_setzero_si256()
-    } else {
-        let r = simd_shl(a.as_u32x8(), u32x8::splat(IMM8));
-        transmute(simd_select_bitmask(k, r, u32x8::ZERO))
+pub fn _mm256_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shl(a.as_u32x8(), u32x8::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x8::ZERO))
+        }
     }
 }
 
@@ -18534,18 +19713,16 @@ pub unsafe fn _mm256_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_slli_epi32<const IMM8: u32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = if IMM8 >= 32 {
-        u32x4::ZERO
-    } else {
-        simd_shl(a.as_u32x4(), u32x4::splat(IMM8))
-    };
-    transmute(simd_select_bitmask(k, r, src.as_u32x4()))
+pub fn _mm_mask_slli_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x4::ZERO
+        } else {
+            simd_shl(a.as_u32x4(), u32x4::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x4()))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18556,13 +19733,15 @@ pub unsafe fn _mm_mask_slli_epi32<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 32 {
-        _mm_setzero_si128()
-    } else {
-        let r = simd_shl(a.as_u32x4(), u32x4::splat(IMM8));
-        transmute(simd_select_bitmask(k, r, u32x4::ZERO))
+pub fn _mm_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shl(a.as_u32x4(), u32x4::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x4::ZERO))
+        }
     }
 }
 
@@ -18574,12 +19753,14 @@ pub unsafe fn _mm_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_srli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 32 {
-        _mm512_setzero_si512()
-    } else {
-        transmute(simd_shr(a.as_u32x16(), u32x16::splat(IMM8)))
+pub fn _mm512_srli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shr(a.as_u32x16(), u32x16::splat(IMM8)))
+        }
     }
 }
 
@@ -18591,18 +19772,16 @@ pub unsafe fn _mm512_srli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_srli_epi32<const IMM8: u32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = if IMM8 >= 32 {
-        u32x16::ZERO
-    } else {
-        simd_shr(a.as_u32x16(), u32x16::splat(IMM8))
-    };
-    transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
+pub fn _mm512_mask_srli_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 32 {
+            u32x16::ZERO
+        } else {
+            simd_shr(a.as_u32x16(), u32x16::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u32x16()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18613,13 +19792,15 @@ pub unsafe fn _mm512_mask_srli_epi32<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_srli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 32 {
-        _mm512_setzero_si512()
-    } else {
-        let shf = simd_shr(a.as_u32x16(), u32x16::splat(IMM8));
-        transmute(simd_select_bitmask(k, shf, u32x16::ZERO))
+pub fn _mm512_maskz_srli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shr(a.as_u32x16(), u32x16::splat(IMM8));
+            transmute(simd_select_bitmask(k, shf, u32x16::ZERO))
+        }
     }
 }
 
@@ -18631,18 +19812,16 @@ pub unsafe fn _mm512_maskz_srli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_srli_epi32<const IMM8: u32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = if IMM8 >= 32 {
-        u32x8::ZERO
-    } else {
-        simd_shr(a.as_u32x8(), u32x8::splat(IMM8))
-    };
-    transmute(simd_select_bitmask(k, r, src.as_u32x8()))
+pub fn _mm256_mask_srli_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x8::ZERO
+        } else {
+            simd_shr(a.as_u32x8(), u32x8::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x8()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18653,13 +19832,15 @@ pub unsafe fn _mm256_mask_srli_epi32<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 32 {
-        _mm256_setzero_si256()
-    } else {
-        let r = simd_shr(a.as_u32x8(), u32x8::splat(IMM8));
-        transmute(simd_select_bitmask(k, r, u32x8::ZERO))
+pub fn _mm256_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shr(a.as_u32x8(), u32x8::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x8::ZERO))
+        }
     }
 }
 
@@ -18671,18 +19852,16 @@ pub unsafe fn _mm256_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_srli_epi32<const IMM8: u32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = if IMM8 >= 32 {
-        u32x4::ZERO
-    } else {
-        simd_shr(a.as_u32x4(), u32x4::splat(IMM8))
-    };
-    transmute(simd_select_bitmask(k, r, src.as_u32x4()))
+pub fn _mm_mask_srli_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 32 {
+            u32x4::ZERO
+        } else {
+            simd_shr(a.as_u32x4(), u32x4::splat(IMM8))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u32x4()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18693,13 +19872,15 @@ pub unsafe fn _mm_mask_srli_epi32<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 32 {
-        _mm_setzero_si128()
-    } else {
-        let r = simd_shr(a.as_u32x4(), u32x4::splat(IMM8));
-        transmute(simd_select_bitmask(k, r, u32x4::ZERO))
+pub fn _mm_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 32 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shr(a.as_u32x4(), u32x4::splat(IMM8));
+            transmute(simd_select_bitmask(k, r, u32x4::ZERO))
+        }
     }
 }
 
@@ -18711,12 +19892,14 @@ pub unsafe fn _mm_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_slli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 64 {
-        _mm512_setzero_si512()
-    } else {
-        transmute(simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
+pub fn _mm512_slli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
+        }
     }
 }
 
@@ -18728,18 +19911,16 @@ pub unsafe fn _mm512_slli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_slli_epi64<const IMM8: u32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = if IMM8 >= 64 {
-        u64x8::ZERO
-    } else {
-        simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64))
-    };
-    transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
+pub fn _mm512_mask_slli_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 64 {
+            u64x8::ZERO
+        } else {
+            simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18750,13 +19931,15 @@ pub unsafe fn _mm512_mask_slli_epi64<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 64 {
-        _mm512_setzero_si512()
-    } else {
-        let shf = simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64));
-        transmute(simd_select_bitmask(k, shf, u64x8::ZERO))
+pub fn _mm512_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shl(a.as_u64x8(), u64x8::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, shf, u64x8::ZERO))
+        }
     }
 }
 
@@ -18768,18 +19951,16 @@ pub unsafe fn _mm512_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_slli_epi64<const IMM8: u32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = if IMM8 >= 64 {
-        u64x4::ZERO
-    } else {
-        simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64))
-    };
-    transmute(simd_select_bitmask(k, r, src.as_u64x4()))
+pub fn _mm256_mask_slli_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x4::ZERO
+        } else {
+            simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x4()))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18790,13 +19971,15 @@ pub unsafe fn _mm256_mask_slli_epi64<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 64 {
-        _mm256_setzero_si256()
-    } else {
-        let r = simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64));
-        transmute(simd_select_bitmask(k, r, u64x4::ZERO))
+pub fn _mm256_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x4::ZERO))
+        }
     }
 }
 
@@ -18808,18 +19991,16 @@ pub unsafe fn _mm256_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_slli_epi64<const IMM8: u32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = if IMM8 >= 64 {
-        u64x2::ZERO
-    } else {
-        simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64))
-    };
-    transmute(simd_select_bitmask(k, r, src.as_u64x2()))
+pub fn _mm_mask_slli_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x2::ZERO
+        } else {
+            simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x2()))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18830,13 +20011,15 @@ pub unsafe fn _mm_mask_slli_epi64<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 64 {
-        _mm_setzero_si128()
-    } else {
-        let r = simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64));
-        transmute(simd_select_bitmask(k, r, u64x2::ZERO))
+pub fn _mm_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shl(a.as_u64x2(), u64x2::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x2::ZERO))
+        }
     }
 }
 
@@ -18848,12 +20031,14 @@ pub unsafe fn _mm_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_srli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 64 {
-        _mm512_setzero_si512()
-    } else {
-        transmute(simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
+pub fn _mm512_srli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            transmute(simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64)))
+        }
     }
 }
 
@@ -18865,18 +20050,16 @@ pub unsafe fn _mm512_srli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_srli_epi64<const IMM8: u32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = if IMM8 >= 64 {
-        u64x8::ZERO
-    } else {
-        simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64))
-    };
-    transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
+pub fn _mm512_mask_srli_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = if IMM8 >= 64 {
+            u64x8::ZERO
+        } else {
+            simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, shf, src.as_u64x8()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18887,13 +20070,15 @@ pub unsafe fn _mm512_mask_srli_epi64<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 64 {
-        _mm512_setzero_si512()
-    } else {
-        let shf = simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64));
-        transmute(simd_select_bitmask(k, shf, u64x8::ZERO))
+pub fn _mm512_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm512_setzero_si512()
+        } else {
+            let shf = simd_shr(a.as_u64x8(), u64x8::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, shf, u64x8::ZERO))
+        }
     }
 }
 
@@ -18905,18 +20090,16 @@ pub unsafe fn _mm512_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_srli_epi64<const IMM8: u32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = if IMM8 >= 64 {
-        u64x4::ZERO
-    } else {
-        simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64))
-    };
-    transmute(simd_select_bitmask(k, r, src.as_u64x4()))
+pub fn _mm256_mask_srli_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x4::ZERO
+        } else {
+            simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x4()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18927,13 +20110,15 @@ pub unsafe fn _mm256_mask_srli_epi64<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 64 {
-        _mm256_setzero_si256()
-    } else {
-        let r = simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64));
-        transmute(simd_select_bitmask(k, r, u64x4::ZERO))
+pub fn _mm256_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm256_setzero_si256()
+        } else {
+            let r = simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x4::ZERO))
+        }
     }
 }
 
@@ -18945,18 +20130,16 @@ pub unsafe fn _mm256_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_srli_epi64<const IMM8: u32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = if IMM8 >= 64 {
-        u64x2::ZERO
-    } else {
-        simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64))
-    };
-    transmute(simd_select_bitmask(k, r, src.as_u64x2()))
+pub fn _mm_mask_srli_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = if IMM8 >= 64 {
+            u64x2::ZERO
+        } else {
+            simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64))
+        };
+        transmute(simd_select_bitmask(k, r, src.as_u64x2()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -18967,13 +20150,15 @@ pub unsafe fn _mm_mask_srli_epi64<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    if IMM8 >= 64 {
-        _mm_setzero_si128()
-    } else {
-        let r = simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64));
-        transmute(simd_select_bitmask(k, r, u64x2::ZERO))
+pub fn _mm_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        if IMM8 >= 64 {
+            _mm_setzero_si128()
+        } else {
+            let r = simd_shr(a.as_u64x2(), u64x2::splat(IMM8 as u64));
+            transmute(simd_select_bitmask(k, r, u64x2::ZERO))
+        }
     }
 }
 
@@ -18984,8 +20169,8 @@ pub unsafe fn _mm_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld))]
-pub unsafe fn _mm512_sll_epi32(a: __m512i, count: __m128i) -> __m512i {
-    transmute(vpslld(a.as_i32x16(), count.as_i32x4()))
+pub fn _mm512_sll_epi32(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpslld(a.as_i32x16(), count.as_i32x4())) }
 }
 
 /// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -18995,14 +20180,11 @@ pub unsafe fn _mm512_sll_epi32(a: __m512i, count: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld))]
-pub unsafe fn _mm512_mask_sll_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    count: __m128i,
-) -> __m512i {
-    let shf = _mm512_sll_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+pub fn _mm512_mask_sll_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19012,9 +20194,11 @@ pub unsafe fn _mm512_mask_sll_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld))]
-pub unsafe fn _mm512_maskz_sll_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
-    let shf = _mm512_sll_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+pub fn _mm512_maskz_sll_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19024,14 +20208,11 @@ pub unsafe fn _mm512_maskz_sll_epi32(k: __mmask16, a: __m512i, count: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld))]
-pub unsafe fn _mm256_mask_sll_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m128i,
-) -> __m256i {
-    let shf = _mm256_sll_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+pub fn _mm256_mask_sll_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19041,9 +20222,11 @@ pub unsafe fn _mm256_mask_sll_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld))]
-pub unsafe fn _mm256_maskz_sll_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    let shf = _mm256_sll_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+pub fn _mm256_maskz_sll_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19053,9 +20236,11 @@ pub unsafe fn _mm256_maskz_sll_epi32(k: __mmask8, a: __m256i, count: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld))]
-pub unsafe fn _mm_mask_sll_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sll_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+pub fn _mm_mask_sll_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19065,9 +20250,11 @@ pub unsafe fn _mm_mask_sll_epi32(src: __m128i, k: __mmask8, a: __m128i, count: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpslld))]
-pub unsafe fn _mm_maskz_sll_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sll_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+pub fn _mm_maskz_sll_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.
@@ -19077,8 +20264,8 @@ pub unsafe fn _mm_maskz_sll_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld))]
-pub unsafe fn _mm512_srl_epi32(a: __m512i, count: __m128i) -> __m512i {
-    transmute(vpsrld(a.as_i32x16(), count.as_i32x4()))
+pub fn _mm512_srl_epi32(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrld(a.as_i32x16(), count.as_i32x4())) }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19088,14 +20275,11 @@ pub unsafe fn _mm512_srl_epi32(a: __m512i, count: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld))]
-pub unsafe fn _mm512_mask_srl_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    count: __m128i,
-) -> __m512i {
-    let shf = _mm512_srl_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+pub fn _mm512_mask_srl_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19105,9 +20289,11 @@ pub unsafe fn _mm512_mask_srl_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld))]
-pub unsafe fn _mm512_maskz_srl_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
-    let shf = _mm512_srl_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+pub fn _mm512_maskz_srl_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19117,14 +20303,11 @@ pub unsafe fn _mm512_maskz_srl_epi32(k: __mmask16, a: __m512i, count: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld))]
-pub unsafe fn _mm256_mask_srl_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m128i,
-) -> __m256i {
-    let shf = _mm256_srl_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+pub fn _mm256_mask_srl_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19134,9 +20317,11 @@ pub unsafe fn _mm256_mask_srl_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld))]
-pub unsafe fn _mm256_maskz_srl_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    let shf = _mm256_srl_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+pub fn _mm256_maskz_srl_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19146,9 +20331,11 @@ pub unsafe fn _mm256_maskz_srl_epi32(k: __mmask8, a: __m256i, count: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld))]
-pub unsafe fn _mm_mask_srl_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srl_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+pub fn _mm_mask_srl_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19158,9 +20345,11 @@ pub unsafe fn _mm_mask_srl_epi32(src: __m128i, k: __mmask8, a: __m128i, count: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrld))]
-pub unsafe fn _mm_maskz_srl_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srl_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+pub fn _mm_maskz_srl_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.
@@ -19170,8 +20359,8 @@ pub unsafe fn _mm_maskz_srl_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq))]
-pub unsafe fn _mm512_sll_epi64(a: __m512i, count: __m128i) -> __m512i {
-    transmute(vpsllq(a.as_i64x8(), count.as_i64x2()))
+pub fn _mm512_sll_epi64(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsllq(a.as_i64x8(), count.as_i64x2())) }
 }
 
 /// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19181,14 +20370,11 @@ pub unsafe fn _mm512_sll_epi64(a: __m512i, count: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq))]
-pub unsafe fn _mm512_mask_sll_epi64(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    count: __m128i,
-) -> __m512i {
-    let shf = _mm512_sll_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+pub fn _mm512_mask_sll_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19198,9 +20384,11 @@ pub unsafe fn _mm512_mask_sll_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq))]
-pub unsafe fn _mm512_maskz_sll_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
-    let shf = _mm512_sll_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+pub fn _mm512_maskz_sll_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sll_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19210,14 +20398,11 @@ pub unsafe fn _mm512_maskz_sll_epi64(k: __mmask8, a: __m512i, count: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq))]
-pub unsafe fn _mm256_mask_sll_epi64(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m128i,
-) -> __m256i {
-    let shf = _mm256_sll_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+pub fn _mm256_mask_sll_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19227,9 +20412,11 @@ pub unsafe fn _mm256_mask_sll_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq))]
-pub unsafe fn _mm256_maskz_sll_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    let shf = _mm256_sll_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+pub fn _mm256_maskz_sll_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sll_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19239,9 +20426,11 @@ pub unsafe fn _mm256_maskz_sll_epi64(k: __mmask8, a: __m256i, count: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq))]
-pub unsafe fn _mm_mask_sll_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sll_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+pub fn _mm_mask_sll_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19251,9 +20440,11 @@ pub unsafe fn _mm_mask_sll_epi64(src: __m128i, k: __mmask8, a: __m128i, count: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllq))]
-pub unsafe fn _mm_maskz_sll_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sll_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+pub fn _mm_maskz_sll_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sll_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.
@@ -19263,8 +20454,8 @@ pub unsafe fn _mm_maskz_sll_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
-pub unsafe fn _mm512_srl_epi64(a: __m512i, count: __m128i) -> __m512i {
-    transmute(vpsrlq(a.as_i64x8(), count.as_i64x2()))
+pub fn _mm512_srl_epi64(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrlq(a.as_i64x8(), count.as_i64x2())) }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19274,14 +20465,11 @@ pub unsafe fn _mm512_srl_epi64(a: __m512i, count: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
-pub unsafe fn _mm512_mask_srl_epi64(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    count: __m128i,
-) -> __m512i {
-    let shf = _mm512_srl_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+pub fn _mm512_mask_srl_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19291,9 +20479,11 @@ pub unsafe fn _mm512_mask_srl_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
-pub unsafe fn _mm512_maskz_srl_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
-    let shf = _mm512_srl_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+pub fn _mm512_maskz_srl_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srl_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19303,14 +20493,11 @@ pub unsafe fn _mm512_maskz_srl_epi64(k: __mmask8, a: __m512i, count: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
-pub unsafe fn _mm256_mask_srl_epi64(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m128i,
-) -> __m256i {
-    let shf = _mm256_srl_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+pub fn _mm256_mask_srl_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19320,9 +20507,11 @@ pub unsafe fn _mm256_mask_srl_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
-pub unsafe fn _mm256_maskz_srl_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    let shf = _mm256_srl_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+pub fn _mm256_maskz_srl_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srl_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19332,9 +20521,11 @@ pub unsafe fn _mm256_maskz_srl_epi64(k: __mmask8, a: __m256i, count: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
-pub unsafe fn _mm_mask_srl_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srl_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+pub fn _mm_mask_srl_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19344,9 +20535,11 @@ pub unsafe fn _mm_mask_srl_epi64(src: __m128i, k: __mmask8, a: __m128i, count: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
-pub unsafe fn _mm_maskz_srl_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srl_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+pub fn _mm_maskz_srl_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srl_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.
@@ -19356,8 +20549,8 @@ pub unsafe fn _mm_maskz_srl_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad))]
-pub unsafe fn _mm512_sra_epi32(a: __m512i, count: __m128i) -> __m512i {
-    transmute(vpsrad(a.as_i32x16(), count.as_i32x4()))
+pub fn _mm512_sra_epi32(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsrad(a.as_i32x16(), count.as_i32x4())) }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19367,14 +20560,11 @@ pub unsafe fn _mm512_sra_epi32(a: __m512i, count: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad))]
-pub unsafe fn _mm512_mask_sra_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    count: __m128i,
-) -> __m512i {
-    let shf = _mm512_sra_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+pub fn _mm512_mask_sra_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19384,9 +20574,11 @@ pub unsafe fn _mm512_mask_sra_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad))]
-pub unsafe fn _mm512_maskz_sra_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
-    let shf = _mm512_sra_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+pub fn _mm512_maskz_sra_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19396,14 +20588,11 @@ pub unsafe fn _mm512_maskz_sra_epi32(k: __mmask16, a: __m512i, count: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad))]
-pub unsafe fn _mm256_mask_sra_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m128i,
-) -> __m256i {
-    let shf = _mm256_sra_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+pub fn _mm256_mask_sra_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19413,9 +20602,11 @@ pub unsafe fn _mm256_mask_sra_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad))]
-pub unsafe fn _mm256_maskz_sra_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    let shf = _mm256_sra_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+pub fn _mm256_maskz_sra_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19425,9 +20616,11 @@ pub unsafe fn _mm256_maskz_sra_epi32(k: __mmask8, a: __m256i, count: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad))]
-pub unsafe fn _mm_mask_sra_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sra_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+pub fn _mm_mask_sra_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19437,9 +20630,11 @@ pub unsafe fn _mm_mask_sra_epi32(src: __m128i, k: __mmask8, a: __m128i, count: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad))]
-pub unsafe fn _mm_maskz_sra_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sra_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+pub fn _mm_maskz_sra_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
@@ -19449,8 +20644,8 @@ pub unsafe fn _mm_maskz_sra_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq))]
-pub unsafe fn _mm512_sra_epi64(a: __m512i, count: __m128i) -> __m512i {
-    transmute(vpsraq(a.as_i64x8(), count.as_i64x2()))
+pub fn _mm512_sra_epi64(a: __m512i, count: __m128i) -> __m512i {
+    unsafe { transmute(vpsraq(a.as_i64x8(), count.as_i64x2())) }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19460,14 +20655,11 @@ pub unsafe fn _mm512_sra_epi64(a: __m512i, count: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq))]
-pub unsafe fn _mm512_mask_sra_epi64(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    count: __m128i,
-) -> __m512i {
-    let shf = _mm512_sra_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+pub fn _mm512_mask_sra_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19477,9 +20669,11 @@ pub unsafe fn _mm512_mask_sra_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq))]
-pub unsafe fn _mm512_maskz_sra_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
-    let shf = _mm512_sra_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+pub fn _mm512_maskz_sra_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sra_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
@@ -19489,8 +20683,8 @@ pub unsafe fn _mm512_maskz_sra_epi64(k: __mmask8, a: __m512i, count: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq))]
-pub unsafe fn _mm256_sra_epi64(a: __m256i, count: __m128i) -> __m256i {
-    transmute(vpsraq256(a.as_i64x4(), count.as_i64x2()))
+pub fn _mm256_sra_epi64(a: __m256i, count: __m128i) -> __m256i {
+    unsafe { transmute(vpsraq256(a.as_i64x4(), count.as_i64x2())) }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19500,14 +20694,11 @@ pub unsafe fn _mm256_sra_epi64(a: __m256i, count: __m128i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq))]
-pub unsafe fn _mm256_mask_sra_epi64(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m128i,
-) -> __m256i {
-    let shf = _mm256_sra_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+pub fn _mm256_mask_sra_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19517,9 +20708,11 @@ pub unsafe fn _mm256_mask_sra_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq))]
-pub unsafe fn _mm256_maskz_sra_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
-    let shf = _mm256_sra_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+pub fn _mm256_maskz_sra_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sra_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
@@ -19529,8 +20722,8 @@ pub unsafe fn _mm256_maskz_sra_epi64(k: __mmask8, a: __m256i, count: __m128i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq))]
-pub unsafe fn _mm_sra_epi64(a: __m128i, count: __m128i) -> __m128i {
-    transmute(vpsraq128(a.as_i64x2(), count.as_i64x2()))
+pub fn _mm_sra_epi64(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsraq128(a.as_i64x2(), count.as_i64x2())) }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19540,9 +20733,11 @@ pub unsafe fn _mm_sra_epi64(a: __m128i, count: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq))]
-pub unsafe fn _mm_mask_sra_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sra_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+pub fn _mm_mask_sra_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19552,9 +20747,11 @@ pub unsafe fn _mm_mask_sra_epi64(src: __m128i, k: __mmask8, a: __m128i, count: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq))]
-pub unsafe fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sra_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+pub fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sra_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
@@ -19565,9 +20762,11 @@ pub unsafe fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32)))
+pub fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32)))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19578,14 +20777,12 @@ pub unsafe fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_srai_epi32<const IMM8: u32>(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19596,10 +20793,12 @@ pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = simd_shr(a.as_i32x16(), i32x16::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19610,13 +20809,11 @@ pub unsafe fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_srai_epi32<const IMM8: u32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_srai_epi32<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19627,9 +20824,11 @@ pub unsafe fn _mm256_mask_srai_epi32<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        let r = simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19640,13 +20839,11 @@ pub unsafe fn _mm256_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m256i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_srai_epi32<const IMM8: u32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_srai_epi32<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19657,9 +20854,11 @@ pub unsafe fn _mm_mask_srai_epi32<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let r = simd_shr(a.as_i32x4(), i32x4::splat(IMM8.min(31) as i32));
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
@@ -19670,9 +20869,11 @@ pub unsafe fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_srai_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64)))
+pub fn _mm512_srai_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64)))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19683,14 +20884,12 @@ pub unsafe fn _mm512_srai_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_srai_epi64<const IMM8: u32>(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
-    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+pub fn _mm512_mask_srai_epi64<const IMM8: u32>(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19701,10 +20900,12 @@ pub unsafe fn _mm512_mask_srai_epi64<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
-    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+pub fn _mm512_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x8(), i64x8::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
@@ -19715,9 +20916,11 @@ pub unsafe fn _mm512_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m512i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm256_srai_epi64<const IMM8: u32>(a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64)))
+pub fn _mm256_srai_epi64<const IMM8: u32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64)))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19728,14 +20931,12 @@ pub unsafe fn _mm256_srai_epi64<const IMM8: u32>(a: __m256i) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_srai_epi64<const IMM8: u32>(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
-    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+pub fn _mm256_mask_srai_epi64<const IMM8: u32>(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19746,10 +20947,12 @@ pub unsafe fn _mm256_mask_srai_epi64<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
-    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+pub fn _mm256_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x4(), i64x4::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
@@ -19760,9 +20963,11 @@ pub unsafe fn _mm256_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m256i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_srai_epi64<const IMM8: u32>(a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    transmute(simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64)))
+pub fn _mm_srai_epi64<const IMM8: u32>(a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        transmute(simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64)))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19773,14 +20978,12 @@ pub unsafe fn _mm_srai_epi64<const IMM8: u32>(a: __m128i) -> __m128i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_srai_epi64<const IMM8: u32>(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
-    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+pub fn _mm_mask_srai_epi64<const IMM8: u32>(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19791,10 +20994,12 @@ pub unsafe fn _mm_mask_srai_epi64<const IMM8: u32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
-    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+pub fn _mm_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = simd_shr(a.as_i64x2(), i64x2::splat(IMM8.min(63) as i64));
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
@@ -19804,8 +21009,8 @@ pub unsafe fn _mm_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravd))]
-pub unsafe fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i {
-    transmute(vpsravd(a.as_i32x16(), count.as_i32x16()))
+pub fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsravd(a.as_i32x16(), count.as_i32x16())) }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19815,14 +21020,11 @@ pub unsafe fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravd))]
-pub unsafe fn _mm512_mask_srav_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    count: __m512i,
-) -> __m512i {
-    let shf = _mm512_srav_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+pub fn _mm512_mask_srav_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19832,9 +21034,11 @@ pub unsafe fn _mm512_mask_srav_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravd))]
-pub unsafe fn _mm512_maskz_srav_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
-    let shf = _mm512_srav_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+pub fn _mm512_maskz_srav_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19844,14 +21048,11 @@ pub unsafe fn _mm512_maskz_srav_epi32(k: __mmask16, a: __m512i, count: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravd))]
-pub unsafe fn _mm256_mask_srav_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m256i,
-) -> __m256i {
-    let shf = _mm256_srav_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+pub fn _mm256_mask_srav_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19861,9 +21062,11 @@ pub unsafe fn _mm256_mask_srav_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravd))]
-pub unsafe fn _mm256_maskz_srav_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    let shf = _mm256_srav_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+pub fn _mm256_maskz_srav_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19873,14 +21076,11 @@ pub unsafe fn _mm256_maskz_srav_epi32(k: __mmask8, a: __m256i, count: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravd))]
-pub unsafe fn _mm_mask_srav_epi32(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    count: __m128i,
-) -> __m128i {
-    let shf = _mm_srav_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+pub fn _mm_mask_srav_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19890,9 +21090,11 @@ pub unsafe fn _mm_mask_srav_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravd))]
-pub unsafe fn _mm_maskz_srav_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srav_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+pub fn _mm_maskz_srav_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
@@ -19902,8 +21104,8 @@ pub unsafe fn _mm_maskz_srav_epi32(k: __mmask8, a: __m128i, count: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravq))]
-pub unsafe fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i {
-    transmute(vpsravq(a.as_i64x8(), count.as_i64x8()))
+pub fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsravq(a.as_i64x8(), count.as_i64x8())) }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19913,14 +21115,11 @@ pub unsafe fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravq))]
-pub unsafe fn _mm512_mask_srav_epi64(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    count: __m512i,
-) -> __m512i {
-    let shf = _mm512_srav_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+pub fn _mm512_mask_srav_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19930,9 +21129,11 @@ pub unsafe fn _mm512_mask_srav_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravq))]
-pub unsafe fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
-    let shf = _mm512_srav_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+pub fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srav_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
@@ -19942,8 +21143,8 @@ pub unsafe fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravq))]
-pub unsafe fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i {
-    transmute(vpsravq256(a.as_i64x4(), count.as_i64x4()))
+pub fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i {
+    unsafe { transmute(vpsravq256(a.as_i64x4(), count.as_i64x4())) }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19953,14 +21154,11 @@ pub unsafe fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravq))]
-pub unsafe fn _mm256_mask_srav_epi64(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m256i,
-) -> __m256i {
-    let shf = _mm256_srav_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+pub fn _mm256_mask_srav_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -19970,9 +21168,11 @@ pub unsafe fn _mm256_mask_srav_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravq))]
-pub unsafe fn _mm256_maskz_srav_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    let shf = _mm256_srav_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+pub fn _mm256_maskz_srav_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srav_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
@@ -19982,8 +21182,8 @@ pub unsafe fn _mm256_maskz_srav_epi64(k: __mmask8, a: __m256i, count: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravq))]
-pub unsafe fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i {
-    transmute(vpsravq128(a.as_i64x2(), count.as_i64x2()))
+pub fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i {
+    unsafe { transmute(vpsravq128(a.as_i64x2(), count.as_i64x2())) }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -19993,14 +21193,11 @@ pub unsafe fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravq))]
-pub unsafe fn _mm_mask_srav_epi64(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    count: __m128i,
-) -> __m128i {
-    let shf = _mm_srav_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+pub fn _mm_mask_srav_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20010,9 +21207,11 @@ pub unsafe fn _mm_mask_srav_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsravq))]
-pub unsafe fn _mm_maskz_srav_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srav_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+pub fn _mm_maskz_srav_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srav_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20022,8 +21221,8 @@ pub unsafe fn _mm_maskz_srav_epi64(k: __mmask8, a: __m128i, count: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvd))]
-pub unsafe fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vprolvd(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprolvd(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20033,14 +21232,11 @@ pub unsafe fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvd))]
-pub unsafe fn _mm512_mask_rolv_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let rol = _mm512_rolv_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, rol, src.as_i32x16()))
+pub fn _mm512_mask_rolv_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, rol, src.as_i32x16()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20050,9 +21246,11 @@ pub unsafe fn _mm512_mask_rolv_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvd))]
-pub unsafe fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let rol = _mm512_rolv_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, rol, i32x16::ZERO))
+pub fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, rol, i32x16::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20062,8 +21260,8 @@ pub unsafe fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvd))]
-pub unsafe fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(vprolvd256(a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprolvd256(a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20073,9 +21271,11 @@ pub unsafe fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvd))]
-pub unsafe fn _mm256_mask_rolv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let rol = _mm256_rolv_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, rol, src.as_i32x8()))
+pub fn _mm256_mask_rolv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, rol, src.as_i32x8()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20085,9 +21285,11 @@ pub unsafe fn _mm256_mask_rolv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvd))]
-pub unsafe fn _mm256_maskz_rolv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let rol = _mm256_rolv_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, rol, i32x8::ZERO))
+pub fn _mm256_maskz_rolv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, rol, i32x8::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20097,8 +21299,8 @@ pub unsafe fn _mm256_maskz_rolv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvd))]
-pub unsafe fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(vprolvd128(a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprolvd128(a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20108,9 +21310,11 @@ pub unsafe fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvd))]
-pub unsafe fn _mm_mask_rolv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let rol = _mm_rolv_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, rol, src.as_i32x4()))
+pub fn _mm_mask_rolv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, rol, src.as_i32x4()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20120,9 +21324,11 @@ pub unsafe fn _mm_mask_rolv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvd))]
-pub unsafe fn _mm_maskz_rolv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let rol = _mm_rolv_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, rol, i32x4::ZERO))
+pub fn _mm_maskz_rolv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, rol, i32x4::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20132,8 +21338,8 @@ pub unsafe fn _mm_maskz_rolv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvd))]
-pub unsafe fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vprorvd(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprorvd(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20143,14 +21349,11 @@ pub unsafe fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvd))]
-pub unsafe fn _mm512_mask_rorv_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let ror = _mm512_rorv_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, ror, src.as_i32x16()))
+pub fn _mm512_mask_rorv_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, ror, src.as_i32x16()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20160,9 +21363,11 @@ pub unsafe fn _mm512_mask_rorv_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvd))]
-pub unsafe fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let ror = _mm512_rorv_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, ror, i32x16::ZERO))
+pub fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, ror, i32x16::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20172,8 +21377,8 @@ pub unsafe fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvd))]
-pub unsafe fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(vprorvd256(a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprorvd256(a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20183,9 +21388,11 @@ pub unsafe fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvd))]
-pub unsafe fn _mm256_mask_rorv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let ror = _mm256_rorv_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, ror, src.as_i32x8()))
+pub fn _mm256_mask_rorv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, ror, src.as_i32x8()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20195,9 +21402,11 @@ pub unsafe fn _mm256_mask_rorv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvd))]
-pub unsafe fn _mm256_maskz_rorv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let ror = _mm256_rorv_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, ror, i32x8::ZERO))
+pub fn _mm256_maskz_rorv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, ror, i32x8::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20207,8 +21416,8 @@ pub unsafe fn _mm256_maskz_rorv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvd))]
-pub unsafe fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(vprorvd128(a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprorvd128(a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20218,9 +21427,11 @@ pub unsafe fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvd))]
-pub unsafe fn _mm_mask_rorv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let ror = _mm_rorv_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, ror, src.as_i32x4()))
+pub fn _mm_mask_rorv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, ror, src.as_i32x4()))
+    }
 }
 
 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20230,9 +21441,11 @@ pub unsafe fn _mm_mask_rorv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvd))]
-pub unsafe fn _mm_maskz_rorv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let ror = _mm_rorv_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, ror, i32x4::ZERO))
+pub fn _mm_maskz_rorv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, ror, i32x4::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20242,8 +21455,8 @@ pub unsafe fn _mm_maskz_rorv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvq))]
-pub unsafe fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vprolvq(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprolvq(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20253,9 +21466,11 @@ pub unsafe fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvq))]
-pub unsafe fn _mm512_mask_rolv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let rol = _mm512_rolv_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, rol, src.as_i64x8()))
+pub fn _mm512_mask_rolv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, rol, src.as_i64x8()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20265,9 +21480,11 @@ pub unsafe fn _mm512_mask_rolv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvq))]
-pub unsafe fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let rol = _mm512_rolv_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, rol, i64x8::ZERO))
+pub fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let rol = _mm512_rolv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, rol, i64x8::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20277,8 +21494,8 @@ pub unsafe fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvq))]
-pub unsafe fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(vprolvq256(a.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprolvq256(a.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20288,9 +21505,11 @@ pub unsafe fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvq))]
-pub unsafe fn _mm256_mask_rolv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let rol = _mm256_rolv_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, rol, src.as_i64x4()))
+pub fn _mm256_mask_rolv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, rol, src.as_i64x4()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20300,9 +21519,11 @@ pub unsafe fn _mm256_mask_rolv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvq))]
-pub unsafe fn _mm256_maskz_rolv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let rol = _mm256_rolv_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, rol, i64x4::ZERO))
+pub fn _mm256_maskz_rolv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let rol = _mm256_rolv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, rol, i64x4::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20312,8 +21533,8 @@ pub unsafe fn _mm256_maskz_rolv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvq))]
-pub unsafe fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
-    transmute(vprolvq128(a.as_i64x2(), b.as_i64x2()))
+pub fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprolvq128(a.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20323,9 +21544,11 @@ pub unsafe fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvq))]
-pub unsafe fn _mm_mask_rolv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let rol = _mm_rolv_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, rol, src.as_i64x2()))
+pub fn _mm_mask_rolv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, rol, src.as_i64x2()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20335,9 +21558,11 @@ pub unsafe fn _mm_mask_rolv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprolvq))]
-pub unsafe fn _mm_maskz_rolv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let rol = _mm_rolv_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, rol, i64x2::ZERO))
+pub fn _mm_maskz_rolv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let rol = _mm_rolv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, rol, i64x2::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20347,8 +21572,8 @@ pub unsafe fn _mm_maskz_rolv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvq))]
-pub unsafe fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vprorvq(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vprorvq(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20358,9 +21583,11 @@ pub unsafe fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvq))]
-pub unsafe fn _mm512_mask_rorv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let ror = _mm512_rorv_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, ror, src.as_i64x8()))
+pub fn _mm512_mask_rorv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, ror, src.as_i64x8()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20370,9 +21597,11 @@ pub unsafe fn _mm512_mask_rorv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvq))]
-pub unsafe fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let ror = _mm512_rorv_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, ror, i64x8::ZERO))
+pub fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let ror = _mm512_rorv_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, ror, i64x8::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20382,8 +21611,8 @@ pub unsafe fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvq))]
-pub unsafe fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(vprorvq256(a.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vprorvq256(a.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20393,9 +21622,11 @@ pub unsafe fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvq))]
-pub unsafe fn _mm256_mask_rorv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let ror = _mm256_rorv_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, ror, src.as_i64x4()))
+pub fn _mm256_mask_rorv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, ror, src.as_i64x4()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20405,9 +21636,11 @@ pub unsafe fn _mm256_mask_rorv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvq))]
-pub unsafe fn _mm256_maskz_rorv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let ror = _mm256_rorv_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, ror, i64x4::ZERO))
+pub fn _mm256_maskz_rorv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let ror = _mm256_rorv_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, ror, i64x4::ZERO))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
@@ -20417,8 +21650,8 @@ pub unsafe fn _mm256_maskz_rorv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvq))]
-pub unsafe fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
-    transmute(vprorvq128(a.as_i64x2(), b.as_i64x2()))
+pub fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vprorvq128(a.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20428,9 +21661,11 @@ pub unsafe fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvq))]
-pub unsafe fn _mm_mask_rorv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let ror = _mm_rorv_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, ror, src.as_i64x2()))
+pub fn _mm_mask_rorv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, ror, src.as_i64x2()))
+    }
 }
 
 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20440,9 +21675,11 @@ pub unsafe fn _mm_mask_rorv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vprorvq))]
-pub unsafe fn _mm_maskz_rorv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let ror = _mm_rorv_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, ror, i64x2::ZERO))
+pub fn _mm_maskz_rorv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let ror = _mm_rorv_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, ror, i64x2::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
@@ -20452,8 +21689,8 @@ pub unsafe fn _mm_maskz_rorv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
-pub unsafe fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i {
-    transmute(vpsllvd(a.as_i32x16(), count.as_i32x16()))
+pub fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsllvd(a.as_i32x16(), count.as_i32x16())) }
 }
 
 /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20463,14 +21700,11 @@ pub unsafe fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
-pub unsafe fn _mm512_mask_sllv_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    count: __m512i,
-) -> __m512i {
-    let shf = _mm512_sllv_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+pub fn _mm512_mask_sllv_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20480,9 +21714,11 @@ pub unsafe fn _mm512_mask_sllv_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
-pub unsafe fn _mm512_maskz_sllv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
-    let shf = _mm512_sllv_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+pub fn _mm512_maskz_sllv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20492,14 +21728,11 @@ pub unsafe fn _mm512_maskz_sllv_epi32(k: __mmask16, a: __m512i, count: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
-pub unsafe fn _mm256_mask_sllv_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m256i,
-) -> __m256i {
-    let shf = _mm256_sllv_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+pub fn _mm256_mask_sllv_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20509,9 +21742,11 @@ pub unsafe fn _mm256_mask_sllv_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
-pub unsafe fn _mm256_maskz_sllv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    let shf = _mm256_sllv_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+pub fn _mm256_maskz_sllv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20521,14 +21756,11 @@ pub unsafe fn _mm256_maskz_sllv_epi32(k: __mmask8, a: __m256i, count: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
-pub unsafe fn _mm_mask_sllv_epi32(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    count: __m128i,
-) -> __m128i {
-    let shf = _mm_sllv_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+pub fn _mm_mask_sllv_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
 }
 
 /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20538,9 +21770,11 @@ pub unsafe fn _mm_mask_sllv_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
-pub unsafe fn _mm_maskz_sllv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sllv_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+pub fn _mm_maskz_sllv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
@@ -20550,8 +21784,8 @@ pub unsafe fn _mm_maskz_sllv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
-pub unsafe fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i {
-    transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16()))
+pub fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16())) }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20561,14 +21795,11 @@ pub unsafe fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
-pub unsafe fn _mm512_mask_srlv_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    count: __m512i,
-) -> __m512i {
-    let shf = _mm512_srlv_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+pub fn _mm512_mask_srlv_epi32(src: __m512i, k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20578,9 +21809,11 @@ pub unsafe fn _mm512_mask_srlv_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
-pub unsafe fn _mm512_maskz_srlv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
-    let shf = _mm512_srlv_epi32(a, count).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+pub fn _mm512_maskz_srlv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi32(a, count).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20590,14 +21823,11 @@ pub unsafe fn _mm512_maskz_srlv_epi32(k: __mmask16, a: __m512i, count: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
-pub unsafe fn _mm256_mask_srlv_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m256i,
-) -> __m256i {
-    let shf = _mm256_srlv_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+pub fn _mm256_mask_srlv_epi32(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20607,9 +21837,11 @@ pub unsafe fn _mm256_mask_srlv_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
-pub unsafe fn _mm256_maskz_srlv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    let shf = _mm256_srlv_epi32(a, count).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+pub fn _mm256_maskz_srlv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi32(a, count).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20619,14 +21851,11 @@ pub unsafe fn _mm256_maskz_srlv_epi32(k: __mmask8, a: __m256i, count: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
-pub unsafe fn _mm_mask_srlv_epi32(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    count: __m128i,
-) -> __m128i {
-    let shf = _mm_srlv_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+pub fn _mm_mask_srlv_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
 }
 
 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20636,9 +21865,11 @@ pub unsafe fn _mm_mask_srlv_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
-pub unsafe fn _mm_maskz_srlv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srlv_epi32(a, count).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+pub fn _mm_maskz_srlv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi32(a, count).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
@@ -20648,8 +21879,8 @@ pub unsafe fn _mm_maskz_srlv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
-pub unsafe fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i {
-    transmute(vpsllvq(a.as_i64x8(), count.as_i64x8()))
+pub fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsllvq(a.as_i64x8(), count.as_i64x8())) }
 }
 
 /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20659,14 +21890,11 @@ pub unsafe fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
-pub unsafe fn _mm512_mask_sllv_epi64(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    count: __m512i,
-) -> __m512i {
-    let shf = _mm512_sllv_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+pub fn _mm512_mask_sllv_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20676,9 +21904,11 @@ pub unsafe fn _mm512_mask_sllv_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
-pub unsafe fn _mm512_maskz_sllv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
-    let shf = _mm512_sllv_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+pub fn _mm512_maskz_sllv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_sllv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20688,14 +21918,11 @@ pub unsafe fn _mm512_maskz_sllv_epi64(k: __mmask8, a: __m512i, count: __m512i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
-pub unsafe fn _mm256_mask_sllv_epi64(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m256i,
-) -> __m256i {
-    let shf = _mm256_sllv_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+pub fn _mm256_mask_sllv_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20705,9 +21932,11 @@ pub unsafe fn _mm256_mask_sllv_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
-pub unsafe fn _mm256_maskz_sllv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    let shf = _mm256_sllv_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+pub fn _mm256_maskz_sllv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_sllv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20717,14 +21946,11 @@ pub unsafe fn _mm256_maskz_sllv_epi64(k: __mmask8, a: __m256i, count: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
-pub unsafe fn _mm_mask_sllv_epi64(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    count: __m128i,
-) -> __m128i {
-    let shf = _mm_sllv_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+pub fn _mm_mask_sllv_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
 }
 
 /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20734,9 +21960,11 @@ pub unsafe fn _mm_mask_sllv_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
-pub unsafe fn _mm_maskz_sllv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_sllv_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+pub fn _mm_maskz_sllv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_sllv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
@@ -20746,8 +21974,8 @@ pub unsafe fn _mm_maskz_sllv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
-pub unsafe fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i {
-    transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8()))
+pub fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i {
+    unsafe { transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8())) }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20757,14 +21985,11 @@ pub unsafe fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
-pub unsafe fn _mm512_mask_srlv_epi64(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    count: __m512i,
-) -> __m512i {
-    let shf = _mm512_srlv_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+pub fn _mm512_mask_srlv_epi64(src: __m512i, k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20774,9 +21999,11 @@ pub unsafe fn _mm512_mask_srlv_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
-pub unsafe fn _mm512_maskz_srlv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
-    let shf = _mm512_srlv_epi64(a, count).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+pub fn _mm512_maskz_srlv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_srlv_epi64(a, count).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20786,14 +22013,11 @@ pub unsafe fn _mm512_maskz_srlv_epi64(k: __mmask8, a: __m512i, count: __m512i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
-pub unsafe fn _mm256_mask_srlv_epi64(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    count: __m256i,
-) -> __m256i {
-    let shf = _mm256_srlv_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+pub fn _mm256_mask_srlv_epi64(src: __m256i, k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20803,9 +22027,11 @@ pub unsafe fn _mm256_mask_srlv_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
-pub unsafe fn _mm256_maskz_srlv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
-    let shf = _mm256_srlv_epi64(a, count).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+pub fn _mm256_maskz_srlv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_srlv_epi64(a, count).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20815,14 +22041,11 @@ pub unsafe fn _mm256_maskz_srlv_epi64(k: __mmask8, a: __m256i, count: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
-pub unsafe fn _mm_mask_srlv_epi64(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    count: __m128i,
-) -> __m128i {
-    let shf = _mm_srlv_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+pub fn _mm_mask_srlv_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
 }
 
 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20832,9 +22055,11 @@ pub unsafe fn _mm_mask_srlv_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
-pub unsafe fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
-    let shf = _mm_srlv_epi64(a, count).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+pub fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_srlv_epi64(a, count).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
@@ -20845,30 +22070,32 @@ pub unsafe fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
-    static_assert_uimm_bits!(MASK, 8);
-    simd_shuffle!(
-        a,
-        a,
-        [
-            MASK as u32 & 0b11,
-            (MASK as u32 >> 2) & 0b11,
-            ((MASK as u32 >> 4) & 0b11),
-            ((MASK as u32 >> 6) & 0b11),
-            (MASK as u32 & 0b11) + 4,
-            ((MASK as u32 >> 2) & 0b11) + 4,
-            ((MASK as u32 >> 4) & 0b11) + 4,
-            ((MASK as u32 >> 6) & 0b11) + 4,
-            (MASK as u32 & 0b11) + 8,
-            ((MASK as u32 >> 2) & 0b11) + 8,
-            ((MASK as u32 >> 4) & 0b11) + 8,
-            ((MASK as u32 >> 6) & 0b11) + 8,
-            (MASK as u32 & 0b11) + 12,
-            ((MASK as u32 >> 2) & 0b11) + 12,
-            ((MASK as u32 >> 4) & 0b11) + 12,
-            ((MASK as u32 >> 6) & 0b11) + 12,
-        ],
-    )
+pub fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+                (MASK as u32 & 0b11) + 8,
+                ((MASK as u32 >> 2) & 0b11) + 8,
+                ((MASK as u32 >> 4) & 0b11) + 8,
+                ((MASK as u32 >> 6) & 0b11) + 8,
+                (MASK as u32 & 0b11) + 12,
+                ((MASK as u32 >> 2) & 0b11) + 12,
+                ((MASK as u32 >> 4) & 0b11) + 12,
+                ((MASK as u32 >> 6) & 0b11) + 12,
+            ],
+        )
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20879,14 +22106,12 @@ pub unsafe fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_permute_ps<const MASK: i32>(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-) -> __m512 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_permute_ps::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+pub fn _mm512_mask_permute_ps<const MASK: i32>(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20897,10 +22122,12 @@ pub unsafe fn _mm512_mask_permute_ps<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_permute_ps<const MASK: i32>(k: __mmask16, a: __m512) -> __m512 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_permute_ps::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+pub fn _mm512_maskz_permute_ps<const MASK: i32>(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20911,13 +22138,11 @@ pub unsafe fn _mm512_maskz_permute_ps<const MASK: i32>(k: __mmask16, a: __m512)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_permute_ps<const MASK: i32>(
-    src: __m256,
-    k: __mmask8,
-    a: __m256,
-) -> __m256 {
-    let r = _mm256_permute_ps::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+pub fn _mm256_mask_permute_ps<const MASK: i32>(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let r = _mm256_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20928,9 +22153,11 @@ pub unsafe fn _mm256_mask_permute_ps<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m256) -> __m256 {
-    let r = _mm256_permute_ps::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+pub fn _mm256_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let r = _mm256_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20941,9 +22168,11 @@ pub unsafe fn _mm256_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m256) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_permute_ps<const MASK: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    let r = _mm_permute_ps::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+pub fn _mm_mask_permute_ps<const MASK: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let r = _mm_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -20954,9 +22183,11 @@ pub unsafe fn _mm_mask_permute_ps<const MASK: i32>(src: __m128, k: __mmask8, a:
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 0b11_00_01_11))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> __m128 {
-    let r = _mm_permute_ps::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+pub fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let r = _mm_permute_ps::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
@@ -20967,22 +22198,24 @@ pub unsafe fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
-    static_assert_uimm_bits!(MASK, 8);
-    simd_shuffle!(
-        a,
-        a,
-        [
-            MASK as u32 & 0b1,
-            ((MASK as u32 >> 1) & 0b1),
-            ((MASK as u32 >> 2) & 0b1) + 2,
-            ((MASK as u32 >> 3) & 0b1) + 2,
-            ((MASK as u32 >> 4) & 0b1) + 4,
-            ((MASK as u32 >> 5) & 0b1) + 4,
-            ((MASK as u32 >> 6) & 0b1) + 6,
-            ((MASK as u32 >> 7) & 0b1) + 6,
-        ],
-    )
+pub fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b1,
+                ((MASK as u32 >> 1) & 0b1),
+                ((MASK as u32 >> 2) & 0b1) + 2,
+                ((MASK as u32 >> 3) & 0b1) + 2,
+                ((MASK as u32 >> 4) & 0b1) + 4,
+                ((MASK as u32 >> 5) & 0b1) + 4,
+                ((MASK as u32 >> 6) & 0b1) + 6,
+                ((MASK as u32 >> 7) & 0b1) + 6,
+            ],
+        )
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -20993,14 +22226,12 @@ pub unsafe fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_permute_pd<const MASK: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_permute_pd::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+pub fn _mm512_mask_permute_pd<const MASK: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21011,10 +22242,12 @@ pub unsafe fn _mm512_mask_permute_pd<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01_10_01))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_permute_pd::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+pub fn _mm512_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21025,14 +22258,12 @@ pub unsafe fn _mm512_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m512d)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_permute_pd<const MASK: i32>(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-) -> __m256d {
-    static_assert_uimm_bits!(MASK, 4);
-    let r = _mm256_permute_pd::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+pub fn _mm256_mask_permute_pd<const MASK: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 4);
+        let r = _mm256_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21043,10 +22274,12 @@ pub unsafe fn _mm256_mask_permute_pd<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 0b11_01))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
-    static_assert_uimm_bits!(MASK, 4);
-    let r = _mm256_permute_pd::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+pub fn _mm256_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 4);
+        let r = _mm256_permute_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21057,14 +22290,12 @@ pub unsafe fn _mm256_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m256d)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0b01))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_permute_pd<const IMM2: i32>(
-    src: __m128d,
-    k: __mmask8,
-    a: __m128d,
-) -> __m128d {
-    static_assert_uimm_bits!(IMM2, 2);
-    let r = _mm_permute_pd::<IMM2>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
+pub fn _mm_mask_permute_pd<const IMM2: i32>(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm_permute_pd::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21075,10 +22306,12 @@ pub unsafe fn _mm_mask_permute_pd<const IMM2: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0b01))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM2, 2);
-    let r = _mm_permute_pd::<IMM2>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x2(), f64x2::ZERO))
+pub fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm_permute_pd::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), f64x2::ZERO))
+    }
 }
 
 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
@@ -21089,22 +22322,24 @@ pub unsafe fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_permutex_epi64<const MASK: i32>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    simd_shuffle!(
-        a,
-        a,
-        [
-            MASK as u32 & 0b11,
-            (MASK as u32 >> 2) & 0b11,
-            ((MASK as u32 >> 4) & 0b11),
-            ((MASK as u32 >> 6) & 0b11),
-            (MASK as u32 & 0b11) + 4,
-            ((MASK as u32 >> 2) & 0b11) + 4,
-            ((MASK as u32 >> 4) & 0b11) + 4,
-            ((MASK as u32 >> 6) & 0b11) + 4,
-        ],
-    )
+pub fn _mm512_permutex_epi64<const MASK: i32>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        )
+    }
 }
 
 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21115,14 +22350,16 @@ pub unsafe fn _mm512_permutex_epi64<const MASK: i32>(a: __m512i) -> __m512i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_permutex_epi64<const MASK: i32>(
+pub fn _mm512_mask_permutex_epi64<const MASK: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_permutex_epi64::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
 }
 
 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21133,10 +22370,12 @@ pub unsafe fn _mm512_mask_permutex_epi64<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_permutex_epi64::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+pub fn _mm512_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
 }
 
 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
@@ -21147,18 +22386,20 @@ pub unsafe fn _mm512_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm256_permutex_epi64<const MASK: i32>(a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(MASK, 8);
-    simd_shuffle!(
-        a,
-        a,
-        [
-            MASK as u32 & 0b11,
-            (MASK as u32 >> 2) & 0b11,
-            ((MASK as u32 >> 4) & 0b11),
-            ((MASK as u32 >> 6) & 0b11),
-        ],
-    )
+pub fn _mm256_permutex_epi64<const MASK: i32>(a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+            ],
+        )
+    }
 }
 
 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21169,14 +22410,16 @@ pub unsafe fn _mm256_permutex_epi64<const MASK: i32>(a: __m256i) -> __m256i {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_permutex_epi64<const MASK: i32>(
+pub fn _mm256_mask_permutex_epi64<const MASK: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_permutex_epi64::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
 }
 
 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21187,10 +22430,12 @@ pub unsafe fn _mm256_mask_permutex_epi64<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m256i) -> __m256i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_permutex_epi64::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+pub fn _mm256_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_epi64::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
@@ -21201,22 +22446,24 @@ pub unsafe fn _mm256_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m25
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_permutex_pd<const MASK: i32>(a: __m512d) -> __m512d {
-    static_assert_uimm_bits!(MASK, 8);
-    simd_shuffle!(
-        a,
-        a,
-        [
-            MASK as u32 & 0b11,
-            (MASK as u32 >> 2) & 0b11,
-            ((MASK as u32 >> 4) & 0b11),
-            ((MASK as u32 >> 6) & 0b11),
-            (MASK as u32 & 0b11) + 4,
-            ((MASK as u32 >> 2) & 0b11) + 4,
-            ((MASK as u32 >> 4) & 0b11) + 4,
-            ((MASK as u32 >> 6) & 0b11) + 4,
-        ],
-    )
+pub fn _mm512_permutex_pd<const MASK: i32>(a: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+            ],
+        )
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21227,13 +22474,11 @@ pub unsafe fn _mm512_permutex_pd<const MASK: i32>(a: __m512d) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_permutex_pd<const MASK: i32>(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-) -> __m512d {
-    let r = _mm512_permutex_pd::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+pub fn _mm512_mask_permutex_pd<const MASK: i32>(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let r = _mm512_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21244,9 +22489,11 @@ pub unsafe fn _mm512_mask_permutex_pd<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
-    let r = _mm512_permutex_pd::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+pub fn _mm512_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let r = _mm512_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
@@ -21257,18 +22504,20 @@ pub unsafe fn _mm512_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m512d)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm256_permutex_pd<const MASK: i32>(a: __m256d) -> __m256d {
-    static_assert_uimm_bits!(MASK, 8);
-    simd_shuffle!(
-        a,
-        a,
-        [
-            MASK as u32 & 0b11,
-            (MASK as u32 >> 2) & 0b11,
-            ((MASK as u32 >> 4) & 0b11),
-            ((MASK as u32 >> 6) & 0b11),
-        ],
-    )
+pub fn _mm256_permutex_pd<const MASK: i32>(a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            a,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11),
+                ((MASK as u32 >> 6) & 0b11),
+            ],
+        )
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21279,14 +22528,12 @@ pub unsafe fn _mm256_permutex_pd<const MASK: i32>(a: __m256d) -> __m256d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_permutex_pd<const MASK: i32>(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-) -> __m256d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_permutex_pd::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+pub fn _mm256_mask_permutex_pd<const MASK: i32>(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21297,10 +22544,12 @@ pub unsafe fn _mm256_mask_permutex_pd<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_permutex_pd::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+pub fn _mm256_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_permutex_pd::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
 }
 
 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_permutexvar_epi32, and it is recommended that you use that intrinsic name.
@@ -21310,8 +22559,8 @@ pub unsafe fn _mm256_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m256d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermd
-pub unsafe fn _mm512_permutevar_epi32(idx: __m512i, a: __m512i) -> __m512i {
-    transmute(vpermd(a.as_i32x16(), idx.as_i32x16()))
+pub fn _mm512_permutevar_epi32(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermd(a.as_i32x16(), idx.as_i32x16())) }
 }
 
 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_mask_permutexvar_epi32, and it is recommended that you use that intrinsic name.
@@ -21321,14 +22570,16 @@ pub unsafe fn _mm512_permutevar_epi32(idx: __m512i, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermd))]
-pub unsafe fn _mm512_mask_permutevar_epi32(
+pub fn _mm512_mask_permutevar_epi32(
     src: __m512i,
     k: __mmask16,
     idx: __m512i,
     a: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutevar_epi32(idx, a).as_i32x16();
-    transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
+    unsafe {
+        let permute = _mm512_permutevar_epi32(idx, a).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
@@ -21338,8 +22589,8 @@ pub unsafe fn _mm512_mask_permutevar_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilps))]
-pub unsafe fn _mm512_permutevar_ps(a: __m512, b: __m512i) -> __m512 {
-    transmute(vpermilps(a.as_f32x16(), b.as_i32x16()))
+pub fn _mm512_permutevar_ps(a: __m512, b: __m512i) -> __m512 {
+    unsafe { transmute(vpermilps(a.as_f32x16(), b.as_i32x16())) }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21349,14 +22600,11 @@ pub unsafe fn _mm512_permutevar_ps(a: __m512, b: __m512i) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilps))]
-pub unsafe fn _mm512_mask_permutevar_ps(
-    src: __m512,
-    k: __mmask16,
-    a: __m512,
-    b: __m512i,
-) -> __m512 {
-    let permute = _mm512_permutevar_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
+pub fn _mm512_mask_permutevar_ps(src: __m512, k: __mmask16, a: __m512, b: __m512i) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutevar_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21366,9 +22614,11 @@ pub unsafe fn _mm512_mask_permutevar_ps(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilps))]
-pub unsafe fn _mm512_maskz_permutevar_ps(k: __mmask16, a: __m512, b: __m512i) -> __m512 {
-    let permute = _mm512_permutevar_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+pub fn _mm512_maskz_permutevar_ps(k: __mmask16, a: __m512, b: __m512i) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutevar_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21378,9 +22628,11 @@ pub unsafe fn _mm512_maskz_permutevar_ps(k: __mmask16, a: __m512, b: __m512i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilps))]
-pub unsafe fn _mm256_mask_permutevar_ps(src: __m256, k: __mmask8, a: __m256, b: __m256i) -> __m256 {
-    let permute = _mm256_permutevar_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
+pub fn _mm256_mask_permutevar_ps(src: __m256, k: __mmask8, a: __m256, b: __m256i) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutevar_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21390,9 +22642,11 @@ pub unsafe fn _mm256_mask_permutevar_ps(src: __m256, k: __mmask8, a: __m256, b:
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilps))]
-pub unsafe fn _mm256_maskz_permutevar_ps(k: __mmask8, a: __m256, b: __m256i) -> __m256 {
-    let permute = _mm256_permutevar_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+pub fn _mm256_maskz_permutevar_ps(k: __mmask8, a: __m256, b: __m256i) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutevar_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21402,9 +22656,11 @@ pub unsafe fn _mm256_maskz_permutevar_ps(k: __mmask8, a: __m256, b: __m256i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilps))]
-pub unsafe fn _mm_mask_permutevar_ps(src: __m128, k: __mmask8, a: __m128, b: __m128i) -> __m128 {
-    let permute = _mm_permutevar_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, permute, src.as_f32x4()))
+pub fn _mm_mask_permutevar_ps(src: __m128, k: __mmask8, a: __m128, b: __m128i) -> __m128 {
+    unsafe {
+        let permute = _mm_permutevar_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x4()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21414,9 +22670,11 @@ pub unsafe fn _mm_mask_permutevar_ps(src: __m128, k: __mmask8, a: __m128, b: __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilps))]
-pub unsafe fn _mm_maskz_permutevar_ps(k: __mmask8, a: __m128, b: __m128i) -> __m128 {
-    let permute = _mm_permutevar_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, permute, f32x4::ZERO))
+pub fn _mm_maskz_permutevar_ps(k: __mmask8, a: __m128, b: __m128i) -> __m128 {
+    unsafe {
+        let permute = _mm_permutevar_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, f32x4::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
@@ -21426,8 +22684,8 @@ pub unsafe fn _mm_maskz_permutevar_ps(k: __mmask8, a: __m128, b: __m128i) -> __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
-pub unsafe fn _mm512_permutevar_pd(a: __m512d, b: __m512i) -> __m512d {
-    transmute(vpermilpd(a.as_f64x8(), b.as_i64x8()))
+pub fn _mm512_permutevar_pd(a: __m512d, b: __m512i) -> __m512d {
+    unsafe { transmute(vpermilpd(a.as_f64x8(), b.as_i64x8())) }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21437,14 +22695,11 @@ pub unsafe fn _mm512_permutevar_pd(a: __m512d, b: __m512i) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
-pub unsafe fn _mm512_mask_permutevar_pd(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512i,
-) -> __m512d {
-    let permute = _mm512_permutevar_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+pub fn _mm512_mask_permutevar_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutevar_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21454,9 +22709,11 @@ pub unsafe fn _mm512_mask_permutevar_pd(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
-pub unsafe fn _mm512_maskz_permutevar_pd(k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
-    let permute = _mm512_permutevar_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+pub fn _mm512_maskz_permutevar_pd(k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutevar_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21466,14 +22723,11 @@ pub unsafe fn _mm512_maskz_permutevar_pd(k: __mmask8, a: __m512d, b: __m512i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
-pub unsafe fn _mm256_mask_permutevar_pd(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-    b: __m256i,
-) -> __m256d {
-    let permute = _mm256_permutevar_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
+pub fn _mm256_mask_permutevar_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256i) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutevar_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21483,9 +22737,11 @@ pub unsafe fn _mm256_mask_permutevar_pd(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
-pub unsafe fn _mm256_maskz_permutevar_pd(k: __mmask8, a: __m256d, b: __m256i) -> __m256d {
-    let permute = _mm256_permutevar_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+pub fn _mm256_maskz_permutevar_pd(k: __mmask8, a: __m256d, b: __m256i) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutevar_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21495,9 +22751,11 @@ pub unsafe fn _mm256_maskz_permutevar_pd(k: __mmask8, a: __m256d, b: __m256i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
-pub unsafe fn _mm_mask_permutevar_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
-    let permute = _mm_permutevar_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, permute, src.as_f64x2()))
+pub fn _mm_mask_permutevar_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
+    unsafe {
+        let permute = _mm_permutevar_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x2()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21507,9 +22765,11 @@ pub unsafe fn _mm_mask_permutevar_pd(src: __m128d, k: __mmask8, a: __m128d, b: _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
-pub unsafe fn _mm_maskz_permutevar_pd(k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
-    let permute = _mm_permutevar_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, permute, f64x2::ZERO))
+pub fn _mm_maskz_permutevar_pd(k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
+    unsafe {
+        let permute = _mm_permutevar_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, f64x2::ZERO))
+    }
 }
 
 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -21519,8 +22779,8 @@ pub unsafe fn _mm_maskz_permutevar_pd(k: __mmask8, a: __m128d, b: __m128i) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermd
-pub unsafe fn _mm512_permutexvar_epi32(idx: __m512i, a: __m512i) -> __m512i {
-    transmute(vpermd(a.as_i32x16(), idx.as_i32x16()))
+pub fn _mm512_permutexvar_epi32(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermd(a.as_i32x16(), idx.as_i32x16())) }
 }
 
 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21530,14 +22790,16 @@ pub unsafe fn _mm512_permutexvar_epi32(idx: __m512i, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermd))]
-pub unsafe fn _mm512_mask_permutexvar_epi32(
+pub fn _mm512_mask_permutexvar_epi32(
     src: __m512i,
     k: __mmask16,
     idx: __m512i,
     a: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
-    transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
+    unsafe {
+        let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
+    }
 }
 
 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21547,9 +22809,11 @@ pub unsafe fn _mm512_mask_permutexvar_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermd))]
-pub unsafe fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m512i) -> __m512i {
-    let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
-    transmute(simd_select_bitmask(k, permute, i32x16::ZERO))
+pub fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, i32x16::ZERO))
+    }
 }
 
 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -21559,7 +22823,7 @@ pub unsafe fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m5
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermd
-pub unsafe fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
+pub fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
     _mm256_permutevar8x32_epi32(a, idx) // llvm use llvm.x86.avx2.permd
 }
 
@@ -21570,14 +22834,16 @@ pub unsafe fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermd))]
-pub unsafe fn _mm256_mask_permutexvar_epi32(
+pub fn _mm256_mask_permutexvar_epi32(
     src: __m256i,
     k: __mmask8,
     idx: __m256i,
     a: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
-    transmute(simd_select_bitmask(k, permute, src.as_i32x8()))
+    unsafe {
+        let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, src.as_i32x8()))
+    }
 }
 
 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21587,9 +22853,11 @@ pub unsafe fn _mm256_mask_permutexvar_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermd))]
-pub unsafe fn _mm256_maskz_permutexvar_epi32(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
-    let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
-    transmute(simd_select_bitmask(k, permute, i32x8::ZERO))
+pub fn _mm256_maskz_permutexvar_epi32(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, i32x8::ZERO))
+    }
 }
 
 /// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -21599,8 +22867,8 @@ pub unsafe fn _mm256_maskz_permutexvar_epi32(k: __mmask8, idx: __m256i, a: __m25
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermq
-pub unsafe fn _mm512_permutexvar_epi64(idx: __m512i, a: __m512i) -> __m512i {
-    transmute(vpermq(a.as_i64x8(), idx.as_i64x8()))
+pub fn _mm512_permutexvar_epi64(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermq(a.as_i64x8(), idx.as_i64x8())) }
 }
 
 /// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21610,14 +22878,16 @@ pub unsafe fn _mm512_permutexvar_epi64(idx: __m512i, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermq))]
-pub unsafe fn _mm512_mask_permutexvar_epi64(
+pub fn _mm512_mask_permutexvar_epi64(
     src: __m512i,
     k: __mmask8,
     idx: __m512i,
     a: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
-    transmute(simd_select_bitmask(k, permute, src.as_i64x8()))
+    unsafe {
+        let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, src.as_i64x8()))
+    }
 }
 
 /// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21627,9 +22897,11 @@ pub unsafe fn _mm512_mask_permutexvar_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermq))]
-pub unsafe fn _mm512_maskz_permutexvar_epi64(k: __mmask8, idx: __m512i, a: __m512i) -> __m512i {
-    let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
-    transmute(simd_select_bitmask(k, permute, i64x8::ZERO))
+pub fn _mm512_maskz_permutexvar_epi64(k: __mmask8, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, i64x8::ZERO))
+    }
 }
 
 /// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -21639,8 +22911,8 @@ pub unsafe fn _mm512_maskz_permutexvar_epi64(k: __mmask8, idx: __m512i, a: __m51
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermq
-pub unsafe fn _mm256_permutexvar_epi64(idx: __m256i, a: __m256i) -> __m256i {
-    transmute(vpermq256(a.as_i64x4(), idx.as_i64x4()))
+pub fn _mm256_permutexvar_epi64(idx: __m256i, a: __m256i) -> __m256i {
+    unsafe { transmute(vpermq256(a.as_i64x4(), idx.as_i64x4())) }
 }
 
 /// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21650,14 +22922,16 @@ pub unsafe fn _mm256_permutexvar_epi64(idx: __m256i, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermq))]
-pub unsafe fn _mm256_mask_permutexvar_epi64(
+pub fn _mm256_mask_permutexvar_epi64(
     src: __m256i,
     k: __mmask8,
     idx: __m256i,
     a: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
-    transmute(simd_select_bitmask(k, permute, src.as_i64x4()))
+    unsafe {
+        let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, src.as_i64x4()))
+    }
 }
 
 /// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21667,9 +22941,11 @@ pub unsafe fn _mm256_mask_permutexvar_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermq))]
-pub unsafe fn _mm256_maskz_permutexvar_epi64(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
-    let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
-    transmute(simd_select_bitmask(k, permute, i64x4::ZERO))
+pub fn _mm256_maskz_permutexvar_epi64(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, i64x4::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
@@ -21679,8 +22955,8 @@ pub unsafe fn _mm256_maskz_permutexvar_epi64(k: __mmask8, idx: __m256i, a: __m25
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermps))]
-pub unsafe fn _mm512_permutexvar_ps(idx: __m512i, a: __m512) -> __m512 {
-    transmute(vpermps(a.as_f32x16(), idx.as_i32x16()))
+pub fn _mm512_permutexvar_ps(idx: __m512i, a: __m512) -> __m512 {
+    unsafe { transmute(vpermps(a.as_f32x16(), idx.as_i32x16())) }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21690,14 +22966,11 @@ pub unsafe fn _mm512_permutexvar_ps(idx: __m512i, a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermps))]
-pub unsafe fn _mm512_mask_permutexvar_ps(
-    src: __m512,
-    k: __mmask16,
-    idx: __m512i,
-    a: __m512,
-) -> __m512 {
-    let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
-    transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
+pub fn _mm512_mask_permutexvar_ps(src: __m512, k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21707,9 +22980,11 @@ pub unsafe fn _mm512_mask_permutexvar_ps(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermps))]
-pub unsafe fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
-    let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
-    transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+pub fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
@@ -21719,7 +22994,7 @@ pub unsafe fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermps))]
-pub unsafe fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
+pub fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
     _mm256_permutevar8x32_ps(a, idx) //llvm.x86.avx2.permps
 }
 
@@ -21730,14 +23005,11 @@ pub unsafe fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermps))]
-pub unsafe fn _mm256_mask_permutexvar_ps(
-    src: __m256,
-    k: __mmask8,
-    idx: __m256i,
-    a: __m256,
-) -> __m256 {
-    let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
-    transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
+pub fn _mm256_mask_permutexvar_ps(src: __m256, k: __mmask8, idx: __m256i, a: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21747,9 +23019,11 @@ pub unsafe fn _mm256_mask_permutexvar_ps(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermps))]
-pub unsafe fn _mm256_maskz_permutexvar_ps(k: __mmask8, idx: __m256i, a: __m256) -> __m256 {
-    let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
-    transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+pub fn _mm256_maskz_permutexvar_ps(k: __mmask8, idx: __m256i, a: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -21759,8 +23033,8 @@ pub unsafe fn _mm256_maskz_permutexvar_ps(k: __mmask8, idx: __m256i, a: __m256)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermpd))]
-pub unsafe fn _mm512_permutexvar_pd(idx: __m512i, a: __m512d) -> __m512d {
-    transmute(vpermpd(a.as_f64x8(), idx.as_i64x8()))
+pub fn _mm512_permutexvar_pd(idx: __m512i, a: __m512d) -> __m512d {
+    unsafe { transmute(vpermpd(a.as_f64x8(), idx.as_i64x8())) }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21770,14 +23044,11 @@ pub unsafe fn _mm512_permutexvar_pd(idx: __m512i, a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermpd))]
-pub unsafe fn _mm512_mask_permutexvar_pd(
-    src: __m512d,
-    k: __mmask8,
-    idx: __m512i,
-    a: __m512d,
-) -> __m512d {
-    let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
-    transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+pub fn _mm512_mask_permutexvar_pd(src: __m512d, k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21787,9 +23058,11 @@ pub unsafe fn _mm512_mask_permutexvar_pd(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermpd))]
-pub unsafe fn _mm512_maskz_permutexvar_pd(k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
-    let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
-    transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+pub fn _mm512_maskz_permutexvar_pd(k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -21799,8 +23072,8 @@ pub unsafe fn _mm512_maskz_permutexvar_pd(k: __mmask8, idx: __m512i, a: __m512d)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermpd))]
-pub unsafe fn _mm256_permutexvar_pd(idx: __m256i, a: __m256d) -> __m256d {
-    transmute(vpermpd256(a.as_f64x4(), idx.as_i64x4()))
+pub fn _mm256_permutexvar_pd(idx: __m256i, a: __m256d) -> __m256d {
+    unsafe { transmute(vpermpd256(a.as_f64x4(), idx.as_i64x4())) }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -21810,14 +23083,11 @@ pub unsafe fn _mm256_permutexvar_pd(idx: __m256i, a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermpd))]
-pub unsafe fn _mm256_mask_permutexvar_pd(
-    src: __m256d,
-    k: __mmask8,
-    idx: __m256i,
-    a: __m256d,
-) -> __m256d {
-    let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
-    transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
+pub fn _mm256_mask_permutexvar_pd(src: __m256d, k: __mmask8, idx: __m256i, a: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21827,9 +23097,11 @@ pub unsafe fn _mm256_mask_permutexvar_pd(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermpd))]
-pub unsafe fn _mm256_maskz_permutexvar_pd(k: __mmask8, idx: __m256i, a: __m256d) -> __m256d {
-    let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
-    transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+pub fn _mm256_maskz_permutexvar_pd(k: __mmask8, idx: __m256i, a: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+    }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -21839,8 +23111,8 @@ pub unsafe fn _mm256_maskz_permutexvar_pd(k: __mmask8, idx: __m256i, a: __m256d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub unsafe fn _mm512_permutex2var_epi32(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
-    transmute(vpermi2d(a.as_i32x16(), idx.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_permutex2var_epi32(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2d(a.as_i32x16(), idx.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -21850,14 +23122,16 @@ pub unsafe fn _mm512_permutex2var_epi32(a: __m512i, idx: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2d))]
-pub unsafe fn _mm512_mask_permutex2var_epi32(
+pub fn _mm512_mask_permutex2var_epi32(
     a: __m512i,
     k: __mmask16,
     idx: __m512i,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
-    transmute(simd_select_bitmask(k, permute, a.as_i32x16()))
+    unsafe {
+        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, a.as_i32x16()))
+    }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21867,14 +23141,16 @@ pub unsafe fn _mm512_mask_permutex2var_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub unsafe fn _mm512_maskz_permutex2var_epi32(
+pub fn _mm512_maskz_permutex2var_epi32(
     k: __mmask16,
     a: __m512i,
     idx: __m512i,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
-    transmute(simd_select_bitmask(k, permute, i32x16::ZERO))
+    unsafe {
+        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, i32x16::ZERO))
+    }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -21884,14 +23160,16 @@ pub unsafe fn _mm512_maskz_permutex2var_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2d))]
-pub unsafe fn _mm512_mask2_permutex2var_epi32(
+pub fn _mm512_mask2_permutex2var_epi32(
     a: __m512i,
     idx: __m512i,
     k: __mmask16,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
-    transmute(simd_select_bitmask(k, permute, idx.as_i32x16()))
+    unsafe {
+        let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
+        transmute(simd_select_bitmask(k, permute, idx.as_i32x16()))
+    }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -21901,8 +23179,8 @@ pub unsafe fn _mm512_mask2_permutex2var_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub unsafe fn _mm256_permutex2var_epi32(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
-    transmute(vpermi2d256(a.as_i32x8(), idx.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_permutex2var_epi32(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2d256(a.as_i32x8(), idx.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -21912,14 +23190,16 @@ pub unsafe fn _mm256_permutex2var_epi32(a: __m256i, idx: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2d))]
-pub unsafe fn _mm256_mask_permutex2var_epi32(
+pub fn _mm256_mask_permutex2var_epi32(
     a: __m256i,
     k: __mmask8,
     idx: __m256i,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
-    transmute(simd_select_bitmask(k, permute, a.as_i32x8()))
+    unsafe {
+        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, a.as_i32x8()))
+    }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21929,14 +23209,16 @@ pub unsafe fn _mm256_mask_permutex2var_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub unsafe fn _mm256_maskz_permutex2var_epi32(
+pub fn _mm256_maskz_permutex2var_epi32(
     k: __mmask8,
     a: __m256i,
     idx: __m256i,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
-    transmute(simd_select_bitmask(k, permute, i32x8::ZERO))
+    unsafe {
+        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, i32x8::ZERO))
+    }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -21946,14 +23228,16 @@ pub unsafe fn _mm256_maskz_permutex2var_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2d))]
-pub unsafe fn _mm256_mask2_permutex2var_epi32(
+pub fn _mm256_mask2_permutex2var_epi32(
     a: __m256i,
     idx: __m256i,
     k: __mmask8,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
-    transmute(simd_select_bitmask(k, permute, idx.as_i32x8()))
+    unsafe {
+        let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
+        transmute(simd_select_bitmask(k, permute, idx.as_i32x8()))
+    }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -21963,8 +23247,8 @@ pub unsafe fn _mm256_mask2_permutex2var_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub unsafe fn _mm_permutex2var_epi32(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    transmute(vpermi2d128(a.as_i32x4(), idx.as_i32x4(), b.as_i32x4()))
+pub fn _mm_permutex2var_epi32(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2d128(a.as_i32x4(), idx.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -21974,14 +23258,11 @@ pub unsafe fn _mm_permutex2var_epi32(a: __m128i, idx: __m128i, b: __m128i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2d))]
-pub unsafe fn _mm_mask_permutex2var_epi32(
-    a: __m128i,
-    k: __mmask8,
-    idx: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
-    transmute(simd_select_bitmask(k, permute, a.as_i32x4()))
+pub fn _mm_mask_permutex2var_epi32(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+        transmute(simd_select_bitmask(k, permute, a.as_i32x4()))
+    }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -21991,14 +23272,11 @@ pub unsafe fn _mm_mask_permutex2var_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
-pub unsafe fn _mm_maskz_permutex2var_epi32(
-    k: __mmask8,
-    a: __m128i,
-    idx: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
-    transmute(simd_select_bitmask(k, permute, i32x4::ZERO))
+pub fn _mm_maskz_permutex2var_epi32(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+        transmute(simd_select_bitmask(k, permute, i32x4::ZERO))
+    }
 }
 
 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -22008,14 +23286,11 @@ pub unsafe fn _mm_maskz_permutex2var_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2d))]
-pub unsafe fn _mm_mask2_permutex2var_epi32(
-    a: __m128i,
-    idx: __m128i,
-    k: __mmask8,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
-    transmute(simd_select_bitmask(k, permute, idx.as_i32x4()))
+pub fn _mm_mask2_permutex2var_epi32(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
+        transmute(simd_select_bitmask(k, permute, idx.as_i32x4()))
+    }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -22025,8 +23300,8 @@ pub unsafe fn _mm_mask2_permutex2var_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub unsafe fn _mm512_permutex2var_epi64(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
-    transmute(vpermi2q(a.as_i64x8(), idx.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_permutex2var_epi64(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2q(a.as_i64x8(), idx.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -22036,14 +23311,16 @@ pub unsafe fn _mm512_permutex2var_epi64(a: __m512i, idx: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2q))]
-pub unsafe fn _mm512_mask_permutex2var_epi64(
+pub fn _mm512_mask_permutex2var_epi64(
     a: __m512i,
     k: __mmask8,
     idx: __m512i,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
-    transmute(simd_select_bitmask(k, permute, a.as_i64x8()))
+    unsafe {
+        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, a.as_i64x8()))
+    }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22053,14 +23330,16 @@ pub unsafe fn _mm512_mask_permutex2var_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub unsafe fn _mm512_maskz_permutex2var_epi64(
+pub fn _mm512_maskz_permutex2var_epi64(
     k: __mmask8,
     a: __m512i,
     idx: __m512i,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
-    transmute(simd_select_bitmask(k, permute, i64x8::ZERO))
+    unsafe {
+        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, i64x8::ZERO))
+    }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -22070,14 +23349,16 @@ pub unsafe fn _mm512_maskz_permutex2var_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2q))]
-pub unsafe fn _mm512_mask2_permutex2var_epi64(
+pub fn _mm512_mask2_permutex2var_epi64(
     a: __m512i,
     idx: __m512i,
     k: __mmask8,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
-    transmute(simd_select_bitmask(k, permute, idx.as_i64x8()))
+    unsafe {
+        let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
+        transmute(simd_select_bitmask(k, permute, idx.as_i64x8()))
+    }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -22087,8 +23368,8 @@ pub unsafe fn _mm512_mask2_permutex2var_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub unsafe fn _mm256_permutex2var_epi64(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
-    transmute(vpermi2q256(a.as_i64x4(), idx.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_permutex2var_epi64(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2q256(a.as_i64x4(), idx.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -22098,14 +23379,16 @@ pub unsafe fn _mm256_permutex2var_epi64(a: __m256i, idx: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2q))]
-pub unsafe fn _mm256_mask_permutex2var_epi64(
+pub fn _mm256_mask_permutex2var_epi64(
     a: __m256i,
     k: __mmask8,
     idx: __m256i,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
-    transmute(simd_select_bitmask(k, permute, a.as_i64x4()))
+    unsafe {
+        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, a.as_i64x4()))
+    }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22115,14 +23398,16 @@ pub unsafe fn _mm256_mask_permutex2var_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub unsafe fn _mm256_maskz_permutex2var_epi64(
+pub fn _mm256_maskz_permutex2var_epi64(
     k: __mmask8,
     a: __m256i,
     idx: __m256i,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
-    transmute(simd_select_bitmask(k, permute, i64x4::ZERO))
+    unsafe {
+        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, i64x4::ZERO))
+    }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -22132,14 +23417,16 @@ pub unsafe fn _mm256_maskz_permutex2var_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2q))]
-pub unsafe fn _mm256_mask2_permutex2var_epi64(
+pub fn _mm256_mask2_permutex2var_epi64(
     a: __m256i,
     idx: __m256i,
     k: __mmask8,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
-    transmute(simd_select_bitmask(k, permute, idx.as_i64x4()))
+    unsafe {
+        let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
+        transmute(simd_select_bitmask(k, permute, idx.as_i64x4()))
+    }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -22149,8 +23436,8 @@ pub unsafe fn _mm256_mask2_permutex2var_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub unsafe fn _mm_permutex2var_epi64(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    transmute(vpermi2q128(a.as_i64x2(), idx.as_i64x2(), b.as_i64x2()))
+pub fn _mm_permutex2var_epi64(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2q128(a.as_i64x2(), idx.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -22160,14 +23447,11 @@ pub unsafe fn _mm_permutex2var_epi64(a: __m128i, idx: __m128i, b: __m128i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2q))]
-pub unsafe fn _mm_mask_permutex2var_epi64(
-    a: __m128i,
-    k: __mmask8,
-    idx: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
-    transmute(simd_select_bitmask(k, permute, a.as_i64x2()))
+pub fn _mm_mask_permutex2var_epi64(a: __m128i, k: __mmask8, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+        transmute(simd_select_bitmask(k, permute, a.as_i64x2()))
+    }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22177,14 +23461,11 @@ pub unsafe fn _mm_mask_permutex2var_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
-pub unsafe fn _mm_maskz_permutex2var_epi64(
-    k: __mmask8,
-    a: __m128i,
-    idx: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
-    transmute(simd_select_bitmask(k, permute, i64x2::ZERO))
+pub fn _mm_maskz_permutex2var_epi64(k: __mmask8, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+        transmute(simd_select_bitmask(k, permute, i64x2::ZERO))
+    }
 }
 
 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -22194,14 +23475,11 @@ pub unsafe fn _mm_maskz_permutex2var_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2q))]
-pub unsafe fn _mm_mask2_permutex2var_epi64(
-    a: __m128i,
-    idx: __m128i,
-    k: __mmask8,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
-    transmute(simd_select_bitmask(k, permute, idx.as_i64x2()))
+pub fn _mm_mask2_permutex2var_epi64(a: __m128i, idx: __m128i, k: __mmask8, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
+        transmute(simd_select_bitmask(k, permute, idx.as_i64x2()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -22211,8 +23489,8 @@ pub unsafe fn _mm_mask2_permutex2var_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub unsafe fn _mm512_permutex2var_ps(a: __m512, idx: __m512i, b: __m512) -> __m512 {
-    transmute(vpermi2ps(a.as_f32x16(), idx.as_i32x16(), b.as_f32x16()))
+pub fn _mm512_permutex2var_ps(a: __m512, idx: __m512i, b: __m512) -> __m512 {
+    unsafe { transmute(vpermi2ps(a.as_f32x16(), idx.as_i32x16(), b.as_f32x16())) }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -22222,14 +23500,11 @@ pub unsafe fn _mm512_permutex2var_ps(a: __m512, idx: __m512i, b: __m512) -> __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2ps))]
-pub unsafe fn _mm512_mask_permutex2var_ps(
-    a: __m512,
-    k: __mmask16,
-    idx: __m512i,
-    b: __m512,
-) -> __m512 {
-    let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
-    transmute(simd_select_bitmask(k, permute, a.as_f32x16()))
+pub fn _mm512_mask_permutex2var_ps(a: __m512, k: __mmask16, idx: __m512i, b: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, a.as_f32x16()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22239,14 +23514,11 @@ pub unsafe fn _mm512_mask_permutex2var_ps(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub unsafe fn _mm512_maskz_permutex2var_ps(
-    k: __mmask16,
-    a: __m512,
-    idx: __m512i,
-    b: __m512,
-) -> __m512 {
-    let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
-    transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+pub fn _mm512_maskz_permutex2var_ps(k: __mmask16, a: __m512, idx: __m512i, b: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, f32x16::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -22256,15 +23528,12 @@ pub unsafe fn _mm512_maskz_permutex2var_ps(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
-pub unsafe fn _mm512_mask2_permutex2var_ps(
-    a: __m512,
-    idx: __m512i,
-    k: __mmask16,
-    b: __m512,
-) -> __m512 {
-    let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
-    let idx = _mm512_castsi512_ps(idx).as_f32x16();
-    transmute(simd_select_bitmask(k, permute, idx))
+pub fn _mm512_mask2_permutex2var_ps(a: __m512, idx: __m512i, k: __mmask16, b: __m512) -> __m512 {
+    unsafe {
+        let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
+        let idx = _mm512_castsi512_ps(idx).as_f32x16();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -22274,8 +23543,8 @@ pub unsafe fn _mm512_mask2_permutex2var_ps(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub unsafe fn _mm256_permutex2var_ps(a: __m256, idx: __m256i, b: __m256) -> __m256 {
-    transmute(vpermi2ps256(a.as_f32x8(), idx.as_i32x8(), b.as_f32x8()))
+pub fn _mm256_permutex2var_ps(a: __m256, idx: __m256i, b: __m256) -> __m256 {
+    unsafe { transmute(vpermi2ps256(a.as_f32x8(), idx.as_i32x8(), b.as_f32x8())) }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -22285,14 +23554,11 @@ pub unsafe fn _mm256_permutex2var_ps(a: __m256, idx: __m256i, b: __m256) -> __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2ps))]
-pub unsafe fn _mm256_mask_permutex2var_ps(
-    a: __m256,
-    k: __mmask8,
-    idx: __m256i,
-    b: __m256,
-) -> __m256 {
-    let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
-    transmute(simd_select_bitmask(k, permute, a.as_f32x8()))
+pub fn _mm256_mask_permutex2var_ps(a: __m256, k: __mmask8, idx: __m256i, b: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, a.as_f32x8()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22302,14 +23568,11 @@ pub unsafe fn _mm256_mask_permutex2var_ps(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub unsafe fn _mm256_maskz_permutex2var_ps(
-    k: __mmask8,
-    a: __m256,
-    idx: __m256i,
-    b: __m256,
-) -> __m256 {
-    let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
-    transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+pub fn _mm256_maskz_permutex2var_ps(k: __mmask8, a: __m256, idx: __m256i, b: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, f32x8::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -22319,15 +23582,12 @@ pub unsafe fn _mm256_maskz_permutex2var_ps(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
-pub unsafe fn _mm256_mask2_permutex2var_ps(
-    a: __m256,
-    idx: __m256i,
-    k: __mmask8,
-    b: __m256,
-) -> __m256 {
-    let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
-    let idx = _mm256_castsi256_ps(idx).as_f32x8();
-    transmute(simd_select_bitmask(k, permute, idx))
+pub fn _mm256_mask2_permutex2var_ps(a: __m256, idx: __m256i, k: __mmask8, b: __m256) -> __m256 {
+    unsafe {
+        let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
+        let idx = _mm256_castsi256_ps(idx).as_f32x8();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -22337,8 +23597,8 @@ pub unsafe fn _mm256_mask2_permutex2var_ps(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub unsafe fn _mm_permutex2var_ps(a: __m128, idx: __m128i, b: __m128) -> __m128 {
-    transmute(vpermi2ps128(a.as_f32x4(), idx.as_i32x4(), b.as_f32x4()))
+pub fn _mm_permutex2var_ps(a: __m128, idx: __m128i, b: __m128) -> __m128 {
+    unsafe { transmute(vpermi2ps128(a.as_f32x4(), idx.as_i32x4(), b.as_f32x4())) }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -22348,9 +23608,11 @@ pub unsafe fn _mm_permutex2var_ps(a: __m128, idx: __m128i, b: __m128) -> __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2ps))]
-pub unsafe fn _mm_mask_permutex2var_ps(a: __m128, k: __mmask8, idx: __m128i, b: __m128) -> __m128 {
-    let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
-    transmute(simd_select_bitmask(k, permute, a.as_f32x4()))
+pub fn _mm_mask_permutex2var_ps(a: __m128, k: __mmask8, idx: __m128i, b: __m128) -> __m128 {
+    unsafe {
+        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, a.as_f32x4()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22360,9 +23622,11 @@ pub unsafe fn _mm_mask_permutex2var_ps(a: __m128, k: __mmask8, idx: __m128i, b:
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
-pub unsafe fn _mm_maskz_permutex2var_ps(k: __mmask8, a: __m128, idx: __m128i, b: __m128) -> __m128 {
-    let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
-    transmute(simd_select_bitmask(k, permute, f32x4::ZERO))
+pub fn _mm_maskz_permutex2var_ps(k: __mmask8, a: __m128, idx: __m128i, b: __m128) -> __m128 {
+    unsafe {
+        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, f32x4::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
@@ -22372,10 +23636,12 @@ pub unsafe fn _mm_maskz_permutex2var_ps(k: __mmask8, a: __m128, idx: __m128i, b:
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
-pub unsafe fn _mm_mask2_permutex2var_ps(a: __m128, idx: __m128i, k: __mmask8, b: __m128) -> __m128 {
-    let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
-    let idx = _mm_castsi128_ps(idx).as_f32x4();
-    transmute(simd_select_bitmask(k, permute, idx))
+pub fn _mm_mask2_permutex2var_ps(a: __m128, idx: __m128i, k: __mmask8, b: __m128) -> __m128 {
+    unsafe {
+        let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
+        let idx = _mm_castsi128_ps(idx).as_f32x4();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -22385,8 +23651,8 @@ pub unsafe fn _mm_mask2_permutex2var_ps(a: __m128, idx: __m128i, k: __mmask8, b:
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub unsafe fn _mm512_permutex2var_pd(a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
-    transmute(vpermi2pd(a.as_f64x8(), idx.as_i64x8(), b.as_f64x8()))
+pub fn _mm512_permutex2var_pd(a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
+    unsafe { transmute(vpermi2pd(a.as_f64x8(), idx.as_i64x8(), b.as_f64x8())) }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -22396,14 +23662,11 @@ pub unsafe fn _mm512_permutex2var_pd(a: __m512d, idx: __m512i, b: __m512d) -> __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2pd))]
-pub unsafe fn _mm512_mask_permutex2var_pd(
-    a: __m512d,
-    k: __mmask8,
-    idx: __m512i,
-    b: __m512d,
-) -> __m512d {
-    let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
-    transmute(simd_select_bitmask(k, permute, a.as_f64x8()))
+pub fn _mm512_mask_permutex2var_pd(a: __m512d, k: __mmask8, idx: __m512i, b: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, a.as_f64x8()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22413,14 +23676,11 @@ pub unsafe fn _mm512_mask_permutex2var_pd(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub unsafe fn _mm512_maskz_permutex2var_pd(
-    k: __mmask8,
-    a: __m512d,
-    idx: __m512i,
-    b: __m512d,
-) -> __m512d {
-    let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
-    transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+pub fn _mm512_maskz_permutex2var_pd(k: __mmask8, a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, f64x8::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
@@ -22430,15 +23690,12 @@ pub unsafe fn _mm512_maskz_permutex2var_pd(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
-pub unsafe fn _mm512_mask2_permutex2var_pd(
-    a: __m512d,
-    idx: __m512i,
-    k: __mmask8,
-    b: __m512d,
-) -> __m512d {
-    let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
-    let idx = _mm512_castsi512_pd(idx).as_f64x8();
-    transmute(simd_select_bitmask(k, permute, idx))
+pub fn _mm512_mask2_permutex2var_pd(a: __m512d, idx: __m512i, k: __mmask8, b: __m512d) -> __m512d {
+    unsafe {
+        let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
+        let idx = _mm512_castsi512_pd(idx).as_f64x8();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -22448,8 +23705,8 @@ pub unsafe fn _mm512_mask2_permutex2var_pd(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub unsafe fn _mm256_permutex2var_pd(a: __m256d, idx: __m256i, b: __m256d) -> __m256d {
-    transmute(vpermi2pd256(a.as_f64x4(), idx.as_i64x4(), b.as_f64x4()))
+pub fn _mm256_permutex2var_pd(a: __m256d, idx: __m256i, b: __m256d) -> __m256d {
+    unsafe { transmute(vpermi2pd256(a.as_f64x4(), idx.as_i64x4(), b.as_f64x4())) }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -22459,14 +23716,11 @@ pub unsafe fn _mm256_permutex2var_pd(a: __m256d, idx: __m256i, b: __m256d) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2pd))]
-pub unsafe fn _mm256_mask_permutex2var_pd(
-    a: __m256d,
-    k: __mmask8,
-    idx: __m256i,
-    b: __m256d,
-) -> __m256d {
-    let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
-    transmute(simd_select_bitmask(k, permute, a.as_f64x4()))
+pub fn _mm256_mask_permutex2var_pd(a: __m256d, k: __mmask8, idx: __m256i, b: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, a.as_f64x4()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22476,14 +23730,11 @@ pub unsafe fn _mm256_mask_permutex2var_pd(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub unsafe fn _mm256_maskz_permutex2var_pd(
-    k: __mmask8,
-    a: __m256d,
-    idx: __m256i,
-    b: __m256d,
-) -> __m256d {
-    let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
-    transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+pub fn _mm256_maskz_permutex2var_pd(k: __mmask8, a: __m256d, idx: __m256i, b: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, f64x4::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
@@ -22493,15 +23744,12 @@ pub unsafe fn _mm256_maskz_permutex2var_pd(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
-pub unsafe fn _mm256_mask2_permutex2var_pd(
-    a: __m256d,
-    idx: __m256i,
-    k: __mmask8,
-    b: __m256d,
-) -> __m256d {
-    let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
-    let idx = _mm256_castsi256_pd(idx).as_f64x4();
-    transmute(simd_select_bitmask(k, permute, idx))
+pub fn _mm256_mask2_permutex2var_pd(a: __m256d, idx: __m256i, k: __mmask8, b: __m256d) -> __m256d {
+    unsafe {
+        let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
+        let idx = _mm256_castsi256_pd(idx).as_f64x4();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -22511,8 +23759,8 @@ pub unsafe fn _mm256_mask2_permutex2var_pd(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub unsafe fn _mm_permutex2var_pd(a: __m128d, idx: __m128i, b: __m128d) -> __m128d {
-    transmute(vpermi2pd128(a.as_f64x2(), idx.as_i64x2(), b.as_f64x2()))
+pub fn _mm_permutex2var_pd(a: __m128d, idx: __m128i, b: __m128d) -> __m128d {
+    unsafe { transmute(vpermi2pd128(a.as_f64x2(), idx.as_i64x2(), b.as_f64x2())) }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -22522,14 +23770,11 @@ pub unsafe fn _mm_permutex2var_pd(a: __m128d, idx: __m128i, b: __m128d) -> __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2pd))]
-pub unsafe fn _mm_mask_permutex2var_pd(
-    a: __m128d,
-    k: __mmask8,
-    idx: __m128i,
-    b: __m128d,
-) -> __m128d {
-    let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
-    transmute(simd_select_bitmask(k, permute, a.as_f64x2()))
+pub fn _mm_mask_permutex2var_pd(a: __m128d, k: __mmask8, idx: __m128i, b: __m128d) -> __m128d {
+    unsafe {
+        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, a.as_f64x2()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22539,14 +23784,11 @@ pub unsafe fn _mm_mask_permutex2var_pd(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
-pub unsafe fn _mm_maskz_permutex2var_pd(
-    k: __mmask8,
-    a: __m128d,
-    idx: __m128i,
-    b: __m128d,
-) -> __m128d {
-    let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
-    transmute(simd_select_bitmask(k, permute, f64x2::ZERO))
+pub fn _mm_maskz_permutex2var_pd(k: __mmask8, a: __m128d, idx: __m128i, b: __m128d) -> __m128d {
+    unsafe {
+        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, f64x2::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
@@ -22556,15 +23798,12 @@ pub unsafe fn _mm_maskz_permutex2var_pd(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
-pub unsafe fn _mm_mask2_permutex2var_pd(
-    a: __m128d,
-    idx: __m128i,
-    k: __mmask8,
-    b: __m128d,
-) -> __m128d {
-    let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
-    let idx = _mm_castsi128_pd(idx).as_f64x2();
-    transmute(simd_select_bitmask(k, permute, idx))
+pub fn _mm_mask2_permutex2var_pd(a: __m128d, idx: __m128i, k: __mmask8, b: __m128d) -> __m128d {
+    unsafe {
+        let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
+        let idx = _mm_castsi128_pd(idx).as_f64x2();
+        transmute(simd_select_bitmask(k, permute, idx))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
@@ -22575,31 +23814,33 @@ pub unsafe fn _mm_mask2_permutex2var_pd(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 9))] //should be vpshufd
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r: i32x16 = simd_shuffle!(
-        a.as_i32x16(),
-        a.as_i32x16(),
-        [
-            MASK as u32 & 0b11,
-            (MASK as u32 >> 2) & 0b11,
-            (MASK as u32 >> 4) & 0b11,
-            (MASK as u32 >> 6) & 0b11,
-            (MASK as u32 & 0b11) + 4,
-            ((MASK as u32 >> 2) & 0b11) + 4,
-            ((MASK as u32 >> 4) & 0b11) + 4,
-            ((MASK as u32 >> 6) & 0b11) + 4,
-            (MASK as u32 & 0b11) + 8,
-            ((MASK as u32 >> 2) & 0b11) + 8,
-            ((MASK as u32 >> 4) & 0b11) + 8,
-            ((MASK as u32 >> 6) & 0b11) + 8,
-            (MASK as u32 & 0b11) + 12,
-            ((MASK as u32 >> 2) & 0b11) + 12,
-            ((MASK as u32 >> 4) & 0b11) + 12,
-            ((MASK as u32 >> 6) & 0b11) + 12,
-        ],
-    );
-    transmute(r)
+pub fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r: i32x16 = simd_shuffle!(
+            a.as_i32x16(),
+            a.as_i32x16(),
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                (MASK as u32 >> 4) & 0b11,
+                (MASK as u32 >> 6) & 0b11,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 4,
+                ((MASK as u32 >> 6) & 0b11) + 4,
+                (MASK as u32 & 0b11) + 8,
+                ((MASK as u32 >> 2) & 0b11) + 8,
+                ((MASK as u32 >> 4) & 0b11) + 8,
+                ((MASK as u32 >> 6) & 0b11) + 8,
+                (MASK as u32 & 0b11) + 12,
+                ((MASK as u32 >> 2) & 0b11) + 12,
+                ((MASK as u32 >> 4) & 0b11) + 12,
+                ((MASK as u32 >> 6) & 0b11) + 12,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22610,14 +23851,16 @@ pub unsafe fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+pub fn _mm512_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_epi32::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
 }
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22628,13 +23871,12 @@ pub unsafe fn _mm512_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
-    k: __mmask16,
-    a: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_epi32::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+pub fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
 }
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22645,14 +23887,16 @@ pub unsafe fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+pub fn _mm256_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_epi32::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
 }
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22663,13 +23907,12 @@ pub unsafe fn _mm256_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
-    k: __mmask8,
-    a: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_epi32::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+pub fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
 }
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22680,14 +23923,16 @@ pub unsafe fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
+pub fn _mm_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm_shuffle_epi32::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
 }
 
 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22698,13 +23943,12 @@ pub unsafe fn _mm_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
-    k: __mmask8,
-    a: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm_shuffle_epi32::<MASK>(a);
-    transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+pub fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_epi32::<MASK>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
@@ -22715,30 +23959,32 @@ pub unsafe fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shuffle_ps<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
-    static_assert_uimm_bits!(MASK, 8);
-    simd_shuffle!(
-        a,
-        b,
-        [
-            MASK as u32 & 0b11,
-            (MASK as u32 >> 2) & 0b11,
-            ((MASK as u32 >> 4) & 0b11) + 16,
-            ((MASK as u32 >> 6) & 0b11) + 16,
-            (MASK as u32 & 0b11) + 4,
-            ((MASK as u32 >> 2) & 0b11) + 4,
-            ((MASK as u32 >> 4) & 0b11) + 20,
-            ((MASK as u32 >> 6) & 0b11) + 20,
-            (MASK as u32 & 0b11) + 8,
-            ((MASK as u32 >> 2) & 0b11) + 8,
-            ((MASK as u32 >> 4) & 0b11) + 24,
-            ((MASK as u32 >> 6) & 0b11) + 24,
-            (MASK as u32 & 0b11) + 12,
-            ((MASK as u32 >> 2) & 0b11) + 12,
-            ((MASK as u32 >> 4) & 0b11) + 28,
-            ((MASK as u32 >> 6) & 0b11) + 28,
-        ],
-    )
+pub fn _mm512_shuffle_ps<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b11,
+                (MASK as u32 >> 2) & 0b11,
+                ((MASK as u32 >> 4) & 0b11) + 16,
+                ((MASK as u32 >> 6) & 0b11) + 16,
+                (MASK as u32 & 0b11) + 4,
+                ((MASK as u32 >> 2) & 0b11) + 4,
+                ((MASK as u32 >> 4) & 0b11) + 20,
+                ((MASK as u32 >> 6) & 0b11) + 20,
+                (MASK as u32 & 0b11) + 8,
+                ((MASK as u32 >> 2) & 0b11) + 8,
+                ((MASK as u32 >> 4) & 0b11) + 24,
+                ((MASK as u32 >> 6) & 0b11) + 24,
+                (MASK as u32 & 0b11) + 12,
+                ((MASK as u32 >> 2) & 0b11) + 12,
+                ((MASK as u32 >> 4) & 0b11) + 28,
+                ((MASK as u32 >> 6) & 0b11) + 28,
+            ],
+        )
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22749,15 +23995,17 @@ pub unsafe fn _mm512_shuffle_ps<const MASK: i32>(a: __m512, b: __m512) -> __m512
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shuffle_ps<const MASK: i32>(
+pub fn _mm512_mask_shuffle_ps<const MASK: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_ps::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22767,15 +24015,13 @@ pub unsafe fn _mm512_mask_shuffle_ps<const MASK: i32>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shuffle_ps<const MASK: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_ps::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+#[rustc_legacy_const_generics(3)]
+pub fn _mm512_maskz_shuffle_ps<const MASK: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22786,15 +24032,17 @@ pub unsafe fn _mm512_maskz_shuffle_ps<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shuffle_ps<const MASK: i32>(
+pub fn _mm256_mask_shuffle_ps<const MASK: i32>(
     src: __m256,
     k: __mmask8,
     a: __m256,
     b: __m256,
 ) -> __m256 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_ps::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22805,14 +24053,12 @@ pub unsafe fn _mm256_mask_shuffle_ps<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shuffle_ps<const MASK: i32>(
-    k: __mmask8,
-    a: __m256,
-    b: __m256,
-) -> __m256 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_ps::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+pub fn _mm256_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22823,15 +24069,17 @@ pub unsafe fn _mm256_maskz_shuffle_ps<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_shuffle_ps<const MASK: i32>(
+pub fn _mm_mask_shuffle_ps<const MASK: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm_shuffle_ps::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22842,10 +24090,12 @@ pub unsafe fn _mm_mask_shuffle_ps<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm_shuffle_ps::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+pub fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_ps::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.
@@ -22856,22 +24106,24 @@ pub unsafe fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shuffle_pd<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
-    static_assert_uimm_bits!(MASK, 8);
-    simd_shuffle!(
-        a,
-        b,
-        [
-            MASK as u32 & 0b1,
-            ((MASK as u32 >> 1) & 0b1) + 8,
-            ((MASK as u32 >> 2) & 0b1) + 2,
-            ((MASK as u32 >> 3) & 0b1) + 10,
-            ((MASK as u32 >> 4) & 0b1) + 4,
-            ((MASK as u32 >> 5) & 0b1) + 12,
-            ((MASK as u32 >> 6) & 0b1) + 6,
-            ((MASK as u32 >> 7) & 0b1) + 14,
-        ],
-    )
+pub fn _mm512_shuffle_pd<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        simd_shuffle!(
+            a,
+            b,
+            [
+                MASK as u32 & 0b1,
+                ((MASK as u32 >> 1) & 0b1) + 8,
+                ((MASK as u32 >> 2) & 0b1) + 2,
+                ((MASK as u32 >> 3) & 0b1) + 10,
+                ((MASK as u32 >> 4) & 0b1) + 4,
+                ((MASK as u32 >> 5) & 0b1) + 12,
+                ((MASK as u32 >> 6) & 0b1) + 6,
+                ((MASK as u32 >> 7) & 0b1) + 14,
+            ],
+        )
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22882,15 +24134,17 @@ pub unsafe fn _mm512_shuffle_pd<const MASK: i32>(a: __m512d, b: __m512d) -> __m5
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shuffle_pd<const MASK: i32>(
+pub fn _mm512_mask_shuffle_pd<const MASK: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_pd::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22901,14 +24155,12 @@ pub unsafe fn _mm512_mask_shuffle_pd<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shuffle_pd<const MASK: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_pd::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+pub fn _mm512_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22919,15 +24171,17 @@ pub unsafe fn _mm512_maskz_shuffle_pd<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shuffle_pd<const MASK: i32>(
+pub fn _mm256_mask_shuffle_pd<const MASK: i32>(
     src: __m256d,
     k: __mmask8,
     a: __m256d,
     b: __m256d,
 ) -> __m256d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_pd::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22938,14 +24192,12 @@ pub unsafe fn _mm256_mask_shuffle_pd<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shuffle_pd<const MASK: i32>(
-    k: __mmask8,
-    a: __m256d,
-    b: __m256d,
-) -> __m256d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_pd::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+pub fn _mm256_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22956,15 +24208,17 @@ pub unsafe fn _mm256_maskz_shuffle_pd<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_shuffle_pd<const MASK: i32>(
+pub fn _mm_mask_shuffle_pd<const MASK: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm_shuffle_pd::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
+    }
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -22975,14 +24229,12 @@ pub unsafe fn _mm_mask_shuffle_pd<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_shuffle_pd<const MASK: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm_shuffle_pd::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x2(), f64x2::ZERO))
+pub fn _mm_maskz_shuffle_pd<const MASK: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm_shuffle_pd::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x2(), f64x2::ZERO))
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
@@ -22993,33 +24245,35 @@ pub unsafe fn _mm_maskz_shuffle_pd<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_01_01_01))] //should be vshufi32x4
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    let r: i32x16 = simd_shuffle!(
-        a,
-        b,
-        [
-            (MASK as u32 & 0b11) * 4 + 0,
-            (MASK as u32 & 0b11) * 4 + 1,
-            (MASK as u32 & 0b11) * 4 + 2,
-            (MASK as u32 & 0b11) * 4 + 3,
-            ((MASK as u32 >> 2) & 0b11) * 4 + 0,
-            ((MASK as u32 >> 2) & 0b11) * 4 + 1,
-            ((MASK as u32 >> 2) & 0b11) * 4 + 2,
-            ((MASK as u32 >> 2) & 0b11) * 4 + 3,
-            ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
-            ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
-            ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
-            ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
-            ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
-            ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
-            ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
-            ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
-        ],
-    );
-    transmute(r)
+pub fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let r: i32x16 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 4 + 0,
+                (MASK as u32 & 0b11) * 4 + 1,
+                (MASK as u32 & 0b11) * 4 + 2,
+                (MASK as u32 & 0b11) * 4 + 3,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23030,15 +24284,17 @@ pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
+pub fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_i32x4::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23049,14 +24305,16 @@ pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
+pub fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
     k: __mmask16,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_i32x4::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
@@ -23067,25 +24325,27 @@ pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b11))] //should be vshufi32x4
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(MASK, 8);
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let r: i32x8 = simd_shuffle!(
-        a,
-        b,
-        [
-            (MASK as u32 & 0b1) * 4 + 0,
-            (MASK as u32 & 0b1) * 4 + 1,
-            (MASK as u32 & 0b1) * 4 + 2,
-            (MASK as u32 & 0b1) * 4 + 3,
-            ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
-            ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
-            ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
-            ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
-        ],
-    );
-    transmute(r)
+pub fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r: i32x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 4 + 0,
+                (MASK as u32 & 0b1) * 4 + 1,
+                (MASK as u32 & 0b1) * 4 + 2,
+                (MASK as u32 & 0b1) * 4 + 3,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23096,15 +24356,17 @@ pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
+pub fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_i32x4::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23115,14 +24377,12 @@ pub unsafe fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_i32x4::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+pub fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
@@ -23133,25 +24393,27 @@ pub unsafe fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    let a = a.as_i64x8();
-    let b = b.as_i64x8();
-    let r: i64x8 = simd_shuffle!(
-        a,
-        b,
-        [
-            (MASK as u32 & 0b11) * 2 + 0,
-            (MASK as u32 & 0b11) * 2 + 1,
-            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
-            ((MASK as u32 >> 2) & 0b11) * 2 + 1,
-            ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
-            ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
-            ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
-            ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
-        ],
-    );
-    transmute(r)
+pub fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let r: i64x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 2 + 0,
+                (MASK as u32 & 0b11) * 2 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23162,15 +24424,17 @@ pub unsafe fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
+pub fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_i64x2::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23181,14 +24445,12 @@ pub unsafe fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_i64x2::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+pub fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
@@ -23199,21 +24461,23 @@ pub unsafe fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshufi64x2
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(MASK, 8);
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    let r: i64x4 = simd_shuffle!(
-        a,
-        b,
-        [
-            (MASK as u32 & 0b1) * 2 + 0,
-            (MASK as u32 & 0b1) * 2 + 1,
-            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
-            ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
-        ],
-    );
-    transmute(r)
+pub fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let r: i64x4 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 2 + 0,
+                (MASK as u32 & 0b1) * 2 + 1,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23224,15 +24488,17 @@ pub unsafe fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
+pub fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_i64x2::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23243,14 +24509,12 @@ pub unsafe fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_i64x2::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+pub fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_i64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
@@ -23261,33 +24525,35 @@ pub unsafe fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b1011))] //should be vshuff32x4, but generate vshuff64x2
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
-    static_assert_uimm_bits!(MASK, 8);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r: f32x16 = simd_shuffle!(
-        a,
-        b,
-        [
-            (MASK as u32 & 0b11) * 4 + 0,
-            (MASK as u32 & 0b11) * 4 + 1,
-            (MASK as u32 & 0b11) * 4 + 2,
-            (MASK as u32 & 0b11) * 4 + 3,
-            ((MASK as u32 >> 2) & 0b11) * 4 + 0,
-            ((MASK as u32 >> 2) & 0b11) * 4 + 1,
-            ((MASK as u32 >> 2) & 0b11) * 4 + 2,
-            ((MASK as u32 >> 2) & 0b11) * 4 + 3,
-            ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
-            ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
-            ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
-            ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
-            ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
-            ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
-            ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
-            ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
-        ],
-    );
-    transmute(r)
+pub fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r: f32x16 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 4 + 0,
+                (MASK as u32 & 0b11) * 4 + 1,
+                (MASK as u32 & 0b11) * 4 + 2,
+                (MASK as u32 & 0b11) * 4 + 3,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 2,
+                ((MASK as u32 >> 2) & 0b11) * 4 + 3,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
+                ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23298,15 +24564,17 @@ pub unsafe fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shuffle_f32x4<const MASK: i32>(
+pub fn _mm512_mask_shuffle_f32x4<const MASK: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __m512 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_f32x4::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23317,14 +24585,12 @@ pub unsafe fn _mm512_mask_shuffle_f32x4<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shuffle_f32x4<const MASK: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __m512 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_f32x4::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+pub fn _mm512_maskz_shuffle_f32x4<const MASK: i32>(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
@@ -23335,25 +24601,27 @@ pub unsafe fn _mm512_maskz_shuffle_f32x4<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff32x4
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
-    static_assert_uimm_bits!(MASK, 8);
-    let a = a.as_f32x8();
-    let b = b.as_f32x8();
-    let r: f32x8 = simd_shuffle!(
-        a,
-        b,
-        [
-            (MASK as u32 & 0b1) * 4 + 0,
-            (MASK as u32 & 0b1) * 4 + 1,
-            (MASK as u32 & 0b1) * 4 + 2,
-            (MASK as u32 & 0b1) * 4 + 3,
-            ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
-            ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
-            ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
-            ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
-        ],
-    );
-    transmute(r)
+pub fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let r: f32x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 4 + 0,
+                (MASK as u32 & 0b1) * 4 + 1,
+                (MASK as u32 & 0b1) * 4 + 2,
+                (MASK as u32 & 0b1) * 4 + 3,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
+                ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23364,15 +24632,17 @@ pub unsafe fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
+pub fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
     src: __m256,
     k: __mmask8,
     a: __m256,
     b: __m256,
 ) -> __m256 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_f32x4::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
 }
 
 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23383,14 +24653,12 @@ pub unsafe fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(
-    k: __mmask8,
-    a: __m256,
-    b: __m256,
-) -> __m256 {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_f32x4::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+pub fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f32x4::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
@@ -23401,25 +24669,27 @@ pub unsafe fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
-    static_assert_uimm_bits!(MASK, 8);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r: f64x8 = simd_shuffle!(
-        a,
-        b,
-        [
-            (MASK as u32 & 0b11) * 2 + 0,
-            (MASK as u32 & 0b11) * 2 + 1,
-            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
-            ((MASK as u32 >> 2) & 0b11) * 2 + 1,
-            ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
-            ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
-            ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
-            ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
-        ],
-    );
-    transmute(r)
+pub fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r: f64x8 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b11) * 2 + 0,
+                (MASK as u32 & 0b11) * 2 + 1,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 0,
+                ((MASK as u32 >> 2) & 0b11) * 2 + 1,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
+                ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23430,15 +24700,17 @@ pub unsafe fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shuffle_f64x2<const MASK: i32>(
+pub fn _mm512_mask_shuffle_f64x2<const MASK: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __m512d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_f64x2::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23449,14 +24721,12 @@ pub unsafe fn _mm512_mask_shuffle_f64x2<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shuffle_f64x2<const MASK: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm512_shuffle_f64x2::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+pub fn _mm512_maskz_shuffle_f64x2<const MASK: i32>(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm512_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
@@ -23467,21 +24737,23 @@ pub unsafe fn _mm512_maskz_shuffle_f64x2<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff64x2
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
-    static_assert_uimm_bits!(MASK, 8);
-    let a = a.as_f64x4();
-    let b = b.as_f64x4();
-    let r: f64x4 = simd_shuffle!(
-        a,
-        b,
-        [
-            (MASK as u32 & 0b1) * 2 + 0,
-            (MASK as u32 & 0b1) * 2 + 1,
-            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
-            ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
-        ],
-    );
-    transmute(r)
+pub fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let r: f64x4 = simd_shuffle!(
+            a,
+            b,
+            [
+                (MASK as u32 & 0b1) * 2 + 0,
+                (MASK as u32 & 0b1) * 2 + 1,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
+                ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
+            ],
+        );
+        transmute(r)
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23492,15 +24764,17 @@ pub unsafe fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shuffle_f64x2<const MASK: i32>(
+pub fn _mm256_mask_shuffle_f64x2<const MASK: i32>(
     src: __m256d,
     k: __mmask8,
     a: __m256d,
     b: __m256d,
 ) -> __m256d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_f64x2::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
 }
 
 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23511,14 +24785,12 @@ pub unsafe fn _mm256_mask_shuffle_f64x2<const MASK: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(
-    k: __mmask8,
-    a: __m256d,
-    b: __m256d,
-) -> __m256d {
-    static_assert_uimm_bits!(MASK, 8);
-    let r = _mm256_shuffle_f64x2::<MASK>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+pub fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(MASK, 8);
+        let r = _mm256_shuffle_f64x2::<MASK>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
 }
 
 /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
@@ -23532,13 +24804,15 @@ pub unsafe fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(
     assert_instr(vextractf32x4, IMM8 = 3)
 )]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_extractf32x4_ps<const IMM8: i32>(a: __m512) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 2);
-    match IMM8 & 0x3 {
-        0 => simd_shuffle!(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
-        1 => simd_shuffle!(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
-        2 => simd_shuffle!(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
-        _ => simd_shuffle!(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
+pub fn _mm512_extractf32x4_ps<const IMM8: i32>(a: __m512) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        match IMM8 & 0x3 {
+            0 => simd_shuffle!(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
+            2 => simd_shuffle!(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
+            _ => simd_shuffle!(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
+        }
     }
 }
 
@@ -23553,14 +24827,12 @@ pub unsafe fn _mm512_extractf32x4_ps<const IMM8: i32>(a: __m512) -> __m128 {
     assert_instr(vextractf32x4, IMM8 = 3)
 )]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_extractf32x4_ps<const IMM8: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m512,
-) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 2);
-    let r = _mm512_extractf32x4_ps::<IMM8>(a);
-    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+pub fn _mm512_mask_extractf32x4_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m512) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
 }
 
 /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23574,10 +24846,12 @@ pub unsafe fn _mm512_mask_extractf32x4_ps<const IMM8: i32>(
     assert_instr(vextractf32x4, IMM8 = 3)
 )]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 2);
-    let r = _mm512_extractf32x4_ps::<IMM8>(a);
-    transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+pub fn _mm512_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
 }
 
 /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
@@ -23591,11 +24865,13 @@ pub unsafe fn _mm512_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m5
     assert_instr(vextract, IMM8 = 1) //should be vextractf32x4
 )]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm256_extractf32x4_ps<const IMM8: i32>(a: __m256) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 1);
-    match IMM8 & 0x1 {
-        0 => simd_shuffle!(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
-        _ => simd_shuffle!(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
+pub fn _mm256_extractf32x4_ps<const IMM8: i32>(a: __m256) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 0x1 {
+            0 => simd_shuffle!(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
+        }
     }
 }
 
@@ -23610,14 +24886,12 @@ pub unsafe fn _mm256_extractf32x4_ps<const IMM8: i32>(a: __m256) -> __m128 {
     assert_instr(vextractf32x4, IMM8 = 1)
 )]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_extractf32x4_ps<const IMM8: i32>(
-    src: __m128,
-    k: __mmask8,
-    a: __m256,
-) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm256_extractf32x4_ps::<IMM8>(a);
-    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+pub fn _mm256_mask_extractf32x4_ps<const IMM8: i32>(src: __m128, k: __mmask8, a: __m256) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
+    }
 }
 
 /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23631,10 +24905,12 @@ pub unsafe fn _mm256_mask_extractf32x4_ps<const IMM8: i32>(
     assert_instr(vextractf32x4, IMM8 = 1)
 )]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm256_extractf32x4_ps::<IMM8>(a);
-    transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+pub fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_extractf32x4_ps::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f32x4(), f32x4::ZERO))
+    }
 }
 
 /// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the result in dst.
@@ -23648,11 +24924,13 @@ pub unsafe fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m2
     assert_instr(vextractf64x4, IMM1 = 1) //should be vextracti64x4
 )]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i {
-    static_assert_uimm_bits!(IMM1, 1);
-    match IMM1 {
-        0 => simd_shuffle!(a, _mm512_setzero_si512(), [0, 1, 2, 3]),
-        _ => simd_shuffle!(a, _mm512_setzero_si512(), [4, 5, 6, 7]),
+pub fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        match IMM1 {
+            0 => simd_shuffle!(a, _mm512_setzero_si512(), [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, _mm512_setzero_si512(), [4, 5, 6, 7]),
+        }
     }
 }
 
@@ -23667,14 +24945,16 @@ pub unsafe fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i
     assert_instr(vextracti64x4, IMM1 = 1)
 )]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_extracti64x4_epi64<const IMM1: i32>(
+pub fn _mm512_mask_extracti64x4_epi64<const IMM1: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m512i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM1, 1);
-    let r = _mm512_extracti64x4_epi64::<IMM1>(a);
-    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm512_extracti64x4_epi64::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
 }
 
 /// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23688,10 +24968,12 @@ pub unsafe fn _mm512_mask_extracti64x4_epi64<const IMM1: i32>(
     assert_instr(vextracti64x4, IMM1 = 1)
 )]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: __m512i) -> __m256i {
-    static_assert_uimm_bits!(IMM1, 1);
-    let r = _mm512_extracti64x4_epi64::<IMM1>(a);
-    transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+pub fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: __m512i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm512_extracti64x4_epi64::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
 }
 
 /// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
@@ -23705,11 +24987,13 @@ pub unsafe fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: _
     assert_instr(vextractf64x4, IMM8 = 1)
 )]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_extractf64x4_pd<const IMM8: i32>(a: __m512d) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 1);
-    match IMM8 & 0x1 {
-        0 => simd_shuffle!(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
-        _ => simd_shuffle!(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
+pub fn _mm512_extractf64x4_pd<const IMM8: i32>(a: __m512d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        match IMM8 & 0x1 {
+            0 => simd_shuffle!(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
+        }
     }
 }
 
@@ -23724,14 +25008,16 @@ pub unsafe fn _mm512_extractf64x4_pd<const IMM8: i32>(a: __m512d) -> __m256d {
     assert_instr(vextractf64x4, IMM8 = 1)
 )]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_extractf64x4_pd<const IMM8: i32>(
+pub fn _mm512_mask_extractf64x4_pd<const IMM8: i32>(
     src: __m256d,
     k: __mmask8,
     a: __m512d,
 ) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm512_extractf64x4_pd::<IMM8>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_extractf64x4_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
+    }
 }
 
 /// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23745,10 +25031,12 @@ pub unsafe fn _mm512_mask_extractf64x4_pd<const IMM8: i32>(
     assert_instr(vextractf64x4, IMM8 = 1)
 )]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_extractf64x4_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m256d {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm512_extractf64x4_pd::<IMM8>(a);
-    transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+pub fn _mm512_maskz_extractf64x4_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m256d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_extractf64x4_pd::<IMM8>(a);
+        transmute(simd_select_bitmask(k, r.as_f64x4(), f64x4::ZERO))
+    }
 }
 
 /// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the result in dst.
@@ -23762,17 +25050,19 @@ pub unsafe fn _mm512_maskz_extractf64x4_pd<const IMM8: i32>(k: __mmask8, a: __m5
     assert_instr(vextractf32x4, IMM2 = 3) //should be vextracti32x4
 )]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i {
-    static_assert_uimm_bits!(IMM2, 2);
-    let a = a.as_i32x16();
-    let zero = i32x16::ZERO;
-    let extract: i32x4 = match IMM2 {
-        0 => simd_shuffle!(a, zero, [0, 1, 2, 3]),
-        1 => simd_shuffle!(a, zero, [4, 5, 6, 7]),
-        2 => simd_shuffle!(a, zero, [8, 9, 10, 11]),
-        _ => simd_shuffle!(a, zero, [12, 13, 14, 15]),
-    };
-    transmute(extract)
+pub fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let a = a.as_i32x16();
+        let zero = i32x16::ZERO;
+        let extract: i32x4 = match IMM2 {
+            0 => simd_shuffle!(a, zero, [0, 1, 2, 3]),
+            1 => simd_shuffle!(a, zero, [4, 5, 6, 7]),
+            2 => simd_shuffle!(a, zero, [8, 9, 10, 11]),
+            _ => simd_shuffle!(a, zero, [12, 13, 14, 15]),
+        };
+        transmute(extract)
+    }
 }
 
 /// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23786,14 +25076,16 @@ pub unsafe fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i
     assert_instr(vextracti32x4, IMM2 = 3)
 )]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_mask_extracti32x4_epi32<const IMM2: i32>(
+pub fn _mm512_mask_extracti32x4_epi32<const IMM2: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m512i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM2, 2);
-    let r = _mm512_extracti32x4_epi32::<IMM2>(a);
-    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm512_extracti32x4_epi32::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
 }
 
 /// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23807,10 +25099,12 @@ pub unsafe fn _mm512_mask_extracti32x4_epi32<const IMM2: i32>(
     assert_instr(vextracti32x4, IMM2 = 3)
 )]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_maskz_extracti32x4_epi32<const IMM2: i32>(k: __mmask8, a: __m512i) -> __m128i {
-    static_assert_uimm_bits!(IMM2, 2);
-    let r = _mm512_extracti32x4_epi32::<IMM2>(a);
-    transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+pub fn _mm512_maskz_extracti32x4_epi32<const IMM2: i32>(k: __mmask8, a: __m512i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM2, 2);
+        let r = _mm512_extracti32x4_epi32::<IMM2>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
 }
 
 /// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the result in dst.
@@ -23824,15 +25118,17 @@ pub unsafe fn _mm512_maskz_extracti32x4_epi32<const IMM2: i32>(k: __mmask8, a: _
     assert_instr(vextract, IMM1 = 1) //should be vextracti32x4
 )]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i {
-    static_assert_uimm_bits!(IMM1, 1);
-    let a = a.as_i32x8();
-    let zero = i32x8::ZERO;
-    let extract: i32x4 = match IMM1 {
-        0 => simd_shuffle!(a, zero, [0, 1, 2, 3]),
-        _ => simd_shuffle!(a, zero, [4, 5, 6, 7]),
-    };
-    transmute(extract)
+pub fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let a = a.as_i32x8();
+        let zero = i32x8::ZERO;
+        let extract: i32x4 = match IMM1 {
+            0 => simd_shuffle!(a, zero, [0, 1, 2, 3]),
+            _ => simd_shuffle!(a, zero, [4, 5, 6, 7]),
+        };
+        transmute(extract)
+    }
 }
 
 /// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23846,14 +25142,16 @@ pub unsafe fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i
     assert_instr(vextracti32x4, IMM1 = 1)
 )]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_mask_extracti32x4_epi32<const IMM1: i32>(
+pub fn _mm256_mask_extracti32x4_epi32<const IMM1: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m256i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM1, 1);
-    let r = _mm256_extracti32x4_epi32::<IMM1>(a);
-    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm256_extracti32x4_epi32::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
 }
 
 /// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23867,10 +25165,12 @@ pub unsafe fn _mm256_mask_extracti32x4_epi32<const IMM1: i32>(
     assert_instr(vextracti32x4, IMM1 = 1)
 )]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: __m256i) -> __m128i {
-    static_assert_uimm_bits!(IMM1, 1);
-    let r = _mm256_extracti32x4_epi32::<IMM1>(a);
-    transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+pub fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: __m256i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM1, 1);
+        let r = _mm256_extracti32x4_epi32::<IMM1>(a);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
 }
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
@@ -23880,9 +25180,11 @@ pub unsafe fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
-pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 {
-    let r: f32x16 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
-    transmute(r)
+pub fn _mm512_moveldup_ps(a: __m512) -> __m512 {
+    unsafe {
+        let r: f32x16 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+        transmute(r)
+    }
 }
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23892,9 +25194,12 @@ pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
-pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
-    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+pub fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+    }
 }
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23904,9 +25209,12 @@ pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
-pub unsafe fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
-    transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+pub fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
+        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+    }
 }
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23916,9 +25224,11 @@ pub unsafe fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
-pub unsafe fn _mm256_mask_moveldup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    let mov = _mm256_moveldup_ps(a);
-    transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
+pub fn _mm256_mask_moveldup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
+    }
 }
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23928,9 +25238,11 @@ pub unsafe fn _mm256_mask_moveldup_ps(src: __m256, k: __mmask8, a: __m256) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
-pub unsafe fn _mm256_maskz_moveldup_ps(k: __mmask8, a: __m256) -> __m256 {
-    let mov = _mm256_moveldup_ps(a);
-    transmute(simd_select_bitmask(k, mov.as_f32x8(), f32x8::ZERO))
+pub fn _mm256_maskz_moveldup_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), f32x8::ZERO))
+    }
 }
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23940,9 +25252,11 @@ pub unsafe fn _mm256_maskz_moveldup_ps(k: __mmask8, a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
-pub unsafe fn _mm_mask_moveldup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    let mov = _mm_moveldup_ps(a);
-    transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
+pub fn _mm_mask_moveldup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
+    }
 }
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23952,9 +25266,11 @@ pub unsafe fn _mm_mask_moveldup_ps(src: __m128, k: __mmask8, a: __m128) -> __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
-pub unsafe fn _mm_maskz_moveldup_ps(k: __mmask8, a: __m128) -> __m128 {
-    let mov = _mm_moveldup_ps(a);
-    transmute(simd_select_bitmask(k, mov.as_f32x4(), f32x4::ZERO))
+pub fn _mm_maskz_moveldup_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_moveldup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), f32x4::ZERO))
+    }
 }
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
@@ -23964,9 +25280,11 @@ pub unsafe fn _mm_maskz_moveldup_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
-pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 {
-    let r: f32x16 = simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
-    transmute(r)
+pub fn _mm512_movehdup_ps(a: __m512) -> __m512 {
+    unsafe {
+        let r: f32x16 = simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+        transmute(r)
+    }
 }
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -23976,9 +25294,12 @@ pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
-pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
-    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+pub fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+        transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+    }
 }
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -23988,9 +25309,12 @@ pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
-pub unsafe fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
-    let mov: f32x16 = simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
-    transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+pub fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
+    unsafe {
+        let mov: f32x16 =
+            simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
+        transmute(simd_select_bitmask(k, mov, f32x16::ZERO))
+    }
 }
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24000,9 +25324,11 @@ pub unsafe fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
-pub unsafe fn _mm256_mask_movehdup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
-    let mov = _mm256_movehdup_ps(a);
-    transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
+pub fn _mm256_mask_movehdup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
+    }
 }
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24012,9 +25338,11 @@ pub unsafe fn _mm256_mask_movehdup_ps(src: __m256, k: __mmask8, a: __m256) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
-pub unsafe fn _mm256_maskz_movehdup_ps(k: __mmask8, a: __m256) -> __m256 {
-    let mov = _mm256_movehdup_ps(a);
-    transmute(simd_select_bitmask(k, mov.as_f32x8(), f32x8::ZERO))
+pub fn _mm256_maskz_movehdup_ps(k: __mmask8, a: __m256) -> __m256 {
+    unsafe {
+        let mov = _mm256_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x8(), f32x8::ZERO))
+    }
 }
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24024,9 +25352,11 @@ pub unsafe fn _mm256_maskz_movehdup_ps(k: __mmask8, a: __m256) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
-pub unsafe fn _mm_mask_movehdup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    let mov = _mm_movehdup_ps(a);
-    transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
+pub fn _mm_mask_movehdup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
+    }
 }
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24036,9 +25366,11 @@ pub unsafe fn _mm_mask_movehdup_ps(src: __m128, k: __mmask8, a: __m128) -> __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
-pub unsafe fn _mm_maskz_movehdup_ps(k: __mmask8, a: __m128) -> __m128 {
-    let mov = _mm_movehdup_ps(a);
-    transmute(simd_select_bitmask(k, mov.as_f32x4(), f32x4::ZERO))
+pub fn _mm_maskz_movehdup_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let mov = _mm_movehdup_ps(a);
+        transmute(simd_select_bitmask(k, mov.as_f32x4(), f32x4::ZERO))
+    }
 }
 
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.
@@ -24048,9 +25380,11 @@ pub unsafe fn _mm_maskz_movehdup_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovddup))]
-pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d {
-    let r: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
-    transmute(r)
+pub fn _mm512_movedup_pd(a: __m512d) -> __m512d {
+    unsafe {
+        let r: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+        transmute(r)
+    }
 }
 
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24060,9 +25394,11 @@ pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovddup))]
-pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
-    let mov: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
-    transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+pub fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+        transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+    }
 }
 
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24072,9 +25408,11 @@ pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovddup))]
-pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
-    let mov: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
-    transmute(simd_select_bitmask(k, mov, f64x8::ZERO))
+pub fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
+    unsafe {
+        let mov: f64x8 = simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
+        transmute(simd_select_bitmask(k, mov, f64x8::ZERO))
+    }
 }
 
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24084,9 +25422,11 @@ pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovddup))]
-pub unsafe fn _mm256_mask_movedup_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
-    let mov = _mm256_movedup_pd(a);
-    transmute(simd_select_bitmask(k, mov.as_f64x4(), src.as_f64x4()))
+pub fn _mm256_mask_movedup_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = _mm256_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x4(), src.as_f64x4()))
+    }
 }
 
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24096,9 +25436,11 @@ pub unsafe fn _mm256_mask_movedup_pd(src: __m256d, k: __mmask8, a: __m256d) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovddup))]
-pub unsafe fn _mm256_maskz_movedup_pd(k: __mmask8, a: __m256d) -> __m256d {
-    let mov = _mm256_movedup_pd(a);
-    transmute(simd_select_bitmask(k, mov.as_f64x4(), f64x4::ZERO))
+pub fn _mm256_maskz_movedup_pd(k: __mmask8, a: __m256d) -> __m256d {
+    unsafe {
+        let mov = _mm256_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x4(), f64x4::ZERO))
+    }
 }
 
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24108,9 +25450,11 @@ pub unsafe fn _mm256_maskz_movedup_pd(k: __mmask8, a: __m256d) -> __m256d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovddup))]
-pub unsafe fn _mm_mask_movedup_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
-    let mov = _mm_movedup_pd(a);
-    transmute(simd_select_bitmask(k, mov.as_f64x2(), src.as_f64x2()))
+pub fn _mm_mask_movedup_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = _mm_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x2(), src.as_f64x2()))
+    }
 }
 
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24120,9 +25464,11 @@ pub unsafe fn _mm_mask_movedup_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m1
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovddup))]
-pub unsafe fn _mm_maskz_movedup_pd(k: __mmask8, a: __m128d) -> __m128d {
-    let mov = _mm_movedup_pd(a);
-    transmute(simd_select_bitmask(k, mov.as_f64x2(), f64x2::ZERO))
+pub fn _mm_maskz_movedup_pd(k: __mmask8, a: __m128d) -> __m128d {
+    unsafe {
+        let mov = _mm_movedup_pd(a);
+        transmute(simd_select_bitmask(k, mov.as_f64x2(), f64x2::ZERO))
+    }
 }
 
 /// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
@@ -24133,29 +25479,39 @@ pub unsafe fn _mm_maskz_movedup_pd(k: __mmask8, a: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))] //should be vinserti32x4
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_inserti32x4<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 2);
-    let a = a.as_i32x16();
-    let b = _mm512_castsi128_si512(b).as_i32x16();
-    let ret: i32x16 = match IMM8 & 0b11 {
-        0 => simd_shuffle!(
-            a,
-            b,
-            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        1 => simd_shuffle!(
-            a,
-            b,
-            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        2 => simd_shuffle!(
-            a,
-            b,
-            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
-        ),
-        _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
-    };
-    transmute(ret)
+pub fn _mm512_inserti32x4<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let a = a.as_i32x16();
+        let b = _mm512_castsi128_si512(b).as_i32x16();
+        let ret: i32x16 = match IMM8 & 0b11 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            1 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            2 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+                )
+            }
+            _ => {
+                simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19])
+            }
+        };
+        transmute(ret)
+    }
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24166,15 +25522,17 @@ pub unsafe fn _mm512_inserti32x4<const IMM8: i32>(a: __m512i, b: __m128i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_inserti32x4<const IMM8: i32>(
+pub fn _mm512_mask_inserti32x4<const IMM8: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m128i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 2);
-    let r = _mm512_inserti32x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24185,14 +25543,12 @@ pub unsafe fn _mm512_mask_inserti32x4<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_inserti32x4<const IMM8: i32>(
-    k: __mmask16,
-    a: __m512i,
-    b: __m128i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 2);
-    let r = _mm512_inserti32x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+pub fn _mm512_maskz_inserti32x4<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m128i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
 }
 
 /// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
@@ -24206,15 +25562,17 @@ pub unsafe fn _mm512_maskz_inserti32x4<const IMM8: i32>(
     assert_instr(vinsert, IMM8 = 1) //should be vinserti32x4
 )]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_inserti32x4<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let a = a.as_i32x8();
-    let b = _mm256_castsi128_si256(b).as_i32x8();
-    let ret: i32x8 = match IMM8 & 0b1 {
-        0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
-    };
-    transmute(ret)
+pub fn _mm256_inserti32x4<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let a = a.as_i32x8();
+        let b = _mm256_castsi128_si256(b).as_i32x8();
+        let ret: i32x8 = match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        };
+        transmute(ret)
+    }
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24228,15 +25586,17 @@ pub unsafe fn _mm256_inserti32x4<const IMM8: i32>(a: __m256i, b: __m128i) -> __m
     assert_instr(vinserti32x4, IMM8 = 1)
 )]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_inserti32x4<const IMM8: i32>(
+pub fn _mm256_mask_inserti32x4<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m128i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm256_inserti32x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24250,14 +25610,12 @@ pub unsafe fn _mm256_mask_inserti32x4<const IMM8: i32>(
     assert_instr(vinserti32x4, IMM8 = 1)
 )]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_inserti32x4<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m128i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm256_inserti32x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+pub fn _mm256_maskz_inserti32x4<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m128i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_inserti32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
 }
 
 /// Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8.
@@ -24268,12 +25626,14 @@ pub unsafe fn _mm256_maskz_inserti32x4<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))] //should be vinserti64x4
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_inserti64x4<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm512_castsi256_si512(b);
-    match IMM8 & 0b1 {
-        0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+pub fn _mm512_inserti64x4<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_castsi256_si512(b);
+        match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        }
     }
 }
 
@@ -24285,15 +25645,17 @@ pub unsafe fn _mm512_inserti64x4<const IMM8: i32>(a: __m512i, b: __m256i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_inserti64x4<const IMM8: i32>(
+pub fn _mm512_mask_inserti64x4<const IMM8: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m256i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm512_inserti64x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_inserti64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
 }
 
 /// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24304,14 +25666,12 @@ pub unsafe fn _mm512_mask_inserti64x4<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_inserti64x4<const IMM8: i32>(
-    k: __mmask8,
-    a: __m512i,
-    b: __m256i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm512_inserti64x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+pub fn _mm512_maskz_inserti64x4<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m256i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_inserti64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
 }
 
 /// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
@@ -24322,26 +25682,36 @@ pub unsafe fn _mm512_maskz_inserti64x4<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_insertf32x4<const IMM8: i32>(a: __m512, b: __m128) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 2);
-    let b = _mm512_castps128_ps512(b);
-    match IMM8 & 0b11 {
-        0 => simd_shuffle!(
-            a,
-            b,
-            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        1 => simd_shuffle!(
-            a,
-            b,
-            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
-        ),
-        2 => simd_shuffle!(
-            a,
-            b,
-            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
-        ),
-        _ => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
+pub fn _mm512_insertf32x4<const IMM8: i32>(a: __m512, b: __m128) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let b = _mm512_castps128_ps512(b);
+        match IMM8 & 0b11 {
+            0 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            1 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
+                )
+            }
+            2 => {
+                simd_shuffle!(
+                    a,
+                    b,
+                    [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
+                )
+            }
+            _ => {
+                simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19])
+            }
+        }
     }
 }
 
@@ -24353,15 +25723,17 @@ pub unsafe fn _mm512_insertf32x4<const IMM8: i32>(a: __m512, b: __m128) -> __m51
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_insertf32x4<const IMM8: i32>(
+pub fn _mm512_mask_insertf32x4<const IMM8: i32>(
     src: __m512,
     k: __mmask16,
     a: __m512,
     b: __m128,
 ) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 2);
-    let r = _mm512_insertf32x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
+    }
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24372,14 +25744,12 @@ pub unsafe fn _mm512_mask_insertf32x4<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_insertf32x4<const IMM8: i32>(
-    k: __mmask16,
-    a: __m512,
-    b: __m128,
-) -> __m512 {
-    static_assert_uimm_bits!(IMM8, 2);
-    let r = _mm512_insertf32x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+pub fn _mm512_maskz_insertf32x4<const IMM8: i32>(k: __mmask16, a: __m512, b: __m128) -> __m512 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 2);
+        let r = _mm512_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x16(), f32x16::ZERO))
+    }
 }
 
 /// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
@@ -24393,12 +25763,14 @@ pub unsafe fn _mm512_maskz_insertf32x4<const IMM8: i32>(
     assert_instr(vinsert, IMM8 = 1) //should be vinsertf32x4
 )]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_insertf32x4<const IMM8: i32>(a: __m256, b: __m128) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm256_castps128_ps256(b);
-    match IMM8 & 0b1 {
-        0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+pub fn _mm256_insertf32x4<const IMM8: i32>(a: __m256, b: __m128) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm256_castps128_ps256(b);
+        match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        }
     }
 }
 
@@ -24413,15 +25785,17 @@ pub unsafe fn _mm256_insertf32x4<const IMM8: i32>(a: __m256, b: __m128) -> __m25
     assert_instr(vinsertf32x4, IMM8 = 1)
 )]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_insertf32x4<const IMM8: i32>(
+pub fn _mm256_mask_insertf32x4<const IMM8: i32>(
     src: __m256,
     k: __mmask8,
     a: __m256,
     b: __m128,
 ) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm256_insertf32x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
+    }
 }
 
 /// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24435,14 +25809,12 @@ pub unsafe fn _mm256_mask_insertf32x4<const IMM8: i32>(
     assert_instr(vinsertf32x4, IMM8 = 1)
 )]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_insertf32x4<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256,
-    b: __m128,
-) -> __m256 {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm256_insertf32x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+pub fn _mm256_maskz_insertf32x4<const IMM8: i32>(k: __mmask8, a: __m256, b: __m128) -> __m256 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm256_insertf32x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f32x8(), f32x8::ZERO))
+    }
 }
 
 /// Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
@@ -24453,12 +25825,14 @@ pub unsafe fn _mm256_maskz_insertf32x4<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_insertf64x4<const IMM8: i32>(a: __m512d, b: __m256d) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 1);
-    let b = _mm512_castpd256_pd512(b);
-    match IMM8 & 0b1 {
-        0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
-        _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+pub fn _mm512_insertf64x4<const IMM8: i32>(a: __m512d, b: __m256d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let b = _mm512_castpd256_pd512(b);
+        match IMM8 & 0b1 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
+            _ => simd_shuffle!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
+        }
     }
 }
 
@@ -24470,15 +25844,17 @@ pub unsafe fn _mm512_insertf64x4<const IMM8: i32>(a: __m512d, b: __m256d) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_insertf64x4<const IMM8: i32>(
+pub fn _mm512_mask_insertf64x4<const IMM8: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m512d,
     b: __m256d,
 ) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm512_insertf64x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_insertf64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
+    }
 }
 
 /// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24489,14 +25865,12 @@ pub unsafe fn _mm512_mask_insertf64x4<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_insertf64x4<const IMM8: i32>(
-    k: __mmask8,
-    a: __m512d,
-    b: __m256d,
-) -> __m512d {
-    static_assert_uimm_bits!(IMM8, 1);
-    let r = _mm512_insertf64x4::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+pub fn _mm512_maskz_insertf64x4<const IMM8: i32>(k: __mmask8, a: __m512d, b: __m256d) -> __m512d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 1);
+        let r = _mm512_insertf64x4::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_f64x8(), f64x8::ZERO))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
@@ -24506,18 +25880,20 @@ pub unsafe fn _mm512_maskz_insertf64x4<const IMM8: i32>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhps))] //should be vpunpckhdq
-pub unsafe fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    #[rustfmt::skip]
-    let r: i32x16 = simd_shuffle!(
-        a, b,
-        [ 2, 18, 3, 19,
-          2 + 4, 18 + 4, 3 + 4, 19 + 4,
-          2 + 8, 18 + 8, 3 + 8, 19 + 8,
-          2 + 12, 18 + 12, 3 + 12, 19 + 12],
-    );
-    transmute(r)
+pub fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        #[rustfmt::skip]
+        let r: i32x16 = simd_shuffle!(
+            a, b,
+            [ 2, 18, 3, 19,
+              2 + 4, 18 + 4, 3 + 4, 19 + 4,
+              2 + 8, 18 + 8, 3 + 8, 19 + 8,
+              2 + 12, 18 + 12, 3 + 12, 19 + 12],
+        );
+        transmute(r)
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24527,14 +25903,11 @@ pub unsafe fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub unsafe fn _mm512_mask_unpackhi_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
+pub fn _mm512_mask_unpackhi_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24544,9 +25917,11 @@ pub unsafe fn _mm512_mask_unpackhi_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, unpackhi, i32x16::ZERO))
+pub fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpackhi, i32x16::ZERO))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24556,14 +25931,11 @@ pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub unsafe fn _mm256_mask_unpackhi_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x8()))
+pub fn _mm256_mask_unpackhi_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x8()))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24573,9 +25945,11 @@ pub unsafe fn _mm256_mask_unpackhi_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub unsafe fn _mm256_maskz_unpackhi_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, unpackhi, i32x8::ZERO))
+pub fn _mm256_maskz_unpackhi_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpackhi, i32x8::ZERO))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24585,14 +25959,11 @@ pub unsafe fn _mm256_maskz_unpackhi_epi32(k: __mmask8, a: __m256i, b: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub unsafe fn _mm_mask_unpackhi_epi32(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x4()))
+pub fn _mm_mask_unpackhi_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i32x4()))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24602,9 +25973,11 @@ pub unsafe fn _mm_mask_unpackhi_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhdq))]
-pub unsafe fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, unpackhi, i32x4::ZERO))
+pub fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpackhi, i32x4::ZERO))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
@@ -24614,8 +25987,8 @@ pub unsafe fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
-pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
-    simd_shuffle!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
+pub fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { simd_shuffle!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6]) }
 }
 
 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24625,14 +25998,11 @@ pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub unsafe fn _mm512_mask_unpackhi_epi64(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
+pub fn _mm512_mask_unpackhi_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24642,9 +26012,11 @@ pub unsafe fn _mm512_mask_unpackhi_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, unpackhi, i64x8::ZERO))
+pub fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpackhi, i64x8::ZERO))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24654,14 +26026,11 @@ pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub unsafe fn _mm256_mask_unpackhi_epi64(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x4()))
+pub fn _mm256_mask_unpackhi_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x4()))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24671,9 +26040,11 @@ pub unsafe fn _mm256_mask_unpackhi_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub unsafe fn _mm256_maskz_unpackhi_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, unpackhi, i64x4::ZERO))
+pub fn _mm256_maskz_unpackhi_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpackhi, i64x4::ZERO))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24683,14 +26054,11 @@ pub unsafe fn _mm256_maskz_unpackhi_epi64(k: __mmask8, a: __m256i, b: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub unsafe fn _mm_mask_unpackhi_epi64(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x2()))
+pub fn _mm_mask_unpackhi_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_i64x2()))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24700,9 +26068,11 @@ pub unsafe fn _mm_mask_unpackhi_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
-pub unsafe fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, unpackhi, i64x2::ZERO))
+pub fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpackhi, i64x2::ZERO))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
@@ -24712,15 +26082,17 @@ pub unsafe fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
-pub unsafe fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
-    #[rustfmt::skip]
-    simd_shuffle!(
-        a, b,
-        [ 2, 18, 3, 19,
-          2 + 4, 18 + 4, 3 + 4, 19 + 4,
-          2 + 8, 18 + 8, 3 + 8, 19 + 8,
-          2 + 12, 18 + 12, 3 + 12, 19 + 12],
-    )
+pub fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        #[rustfmt::skip]
+        simd_shuffle!(
+            a, b,
+            [ 2, 18, 3, 19,
+              2 + 4, 18 + 4, 3 + 4, 19 + 4,
+              2 + 8, 18 + 8, 3 + 8, 19 + 8,
+              2 + 12, 18 + 12, 3 + 12, 19 + 12],
+        )
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24730,9 +26102,11 @@ pub unsafe fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
-pub unsafe fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
+pub fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24742,9 +26116,11 @@ pub unsafe fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
-pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, unpackhi, f32x16::ZERO))
+pub fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpackhi, f32x16::ZERO))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24754,9 +26130,11 @@ pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
-pub unsafe fn _mm256_mask_unpackhi_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x8()))
+pub fn _mm256_mask_unpackhi_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x8()))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24766,9 +26144,11 @@ pub unsafe fn _mm256_mask_unpackhi_ps(src: __m256, k: __mmask8, a: __m256, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
-pub unsafe fn _mm256_maskz_unpackhi_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, unpackhi, f32x8::ZERO))
+pub fn _mm256_maskz_unpackhi_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpackhi, f32x8::ZERO))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24778,9 +26158,11 @@ pub unsafe fn _mm256_maskz_unpackhi_ps(k: __mmask8, a: __m256, b: __m256) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
-pub unsafe fn _mm_mask_unpackhi_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x4()))
+pub fn _mm_mask_unpackhi_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f32x4()))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24790,9 +26172,11 @@ pub unsafe fn _mm_mask_unpackhi_ps(src: __m128, k: __mmask8, a: __m128, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
-pub unsafe fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, unpackhi, f32x4::ZERO))
+pub fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpackhi, f32x4::ZERO))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
@@ -24802,8 +26186,8 @@ pub unsafe fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
-pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
-    simd_shuffle!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
+pub fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { simd_shuffle!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6]) }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24813,14 +26197,11 @@ pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
-pub unsafe fn _mm512_mask_unpackhi_pd(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
+pub fn _mm512_mask_unpackhi_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24830,9 +26211,11 @@ pub unsafe fn _mm512_mask_unpackhi_pd(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
-pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, unpackhi, f64x8::ZERO))
+pub fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpackhi, f64x8::ZERO))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24842,14 +26225,11 @@ pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
-pub unsafe fn _mm256_mask_unpackhi_pd(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-    b: __m256d,
-) -> __m256d {
-    let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x4()))
+pub fn _mm256_mask_unpackhi_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x4()))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24859,9 +26239,11 @@ pub unsafe fn _mm256_mask_unpackhi_pd(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
-pub unsafe fn _mm256_maskz_unpackhi_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, unpackhi, f64x4::ZERO))
+pub fn _mm256_maskz_unpackhi_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpackhi, f64x4::ZERO))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24871,9 +26253,11 @@ pub unsafe fn _mm256_maskz_unpackhi_pd(k: __mmask8, a: __m256d, b: __m256d) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
-pub unsafe fn _mm_mask_unpackhi_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x2()))
+pub fn _mm_mask_unpackhi_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpackhi, src.as_f64x2()))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24883,9 +26267,11 @@ pub unsafe fn _mm_mask_unpackhi_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
-pub unsafe fn _mm_maskz_unpackhi_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, unpackhi, f64x2::ZERO))
+pub fn _mm_maskz_unpackhi_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpackhi, f64x2::ZERO))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
@@ -24895,18 +26281,20 @@ pub unsafe fn _mm_maskz_unpackhi_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m1
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklps))] //should be vpunpckldq
-pub unsafe fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    #[rustfmt::skip]
-    let r: i32x16 = simd_shuffle!(
-        a, b,
-        [ 0, 16, 1, 17,
-          0 + 4, 16 + 4, 1 + 4, 17 + 4,
-          0 + 8, 16 + 8, 1 + 8, 17 + 8,
-          0 + 12, 16 + 12, 1 + 12, 17 + 12],
-    );
-    transmute(r)
+pub fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        #[rustfmt::skip]
+        let r: i32x16 = simd_shuffle!(
+            a, b,
+            [ 0, 16, 1, 17,
+              0 + 4, 16 + 4, 1 + 4, 17 + 4,
+              0 + 8, 16 + 8, 1 + 8, 17 + 8,
+              0 + 12, 16 + 12, 1 + 12, 17 + 12],
+        );
+        transmute(r)
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24916,14 +26304,11 @@ pub unsafe fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckldq))]
-pub unsafe fn _mm512_mask_unpacklo_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x16()))
+pub fn _mm512_mask_unpacklo_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x16()))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24933,9 +26318,11 @@ pub unsafe fn _mm512_mask_unpacklo_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckldq))]
-pub unsafe fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, unpacklo, i32x16::ZERO))
+pub fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, unpacklo, i32x16::ZERO))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24945,14 +26332,11 @@ pub unsafe fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckldq))]
-pub unsafe fn _mm256_mask_unpacklo_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x8()))
+pub fn _mm256_mask_unpacklo_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x8()))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24962,9 +26346,11 @@ pub unsafe fn _mm256_mask_unpacklo_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckldq))]
-pub unsafe fn _mm256_maskz_unpacklo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, unpacklo, i32x8::ZERO))
+pub fn _mm256_maskz_unpacklo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, unpacklo, i32x8::ZERO))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -24974,14 +26360,11 @@ pub unsafe fn _mm256_maskz_unpacklo_epi32(k: __mmask8, a: __m256i, b: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckldq))]
-pub unsafe fn _mm_mask_unpacklo_epi32(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x4()))
+pub fn _mm_mask_unpacklo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i32x4()))
+    }
 }
 
 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -24991,9 +26374,11 @@ pub unsafe fn _mm_mask_unpacklo_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpckldq))]
-pub unsafe fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, unpacklo, i32x4::ZERO))
+pub fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, unpacklo, i32x4::ZERO))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
@@ -25003,8 +26388,8 @@ pub unsafe fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
-pub unsafe fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
-    simd_shuffle!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
+pub fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { simd_shuffle!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6]) }
 }
 
 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25014,14 +26399,11 @@ pub unsafe fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub unsafe fn _mm512_mask_unpacklo_epi64(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x8()))
+pub fn _mm512_mask_unpacklo_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x8()))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25031,9 +26413,11 @@ pub unsafe fn _mm512_mask_unpacklo_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub unsafe fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, unpacklo, i64x8::ZERO))
+pub fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, unpacklo, i64x8::ZERO))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25043,14 +26427,11 @@ pub unsafe fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub unsafe fn _mm256_mask_unpacklo_epi64(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x4()))
+pub fn _mm256_mask_unpacklo_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x4()))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25060,9 +26441,11 @@ pub unsafe fn _mm256_mask_unpacklo_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub unsafe fn _mm256_maskz_unpacklo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, unpacklo, i64x4::ZERO))
+pub fn _mm256_maskz_unpacklo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, unpacklo, i64x4::ZERO))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25072,14 +26455,11 @@ pub unsafe fn _mm256_maskz_unpacklo_epi64(k: __mmask8, a: __m256i, b: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub unsafe fn _mm_mask_unpacklo_epi64(
-    src: __m128i,
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x2()))
+pub fn _mm_mask_unpacklo_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_i64x2()))
+    }
 }
 
 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25089,9 +26469,11 @@ pub unsafe fn _mm_mask_unpacklo_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
-pub unsafe fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, unpacklo, i64x2::ZERO))
+pub fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, unpacklo, i64x2::ZERO))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
@@ -25101,14 +26483,16 @@ pub unsafe fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
-pub unsafe fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
-    #[rustfmt::skip]
-    simd_shuffle!(a, b,
-                   [ 0, 16, 1, 17,
-                     0 + 4, 16 + 4, 1 + 4, 17 + 4,
-                     0 + 8, 16 + 8, 1 + 8, 17 + 8,
-                     0 + 12, 16 + 12, 1 + 12, 17 + 12],
-    )
+pub fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        #[rustfmt::skip]
+        simd_shuffle!(a, b,
+                       [ 0, 16, 1, 17,
+                         0 + 4, 16 + 4, 1 + 4, 17 + 4,
+                         0 + 8, 16 + 8, 1 + 8, 17 + 8,
+                         0 + 12, 16 + 12, 1 + 12, 17 + 12],
+        )
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25118,9 +26502,11 @@ pub unsafe fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
-pub unsafe fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x16()))
+pub fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x16()))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25130,9 +26516,11 @@ pub unsafe fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
-pub unsafe fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
-    transmute(simd_select_bitmask(k, unpacklo, f32x16::ZERO))
+pub fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
+        transmute(simd_select_bitmask(k, unpacklo, f32x16::ZERO))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25142,9 +26530,11 @@ pub unsafe fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
-pub unsafe fn _mm256_mask_unpacklo_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x8()))
+pub fn _mm256_mask_unpacklo_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x8()))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25154,9 +26544,11 @@ pub unsafe fn _mm256_mask_unpacklo_ps(src: __m256, k: __mmask8, a: __m256, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
-pub unsafe fn _mm256_maskz_unpacklo_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
-    transmute(simd_select_bitmask(k, unpacklo, f32x8::ZERO))
+pub fn _mm256_maskz_unpacklo_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
+        transmute(simd_select_bitmask(k, unpacklo, f32x8::ZERO))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25166,9 +26558,11 @@ pub unsafe fn _mm256_maskz_unpacklo_ps(k: __mmask8, a: __m256, b: __m256) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
-pub unsafe fn _mm_mask_unpacklo_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x4()))
+pub fn _mm_mask_unpacklo_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f32x4()))
+    }
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25178,9 +26572,11 @@ pub unsafe fn _mm_mask_unpacklo_ps(src: __m128, k: __mmask8, a: __m128, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
-pub unsafe fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
-    transmute(simd_select_bitmask(k, unpacklo, f32x4::ZERO))
+pub fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
+        transmute(simd_select_bitmask(k, unpacklo, f32x4::ZERO))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
@@ -25190,8 +26586,8 @@ pub unsafe fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
-pub unsafe fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
-    simd_shuffle!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
+pub fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
+    unsafe { simd_shuffle!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6]) }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25201,14 +26597,11 @@ pub unsafe fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
-pub unsafe fn _mm512_mask_unpacklo_pd(
-    src: __m512d,
-    k: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __m512d {
-    let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x8()))
+pub fn _mm512_mask_unpacklo_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x8()))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25218,9 +26611,11 @@ pub unsafe fn _mm512_mask_unpacklo_pd(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
-pub unsafe fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
-    transmute(simd_select_bitmask(k, unpacklo, f64x8::ZERO))
+pub fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe {
+        let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
+        transmute(simd_select_bitmask(k, unpacklo, f64x8::ZERO))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25230,14 +26625,11 @@ pub unsafe fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
-pub unsafe fn _mm256_mask_unpacklo_pd(
-    src: __m256d,
-    k: __mmask8,
-    a: __m256d,
-    b: __m256d,
-) -> __m256d {
-    let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x4()))
+pub fn _mm256_mask_unpacklo_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x4()))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25247,9 +26639,11 @@ pub unsafe fn _mm256_mask_unpacklo_pd(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
-pub unsafe fn _mm256_maskz_unpacklo_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
-    transmute(simd_select_bitmask(k, unpacklo, f64x4::ZERO))
+pub fn _mm256_maskz_unpacklo_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe {
+        let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
+        transmute(simd_select_bitmask(k, unpacklo, f64x4::ZERO))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25259,9 +26653,11 @@ pub unsafe fn _mm256_maskz_unpacklo_pd(k: __mmask8, a: __m256d, b: __m256d) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
-pub unsafe fn _mm_mask_unpacklo_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x2()))
+pub fn _mm_mask_unpacklo_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpacklo, src.as_f64x2()))
+    }
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25271,9 +26667,11 @@ pub unsafe fn _mm_mask_unpacklo_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
-pub unsafe fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
-    transmute(simd_select_bitmask(k, unpacklo, f64x2::ZERO))
+pub fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
+        transmute(simd_select_bitmask(k, unpacklo, f64x2::ZERO))
+    }
 }
 
 /// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25282,12 +26680,14 @@ pub unsafe fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m1
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
-    simd_shuffle!(
-        a,
-        _mm_undefined_ps(),
-        [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
-    )
+pub fn _mm512_castps128_ps512(a: __m128) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_ps(),
+            [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
+        )
+    }
 }
 
 /// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25296,12 +26696,14 @@ pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
-    simd_shuffle!(
-        a,
-        _mm256_undefined_ps(),
-        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
-    )
+pub fn _mm512_castps256_ps512(a: __m256) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_ps(),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
 }
 
 /// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25310,12 +26712,14 @@ pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
-    simd_shuffle!(
-        a,
-        _mm_set1_ps(0.),
-        [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
-    )
+pub fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_set1_ps(0.),
+            [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
+        )
+    }
 }
 
 /// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25324,12 +26728,14 @@ pub unsafe fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
-    simd_shuffle!(
-        a,
-        _mm256_set1_ps(0.),
-        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
-    )
+pub fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_set1_ps(0.),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
+        )
+    }
 }
 
 /// Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25338,8 +26744,8 @@ pub unsafe fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
-    simd_shuffle!(a, a, [0, 1, 2, 3])
+pub fn _mm512_castps512_ps128(a: __m512) -> __m128 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
 }
 
 /// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25348,8 +26754,8 @@ pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castps512_ps256(a: __m512) -> __m256 {
-    simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+pub fn _mm512_castps512_ps256(a: __m512) -> __m256 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
 }
 
 /// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25358,8 +26764,8 @@ pub unsafe fn _mm512_castps512_ps256(a: __m512) -> __m256 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castps_pd(a: __m512) -> __m512d {
-    transmute(a)
+pub fn _mm512_castps_pd(a: __m512) -> __m512d {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type __m512 to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25368,8 +26774,8 @@ pub unsafe fn _mm512_castps_pd(a: __m512) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i {
-    transmute(a)
+pub fn _mm512_castps_si512(a: __m512) -> __m512i {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25378,8 +26784,8 @@ pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
-    simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2, 2, 2, 2, 2])
+pub fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2, 2, 2, 2, 2]) }
 }
 
 /// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25388,8 +26794,8 @@ pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
-    simd_shuffle!(a, _mm256_undefined_pd(), [0, 1, 2, 3, 4, 4, 4, 4])
+pub fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm256_undefined_pd(), [0, 1, 2, 3, 4, 4, 4, 4]) }
 }
 
 /// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25398,8 +26804,8 @@ pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
-    simd_shuffle!(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2])
+pub fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2]) }
 }
 
 /// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25408,8 +26814,8 @@ pub unsafe fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
-    simd_shuffle!(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4])
+pub fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
+    unsafe { simd_shuffle!(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4]) }
 }
 
 /// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25418,8 +26824,8 @@ pub unsafe fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
-    simd_shuffle!(a, a, [0, 1])
+pub fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
+    unsafe { simd_shuffle!(a, a, [0, 1]) }
 }
 
 /// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25428,8 +26834,8 @@ pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
-    simd_shuffle!(a, a, [0, 1, 2, 3])
+pub fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
 }
 
 /// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25438,8 +26844,8 @@ pub unsafe fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castpd_ps(a: __m512d) -> __m512 {
-    transmute(a)
+pub fn _mm512_castpd_ps(a: __m512d) -> __m512 {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type __m512d to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25448,8 +26854,8 @@ pub unsafe fn _mm512_castpd_ps(a: __m512d) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i {
-    transmute(a)
+pub fn _mm512_castpd_si512(a: __m512d) -> __m512i {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25458,8 +26864,8 @@ pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
-    simd_shuffle!(a, _mm_undefined_si128(), [0, 1, 2, 2, 2, 2, 2, 2])
+pub fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm_undefined_si128(), [0, 1, 2, 2, 2, 2, 2, 2]) }
 }
 
 /// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25468,8 +26874,8 @@ pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
-    simd_shuffle!(a, _mm256_undefined_si256(), [0, 1, 2, 3, 4, 4, 4, 4])
+pub fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm256_undefined_si256(), [0, 1, 2, 3, 4, 4, 4, 4]) }
 }
 
 /// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25478,8 +26884,8 @@ pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
-    simd_shuffle!(a, _mm_setzero_si128(), [0, 1, 2, 2, 2, 2, 2, 2])
+pub fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm_setzero_si128(), [0, 1, 2, 2, 2, 2, 2, 2]) }
 }
 
 /// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25488,8 +26894,8 @@ pub unsafe fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
-    simd_shuffle!(a, _mm256_setzero_si256(), [0, 1, 2, 3, 4, 4, 4, 4])
+pub fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
+    unsafe { simd_shuffle!(a, _mm256_setzero_si256(), [0, 1, 2, 3, 4, 4, 4, 4]) }
 }
 
 /// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25498,8 +26904,8 @@ pub unsafe fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
-    simd_shuffle!(a, a, [0, 1])
+pub fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
+    unsafe { simd_shuffle!(a, a, [0, 1]) }
 }
 
 /// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25508,8 +26914,8 @@ pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
-    simd_shuffle!(a, a, [0, 1, 2, 3])
+pub fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3]) }
 }
 
 /// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25518,8 +26924,8 @@ pub unsafe fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castsi512_ps(a: __m512i) -> __m512 {
-    transmute(a)
+pub fn _mm512_castsi512_ps(a: __m512i) -> __m512 {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type __m512i to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
@@ -25528,8 +26934,8 @@ pub unsafe fn _mm512_castsi512_ps(a: __m512i) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
-    transmute(a)
+pub fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
+    unsafe { transmute(a) }
 }
 
 /// Copy the lower 32-bit integer in a to dst.
@@ -25539,8 +26945,8 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(vmovd))]
-pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
-    simd_extract!(a.as_i32x16(), 0)
+pub fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
+    unsafe { simd_extract!(a.as_i32x16(), 0) }
 }
 
 /// Copy the lower single-precision (32-bit) floating-point element of a to dst.
@@ -25549,8 +26955,8 @@ pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtss_f32(a: __m512) -> f32 {
-    simd_extract!(a, 0)
+pub fn _mm512_cvtss_f32(a: __m512) -> f32 {
+    unsafe { simd_extract!(a, 0) }
 }
 
 /// Copy the lower double-precision (64-bit) floating-point element of a to dst.
@@ -25559,8 +26965,8 @@ pub unsafe fn _mm512_cvtss_f32(a: __m512) -> f32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_cvtsd_f64(a: __m512d) -> f64 {
-    simd_extract!(a, 0)
+pub fn _mm512_cvtsd_f64(a: __m512d) -> f64 {
+    unsafe { simd_extract!(a, 0) }
 }
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst.
@@ -25570,10 +26976,12 @@ pub unsafe fn _mm512_cvtsd_f64(a: __m512d) -> f64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
-pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
-    let a = _mm512_castsi128_si512(a).as_i32x16();
-    let ret: i32x16 = simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
-    transmute(ret)
+pub fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
+    unsafe {
+        let a = _mm512_castsi128_si512(a).as_i32x16();
+        let ret: i32x16 = simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        transmute(ret)
+    }
 }
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25583,9 +26991,11 @@ pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub unsafe fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
-    let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+pub fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+    }
 }
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25595,9 +27005,11 @@ pub unsafe fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub unsafe fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i {
-    let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, broadcast, i32x16::ZERO))
+pub fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, i32x16::ZERO))
+    }
 }
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25607,9 +27019,11 @@ pub unsafe fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub unsafe fn _mm256_mask_broadcastd_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+pub fn _mm256_mask_broadcastd_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+    }
 }
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25619,9 +27033,11 @@ pub unsafe fn _mm256_mask_broadcastd_epi32(src: __m256i, k: __mmask8, a: __m128i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub unsafe fn _mm256_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m256i {
-    let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, broadcast, i32x8::ZERO))
+pub fn _mm256_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, i32x8::ZERO))
+    }
 }
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25631,9 +27047,11 @@ pub unsafe fn _mm256_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub unsafe fn _mm_mask_broadcastd_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i32x4()))
+pub fn _mm_mask_broadcastd_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x4()))
+    }
 }
 
 /// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25643,9 +27061,11 @@ pub unsafe fn _mm_mask_broadcastd_epi32(src: __m128i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
-pub unsafe fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, broadcast, i32x4::ZERO))
+pub fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, broadcast, i32x4::ZERO))
+    }
 }
 
 /// Broadcast the low packed 64-bit integer from a to all elements of dst.
@@ -25655,8 +27075,8 @@ pub unsafe fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastq
-pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
-    simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
+pub fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
+    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0]) }
 }
 
 /// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25666,9 +27086,11 @@ pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub unsafe fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
-    let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+pub fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+    }
 }
 
 /// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25678,9 +27100,11 @@ pub unsafe fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub unsafe fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i {
-    let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, broadcast, i64x8::ZERO))
+pub fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, i64x8::ZERO))
+    }
 }
 
 /// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25690,9 +27114,11 @@ pub unsafe fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub unsafe fn _mm256_mask_broadcastq_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i64x4()))
+pub fn _mm256_mask_broadcastq_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x4()))
+    }
 }
 
 /// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25702,9 +27128,11 @@ pub unsafe fn _mm256_mask_broadcastq_epi64(src: __m256i, k: __mmask8, a: __m128i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub unsafe fn _mm256_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m256i {
-    let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
-    transmute(simd_select_bitmask(k, broadcast, i64x4::ZERO))
+pub fn _mm256_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
+        transmute(simd_select_bitmask(k, broadcast, i64x4::ZERO))
+    }
 }
 
 /// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25714,9 +27142,11 @@ pub unsafe fn _mm256_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub unsafe fn _mm_mask_broadcastq_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i64x2()))
+pub fn _mm_mask_broadcastq_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x2()))
+    }
 }
 
 /// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25726,9 +27156,11 @@ pub unsafe fn _mm_mask_broadcastq_epi64(src: __m128i, k: __mmask8, a: __m128i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
-pub unsafe fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
-    transmute(simd_select_bitmask(k, broadcast, i64x2::ZERO))
+pub fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
+        transmute(simd_select_bitmask(k, broadcast, i64x2::ZERO))
+    }
 }
 
 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
@@ -25738,8 +27170,8 @@ pub unsafe fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
-pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
-    simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+pub fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
+    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) }
 }
 
 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25749,9 +27181,11 @@ pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
-pub unsafe fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -> __m512 {
-    let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
-    transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+pub fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+    }
 }
 
 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25761,9 +27195,11 @@ pub unsafe fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
-pub unsafe fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 {
-    let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
-    transmute(simd_select_bitmask(k, broadcast, f32x16::ZERO))
+pub fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, f32x16::ZERO))
+    }
 }
 
 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25773,9 +27209,11 @@ pub unsafe fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
-pub unsafe fn _mm256_mask_broadcastss_ps(src: __m256, k: __mmask8, a: __m128) -> __m256 {
-    let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
-    transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+pub fn _mm256_mask_broadcastss_ps(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+    }
 }
 
 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25785,9 +27223,11 @@ pub unsafe fn _mm256_mask_broadcastss_ps(src: __m256, k: __mmask8, a: __m128) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
-pub unsafe fn _mm256_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m256 {
-    let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
-    transmute(simd_select_bitmask(k, broadcast, f32x8::ZERO))
+pub fn _mm256_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, f32x8::ZERO))
+    }
 }
 
 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25797,9 +27237,11 @@ pub unsafe fn _mm256_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m256 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
-pub unsafe fn _mm_mask_broadcastss_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
-    let broadcast = _mm_broadcastss_ps(a).as_f32x4();
-    transmute(simd_select_bitmask(k, broadcast, src.as_f32x4()))
+pub fn _mm_mask_broadcastss_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let broadcast = _mm_broadcastss_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x4()))
+    }
 }
 
 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25809,9 +27251,11 @@ pub unsafe fn _mm_mask_broadcastss_ps(src: __m128, k: __mmask8, a: __m128) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
-pub unsafe fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
-    let broadcast = _mm_broadcastss_ps(a).as_f32x4();
-    transmute(simd_select_bitmask(k, broadcast, f32x4::ZERO))
+pub fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
+    unsafe {
+        let broadcast = _mm_broadcastss_ps(a).as_f32x4();
+        transmute(simd_select_bitmask(k, broadcast, f32x4::ZERO))
+    }
 }
 
 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
@@ -25821,8 +27265,8 @@ pub unsafe fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
-pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
-    simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
+pub fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
+    unsafe { simd_shuffle!(a, a, [0, 0, 0, 0, 0, 0, 0, 0]) }
 }
 
 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25832,9 +27276,11 @@ pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
-pub unsafe fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
-    let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
-    transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+pub fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+    }
 }
 
 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25844,9 +27290,11 @@ pub unsafe fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
-pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
-    let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
-    transmute(simd_select_bitmask(k, broadcast, f64x8::ZERO))
+pub fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, f64x8::ZERO))
+    }
 }
 
 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25856,9 +27304,11 @@ pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
-pub unsafe fn _mm256_mask_broadcastsd_pd(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
-    let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
-    transmute(simd_select_bitmask(k, broadcast, src.as_f64x4()))
+pub fn _mm256_mask_broadcastsd_pd(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f64x4()))
+    }
 }
 
 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25868,9 +27318,11 @@ pub unsafe fn _mm256_mask_broadcastsd_pd(src: __m256d, k: __mmask8, a: __m128d)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
-pub unsafe fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
-    let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
-    transmute(simd_select_bitmask(k, broadcast, f64x4::ZERO))
+pub fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
+    unsafe {
+        let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
+        transmute(simd_select_bitmask(k, broadcast, f64x4::ZERO))
+    }
 }
 
 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
@@ -25879,10 +27331,12 @@ pub unsafe fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
-    let a = a.as_i32x4();
-    let ret: i32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
-    transmute(ret)
+pub fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
+    unsafe {
+        let a = a.as_i32x4();
+        let ret: i32x16 = simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
+        transmute(ret)
+    }
 }
 
 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25891,9 +27345,11 @@ pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
-    let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+pub fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
+    }
 }
 
 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25902,9 +27358,11 @@ pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
-    let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
-    transmute(simd_select_bitmask(k, broadcast, i32x16::ZERO))
+pub fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
+        transmute(simd_select_bitmask(k, broadcast, i32x16::ZERO))
+    }
 }
 
 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
@@ -25913,10 +27371,12 @@ pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
-    let a = a.as_i32x4();
-    let ret: i32x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
-    transmute(ret)
+pub fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
+    unsafe {
+        let a = a.as_i32x4();
+        let ret: i32x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
+        transmute(ret)
+    }
 }
 
 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25925,9 +27385,11 @@ pub unsafe fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_broadcast_i32x4(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
-    let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+pub fn _mm256_mask_broadcast_i32x4(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
+    }
 }
 
 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25936,9 +27398,11 @@ pub unsafe fn _mm256_mask_broadcast_i32x4(src: __m256i, k: __mmask8, a: __m128i)
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
-    let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
-    transmute(simd_select_bitmask(k, broadcast, i32x8::ZERO))
+pub fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
+    unsafe {
+        let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
+        transmute(simd_select_bitmask(k, broadcast, i32x8::ZERO))
+    }
 }
 
 /// Broadcast the 4 packed 64-bit integers from a to all elements of dst.
@@ -25947,8 +27411,8 @@ pub unsafe fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
-    simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+pub fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
 }
 
 /// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25957,9 +27421,11 @@ pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
-    let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
-    transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+pub fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
+    }
 }
 
 /// Broadcast the 4 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -25968,9 +27434,11 @@ pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i)
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
-    let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
-    transmute(simd_select_bitmask(k, broadcast, i64x8::ZERO))
+pub fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
+    unsafe {
+        let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
+        transmute(simd_select_bitmask(k, broadcast, i64x8::ZERO))
+    }
 }
 
 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
@@ -25979,8 +27447,8 @@ pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
-    simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
+pub fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]) }
 }
 
 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -25989,9 +27457,11 @@ pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
-    let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
-    transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+pub fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
+    }
 }
 
 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26000,9 +27470,11 @@ pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128)
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
-    let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
-    transmute(simd_select_bitmask(k, broadcast, f32x16::ZERO))
+pub fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
+    unsafe {
+        let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
+        transmute(simd_select_bitmask(k, broadcast, f32x16::ZERO))
+    }
 }
 
 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
@@ -26011,8 +27483,8 @@ pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshuf
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
-    simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+pub fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
 }
 
 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26021,9 +27493,11 @@ pub unsafe fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_mask_broadcast_f32x4(src: __m256, k: __mmask8, a: __m128) -> __m256 {
-    let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
-    transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+pub fn _mm256_mask_broadcast_f32x4(src: __m256, k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
+    }
 }
 
 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26032,9 +27506,11 @@ pub unsafe fn _mm256_mask_broadcast_f32x4(src: __m256, k: __mmask8, a: __m128) -
 #[inline]
 #[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
-    let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
-    transmute(simd_select_bitmask(k, broadcast, f32x8::ZERO))
+pub fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
+    unsafe {
+        let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
+        transmute(simd_select_bitmask(k, broadcast, f32x8::ZERO))
+    }
 }
 
 /// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
@@ -26043,8 +27519,8 @@ pub unsafe fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
-    simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
+pub fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]) }
 }
 
 /// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26053,9 +27529,11 @@ pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
-    let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
-    transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+pub fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
+    }
 }
 
 /// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26064,9 +27542,11 @@ pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d)
 #[inline]
 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
-    let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
-    transmute(simd_select_bitmask(k, broadcast, f64x8::ZERO))
+pub fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
+    unsafe {
+        let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
+        transmute(simd_select_bitmask(k, broadcast, f64x8::ZERO))
+    }
 }
 
 /// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
@@ -26076,8 +27556,8 @@ pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
-pub unsafe fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(k, b.as_i32x16(), a.as_i32x16()))
+pub fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i32x16(), a.as_i32x16())) }
 }
 
 /// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
@@ -26087,8 +27567,8 @@ pub unsafe fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> _
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
-pub unsafe fn _mm256_mask_blend_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(k, b.as_i32x8(), a.as_i32x8()))
+pub fn _mm256_mask_blend_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i32x8(), a.as_i32x8())) }
 }
 
 /// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
@@ -26098,8 +27578,8 @@ pub unsafe fn _mm256_mask_blend_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
-pub unsafe fn _mm_mask_blend_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(k, b.as_i32x4(), a.as_i32x4()))
+pub fn _mm_mask_blend_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i32x4(), a.as_i32x4())) }
 }
 
 /// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
@@ -26109,8 +27589,8 @@ pub unsafe fn _mm_mask_blend_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
-pub unsafe fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(k, b.as_i64x8(), a.as_i64x8()))
+pub fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i64x8(), a.as_i64x8())) }
 }
 
 /// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
@@ -26120,8 +27600,8 @@ pub unsafe fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
-pub unsafe fn _mm256_mask_blend_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(k, b.as_i64x4(), a.as_i64x4()))
+pub fn _mm256_mask_blend_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i64x4(), a.as_i64x4())) }
 }
 
 /// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
@@ -26131,8 +27611,8 @@ pub unsafe fn _mm256_mask_blend_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
-pub unsafe fn _mm_mask_blend_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(k, b.as_i64x2(), a.as_i64x2()))
+pub fn _mm_mask_blend_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_select_bitmask(k, b.as_i64x2(), a.as_i64x2())) }
 }
 
 /// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
@@ -26142,8 +27622,8 @@ pub unsafe fn _mm_mask_blend_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m12
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
-pub unsafe fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
-    transmute(simd_select_bitmask(k, b.as_f32x16(), a.as_f32x16()))
+pub fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f32x16(), a.as_f32x16())) }
 }
 
 /// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
@@ -26153,8 +27633,8 @@ pub unsafe fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
-pub unsafe fn _mm256_mask_blend_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
-    transmute(simd_select_bitmask(k, b.as_f32x8(), a.as_f32x8()))
+pub fn _mm256_mask_blend_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f32x8(), a.as_f32x8())) }
 }
 
 /// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
@@ -26164,8 +27644,8 @@ pub unsafe fn _mm256_mask_blend_ps(k: __mmask8, a: __m256, b: __m256) -> __m256
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
-pub unsafe fn _mm_mask_blend_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(simd_select_bitmask(k, b.as_f32x4(), a.as_f32x4()))
+pub fn _mm_mask_blend_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f32x4(), a.as_f32x4())) }
 }
 
 /// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
@@ -26175,8 +27655,8 @@ pub unsafe fn _mm_mask_blend_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
-pub unsafe fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
-    transmute(simd_select_bitmask(k, b.as_f64x8(), a.as_f64x8()))
+pub fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f64x8(), a.as_f64x8())) }
 }
 
 /// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
@@ -26186,8 +27666,8 @@ pub unsafe fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m51
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
-pub unsafe fn _mm256_mask_blend_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
-    transmute(simd_select_bitmask(k, b.as_f64x4(), a.as_f64x4()))
+pub fn _mm256_mask_blend_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f64x4(), a.as_f64x4())) }
 }
 
 /// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
@@ -26197,8 +27677,8 @@ pub unsafe fn _mm256_mask_blend_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
-pub unsafe fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(simd_select_bitmask(k, b.as_f64x2(), a.as_f64x2()))
+pub fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(simd_select_bitmask(k, b.as_f64x2(), a.as_f64x2())) }
 }
 
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
@@ -26211,75 +27691,77 @@ pub unsafe fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    let imm8: i32 = IMM8 % 16;
-    let r: i32x16 = match imm8 {
-        0 => simd_shuffle!(
-            a,
-            b,
-            [
-                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-            ],
-        ),
-        1 => simd_shuffle!(
-            a,
-            b,
-            [
-                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
-            ],
-        ),
-        2 => simd_shuffle!(
-            a,
-            b,
-            [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
-        ),
-        3 => simd_shuffle!(
-            a,
-            b,
-            [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
-        ),
-        4 => simd_shuffle!(
-            a,
-            b,
-            [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
-        ),
-        5 => simd_shuffle!(
-            a,
-            b,
-            [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
-        ),
-        6 => simd_shuffle!(
-            a,
-            b,
-            [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
-        ),
-        7 => simd_shuffle!(
-            a,
-            b,
-            [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
-        ),
-        8 => simd_shuffle!(
-            a,
-            b,
-            [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
-        ),
-        9 => simd_shuffle!(
-            a,
-            b,
-            [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
-        ),
-        10 => simd_shuffle!(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
-        11 => simd_shuffle!(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
-        12 => simd_shuffle!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
-        13 => simd_shuffle!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
-        14 => simd_shuffle!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
-        15 => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
-        _ => unreachable_unchecked(),
-    };
-    transmute(r)
+pub fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let imm8: i32 = IMM8 % 16;
+        let r: i32x16 = match imm8 {
+            0 => simd_shuffle!(
+                a,
+                b,
+                [
+                    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                ],
+            ),
+            1 => simd_shuffle!(
+                a,
+                b,
+                [
+                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
+                ],
+            ),
+            2 => simd_shuffle!(
+                a,
+                b,
+                [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
+            ),
+            3 => simd_shuffle!(
+                a,
+                b,
+                [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
+            ),
+            4 => simd_shuffle!(
+                a,
+                b,
+                [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
+            ),
+            5 => simd_shuffle!(
+                a,
+                b,
+                [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
+            ),
+            6 => simd_shuffle!(
+                a,
+                b,
+                [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
+            ),
+            7 => simd_shuffle!(
+                a,
+                b,
+                [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
+            ),
+            8 => simd_shuffle!(
+                a,
+                b,
+                [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
+            ),
+            9 => simd_shuffle!(
+                a,
+                b,
+                [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
+            ),
+            10 => simd_shuffle!(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+            11 => simd_shuffle!(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+            12 => simd_shuffle!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
+            13 => simd_shuffle!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
+            14 => simd_shuffle!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
+            15 => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
 }
 
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26290,15 +27772,17 @@ pub unsafe fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_alignr_epi32<const IMM8: i32>(
+pub fn _mm512_mask_alignr_epi32<const IMM8: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm512_alignr_epi32::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
+    }
 }
 
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and stores the low 64 bytes (16 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26309,14 +27793,12 @@ pub unsafe fn _mm512_mask_alignr_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_alignr_epi32<const IMM8: i32>(
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm512_alignr_epi32::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+pub fn _mm512_maskz_alignr_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x16(), i32x16::ZERO))
+    }
 }
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
@@ -26329,23 +27811,25 @@ pub unsafe fn _mm512_maskz_alignr_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let imm8: i32 = IMM8 % 8;
-    let r: i32x8 = match imm8 {
-        0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-        2 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-        3 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-        4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-        5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-        6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-        7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
-        _ => unreachable_unchecked(),
-    };
-    transmute(r)
+pub fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let imm8: i32 = IMM8 % 8;
+        let r: i32x8 = match imm8 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+            1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+            2 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+            3 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+            4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+            5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+            6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+            7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
 }
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26356,15 +27840,17 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_alignr_epi32<const IMM8: i32>(
+pub fn _mm256_mask_alignr_epi32<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm256_alignr_epi32::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
+    }
 }
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26375,14 +27861,12 @@ pub unsafe fn _mm256_mask_alignr_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_alignr_epi32<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm256_alignr_epi32::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+pub fn _mm256_maskz_alignr_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x8(), i32x8::ZERO))
+    }
 }
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
@@ -26395,19 +27879,21 @@ pub unsafe fn _mm256_maskz_alignr_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignd
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    let imm8: i32 = IMM8 % 4;
-    let r: i32x4 = match imm8 {
-        0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
-        1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
-        2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
-        3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
-        _ => unreachable_unchecked(),
-    };
-    transmute(r)
+pub fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let imm8: i32 = IMM8 % 4;
+        let r: i32x4 = match imm8 {
+            0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
+            2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
+            3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
 }
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26418,15 +27904,17 @@ pub unsafe fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_alignr_epi32<const IMM8: i32>(
+pub fn _mm_mask_alignr_epi32<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm_alignr_epi32::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
+    }
 }
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26436,15 +27924,13 @@ pub unsafe fn _mm_mask_alignr_epi32<const IMM8: i32>(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
-#[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_alignr_epi32<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm_alignr_epi32::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+#[rustc_legacy_const_generics(3)]
+pub fn _mm_maskz_alignr_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi32::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i32x4(), i32x4::ZERO))
+    }
 }
 
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
@@ -26457,21 +27943,23 @@ pub unsafe fn _mm_maskz_alignr_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let imm8: i32 = IMM8 % 8;
-    let r: i64x8 = match imm8 {
-        0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
-        1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
-        2 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
-        3 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
-        4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
-        5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
-        6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
-        7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
-        _ => unreachable_unchecked(),
-    };
-    transmute(r)
+pub fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let imm8: i32 = IMM8 % 8;
+        let r: i64x8 = match imm8 {
+            0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
+            1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
+            2 => simd_shuffle!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
+            3 => simd_shuffle!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
+            4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
+            5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
+            6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
+            7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
 }
 
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26482,15 +27970,17 @@ pub unsafe fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_alignr_epi64<const IMM8: i32>(
+pub fn _mm512_mask_alignr_epi64<const IMM8: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm512_alignr_epi64::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
+    }
 }
 
 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and stores the low 64 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26501,14 +27991,12 @@ pub unsafe fn _mm512_mask_alignr_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm512_alignr_epi64::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+pub fn _mm512_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm512_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x8(), i64x8::ZERO))
+    }
 }
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
@@ -26521,17 +28009,19 @@ pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let imm8: i32 = IMM8 % 4;
-    let r: i64x4 = match imm8 {
-        0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
-        1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
-        2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
-        3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
-        _ => unreachable_unchecked(),
-    };
-    transmute(r)
+pub fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let imm8: i32 = IMM8 % 4;
+        let r: i64x4 = match imm8 {
+            0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
+            1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
+            2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
+            3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
 }
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26542,15 +28032,17 @@ pub unsafe fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_alignr_epi64<const IMM8: i32>(
+pub fn _mm256_mask_alignr_epi64<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm256_alignr_epi64::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
+    }
 }
 
 /// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26561,14 +28053,12 @@ pub unsafe fn _mm256_mask_alignr_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm256_alignr_epi64::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+pub fn _mm256_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm256_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x4(), i64x4::ZERO))
+    }
 }
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
@@ -26581,15 +28071,17 @@ pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignq
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let imm8: i32 = IMM8 % 2;
-    let r: i64x2 = match imm8 {
-        0 => simd_shuffle!(a, b, [2, 3]),
-        1 => simd_shuffle!(a, b, [3, 0]),
-        _ => unreachable_unchecked(),
-    };
-    transmute(r)
+pub fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let imm8: i32 = IMM8 % 2;
+        let r: i64x2 = match imm8 {
+            0 => simd_shuffle!(a, b, [2, 3]),
+            1 => simd_shuffle!(a, b, [3, 0]),
+            _ => unreachable_unchecked(),
+        };
+        transmute(r)
+    }
 }
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26600,15 +28092,17 @@ pub unsafe fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_alignr_epi64<const IMM8: i32>(
+pub fn _mm_mask_alignr_epi64<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm_alignr_epi64::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x2(), src.as_i64x2()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x2(), src.as_i64x2()))
+    }
 }
 
 /// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26619,14 +28113,12 @@ pub unsafe fn _mm_mask_alignr_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_alignr_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let r = _mm_alignr_epi64::<IMM8>(a, b);
-    transmute(simd_select_bitmask(k, r.as_i64x2(), i64x2::ZERO))
+pub fn _mm_maskz_alignr_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let r = _mm_alignr_epi64::<IMM8>(a, b);
+        transmute(simd_select_bitmask(k, r.as_i64x2(), i64x2::ZERO))
+    }
 }
 
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
@@ -26636,8 +28128,8 @@ pub unsafe fn _mm_maskz_alignr_epi64<const IMM8: i32>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))] //should be vpandd, but generate vpandq
-pub unsafe fn _mm512_and_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_and(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_and_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_and(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26647,9 +28139,11 @@ pub unsafe fn _mm512_and_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandd))]
-pub unsafe fn _mm512_mask_and_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let and = _mm512_and_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, and, src.as_i32x16()))
+pub fn _mm512_mask_and_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, and, src.as_i32x16()))
+    }
 }
 
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26659,9 +28153,11 @@ pub unsafe fn _mm512_mask_and_epi32(src: __m512i, k: __mmask16, a: __m512i, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandd))]
-pub unsafe fn _mm512_maskz_and_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let and = _mm512_and_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, and, i32x16::ZERO))
+pub fn _mm512_maskz_and_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, and, i32x16::ZERO))
+    }
 }
 
 /// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26671,9 +28167,11 @@ pub unsafe fn _mm512_maskz_and_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandd))]
-pub unsafe fn _mm256_mask_and_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let and = simd_and(a.as_i32x8(), b.as_i32x8());
-    transmute(simd_select_bitmask(k, and, src.as_i32x8()))
+pub fn _mm256_mask_and_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, and, src.as_i32x8()))
+    }
 }
 
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26683,9 +28181,11 @@ pub unsafe fn _mm256_mask_and_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandd))]
-pub unsafe fn _mm256_maskz_and_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let and = simd_and(a.as_i32x8(), b.as_i32x8());
-    transmute(simd_select_bitmask(k, and, i32x8::ZERO))
+pub fn _mm256_maskz_and_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, and, i32x8::ZERO))
+    }
 }
 
 /// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26695,9 +28195,11 @@ pub unsafe fn _mm256_maskz_and_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandd))]
-pub unsafe fn _mm_mask_and_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let and = simd_and(a.as_i32x4(), b.as_i32x4());
-    transmute(simd_select_bitmask(k, and, src.as_i32x4()))
+pub fn _mm_mask_and_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, and, src.as_i32x4()))
+    }
 }
 
 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26707,9 +28209,11 @@ pub unsafe fn _mm_mask_and_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandd))]
-pub unsafe fn _mm_maskz_and_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let and = simd_and(a.as_i32x4(), b.as_i32x4());
-    transmute(simd_select_bitmask(k, and, i32x4::ZERO))
+pub fn _mm_maskz_and_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, and, i32x4::ZERO))
+    }
 }
 
 /// Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst.
@@ -26719,8 +28223,8 @@ pub unsafe fn _mm_maskz_and_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
-pub unsafe fn _mm512_and_epi64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_and(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_and_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_and(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26730,9 +28234,11 @@ pub unsafe fn _mm512_and_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
-pub unsafe fn _mm512_mask_and_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let and = _mm512_and_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, and, src.as_i64x8()))
+pub fn _mm512_mask_and_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, and, src.as_i64x8()))
+    }
 }
 
 /// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26742,9 +28248,11 @@ pub unsafe fn _mm512_mask_and_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
-pub unsafe fn _mm512_maskz_and_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let and = _mm512_and_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, and, i64x8::ZERO))
+pub fn _mm512_maskz_and_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let and = _mm512_and_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, and, i64x8::ZERO))
+    }
 }
 
 /// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26754,9 +28262,11 @@ pub unsafe fn _mm512_maskz_and_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
-pub unsafe fn _mm256_mask_and_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let and = simd_and(a.as_i64x4(), b.as_i64x4());
-    transmute(simd_select_bitmask(k, and, src.as_i64x4()))
+pub fn _mm256_mask_and_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, and, src.as_i64x4()))
+    }
 }
 
 /// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26766,9 +28276,11 @@ pub unsafe fn _mm256_mask_and_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
-pub unsafe fn _mm256_maskz_and_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let and = simd_and(a.as_i64x4(), b.as_i64x4());
-    transmute(simd_select_bitmask(k, and, i64x4::ZERO))
+pub fn _mm256_maskz_and_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let and = simd_and(a.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, and, i64x4::ZERO))
+    }
 }
 
 /// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26778,9 +28290,11 @@ pub unsafe fn _mm256_maskz_and_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
-pub unsafe fn _mm_mask_and_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let and = simd_and(a.as_i64x2(), b.as_i64x2());
-    transmute(simd_select_bitmask(k, and, src.as_i64x2()))
+pub fn _mm_mask_and_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, and, src.as_i64x2()))
+    }
 }
 
 /// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26790,9 +28304,11 @@ pub unsafe fn _mm_mask_and_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
-pub unsafe fn _mm_maskz_and_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let and = simd_and(a.as_i64x2(), b.as_i64x2());
-    transmute(simd_select_bitmask(k, and, i64x2::ZERO))
+pub fn _mm_maskz_and_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let and = simd_and(a.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, and, i64x2::ZERO))
+    }
 }
 
 /// Compute the bitwise AND of 512 bits (representing integer data) in a and b, and store the result in dst.
@@ -26802,8 +28318,8 @@ pub unsafe fn _mm_maskz_and_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandq))]
-pub unsafe fn _mm512_and_si512(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_and(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_and_si512(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_and(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
@@ -26813,8 +28329,8 @@ pub unsafe fn _mm512_and_si512(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vporq))]
-pub unsafe fn _mm512_or_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_or(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_or_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_or(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26824,9 +28340,11 @@ pub unsafe fn _mm512_or_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpord))]
-pub unsafe fn _mm512_mask_or_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let or = _mm512_or_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, or, src.as_i32x16()))
+pub fn _mm512_mask_or_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, or, src.as_i32x16()))
+    }
 }
 
 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26836,9 +28354,11 @@ pub unsafe fn _mm512_mask_or_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpord))]
-pub unsafe fn _mm512_maskz_or_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let or = _mm512_or_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, or, i32x16::ZERO))
+pub fn _mm512_maskz_or_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, or, i32x16::ZERO))
+    }
 }
 
 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
@@ -26848,8 +28368,8 @@ pub unsafe fn _mm512_maskz_or_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vor))] //should be vpord
-pub unsafe fn _mm256_or_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_or_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26859,9 +28379,11 @@ pub unsafe fn _mm256_or_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpord))]
-pub unsafe fn _mm256_mask_or_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let or = _mm256_or_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, or, src.as_i32x8()))
+pub fn _mm256_mask_or_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, or, src.as_i32x8()))
+    }
 }
 
 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26871,9 +28393,11 @@ pub unsafe fn _mm256_mask_or_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpord))]
-pub unsafe fn _mm256_maskz_or_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let or = _mm256_or_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, or, i32x8::ZERO))
+pub fn _mm256_maskz_or_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, or, i32x8::ZERO))
+    }
 }
 
 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
@@ -26883,8 +28407,8 @@ pub unsafe fn _mm256_maskz_or_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vor))] //should be vpord
-pub unsafe fn _mm_or_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_or(a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_or_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_or(a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26894,9 +28418,11 @@ pub unsafe fn _mm_or_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpord))]
-pub unsafe fn _mm_mask_or_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let or = _mm_or_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, or, src.as_i32x4()))
+pub fn _mm_mask_or_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, or, src.as_i32x4()))
+    }
 }
 
 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26906,9 +28432,11 @@ pub unsafe fn _mm_mask_or_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpord))]
-pub unsafe fn _mm_maskz_or_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let or = _mm_or_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, or, i32x4::ZERO))
+pub fn _mm_maskz_or_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, or, i32x4::ZERO))
+    }
 }
 
 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
@@ -26918,8 +28446,8 @@ pub unsafe fn _mm_maskz_or_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vporq))]
-pub unsafe fn _mm512_or_epi64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_or(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_or_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_or(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26929,9 +28457,11 @@ pub unsafe fn _mm512_or_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vporq))]
-pub unsafe fn _mm512_mask_or_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let or = _mm512_or_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, or, src.as_i64x8()))
+pub fn _mm512_mask_or_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, or, src.as_i64x8()))
+    }
 }
 
 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26941,9 +28471,11 @@ pub unsafe fn _mm512_mask_or_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vporq))]
-pub unsafe fn _mm512_maskz_or_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let or = _mm512_or_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, or, i64x8::ZERO))
+pub fn _mm512_maskz_or_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let or = _mm512_or_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, or, i64x8::ZERO))
+    }
 }
 
 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
@@ -26953,8 +28485,8 @@ pub unsafe fn _mm512_maskz_or_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m5
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vor))] //should be vporq
-pub unsafe fn _mm256_or_epi64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_or(a.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_or_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_or(a.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26964,9 +28496,11 @@ pub unsafe fn _mm256_or_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vporq))]
-pub unsafe fn _mm256_mask_or_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let or = _mm256_or_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, or, src.as_i64x4()))
+pub fn _mm256_mask_or_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, or, src.as_i64x4()))
+    }
 }
 
 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -26976,9 +28510,11 @@ pub unsafe fn _mm256_mask_or_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vporq))]
-pub unsafe fn _mm256_maskz_or_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let or = _mm256_or_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, or, i64x4::ZERO))
+pub fn _mm256_maskz_or_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let or = _mm256_or_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, or, i64x4::ZERO))
+    }
 }
 
 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
@@ -26988,8 +28524,8 @@ pub unsafe fn _mm256_maskz_or_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m2
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vor))] //should be vporq
-pub unsafe fn _mm_or_epi64(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_or(a.as_i64x2(), b.as_i64x2()))
+pub fn _mm_or_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_or(a.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -26999,9 +28535,11 @@ pub unsafe fn _mm_or_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vporq))]
-pub unsafe fn _mm_mask_or_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let or = _mm_or_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, or, src.as_i64x2()))
+pub fn _mm_mask_or_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, or, src.as_i64x2()))
+    }
 }
 
 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27011,9 +28549,11 @@ pub unsafe fn _mm_mask_or_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vporq))]
-pub unsafe fn _mm_maskz_or_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let or = _mm_or_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, or, i64x2::ZERO))
+pub fn _mm_maskz_or_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let or = _mm_or_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, or, i64x2::ZERO))
+    }
 }
 
 /// Compute the bitwise OR of 512 bits (representing integer data) in a and b, and store the result in dst.
@@ -27023,8 +28563,8 @@ pub unsafe fn _mm_maskz_or_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vporq))]
-pub unsafe fn _mm512_or_si512(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_or(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_or_si512(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_or(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
@@ -27034,8 +28574,8 @@ pub unsafe fn _mm512_or_si512(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxorq))] //should be vpxord
-pub unsafe fn _mm512_xor_epi32(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_xor(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_xor_epi32(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -27045,9 +28585,11 @@ pub unsafe fn _mm512_xor_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxord))]
-pub unsafe fn _mm512_mask_xor_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let xor = _mm512_xor_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, xor, src.as_i32x16()))
+pub fn _mm512_mask_xor_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, xor, src.as_i32x16()))
+    }
 }
 
 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27057,9 +28599,11 @@ pub unsafe fn _mm512_mask_xor_epi32(src: __m512i, k: __mmask16, a: __m512i, b: _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxord))]
-pub unsafe fn _mm512_maskz_xor_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let xor = _mm512_xor_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, xor, i32x16::ZERO))
+pub fn _mm512_maskz_xor_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, xor, i32x16::ZERO))
+    }
 }
 
 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
@@ -27069,8 +28613,8 @@ pub unsafe fn _mm512_maskz_xor_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vxor))] //should be vpxord
-pub unsafe fn _mm256_xor_epi32(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_xor(a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_xor_epi32(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_xor(a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -27080,9 +28624,11 @@ pub unsafe fn _mm256_xor_epi32(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxord))]
-pub unsafe fn _mm256_mask_xor_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let xor = _mm256_xor_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, xor, src.as_i32x8()))
+pub fn _mm256_mask_xor_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, xor, src.as_i32x8()))
+    }
 }
 
 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27092,9 +28638,11 @@ pub unsafe fn _mm256_mask_xor_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxord))]
-pub unsafe fn _mm256_maskz_xor_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let xor = _mm256_xor_epi32(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, xor, i32x8::ZERO))
+pub fn _mm256_maskz_xor_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi32(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, xor, i32x8::ZERO))
+    }
 }
 
 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
@@ -27104,8 +28652,8 @@ pub unsafe fn _mm256_maskz_xor_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vxor))] //should be vpxord
-pub unsafe fn _mm_xor_epi32(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_xor(a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_xor_epi32(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_xor(a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -27115,9 +28663,11 @@ pub unsafe fn _mm_xor_epi32(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxord))]
-pub unsafe fn _mm_mask_xor_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let xor = _mm_xor_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, xor, src.as_i32x4()))
+pub fn _mm_mask_xor_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, xor, src.as_i32x4()))
+    }
 }
 
 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27127,9 +28677,11 @@ pub unsafe fn _mm_mask_xor_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxord))]
-pub unsafe fn _mm_maskz_xor_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let xor = _mm_xor_epi32(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, xor, i32x4::ZERO))
+pub fn _mm_maskz_xor_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi32(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, xor, i32x4::ZERO))
+    }
 }
 
 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
@@ -27139,8 +28691,8 @@ pub unsafe fn _mm_maskz_xor_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxorq))]
-pub unsafe fn _mm512_xor_epi64(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_xor(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_xor_epi64(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_xor(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -27150,9 +28702,11 @@ pub unsafe fn _mm512_xor_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxorq))]
-pub unsafe fn _mm512_mask_xor_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let xor = _mm512_xor_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, xor, src.as_i64x8()))
+pub fn _mm512_mask_xor_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, xor, src.as_i64x8()))
+    }
 }
 
 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27162,9 +28716,11 @@ pub unsafe fn _mm512_mask_xor_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxorq))]
-pub unsafe fn _mm512_maskz_xor_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let xor = _mm512_xor_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, xor, i64x8::ZERO))
+pub fn _mm512_maskz_xor_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let xor = _mm512_xor_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, xor, i64x8::ZERO))
+    }
 }
 
 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
@@ -27174,8 +28730,8 @@ pub unsafe fn _mm512_maskz_xor_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
-pub unsafe fn _mm256_xor_epi64(a: __m256i, b: __m256i) -> __m256i {
-    transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_xor_epi64(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -27185,9 +28741,11 @@ pub unsafe fn _mm256_xor_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxorq))]
-pub unsafe fn _mm256_mask_xor_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let xor = _mm256_xor_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, xor, src.as_i64x4()))
+pub fn _mm256_mask_xor_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, xor, src.as_i64x4()))
+    }
 }
 
 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27197,9 +28755,11 @@ pub unsafe fn _mm256_mask_xor_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxorq))]
-pub unsafe fn _mm256_maskz_xor_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let xor = _mm256_xor_epi64(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, xor, i64x4::ZERO))
+pub fn _mm256_maskz_xor_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let xor = _mm256_xor_epi64(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, xor, i64x4::ZERO))
+    }
 }
 
 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
@@ -27209,8 +28769,8 @@ pub unsafe fn _mm256_maskz_xor_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
-pub unsafe fn _mm_xor_epi64(a: __m128i, b: __m128i) -> __m128i {
-    transmute(simd_xor(a.as_i64x2(), b.as_i64x2()))
+pub fn _mm_xor_epi64(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(simd_xor(a.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -27220,9 +28780,11 @@ pub unsafe fn _mm_xor_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxorq))]
-pub unsafe fn _mm_mask_xor_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let xor = _mm_xor_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, xor, src.as_i64x2()))
+pub fn _mm_mask_xor_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, xor, src.as_i64x2()))
+    }
 }
 
 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27232,9 +28794,11 @@ pub unsafe fn _mm_mask_xor_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m12
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxorq))]
-pub unsafe fn _mm_maskz_xor_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let xor = _mm_xor_epi64(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, xor, i64x2::ZERO))
+pub fn _mm_maskz_xor_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let xor = _mm_xor_epi64(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, xor, i64x2::ZERO))
+    }
 }
 
 /// Compute the bitwise XOR of 512 bits (representing integer data) in a and b, and store the result in dst.
@@ -27244,8 +28808,8 @@ pub unsafe fn _mm_maskz_xor_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpxorq))]
-pub unsafe fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i {
-    transmute(simd_xor(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(simd_xor(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst.
@@ -27255,7 +28819,7 @@ pub unsafe fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
-pub unsafe fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i {
+pub fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i {
     _mm512_and_epi32(_mm512_xor_epi32(a, _mm512_set1_epi32(u32::MAX as i32)), b)
 }
 
@@ -27266,14 +28830,11 @@ pub unsafe fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnd))]
-pub unsafe fn _mm512_mask_andnot_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, andnot, src.as_i32x16()))
+pub fn _mm512_mask_andnot_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, andnot, src.as_i32x16()))
+    }
 }
 
 /// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27283,9 +28844,11 @@ pub unsafe fn _mm512_mask_andnot_epi32(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnd))]
-pub unsafe fn _mm512_maskz_andnot_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
-    let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, andnot, i32x16::ZERO))
+pub fn _mm512_maskz_andnot_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, andnot, i32x16::ZERO))
+    }
 }
 
 /// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -27295,15 +28858,12 @@ pub unsafe fn _mm512_maskz_andnot_epi32(k: __mmask16, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnd))]
-pub unsafe fn _mm256_mask_andnot_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
-    let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
-    transmute(simd_select_bitmask(k, andnot, src.as_i32x8()))
+pub fn _mm256_mask_andnot_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, andnot, src.as_i32x8()))
+    }
 }
 
 /// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27313,10 +28873,12 @@ pub unsafe fn _mm256_mask_andnot_epi32(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnd))]
-pub unsafe fn _mm256_maskz_andnot_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
-    let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
-    transmute(simd_select_bitmask(k, andnot, i32x8::ZERO))
+pub fn _mm256_maskz_andnot_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
+        transmute(simd_select_bitmask(k, andnot, i32x8::ZERO))
+    }
 }
 
 /// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -27326,10 +28888,12 @@ pub unsafe fn _mm256_maskz_andnot_epi32(k: __mmask8, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnd))]
-pub unsafe fn _mm_mask_andnot_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
-    let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
-    transmute(simd_select_bitmask(k, andnot, src.as_i32x4()))
+pub fn _mm_mask_andnot_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, andnot, src.as_i32x4()))
+    }
 }
 
 /// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27339,10 +28903,12 @@ pub unsafe fn _mm_mask_andnot_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnd))]
-pub unsafe fn _mm_maskz_andnot_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
-    let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
-    transmute(simd_select_bitmask(k, andnot, i32x4::ZERO))
+pub fn _mm_maskz_andnot_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
+        let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
+        transmute(simd_select_bitmask(k, andnot, i32x4::ZERO))
+    }
 }
 
 /// Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in a and then AND with b, and store the results in dst.
@@ -27352,7 +28918,7 @@ pub unsafe fn _mm_maskz_andnot_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
-pub unsafe fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i {
+pub fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i {
     _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
 }
 
@@ -27363,14 +28929,11 @@ pub unsafe fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnq))]
-pub unsafe fn _mm512_mask_andnot_epi64(
-    src: __m512i,
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, andnot, src.as_i64x8()))
+pub fn _mm512_mask_andnot_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, andnot, src.as_i64x8()))
+    }
 }
 
 /// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27380,9 +28943,11 @@ pub unsafe fn _mm512_mask_andnot_epi64(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnq))]
-pub unsafe fn _mm512_maskz_andnot_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
-    let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, andnot, i64x8::ZERO))
+pub fn _mm512_maskz_andnot_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, andnot, i64x8::ZERO))
+    }
 }
 
 /// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -27392,15 +28957,12 @@ pub unsafe fn _mm512_maskz_andnot_epi64(k: __mmask8, a: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnq))]
-pub unsafe fn _mm256_mask_andnot_epi64(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
-    let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
-    transmute(simd_select_bitmask(k, andnot, src.as_i64x4()))
+pub fn _mm256_mask_andnot_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, andnot, src.as_i64x4()))
+    }
 }
 
 /// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27410,10 +28972,12 @@ pub unsafe fn _mm256_mask_andnot_epi64(
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnq))]
-pub unsafe fn _mm256_maskz_andnot_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
-    let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
-    let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
-    transmute(simd_select_bitmask(k, andnot, i64x4::ZERO))
+pub fn _mm256_maskz_andnot_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
+        transmute(simd_select_bitmask(k, andnot, i64x4::ZERO))
+    }
 }
 
 /// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -27423,10 +28987,12 @@ pub unsafe fn _mm256_maskz_andnot_epi64(k: __mmask8, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnq))]
-pub unsafe fn _mm_mask_andnot_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
-    let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
-    transmute(simd_select_bitmask(k, andnot, src.as_i64x2()))
+pub fn _mm_mask_andnot_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, andnot, src.as_i64x2()))
+    }
 }
 
 /// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -27436,10 +29002,12 @@ pub unsafe fn _mm_mask_andnot_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnq))]
-pub unsafe fn _mm_maskz_andnot_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
-    let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
-    transmute(simd_select_bitmask(k, andnot, i64x2::ZERO))
+pub fn _mm_maskz_andnot_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
+        let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
+        transmute(simd_select_bitmask(k, andnot, i64x2::ZERO))
+    }
 }
 
 /// Compute the bitwise NOT of 512 bits (representing integer data) in a and then AND with b, and store the result in dst.
@@ -27449,7 +29017,7 @@ pub unsafe fn _mm_maskz_andnot_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpandnq))]
-pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
+pub fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
     _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
 }
 
@@ -27459,7 +29027,7 @@ pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _cvtmask16_u32(a: __mmask16) -> u32 {
+pub fn _cvtmask16_u32(a: __mmask16) -> u32 {
     a as u32
 }
 
@@ -27469,7 +29037,7 @@ pub unsafe fn _cvtmask16_u32(a: __mmask16) -> u32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _cvtu32_mask16(a: u32) -> __mmask16 {
+pub fn _cvtu32_mask16(a: u32) -> __mmask16 {
     a as __mmask16
 }
 
@@ -27480,7 +29048,7 @@ pub unsafe fn _cvtu32_mask16(a: u32) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
-pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
     a & b
 }
 
@@ -27491,7 +29059,7 @@ pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
-pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
     a & b
 }
 
@@ -27502,7 +29070,7 @@ pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
-pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
     a | b
 }
 
@@ -27513,7 +29081,7 @@ pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
-pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
     a | b
 }
 
@@ -27524,7 +29092,7 @@ pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
-pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
     a ^ b
 }
 
@@ -27535,7 +29103,7 @@ pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
-pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
     a ^ b
 }
 
@@ -27545,7 +29113,7 @@ pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
+pub fn _knot_mask16(a: __mmask16) -> __mmask16 {
     a ^ 0b11111111_11111111
 }
 
@@ -27555,7 +29123,7 @@ pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_knot(a: __mmask16) -> __mmask16 {
+pub fn _mm512_knot(a: __mmask16) -> __mmask16 {
     a ^ 0b11111111_11111111
 }
 
@@ -27566,7 +29134,7 @@ pub unsafe fn _mm512_knot(a: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(not))] // generate normal and, not code instead of kandnw
-pub unsafe fn _kandn_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _kandn_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
     _mm512_kand(_mm512_knot(a), b)
 }
 
@@ -27577,7 +29145,7 @@ pub unsafe fn _kandn_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(not))] // generate normal and code instead of kandw
-pub unsafe fn _mm512_kandn(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _mm512_kandn(a: __mmask16, b: __mmask16) -> __mmask16 {
     _mm512_kand(_mm512_knot(a), b)
 }
 
@@ -27588,7 +29156,7 @@ pub unsafe fn _mm512_kandn(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(xor))] // generate normal xor, not code instead of kxnorw
-pub unsafe fn _kxnor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _kxnor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
     _mm512_knot(_mm512_kxor(a, b))
 }
 
@@ -27599,7 +29167,7 @@ pub unsafe fn _kxnor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kandw
-pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
     _mm512_knot(_mm512_kxor(a, b))
 }
 
@@ -27623,7 +29191,7 @@ pub unsafe fn _kortest_mask16_u8(a: __mmask16, b: __mmask16, all_ones: *mut u8)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kortestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+pub fn _kortestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
     (_kor_mask16(a, b) == 0xffff) as u8
 }
 
@@ -27634,7 +29202,7 @@ pub unsafe fn _kortestc_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kortestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
+pub fn _kortestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
     (_kor_mask16(a, b) == 0) as u8
 }
 
@@ -27645,7 +29213,7 @@ pub unsafe fn _kortestz_mask16_u8(a: __mmask16, b: __mmask16) -> u8 {
 #[target_feature(enable = "avx512f")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kshiftli_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
+pub fn _kshiftli_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
     a << COUNT
 }
 
@@ -27656,7 +29224,7 @@ pub unsafe fn _kshiftli_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _kshiftri_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
+pub fn _kshiftri_mask16<const COUNT: u32>(a: __mmask16) -> __mmask16 {
     a >> COUNT
 }
 
@@ -27687,7 +29255,7 @@ pub unsafe fn _store_mask16(mem_addr: *mut __mmask16, a: __mmask16) {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
-pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
+pub fn _mm512_kmov(a: __mmask16) -> __mmask16 {
     a
 }
 
@@ -27697,7 +29265,7 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
+pub fn _mm512_int2mask(mask: i32) -> __mmask16 {
     mask as u16
 }
 
@@ -27708,7 +29276,7 @@ pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
-pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
+pub fn _mm512_mask2int(k1: __mmask16) -> i32 {
     k1 as i32
 }
 
@@ -27719,7 +29287,7 @@ pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckbw
-pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
+pub fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
     ((a & 0xff) << 8) | (b & 0xff)
 }
 
@@ -27730,7 +29298,7 @@ pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(cmp))] // generate normal and code instead of kortestw
-pub unsafe fn _mm512_kortestc(a: __mmask16, b: __mmask16) -> i32 {
+pub fn _mm512_kortestc(a: __mmask16, b: __mmask16) -> i32 {
     let r = (a | b) == 0b11111111_11111111;
     r as i32
 }
@@ -27742,7 +29310,7 @@ pub unsafe fn _mm512_kortestc(a: __mmask16, b: __mmask16) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kortestw
-pub unsafe fn _mm512_kortestz(a: __mmask16, b: __mmask16) -> i32 {
+pub fn _mm512_kortestz(a: __mmask16, b: __mmask16) -> i32 {
     let r = (a | b) == 0;
     r as i32
 }
@@ -27754,7 +29322,7 @@ pub unsafe fn _mm512_kortestz(a: __mmask16, b: __mmask16) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmd))]
-pub unsafe fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
     let and = _mm512_and_epi32(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_cmpneq_epi32_mask(and, zero)
@@ -27767,7 +29335,7 @@ pub unsafe fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmd))]
-pub unsafe fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     let and = _mm512_and_epi32(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_mask_cmpneq_epi32_mask(k, and, zero)
@@ -27780,7 +29348,7 @@ pub unsafe fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmd))]
-pub unsafe fn _mm256_test_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_test_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_cmpneq_epi32_mask(and, zero)
@@ -27793,7 +29361,7 @@ pub unsafe fn _mm256_test_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmd))]
-pub unsafe fn _mm256_mask_test_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_test_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_mask_cmpneq_epi32_mask(k, and, zero)
@@ -27806,7 +29374,7 @@ pub unsafe fn _mm256_mask_test_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmd))]
-pub unsafe fn _mm_test_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_test_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_cmpneq_epi32_mask(and, zero)
@@ -27819,7 +29387,7 @@ pub unsafe fn _mm_test_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmd))]
-pub unsafe fn _mm_mask_test_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_test_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_mask_cmpneq_epi32_mask(k, and, zero)
@@ -27832,7 +29400,7 @@ pub unsafe fn _mm_mask_test_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmq))]
-pub unsafe fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     let and = _mm512_and_epi64(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_cmpneq_epi64_mask(and, zero)
@@ -27845,7 +29413,7 @@ pub unsafe fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmq))]
-pub unsafe fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     let and = _mm512_and_epi64(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_mask_cmpneq_epi64_mask(k, and, zero)
@@ -27858,7 +29426,7 @@ pub unsafe fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmq))]
-pub unsafe fn _mm256_test_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_test_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_cmpneq_epi64_mask(and, zero)
@@ -27871,7 +29439,7 @@ pub unsafe fn _mm256_test_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmq))]
-pub unsafe fn _mm256_mask_test_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_test_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_mask_cmpneq_epi64_mask(k, and, zero)
@@ -27884,7 +29452,7 @@ pub unsafe fn _mm256_mask_test_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmq))]
-pub unsafe fn _mm_test_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_test_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_cmpneq_epi64_mask(and, zero)
@@ -27897,7 +29465,7 @@ pub unsafe fn _mm_test_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestmq))]
-pub unsafe fn _mm_mask_test_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_test_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_mask_cmpneq_epi64_mask(k, and, zero)
@@ -27910,7 +29478,7 @@ pub unsafe fn _mm_mask_test_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> _
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmd))]
-pub unsafe fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
     let and = _mm512_and_epi32(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_cmpeq_epi32_mask(and, zero)
@@ -27923,7 +29491,7 @@ pub unsafe fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmd))]
-pub unsafe fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     let and = _mm512_and_epi32(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_mask_cmpeq_epi32_mask(k, and, zero)
@@ -27936,7 +29504,7 @@ pub unsafe fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmd))]
-pub unsafe fn _mm256_testn_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_testn_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_cmpeq_epi32_mask(and, zero)
@@ -27949,7 +29517,7 @@ pub unsafe fn _mm256_testn_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmd))]
-pub unsafe fn _mm256_mask_testn_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_testn_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_mask_cmpeq_epi32_mask(k, and, zero)
@@ -27962,7 +29530,7 @@ pub unsafe fn _mm256_mask_testn_epi32_mask(k: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmd))]
-pub unsafe fn _mm_testn_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_testn_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_cmpeq_epi32_mask(and, zero)
@@ -27975,7 +29543,7 @@ pub unsafe fn _mm_testn_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmd))]
-pub unsafe fn _mm_mask_testn_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_testn_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_mask_cmpeq_epi32_mask(k, and, zero)
@@ -27988,7 +29556,7 @@ pub unsafe fn _mm_mask_testn_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmq))]
-pub unsafe fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
     let and = _mm512_and_epi64(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_cmpeq_epi64_mask(and, zero)
@@ -28001,7 +29569,7 @@ pub unsafe fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmq))]
-pub unsafe fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     let and = _mm512_and_epi64(a, b);
     let zero = _mm512_setzero_si512();
     _mm512_mask_cmpeq_epi64_mask(k, and, zero)
@@ -28014,7 +29582,7 @@ pub unsafe fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmq))]
-pub unsafe fn _mm256_testn_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_testn_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_cmpeq_epi64_mask(and, zero)
@@ -28027,7 +29595,7 @@ pub unsafe fn _mm256_testn_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmq))]
-pub unsafe fn _mm256_mask_testn_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_testn_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     let and = _mm256_and_si256(a, b);
     let zero = _mm256_setzero_si256();
     _mm256_mask_cmpeq_epi64_mask(k, and, zero)
@@ -28040,7 +29608,7 @@ pub unsafe fn _mm256_mask_testn_epi64_mask(k: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmq))]
-pub unsafe fn _mm_testn_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_testn_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_cmpeq_epi64_mask(and, zero)
@@ -28053,7 +29621,7 @@ pub unsafe fn _mm_testn_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vptestnmq))]
-pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     let and = _mm_and_si128(a, b);
     let zero = _mm_setzero_si128();
     _mm_mask_cmpeq_epi64_mask(k, and, zero)
@@ -28162,7 +29730,7 @@ pub unsafe fn _mm512_stream_load_si512(mem_addr: *const __m512i) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set_ps(
+pub fn _mm512_set_ps(
     e0: f32,
     e1: f32,
     e2: f32,
@@ -28192,7 +29760,7 @@ pub unsafe fn _mm512_set_ps(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_setr_ps(
+pub fn _mm512_setr_ps(
     e0: f32,
     e1: f32,
     e2: f32,
@@ -28210,10 +29778,12 @@ pub unsafe fn _mm512_setr_ps(
     e14: f32,
     e15: f32,
 ) -> __m512 {
-    let r = f32x16::new(
-        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
-    );
-    transmute(r)
+    unsafe {
+        let r = f32x16::new(
+            e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
+        );
+        transmute(r)
+    }
 }
 
 /// Broadcast 64-bit float `a` to all elements of `dst`.
@@ -28222,8 +29792,8 @@ pub unsafe fn _mm512_setr_ps(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d {
-    transmute(f64x8::splat(a))
+pub fn _mm512_set1_pd(a: f64) -> __m512d {
+    unsafe { transmute(f64x8::splat(a)) }
 }
 
 /// Broadcast 32-bit float `a` to all elements of `dst`.
@@ -28232,8 +29802,8 @@ pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set1_ps(a: f32) -> __m512 {
-    transmute(f32x16::splat(a))
+pub fn _mm512_set1_ps(a: f32) -> __m512 {
+    unsafe { transmute(f32x16::splat(a)) }
 }
 
 /// Sets packed 32-bit integers in `dst` with the supplied values.
@@ -28242,7 +29812,7 @@ pub unsafe fn _mm512_set1_ps(a: f32) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set_epi32(
+pub fn _mm512_set_epi32(
     e15: i32,
     e14: i32,
     e13: i32,
@@ -28271,8 +29841,8 @@ pub unsafe fn _mm512_set_epi32(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set1_epi8(a: i8) -> __m512i {
-    transmute(i8x64::splat(a))
+pub fn _mm512_set1_epi8(a: i8) -> __m512i {
+    unsafe { transmute(i8x64::splat(a)) }
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of dst.
@@ -28281,8 +29851,8 @@ pub unsafe fn _mm512_set1_epi8(a: i8) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set1_epi16(a: i16) -> __m512i {
-    transmute(i16x32::splat(a))
+pub fn _mm512_set1_epi16(a: i16) -> __m512i {
+    unsafe { transmute(i16x32::splat(a)) }
 }
 
 /// Broadcast 32-bit integer `a` to all elements of `dst`.
@@ -28291,8 +29861,8 @@ pub unsafe fn _mm512_set1_epi16(a: i16) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i {
-    transmute(i32x16::splat(a))
+pub fn _mm512_set1_epi32(a: i32) -> __m512i {
+    unsafe { transmute(i32x16::splat(a)) }
 }
 
 /// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -28302,9 +29872,11 @@ pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub unsafe fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m512i {
-    let r = _mm512_set1_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }
 
 /// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -28314,9 +29886,11 @@ pub unsafe fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m5
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub unsafe fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i {
-    let r = _mm512_set1_epi32(a).as_i32x16();
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi32(a).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }
 
 /// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -28326,9 +29900,11 @@ pub unsafe fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub unsafe fn _mm256_mask_set1_epi32(src: __m256i, k: __mmask8, a: i32) -> __m256i {
-    let r = _mm256_set1_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_set1_epi32(src: __m256i, k: __mmask8, a: i32) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }
 
 /// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -28338,9 +29914,11 @@ pub unsafe fn _mm256_mask_set1_epi32(src: __m256i, k: __mmask8, a: i32) -> __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub unsafe fn _mm256_maskz_set1_epi32(k: __mmask8, a: i32) -> __m256i {
-    let r = _mm256_set1_epi32(a).as_i32x8();
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_set1_epi32(k: __mmask8, a: i32) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi32(a).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }
 
 /// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -28350,9 +29928,11 @@ pub unsafe fn _mm256_maskz_set1_epi32(k: __mmask8, a: i32) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub unsafe fn _mm_mask_set1_epi32(src: __m128i, k: __mmask8, a: i32) -> __m128i {
-    let r = _mm_set1_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_set1_epi32(src: __m128i, k: __mmask8, a: i32) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }
 
 /// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -28362,9 +29942,11 @@ pub unsafe fn _mm_mask_set1_epi32(src: __m128i, k: __mmask8, a: i32) -> __m128i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastd))]
-pub unsafe fn _mm_maskz_set1_epi32(k: __mmask8, a: i32) -> __m128i {
-    let r = _mm_set1_epi32(a).as_i32x4();
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_set1_epi32(k: __mmask8, a: i32) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi32(a).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }
 
 /// Broadcast 64-bit integer `a` to all elements of `dst`.
@@ -28373,8 +29955,8 @@ pub unsafe fn _mm_maskz_set1_epi32(k: __mmask8, a: i32) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
-    transmute(i64x8::splat(a))
+pub fn _mm512_set1_epi64(a: i64) -> __m512i {
+    unsafe { transmute(i64x8::splat(a)) }
 }
 
 /// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -28384,9 +29966,11 @@ pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub unsafe fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m512i {
-    let r = _mm512_set1_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+pub fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+    }
 }
 
 /// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -28396,9 +29980,11 @@ pub unsafe fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m51
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub unsafe fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i {
-    let r = _mm512_set1_epi64(a).as_i64x8();
-    transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+pub fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i {
+    unsafe {
+        let r = _mm512_set1_epi64(a).as_i64x8();
+        transmute(simd_select_bitmask(k, r, i64x8::ZERO))
+    }
 }
 
 /// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -28408,9 +29994,11 @@ pub unsafe fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub unsafe fn _mm256_mask_set1_epi64(src: __m256i, k: __mmask8, a: i64) -> __m256i {
-    let r = _mm256_set1_epi64x(a).as_i64x4();
-    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+pub fn _mm256_mask_set1_epi64(src: __m256i, k: __mmask8, a: i64) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi64x(a).as_i64x4();
+        transmute(simd_select_bitmask(k, r, src.as_i64x4()))
+    }
 }
 
 /// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -28420,9 +30008,11 @@ pub unsafe fn _mm256_mask_set1_epi64(src: __m256i, k: __mmask8, a: i64) -> __m25
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub unsafe fn _mm256_maskz_set1_epi64(k: __mmask8, a: i64) -> __m256i {
-    let r = _mm256_set1_epi64x(a).as_i64x4();
-    transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+pub fn _mm256_maskz_set1_epi64(k: __mmask8, a: i64) -> __m256i {
+    unsafe {
+        let r = _mm256_set1_epi64x(a).as_i64x4();
+        transmute(simd_select_bitmask(k, r, i64x4::ZERO))
+    }
 }
 
 /// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -28432,9 +30022,11 @@ pub unsafe fn _mm256_maskz_set1_epi64(k: __mmask8, a: i64) -> __m256i {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub unsafe fn _mm_mask_set1_epi64(src: __m128i, k: __mmask8, a: i64) -> __m128i {
-    let r = _mm_set1_epi64x(a).as_i64x2();
-    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+pub fn _mm_mask_set1_epi64(src: __m128i, k: __mmask8, a: i64) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi64x(a).as_i64x2();
+        transmute(simd_select_bitmask(k, r, src.as_i64x2()))
+    }
 }
 
 /// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -28444,9 +30036,11 @@ pub unsafe fn _mm_mask_set1_epi64(src: __m128i, k: __mmask8, a: i64) -> __m128i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpbroadcastq))]
-pub unsafe fn _mm_maskz_set1_epi64(k: __mmask8, a: i64) -> __m128i {
-    let r = _mm_set1_epi64x(a).as_i64x2();
-    transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+pub fn _mm_maskz_set1_epi64(k: __mmask8, a: i64) -> __m128i {
+    unsafe {
+        let r = _mm_set1_epi64x(a).as_i64x2();
+        transmute(simd_select_bitmask(k, r, i64x2::ZERO))
+    }
 }
 
 /// Set packed 64-bit integers in dst with the repeated 4 element sequence.
@@ -28455,7 +30049,7 @@ pub unsafe fn _mm_maskz_set1_epi64(k: __mmask8, a: i64) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
+pub fn _mm512_set4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
     _mm512_set_epi64(d, c, b, a, d, c, b, a)
 }
 
@@ -28465,7 +30059,7 @@ pub unsafe fn _mm512_set4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_setr4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
+pub fn _mm512_setr4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
     _mm512_set_epi64(a, b, c, d, a, b, c, d)
 }
 
@@ -28476,7 +30070,7 @@ pub unsafe fn _mm512_setr4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
     _mm512_cmp_ps_mask::<_CMP_LT_OS>(a, b)
 }
 
@@ -28487,7 +30081,7 @@ pub unsafe fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_mask_cmplt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_mask_cmplt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
     _mm512_mask_cmp_ps_mask::<_CMP_LT_OS>(k1, a, b)
 }
 
@@ -28498,7 +30092,7 @@ pub unsafe fn _mm512_mask_cmplt_ps_mask(k1: __mmask16, a: __m512, b: __m512) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
     _mm512_cmp_ps_mask::<_CMP_NLT_US>(a, b)
 }
 
@@ -28509,7 +30103,7 @@ pub unsafe fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_mask_cmpnlt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_mask_cmpnlt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
     _mm512_mask_cmp_ps_mask::<_CMP_NLT_US>(k1, a, b)
 }
 
@@ -28520,7 +30114,7 @@ pub unsafe fn _mm512_mask_cmpnlt_ps_mask(k1: __mmask16, a: __m512, b: __m512) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
     _mm512_cmp_ps_mask::<_CMP_LE_OS>(a, b)
 }
 
@@ -28531,7 +30125,7 @@ pub unsafe fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_mask_cmple_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_mask_cmple_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
     _mm512_mask_cmp_ps_mask::<_CMP_LE_OS>(k1, a, b)
 }
 
@@ -28542,7 +30136,7 @@ pub unsafe fn _mm512_mask_cmple_ps_mask(k1: __mmask16, a: __m512, b: __m512) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 {
     _mm512_cmp_ps_mask::<_CMP_NLE_US>(a, b)
 }
 
@@ -28553,7 +30147,7 @@ pub unsafe fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_mask_cmpnle_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_mask_cmpnle_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
     _mm512_mask_cmp_ps_mask::<_CMP_NLE_US>(k1, a, b)
 }
 
@@ -28564,7 +30158,7 @@ pub unsafe fn _mm512_mask_cmpnle_ps_mask(k1: __mmask16, a: __m512, b: __m512) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
     _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b)
 }
 
@@ -28575,7 +30169,7 @@ pub unsafe fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_mask_cmpeq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_mask_cmpeq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
     _mm512_mask_cmp_ps_mask::<_CMP_EQ_OQ>(k1, a, b)
 }
 
@@ -28586,7 +30180,7 @@ pub unsafe fn _mm512_mask_cmpeq_ps_mask(k1: __mmask16, a: __m512, b: __m512) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
     _mm512_cmp_ps_mask::<_CMP_NEQ_UQ>(a, b)
 }
 
@@ -28597,7 +30191,7 @@ pub unsafe fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_mask_cmpneq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_mask_cmpneq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
     _mm512_mask_cmp_ps_mask::<_CMP_NEQ_UQ>(k1, a, b)
 }
 
@@ -28609,13 +30203,15 @@ pub unsafe fn _mm512_mask_cmpneq_ps_mask(k1: __mmask16, a: __m512, b: __m512) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm512_cmp_ps_mask<const IMM8: i32>(a: __m512, b: __m512) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let neg_one = -1;
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vcmpps(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
-    r.cast_unsigned()
+pub fn _mm512_cmp_ps_mask<const IMM8: i32>(a: __m512, b: __m512) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -28626,16 +30222,14 @@ pub unsafe fn _mm512_cmp_ps_mask<const IMM8: i32>(a: __m512, b: __m512) -> __mma
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm512_mask_cmp_ps_mask<const IMM8: i32>(
-    k1: __mmask16,
-    a: __m512,
-    b: __m512,
-) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vcmpps(a, b, IMM8, k1 as i16, _MM_FROUND_CUR_DIRECTION);
-    r.cast_unsigned()
+pub fn _mm512_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM8, k1 as i16, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -28646,13 +30240,15 @@ pub unsafe fn _mm512_mask_cmp_ps_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm256_cmp_ps_mask<const IMM8: i32>(a: __m256, b: __m256) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let neg_one = -1;
-    let a = a.as_f32x8();
-    let b = b.as_f32x8();
-    let r = vcmpps256(a, b, IMM8, neg_one);
-    r.cast_unsigned()
+pub fn _mm256_cmp_ps_mask<const IMM8: i32>(a: __m256, b: __m256) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let r = vcmpps256(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -28663,16 +30259,14 @@ pub unsafe fn _mm256_cmp_ps_mask<const IMM8: i32>(a: __m256, b: __m256) -> __mma
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm256_mask_cmp_ps_mask<const IMM8: i32>(
-    k1: __mmask8,
-    a: __m256,
-    b: __m256,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let a = a.as_f32x8();
-    let b = b.as_f32x8();
-    let r = vcmpps256(a, b, IMM8, k1 as i8);
-    r.cast_unsigned()
+pub fn _mm256_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m256, b: __m256) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f32x8();
+        let b = b.as_f32x8();
+        let r = vcmpps256(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -28683,13 +30277,15 @@ pub unsafe fn _mm256_mask_cmp_ps_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm_cmp_ps_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let neg_one = -1;
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vcmpps128(a, b, IMM8, neg_one);
-    r.cast_unsigned()
+pub fn _mm_cmp_ps_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vcmpps128(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -28700,16 +30296,14 @@ pub unsafe fn _mm_cmp_ps_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm_mask_cmp_ps_mask<const IMM8: i32>(
-    k1: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vcmpps128(a, b, IMM8, k1 as i8);
-    r.cast_unsigned()
+pub fn _mm_mask_cmp_ps_mask<const IMM8: i32>(k1: __mmask8, a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vcmpps128(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
@@ -28721,17 +30315,19 @@ pub unsafe fn _mm_mask_cmp_ps_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm512_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
+pub fn _mm512_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
     a: __m512,
     b: __m512,
 ) -> __mmask16 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_mantissas_sae!(SAE);
-    let neg_one = -1;
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vcmpps(a, b, IMM5, neg_one, SAE);
-    r.cast_unsigned()
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
@@ -28743,17 +30339,19 @@ pub unsafe fn _mm512_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm512_mask_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
+pub fn _mm512_mask_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
     m: __mmask16,
     a: __m512,
     b: __m512,
 ) -> __mmask16 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x16();
-    let b = b.as_f32x16();
-    let r = vcmpps(a, b, IMM5, m as i16, SAE);
-    r.cast_unsigned()
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x16();
+        let b = b.as_f32x16();
+        let r = vcmpps(a, b, IMM5, m as i16, SAE);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
@@ -28763,7 +30361,7 @@ pub unsafe fn _mm512_mask_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmps
-pub unsafe fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
     _mm512_cmp_ps_mask::<_CMP_ORD_Q>(a, b)
 }
 
@@ -28774,7 +30372,7 @@ pub unsafe fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_mask_cmpord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_mask_cmpord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
     _mm512_mask_cmp_ps_mask::<_CMP_ORD_Q>(k1, a, b)
 }
 
@@ -28785,7 +30383,7 @@ pub unsafe fn _mm512_mask_cmpord_ps_mask(k1: __mmask16, a: __m512, b: __m512) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
     _mm512_cmp_ps_mask::<_CMP_UNORD_Q>(a, b)
 }
 
@@ -28796,7 +30394,7 @@ pub unsafe fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
-pub unsafe fn _mm512_mask_cmpunord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
+pub fn _mm512_mask_cmpunord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
     _mm512_mask_cmp_ps_mask::<_CMP_UNORD_Q>(k1, a, b)
 }
 
@@ -28807,7 +30405,7 @@ pub unsafe fn _mm512_mask_cmpunord_ps_mask(k1: __mmask16, a: __m512, b: __m512)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_cmp_pd_mask::<_CMP_LT_OS>(a, b)
 }
 
@@ -28818,7 +30416,7 @@ pub unsafe fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_mask_cmplt_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_mask_cmplt_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_mask_cmp_pd_mask::<_CMP_LT_OS>(k1, a, b)
 }
 
@@ -28829,7 +30427,7 @@ pub unsafe fn _mm512_mask_cmplt_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_cmp_pd_mask::<_CMP_NLT_US>(a, b)
 }
 
@@ -28840,7 +30438,7 @@ pub unsafe fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_mask_cmp_pd_mask::<_CMP_NLT_US>(m, a, b)
 }
 
@@ -28851,7 +30449,7 @@ pub unsafe fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_cmp_pd_mask::<_CMP_LE_OS>(a, b)
 }
 
@@ -28862,7 +30460,7 @@ pub unsafe fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_mask_cmple_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_mask_cmple_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_mask_cmp_pd_mask::<_CMP_LE_OS>(k1, a, b)
 }
 
@@ -28873,7 +30471,7 @@ pub unsafe fn _mm512_mask_cmple_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_cmp_pd_mask::<_CMP_NLE_US>(a, b)
 }
 
@@ -28884,7 +30482,7 @@ pub unsafe fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_mask_cmpnle_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_mask_cmpnle_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_mask_cmp_pd_mask::<_CMP_NLE_US>(k1, a, b)
 }
 
@@ -28895,7 +30493,7 @@ pub unsafe fn _mm512_mask_cmpnle_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b)
 }
 
@@ -28906,7 +30504,7 @@ pub unsafe fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_mask_cmpeq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_mask_cmpeq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_mask_cmp_pd_mask::<_CMP_EQ_OQ>(k1, a, b)
 }
 
@@ -28917,7 +30515,7 @@ pub unsafe fn _mm512_mask_cmpeq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_cmp_pd_mask::<_CMP_NEQ_UQ>(a, b)
 }
 
@@ -28928,7 +30526,7 @@ pub unsafe fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_mask_cmpneq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_mask_cmpneq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_mask_cmp_pd_mask::<_CMP_NEQ_UQ>(k1, a, b)
 }
 
@@ -28940,13 +30538,15 @@ pub unsafe fn _mm512_mask_cmpneq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm512_cmp_pd_mask<const IMM8: i32>(a: __m512d, b: __m512d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let neg_one = -1;
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vcmppd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
-    r.cast_unsigned()
+pub fn _mm512_cmp_pd_mask<const IMM8: i32>(a: __m512d, b: __m512d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -28957,16 +30557,14 @@ pub unsafe fn _mm512_cmp_pd_mask<const IMM8: i32>(a: __m512d, b: __m512d) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm512_mask_cmp_pd_mask<const IMM8: i32>(
-    k1: __mmask8,
-    a: __m512d,
-    b: __m512d,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vcmppd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
-    r.cast_unsigned()
+pub fn _mm512_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -28977,13 +30575,15 @@ pub unsafe fn _mm512_mask_cmp_pd_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm256_cmp_pd_mask<const IMM8: i32>(a: __m256d, b: __m256d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let neg_one = -1;
-    let a = a.as_f64x4();
-    let b = b.as_f64x4();
-    let r = vcmppd256(a, b, IMM8, neg_one);
-    r.cast_unsigned()
+pub fn _mm256_cmp_pd_mask<const IMM8: i32>(a: __m256d, b: __m256d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let r = vcmppd256(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -28994,16 +30594,14 @@ pub unsafe fn _mm256_cmp_pd_mask<const IMM8: i32>(a: __m256d, b: __m256d) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm256_mask_cmp_pd_mask<const IMM8: i32>(
-    k1: __mmask8,
-    a: __m256d,
-    b: __m256d,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let a = a.as_f64x4();
-    let b = b.as_f64x4();
-    let r = vcmppd256(a, b, IMM8, k1 as i8);
-    r.cast_unsigned()
+pub fn _mm256_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m256d, b: __m256d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f64x4();
+        let b = b.as_f64x4();
+        let r = vcmppd256(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -29014,13 +30612,15 @@ pub unsafe fn _mm256_mask_cmp_pd_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm_cmp_pd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let neg_one = -1;
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vcmppd128(a, b, IMM8, neg_one);
-    r.cast_unsigned()
+pub fn _mm_cmp_pd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vcmppd128(a, b, IMM8, neg_one);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29031,16 +30631,14 @@ pub unsafe fn _mm_cmp_pd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmas
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm_mask_cmp_pd_mask<const IMM8: i32>(
-    k1: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vcmppd128(a, b, IMM8, k1 as i8);
-    r.cast_unsigned()
+pub fn _mm_mask_cmp_pd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vcmppd128(a, b, IMM8, k1 as i8);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
@@ -29052,17 +30650,19 @@ pub unsafe fn _mm_mask_cmp_pd_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm512_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
+pub fn _mm512_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
     a: __m512d,
     b: __m512d,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_mantissas_sae!(SAE);
-    let neg_one = -1;
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vcmppd(a, b, IMM5, neg_one, SAE);
-    r.cast_unsigned()
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
@@ -29074,17 +30674,19 @@ pub unsafe fn _mm512_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm512_mask_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
+pub fn _mm512_mask_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
     k1: __mmask8,
     a: __m512d,
     b: __m512d,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x8();
-    let b = b.as_f64x8();
-    let r = vcmppd(a, b, IMM5, k1 as i8, SAE);
-    r.cast_unsigned()
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x8();
+        let b = b.as_f64x8();
+        let r = vcmppd(a, b, IMM5, k1 as i8, SAE);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
@@ -29094,7 +30696,7 @@ pub unsafe fn _mm512_mask_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_cmp_pd_mask::<_CMP_ORD_Q>(a, b)
 }
 
@@ -29105,7 +30707,7 @@ pub unsafe fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_mask_cmpord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_mask_cmpord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_mask_cmp_pd_mask::<_CMP_ORD_Q>(k1, a, b)
 }
 
@@ -29116,7 +30718,7 @@ pub unsafe fn _mm512_mask_cmpord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_cmp_pd_mask::<_CMP_UNORD_Q>(a, b)
 }
 
@@ -29127,7 +30729,7 @@ pub unsafe fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
-pub unsafe fn _mm512_mask_cmpunord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
+pub fn _mm512_mask_cmpunord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
     _mm512_mask_cmp_pd_mask::<_CMP_UNORD_Q>(k1, a, b)
 }
 
@@ -29139,11 +30741,13 @@ pub unsafe fn _mm512_mask_cmpunord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm_cmp_ss_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let neg_one = -1;
-    let r = vcmpss(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
-    r.cast_unsigned()
+pub fn _mm_cmp_ss_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let r = vcmpss(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
@@ -29154,14 +30758,12 @@ pub unsafe fn _mm_cmp_ss_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm_mask_cmp_ss_mask<const IMM8: i32>(
-    k1: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let r = vcmpss(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
-    r.cast_unsigned()
+pub fn _mm_mask_cmp_ss_mask<const IMM8: i32>(k1: __mmask8, a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let r = vcmpss(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
@@ -29173,15 +30775,14 @@ pub unsafe fn _mm_mask_cmp_ss_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
-    a: __m128,
-    b: __m128,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_mantissas_sae!(SAE);
-    let neg_one = -1;
-    let r = vcmpss(a, b, IMM5, neg_one, SAE);
-    r.cast_unsigned()
+pub fn _mm_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(a: __m128, b: __m128) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let r = vcmpss(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not seti).\
@@ -29193,15 +30794,17 @@ pub unsafe fn _mm_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_mask_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
+pub fn _mm_mask_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
     k1: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_mantissas_sae!(SAE);
-    let r = vcmpss(a, b, IMM5, k1 as i8, SAE);
-    r.cast_unsigned()
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let r = vcmpss(a, b, IMM5, k1 as i8, SAE);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
@@ -29212,11 +30815,13 @@ pub unsafe fn _mm_mask_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm_cmp_sd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let neg_one = -1;
-    let r = vcmpsd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
-    r.cast_unsigned()
+pub fn _mm_cmp_sd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let neg_one = -1;
+        let r = vcmpsd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
@@ -29227,14 +30832,12 @@ pub unsafe fn _mm_cmp_sd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmas
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
-pub unsafe fn _mm_mask_cmp_sd_mask<const IMM8: i32>(
-    k1: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 5);
-    let r = vcmpsd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
-    r.cast_unsigned()
+pub fn _mm_mask_cmp_sd_mask<const IMM8: i32>(k1: __mmask8, a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 5);
+        let r = vcmpsd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
@@ -29246,15 +30849,14 @@ pub unsafe fn _mm_mask_cmp_sd_mask<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
-    a: __m128d,
-    b: __m128d,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_mantissas_sae!(SAE);
-    let neg_one = -1;
-    let r = vcmpsd(a, b, IMM5, neg_one, SAE);
-    r.cast_unsigned()
+pub fn _mm_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let neg_one = -1;
+        let r = vcmpsd(a, b, IMM5, neg_one, SAE);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).\
@@ -29266,15 +30868,17 @@ pub unsafe fn _mm_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_mask_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
+pub fn _mm_mask_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
     k1: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_mantissas_sae!(SAE);
-    let r = vcmpsd(a, b, IMM5, k1 as i8, SAE);
-    r.cast_unsigned()
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let r = vcmpsd(a, b, IMM5, k1 as i8, SAE);
+        r.cast_unsigned()
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -29284,8 +30888,8 @@ pub unsafe fn _mm_mask_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<u32x16, _>(simd_lt(a.as_u32x16(), b.as_u32x16()))
+pub fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_lt(a.as_u32x16(), b.as_u32x16())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29295,7 +30899,7 @@ pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_mask_cmplt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmplt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -29306,8 +30910,8 @@ pub unsafe fn _mm512_mask_cmplt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<u32x8, _>(simd_lt(a.as_u32x8(), b.as_u32x8()))
+pub fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_lt(a.as_u32x8(), b.as_u32x8())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29317,7 +30921,7 @@ pub unsafe fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_mask_cmplt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmplt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -29328,8 +30932,8 @@ pub unsafe fn _mm256_mask_cmplt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u32x4, _>(simd_lt(a.as_u32x4(), b.as_u32x4()))
+pub fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_lt(a.as_u32x4(), b.as_u32x4())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29339,7 +30943,7 @@ pub unsafe fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_mask_cmplt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmplt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -29350,8 +30954,8 @@ pub unsafe fn _mm_mask_cmplt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<u32x16, _>(simd_gt(a.as_u32x16(), b.as_u32x16()))
+pub fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_gt(a.as_u32x16(), b.as_u32x16())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29361,7 +30965,7 @@ pub unsafe fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_mask_cmpgt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmpgt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -29372,8 +30976,8 @@ pub unsafe fn _mm512_mask_cmpgt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_cmpgt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<u32x8, _>(simd_gt(a.as_u32x8(), b.as_u32x8()))
+pub fn _mm256_cmpgt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_gt(a.as_u32x8(), b.as_u32x8())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29383,7 +30987,7 @@ pub unsafe fn _mm256_cmpgt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -29394,8 +30998,8 @@ pub unsafe fn _mm256_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_cmpgt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u32x4, _>(simd_gt(a.as_u32x4(), b.as_u32x4()))
+pub fn _mm_cmpgt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_gt(a.as_u32x4(), b.as_u32x4())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29405,7 +31009,7 @@ pub unsafe fn _mm_cmpgt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -29416,8 +31020,8 @@ pub unsafe fn _mm_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<u32x16, _>(simd_le(a.as_u32x16(), b.as_u32x16()))
+pub fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_le(a.as_u32x16(), b.as_u32x16())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29427,7 +31031,7 @@ pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_mask_cmple_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmple_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -29438,8 +31042,8 @@ pub unsafe fn _mm512_mask_cmple_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<u32x8, _>(simd_le(a.as_u32x8(), b.as_u32x8()))
+pub fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_le(a.as_u32x8(), b.as_u32x8())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29449,7 +31053,7 @@ pub unsafe fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_mask_cmple_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmple_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -29460,8 +31064,8 @@ pub unsafe fn _mm256_mask_cmple_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u32x4, _>(simd_le(a.as_u32x4(), b.as_u32x4()))
+pub fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_le(a.as_u32x4(), b.as_u32x4())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29471,7 +31075,7 @@ pub unsafe fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_mask_cmple_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmple_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -29482,8 +31086,8 @@ pub unsafe fn _mm_mask_cmple_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<u32x16, _>(simd_ge(a.as_u32x16(), b.as_u32x16()))
+pub fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_ge(a.as_u32x16(), b.as_u32x16())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29493,7 +31097,7 @@ pub unsafe fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_mask_cmpge_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmpge_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -29504,8 +31108,8 @@ pub unsafe fn _mm512_mask_cmpge_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_cmpge_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<u32x8, _>(simd_ge(a.as_u32x8(), b.as_u32x8()))
+pub fn _mm256_cmpge_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_ge(a.as_u32x8(), b.as_u32x8())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29515,7 +31119,7 @@ pub unsafe fn _mm256_cmpge_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_mask_cmpge_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpge_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -29526,8 +31130,8 @@ pub unsafe fn _mm256_mask_cmpge_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_cmpge_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u32x4, _>(simd_ge(a.as_u32x4(), b.as_u32x4()))
+pub fn _mm_cmpge_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_ge(a.as_u32x4(), b.as_u32x4())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29537,7 +31141,7 @@ pub unsafe fn _mm_cmpge_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_mask_cmpge_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpge_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -29548,8 +31152,8 @@ pub unsafe fn _mm_mask_cmpge_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<u32x16, _>(simd_eq(a.as_u32x16(), b.as_u32x16()))
+pub fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_eq(a.as_u32x16(), b.as_u32x16())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29559,7 +31163,7 @@ pub unsafe fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_mask_cmpeq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmpeq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -29570,8 +31174,8 @@ pub unsafe fn _mm512_mask_cmpeq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_cmpeq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<u32x8, _>(simd_eq(a.as_u32x8(), b.as_u32x8()))
+pub fn _mm256_cmpeq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_eq(a.as_u32x8(), b.as_u32x8())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29581,7 +31185,7 @@ pub unsafe fn _mm256_cmpeq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -29592,8 +31196,8 @@ pub unsafe fn _mm256_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_cmpeq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u32x4, _>(simd_eq(a.as_u32x4(), b.as_u32x4()))
+pub fn _mm_cmpeq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_eq(a.as_u32x4(), b.as_u32x4())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29603,7 +31207,7 @@ pub unsafe fn _mm_cmpeq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -29614,8 +31218,8 @@ pub unsafe fn _mm_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<u32x16, _>(simd_ne(a.as_u32x16(), b.as_u32x16()))
+pub fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<u32x16, _>(simd_ne(a.as_u32x16(), b.as_u32x16())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29625,7 +31229,7 @@ pub unsafe fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm512_mask_cmpneq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmpneq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -29636,8 +31240,8 @@ pub unsafe fn _mm512_mask_cmpneq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_cmpneq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<u32x8, _>(simd_ne(a.as_u32x8(), b.as_u32x8()))
+pub fn _mm256_cmpneq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x8, _>(simd_ne(a.as_u32x8(), b.as_u32x8())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29647,7 +31251,7 @@ pub unsafe fn _mm256_cmpneq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm256_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -29658,8 +31262,8 @@ pub unsafe fn _mm256_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_cmpneq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<u32x4, _>(simd_ne(a.as_u32x4(), b.as_u32x4()))
+pub fn _mm_cmpneq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<u32x4, _>(simd_ne(a.as_u32x4(), b.as_u32x4())) }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29669,7 +31273,7 @@ pub unsafe fn _mm_cmpneq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
-pub unsafe fn _mm_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -29681,24 +31285,23 @@ pub unsafe fn _mm_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm512_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
-    a: __m512i,
-    b: __m512i,
-) -> __mmask16 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u32x16();
-    let b = b.as_u32x16();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i32x16::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i32x16::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm512_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29709,26 +31312,28 @@ pub unsafe fn _mm512_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm512_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm512_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask16,
     a: __m512i,
     b: __m512i,
 ) -> __mmask16 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u32x16();
-    let b = b.as_u32x16();
-    let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i32x16::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x16();
+        let b = b.as_u32x16();
+        let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -29739,24 +31344,23 @@ pub unsafe fn _mm512_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm256_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
-    a: __m256i,
-    b: __m256i,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u32x8();
-    let b = b.as_u32x8();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i32x8::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i32x8::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm256_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29767,26 +31371,28 @@ pub unsafe fn _mm256_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm256_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm256_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u32x8();
-    let b = b.as_u32x8();
-    let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i32x8::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x8();
+        let b = b.as_u32x8();
+        let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -29797,21 +31403,23 @@ pub unsafe fn _mm256_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u32x4();
-    let b = b.as_u32x4();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i32x4::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i32x4::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x4();
+        let b = b.as_u32x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29822,26 +31430,28 @@ pub unsafe fn _mm_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u32x4();
-    let b = b.as_u32x4();
-    let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i32x4::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u32x4();
+        let b = b.as_u32x4();
+        let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -29851,8 +31461,8 @@ pub unsafe fn _mm_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29862,7 +31472,7 @@ pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_mask_cmplt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmplt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -29873,8 +31483,8 @@ pub unsafe fn _mm512_mask_cmplt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<i32x8, _>(simd_lt(a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_lt(a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29884,7 +31494,7 @@ pub unsafe fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_mask_cmplt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmplt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -29895,8 +31505,8 @@ pub unsafe fn _mm256_mask_cmplt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29906,7 +31516,7 @@ pub unsafe fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_mask_cmplt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmplt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -29917,8 +31527,8 @@ pub unsafe fn _mm_mask_cmplt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<i32x16, _>(simd_gt(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_gt(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29928,7 +31538,7 @@ pub unsafe fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_mask_cmpgt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmpgt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -29939,8 +31549,8 @@ pub unsafe fn _mm512_mask_cmpgt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_cmpgt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_cmpgt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29950,7 +31560,7 @@ pub unsafe fn _mm256_cmpgt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -29961,8 +31571,8 @@ pub unsafe fn _mm256_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_cmpgt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_cmpgt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29972,7 +31582,7 @@ pub unsafe fn _mm_cmpgt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -29983,8 +31593,8 @@ pub unsafe fn _mm_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<i32x16, _>(simd_le(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_le(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -29994,7 +31604,7 @@ pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_mask_cmple_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmple_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -30005,8 +31615,8 @@ pub unsafe fn _mm512_mask_cmple_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<i32x8, _>(simd_le(a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_le(a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30016,7 +31626,7 @@ pub unsafe fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_mask_cmple_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmple_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -30027,8 +31637,8 @@ pub unsafe fn _mm256_mask_cmple_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i32x4, _>(simd_le(a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_le(a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30038,7 +31648,7 @@ pub unsafe fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_mask_cmple_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmple_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -30049,8 +31659,8 @@ pub unsafe fn _mm_mask_cmple_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<i32x16, _>(simd_ge(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_ge(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30060,7 +31670,7 @@ pub unsafe fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_mask_cmpge_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmpge_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -30071,8 +31681,8 @@ pub unsafe fn _mm512_mask_cmpge_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_cmpge_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<i32x8, _>(simd_ge(a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_cmpge_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_ge(a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30082,7 +31692,7 @@ pub unsafe fn _mm256_cmpge_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_mask_cmpge_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpge_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -30093,8 +31703,8 @@ pub unsafe fn _mm256_mask_cmpge_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_cmpge_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i32x4, _>(simd_ge(a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_cmpge_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_ge(a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30104,7 +31714,7 @@ pub unsafe fn _mm_cmpge_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_mask_cmpge_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpge_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -30115,8 +31725,8 @@ pub unsafe fn _mm_mask_cmpge_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<i32x16, _>(simd_eq(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_eq(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30126,7 +31736,7 @@ pub unsafe fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_mask_cmpeq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmpeq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -30137,8 +31747,8 @@ pub unsafe fn _mm512_mask_cmpeq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_cmpeq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_cmpeq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30148,7 +31758,7 @@ pub unsafe fn _mm256_cmpeq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -30159,8 +31769,8 @@ pub unsafe fn _mm256_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_cmpeq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_cmpeq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30170,7 +31780,7 @@ pub unsafe fn _mm_cmpeq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi32_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -30181,8 +31791,8 @@ pub unsafe fn _mm_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
-    simd_bitmask::<i32x16, _>(simd_ne(a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe { simd_bitmask::<i32x16, _>(simd_ne(a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30192,7 +31802,7 @@ pub unsafe fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm512_mask_cmpneq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+pub fn _mm512_mask_cmpneq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
     _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -30203,8 +31813,8 @@ pub unsafe fn _mm512_mask_cmpneq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_cmpneq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<i32x8, _>(simd_ne(a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_cmpneq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x8, _>(simd_ne(a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30214,7 +31824,7 @@ pub unsafe fn _mm256_cmpneq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm256_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -30225,8 +31835,8 @@ pub unsafe fn _mm256_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_cmpneq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<i32x4, _>(simd_ne(a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_cmpneq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<i32x4, _>(simd_ne(a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30236,7 +31846,7 @@ pub unsafe fn _mm_cmpneq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
-pub unsafe fn _mm_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi32_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -30248,24 +31858,23 @@ pub unsafe fn _mm_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm512_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
-    a: __m512i,
-    b: __m512i,
-) -> __mmask16 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i32x16::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i32x16::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm512_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x16::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x16::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30276,26 +31885,28 @@ pub unsafe fn _mm512_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm512_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm512_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask16,
     a: __m512i,
     b: __m512i,
 ) -> __mmask16 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i32x16();
-    let b = b.as_i32x16();
-    let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i32x16::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x16();
+        let b = b.as_i32x16();
+        let k1 = simd_select_bitmask(k1, i32x16::splat(-1), i32x16::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x16::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -30306,24 +31917,23 @@ pub unsafe fn _mm512_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm256_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
-    a: __m256i,
-    b: __m256i,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i32x8::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i32x8::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm256_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30334,26 +31944,28 @@ pub unsafe fn _mm256_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm256_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm256_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i32x8();
-    let b = b.as_i32x8();
-    let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i32x8::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x8();
+        let b = b.as_i32x8();
+        let k1 = simd_select_bitmask(k1, i32x8::splat(-1), i32x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -30364,21 +31976,23 @@ pub unsafe fn _mm256_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i32x4::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i32x4::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i32x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i32x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30389,26 +32003,28 @@ pub unsafe fn _mm_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i32x4();
-    let b = b.as_i32x4();
-    let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i32x4::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i32x4();
+        let b = b.as_i32x4();
+        let k1 = simd_select_bitmask(k1, i32x4::splat(-1), i32x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i32x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -30418,8 +32034,8 @@ pub unsafe fn _mm_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_lt(a.as_u64x8(), b.as_u64x8()))
+pub fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_lt(a.as_u64x8(), b.as_u64x8())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30429,7 +32045,7 @@ pub unsafe fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_mask_cmplt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmplt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -30440,8 +32056,8 @@ pub unsafe fn _mm512_mask_cmplt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_cmplt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_lt(a.as_u64x4(), b.as_u64x4()))
+pub fn _mm256_cmplt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_lt(a.as_u64x4(), b.as_u64x4())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30451,7 +32067,7 @@ pub unsafe fn _mm256_cmplt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_mask_cmplt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmplt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -30462,8 +32078,8 @@ pub unsafe fn _mm256_mask_cmplt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_cmplt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_lt(a.as_u64x2(), b.as_u64x2()))
+pub fn _mm_cmplt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_lt(a.as_u64x2(), b.as_u64x2())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30473,7 +32089,7 @@ pub unsafe fn _mm_cmplt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_mask_cmplt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmplt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -30484,8 +32100,8 @@ pub unsafe fn _mm_mask_cmplt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8()))
+pub fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30495,7 +32111,7 @@ pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -30506,8 +32122,8 @@ pub unsafe fn _mm512_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_cmpgt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_gt(a.as_u64x4(), b.as_u64x4()))
+pub fn _mm256_cmpgt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_gt(a.as_u64x4(), b.as_u64x4())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30517,7 +32133,7 @@ pub unsafe fn _mm256_cmpgt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -30528,8 +32144,8 @@ pub unsafe fn _mm256_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_cmpgt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_gt(a.as_u64x2(), b.as_u64x2()))
+pub fn _mm_cmpgt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_gt(a.as_u64x2(), b.as_u64x2())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30539,7 +32155,7 @@ pub unsafe fn _mm_cmpgt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -30550,8 +32166,8 @@ pub unsafe fn _mm_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_le(a.as_u64x8(), b.as_u64x8()))
+pub fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_le(a.as_u64x8(), b.as_u64x8())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30561,7 +32177,7 @@ pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_mask_cmple_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmple_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -30572,8 +32188,8 @@ pub unsafe fn _mm512_mask_cmple_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_cmple_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_le(a.as_u64x4(), b.as_u64x4()))
+pub fn _mm256_cmple_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_le(a.as_u64x4(), b.as_u64x4())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30583,7 +32199,7 @@ pub unsafe fn _mm256_cmple_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_mask_cmple_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmple_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -30594,8 +32210,8 @@ pub unsafe fn _mm256_mask_cmple_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_cmple_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_le(a.as_u64x2(), b.as_u64x2()))
+pub fn _mm_cmple_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_le(a.as_u64x2(), b.as_u64x2())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30605,7 +32221,7 @@ pub unsafe fn _mm_cmple_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_mask_cmple_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmple_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -30616,8 +32232,8 @@ pub unsafe fn _mm_mask_cmple_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_ge(a.as_u64x8(), b.as_u64x8()))
+pub fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ge(a.as_u64x8(), b.as_u64x8())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30627,7 +32243,7 @@ pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_mask_cmpge_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmpge_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -30638,8 +32254,8 @@ pub unsafe fn _mm512_mask_cmpge_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_cmpge_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_ge(a.as_u64x4(), b.as_u64x4()))
+pub fn _mm256_cmpge_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ge(a.as_u64x4(), b.as_u64x4())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30649,7 +32265,7 @@ pub unsafe fn _mm256_cmpge_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_mask_cmpge_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpge_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -30660,8 +32276,8 @@ pub unsafe fn _mm256_mask_cmpge_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_cmpge_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_ge(a.as_u64x2(), b.as_u64x2()))
+pub fn _mm_cmpge_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ge(a.as_u64x2(), b.as_u64x2())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30671,7 +32287,7 @@ pub unsafe fn _mm_cmpge_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_mask_cmpge_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpge_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -30682,8 +32298,8 @@ pub unsafe fn _mm_mask_cmpge_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8()))
+pub fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30693,7 +32309,7 @@ pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -30704,8 +32320,8 @@ pub unsafe fn _mm512_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_cmpeq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_eq(a.as_u64x4(), b.as_u64x4()))
+pub fn _mm256_cmpeq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_eq(a.as_u64x4(), b.as_u64x4())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30715,7 +32331,7 @@ pub unsafe fn _mm256_cmpeq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -30726,8 +32342,8 @@ pub unsafe fn _mm256_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_cmpeq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_eq(a.as_u64x2(), b.as_u64x2()))
+pub fn _mm_cmpeq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_eq(a.as_u64x2(), b.as_u64x2())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30737,7 +32353,7 @@ pub unsafe fn _mm_cmpeq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -30748,8 +32364,8 @@ pub unsafe fn _mm_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8()))
+pub fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30759,7 +32375,7 @@ pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm512_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -30770,8 +32386,8 @@ pub unsafe fn _mm512_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_cmpneq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_ne(a.as_u64x4(), b.as_u64x4()))
+pub fn _mm256_cmpneq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ne(a.as_u64x4(), b.as_u64x4())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30781,7 +32397,7 @@ pub unsafe fn _mm256_cmpneq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm256_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -30792,8 +32408,8 @@ pub unsafe fn _mm256_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_cmpneq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_ne(a.as_u64x2(), b.as_u64x2()))
+pub fn _mm_cmpneq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ne(a.as_u64x2(), b.as_u64x2())) }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30803,7 +32419,7 @@ pub unsafe fn _mm_cmpneq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
-pub unsafe fn _mm_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epu64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -30815,24 +32431,23 @@ pub unsafe fn _mm_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm512_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
-    a: __m512i,
-    b: __m512i,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u64x8();
-    let b = b.as_u64x8();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i64x8::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i64x8::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm512_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30843,26 +32458,28 @@ pub unsafe fn _mm512_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm512_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm512_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask8,
     a: __m512i,
     b: __m512i,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u64x8();
-    let b = b.as_u64x8();
-    let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i64x8::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x8();
+        let b = b.as_u64x8();
+        let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -30873,24 +32490,23 @@ pub unsafe fn _mm512_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm256_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
-    a: __m256i,
-    b: __m256i,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u64x4();
-    let b = b.as_u64x4();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i64x4::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i64x4::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm256_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30901,26 +32517,28 @@ pub unsafe fn _mm256_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm256_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm256_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u64x4();
-    let b = b.as_u64x4();
-    let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i64x4::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x4();
+        let b = b.as_u64x4();
+        let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -30931,21 +32549,23 @@ pub unsafe fn _mm256_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u64x2();
-    let b = b.as_u64x2();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i64x2::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i64x2::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x2::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x2::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30956,26 +32576,28 @@ pub unsafe fn _mm_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_u64x2();
-    let b = b.as_u64x2();
-    let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i64x2::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_u64x2();
+        let b = b.as_u64x2();
+        let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x2::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
@@ -30985,8 +32607,8 @@ pub unsafe fn _mm_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -30996,7 +32618,7 @@ pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_mask_cmplt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmplt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -31007,8 +32629,8 @@ pub unsafe fn _mm512_mask_cmplt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_cmplt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_lt(a.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_cmplt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_lt(a.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31018,7 +32640,7 @@ pub unsafe fn _mm256_cmplt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_mask_cmplt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmplt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -31029,8 +32651,8 @@ pub unsafe fn _mm256_mask_cmplt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_cmplt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_lt(a.as_i64x2(), b.as_i64x2()))
+pub fn _mm_cmplt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_lt(a.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31040,7 +32662,7 @@ pub unsafe fn _mm_cmplt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_mask_cmplt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmplt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LT>(k1, a, b)
 }
 
@@ -31051,8 +32673,8 @@ pub unsafe fn _mm_mask_cmplt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31062,7 +32684,7 @@ pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -31073,8 +32695,8 @@ pub unsafe fn _mm512_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_cmpgt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_cmpgt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31084,7 +32706,7 @@ pub unsafe fn _mm256_cmpgt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -31095,8 +32717,8 @@ pub unsafe fn _mm256_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_cmpgt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_gt(a.as_i64x2(), b.as_i64x2()))
+pub fn _mm_cmpgt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_gt(a.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31106,7 +32728,7 @@ pub unsafe fn _mm_cmpgt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLE>(k1, a, b)
 }
 
@@ -31117,8 +32739,8 @@ pub unsafe fn _mm_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_le(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_le(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31128,7 +32750,7 @@ pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_mask_cmple_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmple_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -31139,8 +32761,8 @@ pub unsafe fn _mm512_mask_cmple_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_cmple_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_le(a.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_cmple_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_le(a.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31150,7 +32772,7 @@ pub unsafe fn _mm256_cmple_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_mask_cmple_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmple_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -31161,8 +32783,8 @@ pub unsafe fn _mm256_mask_cmple_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_cmple_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_le(a.as_i64x2(), b.as_i64x2()))
+pub fn _mm_cmple_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_le(a.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31172,7 +32794,7 @@ pub unsafe fn _mm_cmple_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_mask_cmple_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmple_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi64_mask::<_MM_CMPINT_LE>(k1, a, b)
 }
 
@@ -31183,8 +32805,8 @@ pub unsafe fn _mm_mask_cmple_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_ge(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ge(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31194,7 +32816,7 @@ pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_mask_cmpge_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmpge_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -31205,8 +32827,8 @@ pub unsafe fn _mm512_mask_cmpge_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_cmpge_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_ge(a.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_cmpge_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ge(a.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31216,7 +32838,7 @@ pub unsafe fn _mm256_cmpge_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_mask_cmpge_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpge_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -31227,8 +32849,8 @@ pub unsafe fn _mm256_mask_cmpge_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_cmpge_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_ge(a.as_i64x2(), b.as_i64x2()))
+pub fn _mm_cmpge_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ge(a.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31238,7 +32860,7 @@ pub unsafe fn _mm_cmpge_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_mask_cmpge_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpge_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NLT>(k1, a, b)
 }
 
@@ -31249,8 +32871,8 @@ pub unsafe fn _mm_mask_cmpge_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31260,7 +32882,7 @@ pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -31271,8 +32893,8 @@ pub unsafe fn _mm512_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_cmpeq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_cmpeq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31282,7 +32904,7 @@ pub unsafe fn _mm256_cmpeq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -31293,8 +32915,8 @@ pub unsafe fn _mm256_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i)
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_cmpeq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_eq(a.as_i64x2(), b.as_i64x2()))
+pub fn _mm_cmpeq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_eq(a.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31304,7 +32926,7 @@ pub unsafe fn _mm_cmpeq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi64_mask::<_MM_CMPINT_EQ>(k1, a, b)
 }
 
@@ -31315,8 +32937,8 @@ pub unsafe fn _mm_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
-    simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8()))
+pub fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31326,7 +32948,7 @@ pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm512_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+pub fn _mm512_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
     _mm512_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -31337,8 +32959,8 @@ pub unsafe fn _mm512_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_cmpneq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
-    simd_bitmask::<__m256i, _>(simd_ne(a.as_i64x4(), b.as_i64x4()))
+pub fn _mm256_cmpneq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m256i, _>(simd_ne(a.as_i64x4(), b.as_i64x4())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31348,7 +32970,7 @@ pub unsafe fn _mm256_cmpneq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm256_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
+pub fn _mm256_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
     _mm256_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -31359,8 +32981,8 @@ pub unsafe fn _mm256_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_cmpneq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
-    simd_bitmask::<__m128i, _>(simd_ne(a.as_i64x2(), b.as_i64x2()))
+pub fn _mm_cmpneq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe { simd_bitmask::<__m128i, _>(simd_ne(a.as_i64x2(), b.as_i64x2())) }
 }
 
 /// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31370,7 +32992,7 @@ pub unsafe fn _mm_cmpneq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
 #[target_feature(enable = "avx512f,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
-pub unsafe fn _mm_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
+pub fn _mm_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
     _mm_mask_cmp_epi64_mask::<_MM_CMPINT_NE>(k1, a, b)
 }
 
@@ -31382,24 +33004,23 @@ pub unsafe fn _mm_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm512_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
-    a: __m512i,
-    b: __m512i,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i64x8();
-    let b = b.as_i64x8();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i64x8::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i64x8::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm512_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m512i, b: __m512i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x8::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x8::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31410,26 +33031,28 @@ pub unsafe fn _mm512_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm512_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm512_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask8,
     a: __m512i,
     b: __m512i,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i64x8();
-    let b = b.as_i64x8();
-    let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i64x8::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x8();
+        let b = b.as_i64x8();
+        let k1 = simd_select_bitmask(k1, i64x8::splat(-1), i64x8::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x8::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -31440,24 +33063,23 @@ pub unsafe fn _mm512_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm256_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
-    a: __m256i,
-    b: __m256i,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i64x4::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i64x4::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm256_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m256i, b: __m256i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x4::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x4::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31468,26 +33090,28 @@ pub unsafe fn _mm256_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm256_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm256_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i64x4();
-    let b = b.as_i64x4();
-    let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i64x4::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x4();
+        let b = b.as_i64x4();
+        let k1 = simd_select_bitmask(k1, i64x4::splat(-1), i64x4::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x4::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
@@ -31498,21 +33122,23 @@ pub unsafe fn _mm256_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(2)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i64x2();
-    let b = b.as_i64x2();
-    let r = match IMM3 {
-        0 => simd_eq(a, b),
-        1 => simd_lt(a, b),
-        2 => simd_le(a, b),
-        3 => i64x2::ZERO,
-        4 => simd_ne(a, b),
-        5 => simd_ge(a, b),
-        6 => simd_gt(a, b),
-        _ => i64x2::splat(-1),
-    };
-    simd_bitmask(r)
+pub fn _mm_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let r = match IMM3 {
+            0 => simd_eq(a, b),
+            1 => simd_lt(a, b),
+            2 => simd_le(a, b),
+            3 => i64x2::ZERO,
+            4 => simd_ne(a, b),
+            5 => simd_ge(a, b),
+            6 => simd_gt(a, b),
+            _ => i64x2::splat(-1),
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
@@ -31523,26 +33149,28 @@ pub unsafe fn _mm_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[rustc_legacy_const_generics(3)]
 #[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
-pub unsafe fn _mm_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
+pub fn _mm_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
     k1: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM3, 3);
-    let a = a.as_i64x2();
-    let b = b.as_i64x2();
-    let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::ZERO);
-    let r = match IMM3 {
-        0 => simd_and(k1, simd_eq(a, b)),
-        1 => simd_and(k1, simd_lt(a, b)),
-        2 => simd_and(k1, simd_le(a, b)),
-        3 => i64x2::ZERO,
-        4 => simd_and(k1, simd_ne(a, b)),
-        5 => simd_and(k1, simd_ge(a, b)),
-        6 => simd_and(k1, simd_gt(a, b)),
-        _ => k1,
-    };
-    simd_bitmask(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM3, 3);
+        let a = a.as_i64x2();
+        let b = b.as_i64x2();
+        let k1 = simd_select_bitmask(k1, i64x2::splat(-1), i64x2::ZERO);
+        let r = match IMM3 {
+            0 => simd_and(k1, simd_eq(a, b)),
+            1 => simd_and(k1, simd_lt(a, b)),
+            2 => simd_and(k1, simd_le(a, b)),
+            3 => i64x2::ZERO,
+            4 => simd_and(k1, simd_ne(a, b)),
+            5 => simd_and(k1, simd_ge(a, b)),
+            6 => simd_and(k1, simd_gt(a, b)),
+            _ => k1,
+        };
+        simd_bitmask(r)
+    }
 }
 
 /// Reduce the packed 32-bit integers in a by addition. Returns the sum of all elements in a.
@@ -31551,8 +33179,8 @@ pub unsafe fn _mm_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
-    simd_reduce_add_unordered(a.as_i32x16())
+pub fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_add_unordered(a.as_i32x16()) }
 }
 
 /// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -31561,8 +33189,8 @@ pub unsafe fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
-    simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO))
+pub fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO)) }
 }
 
 /// Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.
@@ -31571,8 +33199,8 @@ pub unsafe fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
-    simd_reduce_add_unordered(a.as_i64x8())
+pub fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_add_unordered(a.as_i64x8()) }
 }
 
 /// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -31581,8 +33209,8 @@ pub unsafe fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
-    simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO))
+pub fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_add_unordered(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO)) }
 }
 
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
@@ -31591,15 +33219,17 @@ pub unsafe fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_add_ps(a: __m512) -> f32 {
-    // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
-    let a = _mm256_add_ps(
-        simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-        simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
-    );
-    let a = _mm_add_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
-    let a = _mm_add_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
-    simd_extract::<_, f32>(a, 0) + simd_extract::<_, f32>(a, 1)
+pub fn _mm512_reduce_add_ps(a: __m512) -> f32 {
+    unsafe {
+        // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
+        let a = _mm256_add_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_add_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_add_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        simd_extract::<_, f32>(a, 0) + simd_extract::<_, f32>(a, 1)
+    }
 }
 
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
@@ -31608,8 +33238,8 @@ pub unsafe fn _mm512_reduce_add_ps(a: __m512) -> f32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 {
-    _mm512_reduce_add_ps(simd_select_bitmask(k, a, _mm512_setzero_ps()))
+pub fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 {
+    unsafe { _mm512_reduce_add_ps(simd_select_bitmask(k, a, _mm512_setzero_ps())) }
 }
 
 /// Reduce the packed double-precision (64-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
@@ -31618,13 +33248,15 @@ pub unsafe fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_add_pd(a: __m512d) -> f64 {
-    let a = _mm256_add_pd(
-        _mm512_extractf64x4_pd::<0>(a),
-        _mm512_extractf64x4_pd::<1>(a),
-    );
-    let a = _mm_add_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
-    simd_extract::<_, f64>(a, 0) + simd_extract::<_, f64>(a, 1)
+pub fn _mm512_reduce_add_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_add_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_add_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        simd_extract::<_, f64>(a, 0) + simd_extract::<_, f64>(a, 1)
+    }
 }
 
 /// Reduce the packed double-precision (64-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
@@ -31633,8 +33265,8 @@ pub unsafe fn _mm512_reduce_add_pd(a: __m512d) -> f64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 {
-    _mm512_reduce_add_pd(simd_select_bitmask(k, a, _mm512_setzero_pd()))
+pub fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 {
+    unsafe { _mm512_reduce_add_pd(simd_select_bitmask(k, a, _mm512_setzero_pd())) }
 }
 
 /// Reduce the packed 32-bit integers in a by multiplication. Returns the product of all elements in a.
@@ -31643,8 +33275,8 @@ pub unsafe fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_mul_epi32(a: __m512i) -> i32 {
-    simd_reduce_mul_unordered(a.as_i32x16())
+pub fn _mm512_reduce_mul_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_mul_unordered(a.as_i32x16()) }
 }
 
 /// Reduce the packed 32-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -31653,12 +33285,14 @@ pub unsafe fn _mm512_reduce_mul_epi32(a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
-    simd_reduce_mul_unordered(simd_select_bitmask(
-        k,
-        a.as_i32x16(),
-        _mm512_set1_epi32(1).as_i32x16(),
-    ))
+pub fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe {
+        simd_reduce_mul_unordered(simd_select_bitmask(
+            k,
+            a.as_i32x16(),
+            _mm512_set1_epi32(1).as_i32x16(),
+        ))
+    }
 }
 
 /// Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a.
@@ -31667,8 +33301,8 @@ pub unsafe fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
-    simd_reduce_mul_unordered(a.as_i64x8())
+pub fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_mul_unordered(a.as_i64x8()) }
 }
 
 /// Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -31677,12 +33311,14 @@ pub unsafe fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
-    simd_reduce_mul_unordered(simd_select_bitmask(
-        k,
-        a.as_i64x8(),
-        _mm512_set1_epi64(1).as_i64x8(),
-    ))
+pub fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe {
+        simd_reduce_mul_unordered(simd_select_bitmask(
+            k,
+            a.as_i64x8(),
+            _mm512_set1_epi64(1).as_i64x8(),
+        ))
+    }
 }
 
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
@@ -31691,15 +33327,17 @@ pub unsafe fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_mul_ps(a: __m512) -> f32 {
-    // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
-    let a = _mm256_mul_ps(
-        simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-        simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
-    );
-    let a = _mm_mul_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
-    let a = _mm_mul_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
-    simd_extract::<_, f32>(a, 0) * simd_extract::<_, f32>(a, 1)
+pub fn _mm512_reduce_mul_ps(a: __m512) -> f32 {
+    unsafe {
+        // we have to use `simd_shuffle` here because `_mm512_extractf32x8_ps` is in AVX512DQ
+        let a = _mm256_mul_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_mul_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_mul_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        simd_extract::<_, f32>(a, 0) * simd_extract::<_, f32>(a, 1)
+    }
 }
 
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -31708,8 +33346,8 @@ pub unsafe fn _mm512_reduce_mul_ps(a: __m512) -> f32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 {
-    _mm512_reduce_mul_ps(simd_select_bitmask(k, a, _mm512_set1_ps(1.)))
+pub fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 {
+    unsafe { _mm512_reduce_mul_ps(simd_select_bitmask(k, a, _mm512_set1_ps(1.))) }
 }
 
 /// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
@@ -31718,13 +33356,15 @@ pub unsafe fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_mul_pd(a: __m512d) -> f64 {
-    let a = _mm256_mul_pd(
-        _mm512_extractf64x4_pd::<0>(a),
-        _mm512_extractf64x4_pd::<1>(a),
-    );
-    let a = _mm_mul_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
-    simd_extract::<_, f64>(a, 0) * simd_extract::<_, f64>(a, 1)
+pub fn _mm512_reduce_mul_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_mul_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_mul_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        simd_extract::<_, f64>(a, 0) * simd_extract::<_, f64>(a, 1)
+    }
 }
 
 /// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
@@ -31733,8 +33373,8 @@ pub unsafe fn _mm512_reduce_mul_pd(a: __m512d) -> f64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_mul_pd(k: __mmask8, a: __m512d) -> f64 {
-    _mm512_reduce_mul_pd(simd_select_bitmask(k, a, _mm512_set1_pd(1.)))
+pub fn _mm512_mask_reduce_mul_pd(k: __mmask8, a: __m512d) -> f64 {
+    unsafe { _mm512_reduce_mul_pd(simd_select_bitmask(k, a, _mm512_set1_pd(1.))) }
 }
 
 /// Reduce the packed signed 32-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -31743,8 +33383,8 @@ pub unsafe fn _mm512_mask_reduce_mul_pd(k: __mmask8, a: __m512d) -> f64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_max_epi32(a: __m512i) -> i32 {
-    simd_reduce_max(a.as_i32x16())
+pub fn _mm512_reduce_max_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_max(a.as_i32x16()) }
 }
 
 /// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -31753,12 +33393,14 @@ pub unsafe fn _mm512_reduce_max_epi32(a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 {
-    simd_reduce_max(simd_select_bitmask(
-        k,
-        a.as_i32x16(),
-        i32x16::splat(i32::MIN),
-    ))
+pub fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe {
+        simd_reduce_max(simd_select_bitmask(
+            k,
+            a.as_i32x16(),
+            i32x16::splat(i32::MIN),
+        ))
+    }
 }
 
 /// Reduce the packed signed 64-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -31767,8 +33409,8 @@ pub unsafe fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_max_epi64(a: __m512i) -> i64 {
-    simd_reduce_max(a.as_i64x8())
+pub fn _mm512_reduce_max_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_max(a.as_i64x8()) }
 }
 
 /// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -31777,8 +33419,8 @@ pub unsafe fn _mm512_reduce_max_epi64(a: __m512i) -> i64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 {
-    simd_reduce_max(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(i64::MIN)))
+pub fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(i64::MIN))) }
 }
 
 /// Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -31787,8 +33429,8 @@ pub unsafe fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_max_epu32(a: __m512i) -> u32 {
-    simd_reduce_max(a.as_u32x16())
+pub fn _mm512_reduce_max_epu32(a: __m512i) -> u32 {
+    unsafe { simd_reduce_max(a.as_u32x16()) }
 }
 
 /// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -31797,8 +33439,8 @@ pub unsafe fn _mm512_reduce_max_epu32(a: __m512i) -> u32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 {
-    simd_reduce_max(simd_select_bitmask(k, a.as_u32x16(), u32x16::ZERO))
+pub fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u32x16(), u32x16::ZERO)) }
 }
 
 /// Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a.
@@ -31807,8 +33449,8 @@ pub unsafe fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_max_epu64(a: __m512i) -> u64 {
-    simd_reduce_max(a.as_u64x8())
+pub fn _mm512_reduce_max_epu64(a: __m512i) -> u64 {
+    unsafe { simd_reduce_max(a.as_u64x8()) }
 }
 
 /// Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -31817,8 +33459,8 @@ pub unsafe fn _mm512_reduce_max_epu64(a: __m512i) -> u64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
-    simd_reduce_max(simd_select_bitmask(k, a.as_u64x8(), u64x8::ZERO))
+pub fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
+    unsafe { simd_reduce_max(simd_select_bitmask(k, a.as_u64x8(), u64x8::ZERO)) }
 }
 
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
@@ -31827,14 +33469,16 @@ pub unsafe fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_max_ps(a: __m512) -> f32 {
-    let a = _mm256_max_ps(
-        simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-        simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
-    );
-    let a = _mm_max_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
-    let a = _mm_max_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
-    _mm_cvtss_f32(_mm_max_ss(a, _mm_movehdup_ps(a)))
+pub fn _mm512_reduce_max_ps(a: __m512) -> f32 {
+    unsafe {
+        let a = _mm256_max_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_max_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_max_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        _mm_cvtss_f32(_mm_max_ss(a, _mm_movehdup_ps(a)))
+    }
 }
 
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -31843,7 +33487,7 @@ pub unsafe fn _mm512_reduce_max_ps(a: __m512) -> f32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
+pub fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
     _mm512_reduce_max_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MIN), k, a))
 }
 
@@ -31853,13 +33497,15 @@ pub unsafe fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
-    let a = _mm256_max_pd(
-        _mm512_extractf64x4_pd::<0>(a),
-        _mm512_extractf64x4_pd::<1>(a),
-    );
-    let a = _mm_max_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
-    _mm_cvtsd_f64(_mm_max_sd(a, simd_shuffle!(a, a, [1, 0])))
+pub fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_max_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_max_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        _mm_cvtsd_f64(_mm_max_sd(a, simd_shuffle!(a, a, [1, 0])))
+    }
 }
 
 /// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
@@ -31868,7 +33514,7 @@ pub unsafe fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_max_pd(k: __mmask8, a: __m512d) -> f64 {
+pub fn _mm512_mask_reduce_max_pd(k: __mmask8, a: __m512d) -> f64 {
     _mm512_reduce_max_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MIN), k, a))
 }
 
@@ -31878,8 +33524,8 @@ pub unsafe fn _mm512_mask_reduce_max_pd(k: __mmask8, a: __m512d) -> f64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_min_epi32(a: __m512i) -> i32 {
-    simd_reduce_min(a.as_i32x16())
+pub fn _mm512_reduce_min_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_min(a.as_i32x16()) }
 }
 
 /// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
@@ -31888,12 +33534,14 @@ pub unsafe fn _mm512_reduce_min_epi32(a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 {
-    simd_reduce_min(simd_select_bitmask(
-        k,
-        a.as_i32x16(),
-        i32x16::splat(i32::MAX),
-    ))
+pub fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe {
+        simd_reduce_min(simd_select_bitmask(
+            k,
+            a.as_i32x16(),
+            i32x16::splat(i32::MAX),
+        ))
+    }
 }
 
 /// Reduce the packed signed 64-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -31902,8 +33550,8 @@ pub unsafe fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_min_epi64(a: __m512i) -> i64 {
-    simd_reduce_min(a.as_i64x8())
+pub fn _mm512_reduce_min_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_min(a.as_i64x8()) }
 }
 
 /// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
@@ -31912,8 +33560,8 @@ pub unsafe fn _mm512_reduce_min_epi64(a: __m512i) -> i64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 {
-    simd_reduce_min(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(i64::MAX)))
+pub fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(i64::MAX))) }
 }
 
 /// Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -31922,8 +33570,8 @@ pub unsafe fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_min_epu32(a: __m512i) -> u32 {
-    simd_reduce_min(a.as_u32x16())
+pub fn _mm512_reduce_min_epu32(a: __m512i) -> u32 {
+    unsafe { simd_reduce_min(a.as_u32x16()) }
 }
 
 /// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
@@ -31932,12 +33580,14 @@ pub unsafe fn _mm512_reduce_min_epu32(a: __m512i) -> u32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 {
-    simd_reduce_min(simd_select_bitmask(
-        k,
-        a.as_u32x16(),
-        u32x16::splat(u32::MAX),
-    ))
+pub fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 {
+    unsafe {
+        simd_reduce_min(simd_select_bitmask(
+            k,
+            a.as_u32x16(),
+            u32x16::splat(u32::MAX),
+        ))
+    }
 }
 
 /// Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a.
@@ -31946,8 +33596,8 @@ pub unsafe fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_min_epu64(a: __m512i) -> u64 {
-    simd_reduce_min(a.as_u64x8())
+pub fn _mm512_reduce_min_epu64(a: __m512i) -> u64 {
+    unsafe { simd_reduce_min(a.as_u64x8()) }
 }
 
 /// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
@@ -31956,8 +33606,8 @@ pub unsafe fn _mm512_reduce_min_epu64(a: __m512i) -> u64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
-    simd_reduce_min(simd_select_bitmask(k, a.as_u64x8(), u64x8::splat(u64::MAX)))
+pub fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
+    unsafe { simd_reduce_min(simd_select_bitmask(k, a.as_u64x8(), u64x8::splat(u64::MAX))) }
 }
 
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
@@ -31966,14 +33616,16 @@ pub unsafe fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_min_ps(a: __m512) -> f32 {
-    let a = _mm256_min_ps(
-        simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
-        simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
-    );
-    let a = _mm_min_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
-    let a = _mm_min_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
-    _mm_cvtss_f32(_mm_min_ss(a, _mm_movehdup_ps(a)))
+pub fn _mm512_reduce_min_ps(a: __m512) -> f32 {
+    unsafe {
+        let a = _mm256_min_ps(
+            simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]),
+            simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]),
+        );
+        let a = _mm_min_ps(_mm256_extractf128_ps::<0>(a), _mm256_extractf128_ps::<1>(a));
+        let a = _mm_min_ps(a, simd_shuffle!(a, a, [2, 3, 0, 1]));
+        _mm_cvtss_f32(_mm_min_ss(a, _mm_movehdup_ps(a)))
+    }
 }
 
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
@@ -31982,7 +33634,7 @@ pub unsafe fn _mm512_reduce_min_ps(a: __m512) -> f32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
+pub fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
     _mm512_reduce_min_ps(_mm512_mask_mov_ps(_mm512_set1_ps(f32::MAX), k, a))
 }
 
@@ -31992,13 +33644,15 @@ pub unsafe fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
-    let a = _mm256_min_pd(
-        _mm512_extractf64x4_pd::<0>(a),
-        _mm512_extractf64x4_pd::<1>(a),
-    );
-    let a = _mm_min_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
-    _mm_cvtsd_f64(_mm_min_sd(a, simd_shuffle!(a, a, [1, 0])))
+pub fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
+    unsafe {
+        let a = _mm256_min_pd(
+            _mm512_extractf64x4_pd::<0>(a),
+            _mm512_extractf64x4_pd::<1>(a),
+        );
+        let a = _mm_min_pd(_mm256_extractf128_pd::<0>(a), _mm256_extractf128_pd::<1>(a));
+        _mm_cvtsd_f64(_mm_min_sd(a, simd_shuffle!(a, a, [1, 0])))
+    }
 }
 
 /// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
@@ -32007,7 +33661,7 @@ pub unsafe fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_min_pd(k: __mmask8, a: __m512d) -> f64 {
+pub fn _mm512_mask_reduce_min_pd(k: __mmask8, a: __m512d) -> f64 {
     _mm512_reduce_min_pd(_mm512_mask_mov_pd(_mm512_set1_pd(f64::MAX), k, a))
 }
 
@@ -32017,8 +33671,8 @@ pub unsafe fn _mm512_mask_reduce_min_pd(k: __mmask8, a: __m512d) -> f64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_and_epi32(a: __m512i) -> i32 {
-    simd_reduce_and(a.as_i32x16())
+pub fn _mm512_reduce_and_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_and(a.as_i32x16()) }
 }
 
 /// Reduce the packed 32-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
@@ -32027,8 +33681,8 @@ pub unsafe fn _mm512_reduce_and_epi32(a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 {
-    simd_reduce_and(simd_select_bitmask(k, a.as_i32x16(), i32x16::splat(-1)))
+pub fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe { simd_reduce_and(simd_select_bitmask(k, a.as_i32x16(), i32x16::splat(-1))) }
 }
 
 /// Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
@@ -32037,8 +33691,8 @@ pub unsafe fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_and_epi64(a: __m512i) -> i64 {
-    simd_reduce_and(a.as_i64x8())
+pub fn _mm512_reduce_and_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_and(a.as_i64x8()) }
 }
 
 /// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
@@ -32047,8 +33701,8 @@ pub unsafe fn _mm512_reduce_and_epi64(a: __m512i) -> i64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 {
-    simd_reduce_and(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(-1)))
+pub fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_and(simd_select_bitmask(k, a.as_i64x8(), i64x8::splat(-1))) }
 }
 
 /// Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
@@ -32057,8 +33711,8 @@ pub unsafe fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_or_epi32(a: __m512i) -> i32 {
-    simd_reduce_or(a.as_i32x16())
+pub fn _mm512_reduce_or_epi32(a: __m512i) -> i32 {
+    unsafe { simd_reduce_or(a.as_i32x16()) }
 }
 
 /// Reduce the packed 32-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
@@ -32067,8 +33721,8 @@ pub unsafe fn _mm512_reduce_or_epi32(a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 {
-    simd_reduce_or(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO))
+pub fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i32x16(), i32x16::ZERO)) }
 }
 
 /// Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
@@ -32077,8 +33731,8 @@ pub unsafe fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_reduce_or_epi64(a: __m512i) -> i64 {
-    simd_reduce_or(a.as_i64x8())
+pub fn _mm512_reduce_or_epi64(a: __m512i) -> i64 {
+    unsafe { simd_reduce_or(a.as_i64x8()) }
 }
 
 /// Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
@@ -32087,8 +33741,8 @@ pub unsafe fn _mm512_reduce_or_epi64(a: __m512i) -> i64 {
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 {
-    simd_reduce_or(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO))
+pub fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 {
+    unsafe { simd_reduce_or(simd_select_bitmask(k, a.as_i64x8(), i64x8::ZERO)) }
 }
 
 /// Returns vector of type `__m512d` with indeterminate elements.
@@ -32100,8 +33754,8 @@ pub unsafe fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 // This intrinsic has no corresponding instruction.
-pub unsafe fn _mm512_undefined_pd() -> __m512d {
-    const { mem::zeroed() }
+pub fn _mm512_undefined_pd() -> __m512d {
+    unsafe { const { mem::zeroed() } }
 }
 
 /// Returns vector of type `__m512` with indeterminate elements.
@@ -32113,8 +33767,8 @@ pub unsafe fn _mm512_undefined_pd() -> __m512d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 // This intrinsic has no corresponding instruction.
-pub unsafe fn _mm512_undefined_ps() -> __m512 {
-    const { mem::zeroed() }
+pub fn _mm512_undefined_ps() -> __m512 {
+    unsafe { const { mem::zeroed() } }
 }
 
 /// Return vector of type __m512i with indeterminate elements.
@@ -32126,8 +33780,8 @@ pub unsafe fn _mm512_undefined_ps() -> __m512 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 // This intrinsic has no corresponding instruction.
-pub unsafe fn _mm512_undefined_epi32() -> __m512i {
-    const { mem::zeroed() }
+pub fn _mm512_undefined_epi32() -> __m512i {
+    unsafe { const { mem::zeroed() } }
 }
 
 /// Return vector of type __m512 with indeterminate elements.
@@ -32139,8 +33793,8 @@ pub unsafe fn _mm512_undefined_epi32() -> __m512i {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 // This intrinsic has no corresponding instruction.
-pub unsafe fn _mm512_undefined() -> __m512 {
-    const { mem::zeroed() }
+pub fn _mm512_undefined() -> __m512 {
+    unsafe { const { mem::zeroed() } }
 }
 
 /// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
@@ -34384,7 +36038,7 @@ pub unsafe fn _mm_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_setr_pd(
+pub fn _mm512_setr_pd(
     e0: f64,
     e1: f64,
     e2: f64,
@@ -34394,8 +36048,10 @@ pub unsafe fn _mm512_setr_pd(
     e6: f64,
     e7: f64,
 ) -> __m512d {
-    let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
-    transmute(r)
+    unsafe {
+        let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
+        transmute(r)
+    }
 }
 
 /// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
@@ -34404,7 +36060,7 @@ pub unsafe fn _mm512_setr_pd(
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm512_set_pd(
+pub fn _mm512_set_pd(
     e0: f64,
     e1: f64,
     e2: f64,
@@ -34424,13 +36080,15 @@ pub unsafe fn _mm512_set_pd(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovss))]
-pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract!(src, 0);
-    let mut mov: f32 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        mov = simd_extract!(b, 0);
+pub fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut mov: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
     }
-    simd_insert!(a, 0, mov)
 }
 
 /// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34440,12 +36098,14 @@ pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovss))]
-pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mut mov: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        mov = simd_extract!(b, 0);
+pub fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut mov: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
     }
-    simd_insert!(a, 0, mov)
 }
 
 /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34455,13 +36115,15 @@ pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovsd))]
-pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract!(src, 0);
-    let mut mov: f64 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        mov = simd_extract!(b, 0);
+pub fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut mov: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
     }
-    simd_insert!(a, 0, mov)
 }
 
 /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34471,12 +36133,14 @@ pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovsd))]
-pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mut mov: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        mov = simd_extract!(b, 0);
+pub fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut mov: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
     }
-    simd_insert!(a, 0, mov)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34486,15 +36150,17 @@ pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddss))]
-pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract!(src, 0);
-    let mut add: f32 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        add = extracta + extractb;
+pub fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34504,14 +36170,16 @@ pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddss))]
-pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mut add: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        add = extracta + extractb;
+pub fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34521,15 +36189,17 @@ pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddsd))]
-pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract!(src, 0);
-    let mut add: f64 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        add = extracta + extractb;
+pub fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34539,14 +36209,16 @@ pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddsd))]
-pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mut add: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        add = extracta + extractb;
+pub fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta + extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34556,15 +36228,17 @@ pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubss))]
-pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract!(src, 0);
-    let mut add: f32 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        add = extracta - extractb;
+pub fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34574,14 +36248,16 @@ pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubss))]
-pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mut add: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        add = extracta - extractb;
+pub fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34591,15 +36267,17 @@ pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubsd))]
-pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract!(src, 0);
-    let mut add: f64 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        add = extracta - extractb;
+pub fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34609,14 +36287,16 @@ pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubsd))]
-pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mut add: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        add = extracta - extractb;
+pub fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta - extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34626,15 +36306,17 @@ pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulss))]
-pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract!(src, 0);
-    let mut add: f32 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        add = extracta * extractb;
+pub fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34644,14 +36326,16 @@ pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulss))]
-pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mut add: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        add = extracta * extractb;
+pub fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34661,15 +36345,17 @@ pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulsd))]
-pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract!(src, 0);
-    let mut add: f64 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        add = extracta * extractb;
+pub fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34679,14 +36365,16 @@ pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulsd))]
-pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mut add: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        add = extracta * extractb;
+pub fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta * extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34696,15 +36384,17 @@ pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivss))]
-pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract!(src, 0);
-    let mut add: f32 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        add = extracta / extractb;
+pub fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let extractsrc: f32 = simd_extract!(src, 0);
+        let mut add: f32 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34714,14 +36404,16 @@ pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivss))]
-pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let mut add: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        add = extracta / extractb;
+pub fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let mut add: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34731,15 +36423,17 @@ pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivsd))]
-pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract!(src, 0);
-    let mut add: f64 = extractsrc;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        add = extracta / extractb;
+pub fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let extractsrc: f64 = simd_extract!(src, 0);
+        let mut add: f64 = extractsrc;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34749,14 +36443,16 @@ pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivsd))]
-pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let mut add: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        add = extracta / extractb;
+pub fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        let mut add: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            add = extracta / extractb;
+        }
+        simd_insert!(a, 0, add)
     }
-    simd_insert!(a, 0, add)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34766,14 +36462,16 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxss))]
-pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vmaxss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vmaxss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34783,14 +36481,16 @@ pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxss))]
-pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vmaxss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        f32x4::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vmaxss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34800,14 +36500,16 @@ pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxsd))]
-pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vmaxsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vmaxsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34817,14 +36519,16 @@ pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxsd))]
-pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vmaxsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        f64x2::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vmaxsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34834,14 +36538,16 @@ pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminss))]
-pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vminss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vminss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34851,14 +36557,16 @@ pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminss))]
-pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vminss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        f32x4::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vminss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34868,14 +36576,16 @@ pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminsd))]
-pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vminsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vminsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34885,14 +36595,16 @@ pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminsd))]
-pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vminsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        f64x2::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vminsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34902,8 +36614,8 @@ pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtss))]
-pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    vsqrtss(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { vsqrtss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34913,8 +36625,8 @@ pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtss))]
-pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    vsqrtss(a, b, _mm_setzero_ps(), k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { vsqrtss(a, b, _mm_setzero_ps(), k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34924,8 +36636,8 @@ pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtsd))]
-pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    vsqrtsd(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { vsqrtsd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34935,8 +36647,8 @@ pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtsd))]
-pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    vsqrtsd(a, b, _mm_setzero_pd(), k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { vsqrtsd(a, b, _mm_setzero_pd(), k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -34946,8 +36658,8 @@ pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ss))]
-pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
-    transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, 0b1))
+pub fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, 0b1)) }
 }
 
 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -34957,8 +36669,8 @@ pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ss))]
-pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+pub fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
 }
 
 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -34968,8 +36680,8 @@ pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14ss))]
-pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k))
+pub fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -34979,8 +36691,8 @@ pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14sd))]
-pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, 0b1))
+pub fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, 0b1)) }
 }
 
 /// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -34990,8 +36702,8 @@ pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14sd))]
-pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+pub fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
 }
 
 /// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -35001,8 +36713,8 @@ pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m1
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrsqrt14sd))]
-pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k))
+pub fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -35012,8 +36724,8 @@ pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m12
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ss))]
-pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
-    transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, 0b1))
+pub fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, 0b1)) }
 }
 
 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -35023,8 +36735,8 @@ pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ss))]
-pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+pub fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) }
 }
 
 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -35034,8 +36746,8 @@ pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14ss))]
-pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k))
+pub fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe { transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), f32x4::ZERO, k)) }
 }
 
 /// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -35045,8 +36757,8 @@ pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14sd))]
-pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, 0b1))
+pub fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, 0b1)) }
 }
 
 /// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -35056,8 +36768,8 @@ pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14sd))]
-pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+pub fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) }
 }
 
 /// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
@@ -35067,8 +36779,8 @@ pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrcp14sd))]
-pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k))
+pub fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe { transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), f64x2::ZERO, k)) }
 }
 
 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
@@ -35078,14 +36790,16 @@ pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpss))]
-pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
-    transmute(vgetexpss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        f32x4::ZERO,
-        0b1,
-        _MM_FROUND_NO_EXC,
-    ))
+pub fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vgetexpss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            0b1,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
 }
 
 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
@@ -35095,14 +36809,16 @@ pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpss))]
-pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vgetexpss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        src.as_f32x4(),
-        k,
-        _MM_FROUND_NO_EXC,
-    ))
+pub fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vgetexpss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
 }
 
 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
@@ -35112,14 +36828,16 @@ pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpss))]
-pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vgetexpss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        f32x4::ZERO,
-        k,
-        _MM_FROUND_NO_EXC,
-    ))
+pub fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vgetexpss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
 }
 
 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
@@ -35129,14 +36847,16 @@ pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpsd))]
-pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
-    transmute(vgetexpsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        f64x2::ZERO,
-        0b1,
-        _MM_FROUND_NO_EXC,
-    ))
+pub fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vgetexpsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            0b1,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
 }
 
 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
@@ -35146,14 +36866,16 @@ pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpsd))]
-pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vgetexpsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        _MM_FROUND_NO_EXC,
-    ))
+pub fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vgetexpsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
 }
 
 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
@@ -35163,14 +36885,16 @@ pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m12
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpsd))]
-pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vgetexpsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        f64x2::ZERO,
-        k,
-        _MM_FROUND_NO_EXC,
-    ))
+pub fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vgetexpsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_NO_EXC,
+        ))
+    }
 }
 
 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -35191,26 +36915,25 @@ pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_getmant_ss<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+pub fn _mm_getmant_ss<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vgetmantss(
-        a,
-        b,
-        SIGN << 2 | NORM,
-        f32x4::ZERO,
-        0b1,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f32x4::ZERO,
+            0b1,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -35231,7 +36954,7 @@ pub unsafe fn _mm_getmant_ss<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm_mask_getmant_ss<
+pub fn _mm_mask_getmant_ss<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -35240,13 +36963,15 @@ pub unsafe fn _mm_mask_getmant_ss<
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -35267,7 +36992,7 @@ pub unsafe fn _mm_mask_getmant_ss<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_maskz_getmant_ss<
+pub fn _mm_maskz_getmant_ss<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -35275,19 +37000,21 @@ pub unsafe fn _mm_maskz_getmant_ss<
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vgetmantss(
-        a,
-        b,
-        SIGN << 2 | NORM,
-        f32x4::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -35308,26 +37035,25 @@ pub unsafe fn _mm_maskz_getmant_ss<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_getmant_sd<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+pub fn _mm_getmant_sd<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vgetmantsd(
-        a,
-        b,
-        SIGN << 2 | NORM,
-        f64x2::ZERO,
-        0b1,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f64x2::ZERO,
+            0b1,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -35348,7 +37074,7 @@ pub unsafe fn _mm_getmant_sd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm_mask_getmant_sd<
+pub fn _mm_mask_getmant_sd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -35357,13 +37083,15 @@ pub unsafe fn _mm_mask_getmant_sd<
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -35384,7 +37112,7 @@ pub unsafe fn _mm_mask_getmant_sd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_maskz_getmant_sd<
+pub fn _mm_maskz_getmant_sd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -35392,19 +37120,21 @@ pub unsafe fn _mm_maskz_getmant_sd<
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vgetmantsd(
-        a,
-        b,
-        SIGN << 2 | NORM,
-        f64x2::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(
+            a,
+            b,
+            SIGN << 2 | NORM,
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
 }
 
 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35421,19 +37151,21 @@ pub unsafe fn _mm_maskz_getmant_sd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 255))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_roundscale_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vrndscaless(
-        a,
-        b,
-        f32x4::ZERO,
-        0b11111111,
-        IMM8,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    transmute(r)
+pub fn _mm_roundscale_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(
+            a,
+            b,
+            f32x4::ZERO,
+            0b11111111,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
 }
 
 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35450,18 +37182,20 @@ pub unsafe fn _mm_roundscale_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_roundscale_ss<const IMM8: i32>(
+pub fn _mm_mask_roundscale_ss<const IMM8: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vrndscaless(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vrndscaless(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -35478,16 +37212,14 @@ pub unsafe fn _mm_mask_roundscale_ss<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_roundscale_ss<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vrndscaless(a, b, f32x4::ZERO, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub fn _mm_maskz_roundscale_ss<const IMM8: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(a, b, f32x4::ZERO, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -35504,19 +37236,21 @@ pub unsafe fn _mm_maskz_roundscale_ss<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 255))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_roundscale_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vrndscalesd(
-        a,
-        b,
-        f64x2::ZERO,
-        0b11111111,
-        IMM8,
-        _MM_FROUND_CUR_DIRECTION,
-    );
-    transmute(r)
+pub fn _mm_roundscale_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(
+            a,
+            b,
+            f64x2::ZERO,
+            0b11111111,
+            IMM8,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        transmute(r)
+    }
 }
 
 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -35533,18 +37267,20 @@ pub unsafe fn _mm_roundscale_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_roundscale_sd<const IMM8: i32>(
+pub fn _mm_mask_roundscale_sd<const IMM8: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vrndscalesd(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vrndscalesd(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -35561,16 +37297,14 @@ pub unsafe fn _mm_mask_roundscale_sd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_roundscale_sd<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vrndscalesd(a, b, f64x2::ZERO, k, IMM8, _MM_FROUND_CUR_DIRECTION);
-    transmute(r)
+pub fn _mm_maskz_roundscale_sd<const IMM8: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(a, b, f64x2::ZERO, k, IMM8, _MM_FROUND_CUR_DIRECTION);
+        transmute(r)
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35580,16 +37314,18 @@ pub unsafe fn _mm_maskz_roundscale_sd<const IMM8: i32>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefss))]
-pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    transmute(vscalefss(
-        a,
-        b,
-        f32x4::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        transmute(vscalefss(
+            a,
+            b,
+            f32x4::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35599,11 +37335,13 @@ pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefss))]
-pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION))
+pub fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION))
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35613,14 +37351,16 @@ pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefss))]
-pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    transmute(vscalefss(
-        a.as_f32x4(),
-        b.as_f32x4(),
-        f32x4::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        transmute(vscalefss(
+            a.as_f32x4(),
+            b.as_f32x4(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -35630,14 +37370,16 @@ pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefsd))]
-pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
-    transmute(vscalefsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        f64x2::ZERO,
-        0b11111111,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            0b11111111,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35647,14 +37389,16 @@ pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefsd))]
-pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vscalefsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        src.as_f64x2(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35664,14 +37408,16 @@ pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m12
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefsd))]
-pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    transmute(vscalefsd(
-        a.as_f64x2(),
-        b.as_f64x2(),
-        f64x2::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        transmute(vscalefsd(
+            a.as_f64x2(),
+            b.as_f64x2(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35681,14 +37427,16 @@ pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))]
-pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fmadd: f32 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        fmadd = fmaf32(fmadd, extractb, extractc);
+pub fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = fmaf32(fmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35698,15 +37446,17 @@ pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))]
-pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let mut fmadd: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        fmadd = fmaf32(extracta, extractb, extractc);
+pub fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -35716,14 +37466,16 @@ pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))]
-pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fmadd: f32 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        fmadd = fmaf32(extracta, extractb, fmadd);
+pub fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            fmadd = fmaf32(extracta, extractb, fmadd);
+        }
+        simd_insert!(c, 0, fmadd)
     }
-    simd_insert!(c, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35733,14 +37485,16 @@ pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))]
-pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fmadd: f64 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        fmadd = fmaf64(fmadd, extractb, extractc);
+pub fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = fmaf64(fmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35750,15 +37504,17 @@ pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))]
-pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let mut fmadd: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        fmadd = fmaf64(extracta, extractb, extractc);
+pub fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -35768,14 +37524,16 @@ pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd))]
-pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fmadd: f64 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        fmadd = fmaf64(extracta, extractb, fmadd);
+pub fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            fmadd = fmaf64(extracta, extractb, fmadd);
+        }
+        simd_insert!(c, 0, fmadd)
     }
-    simd_insert!(c, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35785,15 +37543,17 @@ pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))]
-pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fmsub: f32 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fmsub = fmaf32(fmsub, extractb, extractc);
+pub fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf32(fmsub, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35803,16 +37563,18 @@ pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))]
-pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let mut fmsub: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fmsub = fmaf32(extracta, extractb, extractc);
+pub fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -35822,15 +37584,17 @@ pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))]
-pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fmsub: f32 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc = -fmsub;
-        fmsub = fmaf32(extracta, extractb, extractc);
+pub fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fmsub)
     }
-    simd_insert!(c, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35840,15 +37604,17 @@ pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))]
-pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fmsub: f64 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fmsub = fmaf64(fmsub, extractb, extractc);
+pub fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf64(fmsub, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35858,16 +37624,18 @@ pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))]
-pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let mut fmsub: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fmsub = fmaf64(extracta, extractb, extractc);
+pub fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -35877,15 +37645,17 @@ pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub))]
-pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fmsub: f64 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc = -fmsub;
-        fmsub = fmaf64(extracta, extractb, extractc);
+pub fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fmsub)
     }
-    simd_insert!(c, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35895,15 +37665,17 @@ pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
-pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fnmadd: f32 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta = -fnmadd;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        fnmadd = fmaf32(extracta, extractb, extractc);
+pub fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35913,16 +37685,18 @@ pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
-pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let mut fnmadd: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        fnmadd = fmaf32(extracta, extractb, extractc);
+pub fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -35932,15 +37706,17 @@ pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
-pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fnmadd: f32 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract!(b, 0);
-        fnmadd = fmaf32(extracta, extractb, fnmadd);
+pub fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fnmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            fnmadd = fmaf32(extracta, extractb, fnmadd);
+        }
+        simd_insert!(c, 0, fnmadd)
     }
-    simd_insert!(c, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35950,15 +37726,17 @@ pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
-pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fnmadd: f64 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta = -fnmadd;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        fnmadd = fmaf64(extracta, extractb, extractc);
+pub fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35968,16 +37746,18 @@ pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
-pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let mut fnmadd: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        fnmadd = fmaf64(extracta, extractb, extractc);
+pub fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -35987,15 +37767,17 @@ pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
-pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fnmadd: f64 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract!(b, 0);
-        fnmadd = fmaf64(extracta, extractb, fnmadd);
+pub fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fnmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            fnmadd = fmaf64(extracta, extractb, fnmadd);
+        }
+        simd_insert!(c, 0, fnmadd)
     }
-    simd_insert!(c, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -36005,16 +37787,18 @@ pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
-pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fnmsub: f32 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta = -fnmsub;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fnmsub = fmaf32(extracta, extractb, extractc);
+pub fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -36024,17 +37808,19 @@ pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
-pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
-    let mut fnmsub: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fnmsub = fmaf32(extracta, extractb, extractc);
+pub fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        let mut fnmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -36044,16 +37830,18 @@ pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
-pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fnmsub: f32 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc = -fnmsub;
-        fnmsub = fmaf32(extracta, extractb, extractc);
+pub fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    unsafe {
+        let mut fnmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = fmaf32(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fnmsub)
     }
-    simd_insert!(c, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -36063,16 +37851,18 @@ pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
-pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fnmsub: f64 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta = -fnmsub;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fnmsub = fmaf64(extracta, extractb, extractc);
+pub fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -36082,17 +37872,19 @@ pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
-pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
-    let mut fnmsub: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fnmsub = fmaf64(extracta, extractb, extractc);
+pub fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        let mut fnmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -36102,16 +37894,18 @@ pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
-pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fnmsub: f64 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc = -fnmsub;
-        fnmsub = fmaf64(extracta, extractb, extractc);
+pub fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    unsafe {
+        let mut fnmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = fmaf64(extracta, extractb, extractc);
+        }
+        simd_insert!(c, 0, fnmsub)
     }
-    simd_insert!(c, 0, fnmsub)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36129,12 +37923,14 @@ pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vaddss(a, b, f32x4::ZERO, 0b1, ROUNDING);
-    transmute(r)
+pub fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vaddss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36152,18 +37948,20 @@ pub unsafe fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_add_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask_add_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vaddss(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vaddss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36181,16 +37979,14 @@ pub unsafe fn _mm_mask_add_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_add_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vaddss(a, b, f32x4::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm_maskz_add_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vaddss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36208,12 +38004,14 @@ pub unsafe fn _mm_maskz_add_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vaddsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
-    transmute(r)
+pub fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vaddsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36231,18 +38029,20 @@ pub unsafe fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_add_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask_add_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vaddsd(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vaddsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36260,16 +38060,14 @@ pub unsafe fn _mm_mask_add_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_add_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vaddsd(a, b, f64x2::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm_maskz_add_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vaddsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36287,12 +38085,14 @@ pub unsafe fn _mm_maskz_add_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vsubss(a, b, f32x4::ZERO, 0b1, ROUNDING);
-    transmute(r)
+pub fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vsubss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36310,18 +38110,20 @@ pub unsafe fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vsubss(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vsubss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36339,16 +38141,14 @@ pub unsafe fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vsubss(a, b, f32x4::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vsubss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36366,12 +38166,14 @@ pub unsafe fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vsubsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
-    transmute(r)
+pub fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vsubsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36389,18 +38191,20 @@ pub unsafe fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vsubsd(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vsubsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36418,16 +38222,14 @@ pub unsafe fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vsubsd(a, b, f64x2::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vsubsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36445,12 +38247,14 @@ pub unsafe fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vmulss(a, b, f32x4::ZERO, 0b1, ROUNDING);
-    transmute(r)
+pub fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmulss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36468,18 +38272,20 @@ pub unsafe fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vmulss(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vmulss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36497,16 +38303,14 @@ pub unsafe fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vmulss(a, b, f32x4::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmulss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36524,12 +38328,14 @@ pub unsafe fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vmulsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
-    transmute(r)
+pub fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmulsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36547,18 +38353,20 @@ pub unsafe fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vmulsd(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vmulsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36576,16 +38384,14 @@ pub unsafe fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vmulsd(a, b, f64x2::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmulsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36603,12 +38409,14 @@ pub unsafe fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vdivss(a, b, f32x4::ZERO, 0b1, ROUNDING);
-    transmute(r)
+pub fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vdivss(a, b, f32x4::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36626,18 +38434,20 @@ pub unsafe fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_div_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask_div_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vdivss(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vdivss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36655,16 +38465,14 @@ pub unsafe fn _mm_mask_div_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_div_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vdivss(a, b, f32x4::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm_maskz_div_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vdivss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36682,12 +38490,14 @@ pub unsafe fn _mm_maskz_div_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vdivsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
-    transmute(r)
+pub fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vdivsd(a, b, f64x2::ZERO, 0b1, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36705,18 +38515,20 @@ pub unsafe fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_div_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask_div_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vdivsd(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vdivsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36734,16 +38546,14 @@ pub unsafe fn _mm_mask_div_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_div_round_sd<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vdivsd(a, b, f64x2::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm_maskz_div_round_sd<const ROUNDING: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vdivsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36755,12 +38565,14 @@ pub unsafe fn _mm_maskz_div_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vmaxss(a, b, f32x4::ZERO, 0b1, SAE);
-    transmute(r)
+pub fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmaxss(a, b, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36772,18 +38584,20 @@ pub unsafe fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_max_round_ss<const SAE: i32>(
+pub fn _mm_mask_max_round_ss<const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vmaxss(a, b, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vmaxss(a, b, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36795,12 +38609,14 @@ pub unsafe fn _mm_mask_max_round_ss<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vmaxss(a, b, f32x4::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vmaxss(a, b, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -36812,12 +38628,14 @@ pub unsafe fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b:
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vmaxsd(a, b, f64x2::ZERO, 0b1, SAE);
-    transmute(r)
+pub fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmaxsd(a, b, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36829,18 +38647,20 @@ pub unsafe fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_max_round_sd<const SAE: i32>(
+pub fn _mm_mask_max_round_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vmaxsd(a, b, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vmaxsd(a, b, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36852,16 +38672,14 @@ pub unsafe fn _mm_mask_max_round_sd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_max_round_sd<const SAE: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vmaxsd(a, b, f64x2::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm_maskz_max_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vmaxsd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36873,12 +38691,14 @@ pub unsafe fn _mm_maskz_max_round_sd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminss, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vminss(a, b, f32x4::ZERO, 0b1, SAE);
-    transmute(r)
+pub fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vminss(a, b, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36890,18 +38710,20 @@ pub unsafe fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminss, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_min_round_ss<const SAE: i32>(
+pub fn _mm_mask_min_round_ss<const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vminss(a, b, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vminss(a, b, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36913,12 +38735,14 @@ pub unsafe fn _mm_mask_min_round_ss<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminss, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vminss(a, b, f32x4::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vminss(a, b, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\
@@ -36930,12 +38754,14 @@ pub unsafe fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b:
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vminsd(a, b, f64x2::ZERO, 0b1, SAE);
-    transmute(r)
+pub fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vminsd(a, b, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36947,18 +38773,20 @@ pub unsafe fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_min_round_sd<const SAE: i32>(
+pub fn _mm_mask_min_round_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vminsd(a, b, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vminsd(a, b, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -36970,16 +38798,14 @@ pub unsafe fn _mm_mask_min_round_sd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_min_round_sd<const SAE: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vminsd(a, b, f64x2::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm_maskz_min_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vminsd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -36997,9 +38823,11 @@ pub unsafe fn _mm_maskz_min_round_sd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    vsqrtss(a, b, _mm_setzero_ps(), 0b1, ROUNDING)
+pub fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtss(a, b, _mm_setzero_ps(), 0b1, ROUNDING)
+    }
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37017,14 +38845,16 @@ pub unsafe fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    vsqrtss(a, b, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtss(a, b, src, k, ROUNDING)
+    }
 }
 
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37042,13 +38872,11 @@ pub unsafe fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    vsqrtss(a, b, _mm_setzero_ps(), k, ROUNDING)
+pub fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtss(a, b, _mm_setzero_ps(), k, ROUNDING)
+    }
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -37066,9 +38894,11 @@ pub unsafe fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    vsqrtsd(a, b, _mm_setzero_pd(), 0b1, ROUNDING)
+pub fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsd(a, b, _mm_setzero_pd(), 0b1, ROUNDING)
+    }
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -37086,14 +38916,16 @@ pub unsafe fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    vsqrtsd(a, b, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsd(a, b, src, k, ROUNDING)
+    }
 }
 
 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -37111,13 +38943,15 @@ pub unsafe fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
+pub fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    vsqrtsd(a, b, _mm_setzero_pd(), k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsd(a, b, _mm_setzero_pd(), k, ROUNDING)
+    }
 }
 
 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
@@ -37129,12 +38963,14 @@ pub unsafe fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vgetexpss(a, b, f32x4::ZERO, 0b1, SAE);
-    transmute(r)
+pub fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetexpss(a, b, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
@@ -37146,18 +38982,20 @@ pub unsafe fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m12
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_getexp_round_ss<const SAE: i32>(
+pub fn _mm_mask_getexp_round_ss<const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vgetexpss(a, b, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetexpss(a, b, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
@@ -37169,16 +39007,14 @@ pub unsafe fn _mm_mask_getexp_round_ss<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_getexp_round_ss<const SAE: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vgetexpss(a, b, f32x4::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm_maskz_getexp_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetexpss(a, b, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
@@ -37190,12 +39026,14 @@ pub unsafe fn _mm_maskz_getexp_round_ss<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vgetexpsd(a, b, f64x2::ZERO, 0b1, SAE);
-    transmute(r)
+pub fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetexpsd(a, b, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
@@ -37207,18 +39045,20 @@ pub unsafe fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_getexp_round_sd<const SAE: i32>(
+pub fn _mm_mask_getexp_round_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vgetexpsd(a, b, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetexpsd(a, b, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
@@ -37230,16 +39070,14 @@ pub unsafe fn _mm_mask_getexp_round_sd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_getexp_round_sd<const SAE: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vgetexpsd(a, b, f64x2::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm_maskz_getexp_round_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetexpsd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -37260,7 +39098,7 @@ pub unsafe fn _mm_maskz_getexp_round_sd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(2, 3, 4)]
-pub unsafe fn _mm_getmant_round_ss<
+pub fn _mm_getmant_round_ss<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -37268,13 +39106,15 @@ pub unsafe fn _mm_getmant_round_ss<
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vgetmantss(a, b, SIGN << 2 | NORM, f32x4::ZERO, 0b1, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, f32x4::ZERO, 0b1, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -37295,7 +39135,7 @@ pub unsafe fn _mm_getmant_round_ss<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(4, 5, 6)]
-pub unsafe fn _mm_mask_getmant_round_ss<
+pub fn _mm_mask_getmant_round_ss<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -37305,14 +39145,16 @@ pub unsafe fn _mm_mask_getmant_round_ss<
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -37333,7 +39175,7 @@ pub unsafe fn _mm_mask_getmant_round_ss<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(3, 4, 5)]
-pub unsafe fn _mm_maskz_getmant_round_ss<
+pub fn _mm_maskz_getmant_round_ss<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -37342,13 +39184,15 @@ pub unsafe fn _mm_maskz_getmant_round_ss<
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vgetmantss(a, b, SIGN << 2 | NORM, f32x4::ZERO, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vgetmantss(a, b, SIGN << 2 | NORM, f32x4::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -37369,7 +39213,7 @@ pub unsafe fn _mm_maskz_getmant_round_ss<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(2, 3, 4)]
-pub unsafe fn _mm_getmant_round_sd<
+pub fn _mm_getmant_round_sd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -37377,13 +39221,15 @@ pub unsafe fn _mm_getmant_round_sd<
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vgetmantsd(a, b, SIGN << 2 | NORM, f64x2::ZERO, 0b1, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, f64x2::ZERO, 0b1, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -37404,7 +39250,7 @@ pub unsafe fn _mm_getmant_round_sd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(4, 5, 6)]
-pub unsafe fn _mm_mask_getmant_round_sd<
+pub fn _mm_mask_getmant_round_sd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -37414,14 +39260,16 @@ pub unsafe fn _mm_mask_getmant_round_sd<
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
@@ -37442,7 +39290,7 @@ pub unsafe fn _mm_mask_getmant_round_sd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
 #[rustc_legacy_const_generics(3, 4, 5)]
-pub unsafe fn _mm_maskz_getmant_round_sd<
+pub fn _mm_maskz_getmant_round_sd<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -37451,13 +39299,15 @@ pub unsafe fn _mm_maskz_getmant_round_sd<
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vgetmantsd(a, b, SIGN << 2 | NORM, f64x2::ZERO, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vgetmantsd(a, b, SIGN << 2 | NORM, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37475,16 +39325,15 @@ pub unsafe fn _mm_maskz_getmant_round_sd<
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vrndscaless(a, b, f32x4::ZERO, 0b11111111, IMM8, SAE);
-    transmute(r)
+pub fn _mm_roundscale_round_ss<const IMM8: i32, const SAE: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(a, b, f32x4::ZERO, 0b11111111, IMM8, SAE);
+        transmute(r)
+    }
 }
 
 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37502,19 +39351,21 @@ pub unsafe fn _mm_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm_mask_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
+pub fn _mm_mask_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vrndscaless(a, b, src, k, IMM8, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vrndscaless(a, b, src, k, IMM8, SAE);
+        transmute(r)
+    }
 }
 
 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37532,17 +39383,19 @@ pub unsafe fn _mm_mask_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_maskz_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
+pub fn _mm_maskz_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vrndscaless(a, b, f32x4::ZERO, k, IMM8, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vrndscaless(a, b, f32x4::ZERO, k, IMM8, SAE);
+        transmute(r)
+    }
 }
 
 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -37560,16 +39413,15 @@ pub unsafe fn _mm_maskz_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
-    a: __m128d,
-    b: __m128d,
-) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vrndscalesd(a, b, f64x2::ZERO, 0b11111111, IMM8, SAE);
-    transmute(r)
+pub fn _mm_roundscale_round_sd<const IMM8: i32, const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(a, b, f64x2::ZERO, 0b11111111, IMM8, SAE);
+        transmute(r)
+    }
 }
 
 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -37587,19 +39439,21 @@ pub unsafe fn _mm_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm_mask_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
+pub fn _mm_mask_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vrndscalesd(a, b, src, k, IMM8, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vrndscalesd(a, b, src, k, IMM8, SAE);
+        transmute(r)
+    }
 }
 
 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -37617,17 +39471,19 @@ pub unsafe fn _mm_mask_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_maskz_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
+pub fn _mm_maskz_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vrndscalesd(a, b, f64x2::ZERO, k, IMM8, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vrndscalesd(a, b, f64x2::ZERO, k, IMM8, SAE);
+        transmute(r)
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37645,12 +39501,14 @@ pub unsafe fn _mm_maskz_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_scalef_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vscalefss(a, b, f32x4::ZERO, 0b11111111, ROUNDING);
-    transmute(r)
+pub fn _mm_scalef_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vscalefss(a, b, f32x4::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37668,18 +39526,20 @@ pub unsafe fn _mm_scalef_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_scalef_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask_scalef_round_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let src = src.as_f32x4();
-    let r = vscalefss(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let src = src.as_f32x4();
+        let r = vscalefss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37697,16 +39557,14 @@ pub unsafe fn _mm_mask_scalef_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_scalef_round_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let r = vscalefss(a, b, f32x4::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm_maskz_scalef_round_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let r = vscalefss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -37724,12 +39582,14 @@ pub unsafe fn _mm_maskz_scalef_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_scalef_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vscalefsd(a, b, f64x2::ZERO, 0b11111111, ROUNDING);
-    transmute(r)
+pub fn _mm_scalef_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vscalefsd(a, b, f64x2::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -37747,17 +39607,19 @@ pub unsafe fn _mm_scalef_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_scalef_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask_scalef_round_sd<const ROUNDING: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let src = src.as_f64x2();
-    let r = vscalefsd(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let src = src.as_f64x2();
+        let r = vscalefsd(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -37775,16 +39637,18 @@ pub unsafe fn _mm_mask_scalef_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
+pub fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let r = vscalefsd(a, b, f64x2::ZERO, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let r = vscalefsd(a, b, f64x2::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37802,13 +39666,15 @@ pub unsafe fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f32 = simd_extract!(a, 0);
-    let extractb: f32 = simd_extract!(b, 0);
-    let extractc: f32 = simd_extract!(c, 0);
-    let r = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-    simd_insert!(a, 0, r)
+pub fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let r = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37826,20 +39692,22 @@ pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fmadd: f32 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        fmadd = vfmaddssround(fmadd, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = vfmaddssround(fmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -37857,21 +39725,23 @@ pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
+pub fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fmadd: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        fmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -37889,20 +39759,22 @@ pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fmadd: f32 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        fmadd = vfmaddssround(extracta, extractb, fmadd, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            fmadd = vfmaddssround(extracta, extractb, fmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fmadd)
     }
-    simd_insert!(c, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -37920,17 +39792,15 @@ pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f64 = simd_extract!(a, 0);
-    let extractb: f64 = simd_extract!(b, 0);
-    let extractc: f64 = simd_extract!(c, 0);
-    let fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-    simd_insert!(a, 0, fmadd)
+pub fn _mm_fmadd_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fmadd)
+    }
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -37948,20 +39818,22 @@ pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fmadd: f64 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        fmadd = vfmaddsdround(fmadd, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = vfmaddsdround(fmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -37979,21 +39851,23 @@ pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
+pub fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fmadd: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -38011,20 +39885,22 @@ pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fmadd: f64 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        fmadd = vfmaddsdround(extracta, extractb, fmadd, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            fmadd = vfmaddsdround(extracta, extractb, fmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fmadd)
     }
-    simd_insert!(c, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38042,14 +39918,16 @@ pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f32 = simd_extract!(a, 0);
-    let extractb: f32 = simd_extract!(b, 0);
-    let extractc: f32 = simd_extract!(c, 0);
-    let extractc = -extractc;
-    let fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-    simd_insert!(a, 0, fmsub)
+pub fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fmsub)
+    }
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38067,21 +39945,23 @@ pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fmsub: f32 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fmsub = vfmaddssround(fmsub, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddssround(fmsub, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38099,22 +39979,24 @@ pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
+pub fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fmsub: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -38132,21 +40014,23 @@ pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fmsub: f32 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc = -fmsub;
-        fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fmsub)
     }
-    simd_insert!(c, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -38164,18 +40048,16 @@ pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f64 = simd_extract!(a, 0);
-    let extractb: f64 = simd_extract!(b, 0);
-    let extractc: f64 = simd_extract!(c, 0);
-    let extractc = -extractc;
-    let fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-    simd_insert!(a, 0, fmsub)
+pub fn _mm_fmsub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fmsub)
+    }
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38193,21 +40075,23 @@ pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fmsub: f64 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fmsub = vfmaddsdround(fmsub, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddsdround(fmsub, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38225,22 +40109,24 @@ pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
+pub fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fmsub: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -38258,21 +40144,23 @@ pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fmsub: f64 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc = -fmsub;
-        fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fmsub;
+            fmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fmsub)
     }
-    simd_insert!(c, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38290,14 +40178,16 @@ pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f32 = simd_extract!(a, 0);
-    let extracta = -extracta;
-    let extractb: f32 = simd_extract!(b, 0);
-    let extractc: f32 = simd_extract!(c, 0);
-    let fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-    simd_insert!(a, 0, fnmadd)
+pub fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmadd)
+    }
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38315,21 +40205,23 @@ pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f32 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta = -fnmadd;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38347,22 +40239,24 @@ pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
+pub fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            fnmadd = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -38380,21 +40274,23 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f32 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract!(b, 0);
-        fnmadd = vfmaddssround(extracta, extractb, fnmadd, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            fnmadd = vfmaddssround(extracta, extractb, fnmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmadd)
     }
-    simd_insert!(c, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -38412,18 +40308,16 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f64 = simd_extract!(a, 0);
-    let extracta = -extracta;
-    let extractb: f64 = simd_extract!(b, 0);
-    let extractc: f64 = simd_extract!(c, 0);
-    let fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-    simd_insert!(a, 0, fnmadd)
+pub fn _mm_fnmadd_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmadd)
+    }
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38441,21 +40335,23 @@ pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f64 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta = -fnmadd;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmadd;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38473,22 +40369,24 @@ pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
+pub fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            fnmadd = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -38506,21 +40404,23 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f64 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract!(b, 0);
-        fnmadd = vfmaddsdround(extracta, extractb, fnmadd, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            fnmadd = vfmaddsdround(extracta, extractb, fnmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmadd)
     }
-    simd_insert!(c, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38538,15 +40438,17 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f32 = simd_extract!(a, 0);
-    let extracta = -extracta;
-    let extractb: f32 = simd_extract!(b, 0);
-    let extractc: f32 = simd_extract!(c, 0);
-    let extractc = -extractc;
-    let fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
-    simd_insert!(a, 0, fnmsub)
+pub fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmsub)
+    }
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38564,22 +40466,24 @@ pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c:
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f32 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta = -fnmsub;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f32 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38597,23 +40501,25 @@ pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
+pub fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f32 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc: f32 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f32 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc: f32 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -38631,22 +40537,24 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
+pub fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
     a: __m128,
     b: __m128,
     c: __m128,
     k: __mmask8,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f32 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f32 = simd_extract!(b, 0);
-        let extractc = -fnmsub;
-        fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f32 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f32 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f32 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = vfmaddssround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmsub)
     }
-    simd_insert!(c, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -38664,19 +40572,17 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
-    a: __m128d,
-    b: __m128d,
-    c: __m128d,
-) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f64 = simd_extract!(a, 0);
-    let extracta = -extracta;
-    let extractb: f64 = simd_extract!(b, 0);
-    let extractc: f64 = simd_extract!(c, 0);
-    let extractc = -extractc;
-    let fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
-    simd_insert!(a, 0, fnmsub)
+pub fn _mm_fnmsub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extracta = -extracta;
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
+        let extractc = -extractc;
+        let fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, fnmsub)
+    }
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38694,22 +40600,24 @@ pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f64 = simd_extract!(a, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta = -fnmsub;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f64 = simd_extract!(a, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta = -fnmsub;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38727,23 +40635,25 @@ pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
+pub fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128d,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f64 = 0.;
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc: f64 = simd_extract!(c, 0);
-        let extractc = -extractc;
-        fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f64 = 0.;
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc: f64 = simd_extract!(c, 0);
+            let extractc = -extractc;
+            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -38761,22 +40671,24 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
+pub fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128d,
     k: __mmask8,
 ) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f64 = simd_extract!(c, 0);
-    if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract!(a, 0);
-        let extracta = -extracta;
-        let extractb: f64 = simd_extract!(b, 0);
-        let extractc = -fnmsub;
-        fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f64 = simd_extract!(c, 0);
+        if (k & 0b00000001) != 0 {
+            let extracta: f64 = simd_extract!(a, 0);
+            let extracta = -extracta;
+            let extractb: f64 = simd_extract!(b, 0);
+            let extractc = -fnmsub;
+            fnmsub = vfmaddsdround(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmsub)
     }
-    simd_insert!(c, 0, fnmsub)
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
@@ -38787,15 +40699,17 @@ pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fixupimm_ss<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let c = c.as_i32x4();
-    let r = vfixupimmss(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f32 = simd_extract!(r, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+pub fn _mm_fixupimm_ss<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmss(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
@@ -38806,20 +40720,22 @@ pub unsafe fn _mm_fixupimm_ss<const IMM8: i32>(a: __m128, b: __m128, c: __m128i)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fixupimm_ss<const IMM8: i32>(
+pub fn _mm_mask_fixupimm_ss<const IMM8: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128i,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let c = c.as_i32x4();
-    let fixupimm = vfixupimmss(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f32 = simd_extract!(fixupimm, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let fixupimm = vfixupimmss(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f32 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
@@ -38830,20 +40746,22 @@ pub unsafe fn _mm_mask_fixupimm_ss<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fixupimm_ss<const IMM8: i32>(
+pub fn _mm_maskz_fixupimm_ss<const IMM8: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128i,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let c = c.as_i32x4();
-    let fixupimm = vfixupimmssz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f32 = simd_extract!(fixupimm, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let fixupimm = vfixupimmssz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f32 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
@@ -38854,15 +40772,17 @@ pub unsafe fn _mm_maskz_fixupimm_ss<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_fixupimm_sd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let c = c.as_i64x2();
-    let fixupimm = vfixupimmsd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f64 = simd_extract!(fixupimm, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+pub fn _mm_fixupimm_sd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let fixupimm = vfixupimmsd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f64 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
@@ -38873,20 +40793,22 @@ pub unsafe fn _mm_fixupimm_sd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_fixupimm_sd<const IMM8: i32>(
+pub fn _mm_mask_fixupimm_sd<const IMM8: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128i,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let c = c.as_i64x2();
-    let fixupimm = vfixupimmsd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f64 = simd_extract!(fixupimm, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let fixupimm = vfixupimmsd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f64 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
@@ -38897,20 +40819,22 @@ pub unsafe fn _mm_mask_fixupimm_sd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_maskz_fixupimm_sd<const IMM8: i32>(
+pub fn _mm_maskz_fixupimm_sd<const IMM8: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128i,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let c = c.as_i64x2();
-    let fixupimm = vfixupimmsdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f64 = simd_extract!(fixupimm, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let fixupimm = vfixupimmsdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
+        let fixupimm: f64 = simd_extract!(fixupimm, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
@@ -38922,20 +40846,22 @@ pub unsafe fn _mm_maskz_fixupimm_sd<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+pub fn _mm_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
     a: __m128,
     b: __m128,
     c: __m128i,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let c = c.as_i32x4();
-    let r = vfixupimmss(a, b, c, IMM8, 0b11111111, SAE);
-    let fixupimm: f32 = simd_extract!(r, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmss(a, b, c, IMM8, 0b11111111, SAE);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
@@ -38947,21 +40873,23 @@ pub unsafe fn _mm_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm_mask_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+pub fn _mm_mask_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
     a: __m128,
     k: __mmask8,
     b: __m128,
     c: __m128i,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let c = c.as_i32x4();
-    let r = vfixupimmss(a, b, c, IMM8, k, SAE);
-    let fixupimm: f32 = simd_extract!(r, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmss(a, b, c, IMM8, k, SAE);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
@@ -38973,21 +40901,23 @@ pub unsafe fn _mm_mask_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm_maskz_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
+pub fn _mm_maskz_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m128,
     b: __m128,
     c: __m128i,
 ) -> __m128 {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    let c = c.as_i32x4();
-    let r = vfixupimmssz(a, b, c, IMM8, k, SAE);
-    let fixupimm: f32 = simd_extract!(r, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        let c = c.as_i32x4();
+        let r = vfixupimmssz(a, b, c, IMM8, k, SAE);
+        let fixupimm: f32 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
@@ -38999,20 +40929,22 @@ pub unsafe fn _mm_maskz_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
-pub unsafe fn _mm_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+pub fn _mm_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
     a: __m128d,
     b: __m128d,
     c: __m128i,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let c = c.as_i64x2();
-    let r = vfixupimmsd(a, b, c, IMM8, 0b11111111, SAE);
-    let fixupimm: f64 = simd_extract!(r, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmsd(a, b, c, IMM8, 0b11111111, SAE);
+        let fixupimm: f64 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
@@ -39024,21 +40956,23 @@ pub unsafe fn _mm_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm_mask_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+pub fn _mm_mask_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
     a: __m128d,
     k: __mmask8,
     b: __m128d,
     c: __m128i,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let c = c.as_i64x2();
-    let r = vfixupimmsd(a, b, c, IMM8, k, SAE);
-    let fixupimm: f64 = simd_extract!(r, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmsd(a, b, c, IMM8, k, SAE);
+        let fixupimm: f64 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
@@ -39050,21 +40984,23 @@ pub unsafe fn _mm_mask_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
-pub unsafe fn _mm_maskz_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
+pub fn _mm_maskz_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
     c: __m128i,
 ) -> __m128d {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    let c = c.as_i64x2();
-    let r = vfixupimmsdz(a, b, c, IMM8, k, SAE);
-    let fixupimm: f64 = simd_extract!(r, 0);
-    let r = simd_insert!(a, 0, fixupimm);
-    transmute(r)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        let c = c.as_i64x2();
+        let r = vfixupimmsdz(a, b, c, IMM8, k, SAE);
+        let fixupimm: f64 = simd_extract!(r, 0);
+        let r = simd_insert!(a, 0, fixupimm);
+        transmute(r)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -39074,31 +41010,35 @@ pub unsafe fn _mm_maskz_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2sd))]
-pub unsafe fn _mm_mask_cvtss_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128) -> __m128d {
-    transmute(vcvtss2sd(
-        a.as_f64x2(),
-        b.as_f32x4(),
-        src.as_f64x2(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_mask_cvtss_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        transmute(vcvtss2sd(
+            a.as_f64x2(),
+            b.as_f32x4(),
+            src.as_f64x2(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
-/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.    
+/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvtss_sd&expand=1897)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2sd))]
-pub unsafe fn _mm_maskz_cvtss_sd(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
-    transmute(vcvtss2sd(
-        a.as_f64x2(),
-        b.as_f32x4(),
-        f64x2::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_maskz_cvtss_sd(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        transmute(vcvtss2sd(
+            a.as_f64x2(),
+            b.as_f32x4(),
+            f64x2::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -39108,14 +41048,16 @@ pub unsafe fn _mm_maskz_cvtss_sd(k: __mmask8, a: __m128d, b: __m128) -> __m128d
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2ss))]
-pub unsafe fn _mm_mask_cvtsd_ss(src: __m128, k: __mmask8, a: __m128, b: __m128d) -> __m128 {
-    transmute(vcvtsd2ss(
-        a.as_f32x4(),
-        b.as_f64x2(),
-        src.as_f32x4(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_mask_cvtsd_ss(src: __m128, k: __mmask8, a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        transmute(vcvtsd2ss(
+            a.as_f32x4(),
+            b.as_f64x2(),
+            src.as_f32x4(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -39125,75 +41067,79 @@ pub unsafe fn _mm_mask_cvtsd_ss(src: __m128, k: __mmask8, a: __m128, b: __m128d)
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2ss))]
-pub unsafe fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
-    transmute(vcvtsd2ss(
-        a.as_f32x4(),
-        b.as_f64x2(),
-        f32x4::ZERO,
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        transmute(vcvtsd2ss(
+            a.as_f32x4(),
+            b.as_f64x2(),
+            f32x4::ZERO,
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_cvt_roundss_sd&expand=1371)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundss_sd<const SAE: i32>(a: __m128d, b: __m128) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f32x4();
-    let r = vcvtss2sd(a, b, f64x2::ZERO, 0b11111111, SAE);
-    transmute(r)
+pub fn _mm_cvt_roundss_sd<const SAE: i32>(a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f32x4();
+        let r = vcvtss2sd(a, b, f64x2::ZERO, 0b11111111, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mask_cvt_roundss_sd&expand=1372)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_cvt_roundss_sd<const SAE: i32>(
+pub fn _mm_mask_cvt_roundss_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128,
 ) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f32x4();
-    let src = src.as_f64x2();
-    let r = vcvtss2sd(a, b, src, k, SAE);
-    transmute(r)
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f32x4();
+        let src = src.as_f64x2();
+        let r = vcvtss2sd(a, b, src, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
-///    
+///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_maskz_cvt_roundss_sd&expand=1373)
 #[inline]
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_cvt_roundss_sd<const SAE: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128,
-) -> __m128d {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f32x4();
-    let r = vcvtss2sd(a, b, f64x2::ZERO, k, SAE);
-    transmute(r)
+pub fn _mm_maskz_cvt_roundss_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f32x4();
+        let r = vcvtss2sd(a, b, f64x2::ZERO, k, SAE);
+        transmute(r)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -39210,12 +41156,14 @@ pub unsafe fn _mm_maskz_cvt_roundss_sd<const SAE: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundsd_ss<const ROUNDING: i32>(a: __m128, b: __m128d) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f64x2();
-    let r = vcvtsd2ss(a, b, f32x4::ZERO, 0b11111111, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundsd_ss<const ROUNDING: i32>(a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f64x2();
+        let r = vcvtsd2ss(a, b, f32x4::ZERO, 0b11111111, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -39232,18 +41180,20 @@ pub unsafe fn _mm_cvt_roundsd_ss<const ROUNDING: i32>(a: __m128, b: __m128d) ->
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_cvt_roundsd_ss<const ROUNDING: i32>(
+pub fn _mm_mask_cvt_roundsd_ss<const ROUNDING: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128d,
 ) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f64x2();
-    let src = src.as_f32x4();
-    let r = vcvtsd2ss(a, b, src, k, ROUNDING);
-    transmute(r)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f64x2();
+        let src = src.as_f32x4();
+        let r = vcvtsd2ss(a, b, src, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -39260,16 +41210,14 @@ pub unsafe fn _mm_mask_cvt_roundsd_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128d,
-) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let b = b.as_f64x2();
-    let r = vcvtsd2ss(a, b, f32x4::ZERO, k, ROUNDING);
-    transmute(r)
+pub fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let b = b.as_f64x2();
+        let r = vcvtsd2ss(a, b, f32x4::ZERO, k, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -39286,10 +41234,12 @@ pub unsafe fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    vcvtss2si(a, ROUNDING)
+pub fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si(a, ROUNDING)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -39306,10 +41256,12 @@ pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    vcvtss2si(a, ROUNDING)
+pub fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si(a, ROUNDING)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
@@ -39326,10 +41278,12 @@ pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    vcvtss2usi(a, ROUNDING)
+pub fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2usi(a, ROUNDING)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
@@ -39339,8 +41293,8 @@ pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2si))]
-pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
-    vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtss_i32(a: __m128) -> i32 {
+    unsafe { vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
@@ -39350,8 +41304,8 @@ pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2usi))]
-pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
-    vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtss_u32(a: __m128) -> u32 {
+    unsafe { vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -39368,10 +41322,12 @@ pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    vcvtsd2si(a, ROUNDING)
+pub fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si(a, ROUNDING)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
@@ -39388,10 +41344,12 @@ pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    vcvtsd2si(a, ROUNDING)
+pub fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si(a, ROUNDING)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
@@ -39408,10 +41366,12 @@ pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    vcvtsd2usi(a, ROUNDING)
+pub fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2usi(a, ROUNDING)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
@@ -39421,8 +41381,8 @@ pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2si))]
-pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
-    vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtsd_i32(a: __m128d) -> i32 {
+    unsafe { vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
@@ -39432,8 +41392,8 @@ pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
-pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
-    vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtsd_u32(a: __m128d) -> u32 {
+    unsafe { vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -39451,11 +41411,13 @@ pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let r = vcvtsi2ss(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -39473,11 +41435,13 @@ pub unsafe fn _mm_cvt_roundi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundsi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let r = vcvtsi2ss(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundsi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -39494,11 +41458,13 @@ pub unsafe fn _mm_cvt_roundsi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let r = vcvtusi2ss(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtusi2ss(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -39508,9 +41474,11 @@ pub unsafe fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2ss))]
-pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
-    let b = b as f32;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
 }
 
 /// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -39520,9 +41488,11 @@ pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2sd))]
-pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
-    let b = b as f64;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -39534,10 +41504,12 @@ pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    vcvttss2si(a, SAE)
+pub fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si(a, SAE)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -39549,10 +41521,12 @@ pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    vcvttss2si(a, SAE)
+pub fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si(a, SAE)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
@@ -39564,10 +41538,12 @@ pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    vcvttss2usi(a, SAE)
+pub fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2usi(a, SAE)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
@@ -39577,8 +41553,8 @@ pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2si))]
-pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
-    vcvttss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttss_i32(a: __m128) -> i32 {
+    unsafe { vcvttss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
@@ -39588,8 +41564,8 @@ pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2usi))]
-pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
-    vcvttss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttss_u32(a: __m128) -> u32 {
+    unsafe { vcvttss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -39601,10 +41577,12 @@ pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    vcvttsd2si(a, SAE)
+pub fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si(a, SAE)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -39616,10 +41594,12 @@ pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    vcvttsd2si(a, SAE)
+pub fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si(a, SAE)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
@@ -39631,10 +41611,12 @@ pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    vcvttsd2usi(a, SAE)
+pub fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2usi(a, SAE)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
@@ -39644,8 +41626,8 @@ pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2si))]
-pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
-    vcvttsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsd_i32(a: __m128d) -> i32 {
+    unsafe { vcvttsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
@@ -39655,8 +41637,8 @@ pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2usi))]
-pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
-    vcvttsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsd_u32(a: __m128d) -> u32 {
+    unsafe { vcvttsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -39666,9 +41648,11 @@ pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtusi2ss))]
-pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
-    let b = b as f32;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
 }
 
 /// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -39678,9 +41662,11 @@ pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtusi2sd))]
-pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
-    let b = b as f64;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
 }
 
 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
@@ -39692,12 +41678,14 @@ pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomiss
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: __m128) -> i32 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f32x4();
-    let b = b.as_f32x4();
-    vcomiss(a, b, IMM5, SAE)
+pub fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: __m128) -> i32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f32x4();
+        let b = b.as_f32x4();
+        vcomiss(a, b, IMM5, SAE)
+    }
 }
 
 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
@@ -39709,12 +41697,14 @@ pub unsafe fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomisd
 #[rustc_legacy_const_generics(2, 3)]
-pub unsafe fn _mm_comi_round_sd<const IMM5: i32, const SAE: i32>(a: __m128d, b: __m128d) -> i32 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_mantissas_sae!(SAE);
-    let a = a.as_f64x2();
-    let b = b.as_f64x2();
-    vcomisd(a, b, IMM5, SAE)
+pub fn _mm_comi_round_sd<const IMM5: i32, const SAE: i32>(a: __m128d, b: __m128d) -> i32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_mantissas_sae!(SAE);
+        let a = a.as_f64x2();
+        let b = b.as_f64x2();
+        vcomisd(a, b, IMM5, SAE)
+    }
 }
 
 /// Equal
diff --git a/crates/core_arch/src/x86/avx512fp16.rs b/crates/core_arch/src/x86/avx512fp16.rs
index 20dace5e9c..e9dc88f87f 100644
--- a/crates/core_arch/src/x86/avx512fp16.rs
+++ b/crates/core_arch/src/x86/avx512fp16.rs
@@ -9,7 +9,7 @@ use crate::ptr;
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_set_ph(
+pub fn _mm_set_ph(
     e7: f16,
     e6: f16,
     e5: f16,
@@ -28,7 +28,7 @@ pub unsafe fn _mm_set_ph(
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_set_ph(
+pub fn _mm256_set_ph(
     e15: f16,
     e14: f16,
     e13: f16,
@@ -57,7 +57,7 @@ pub unsafe fn _mm256_set_ph(
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_set_ph(
+pub fn _mm512_set_ph(
     e31: f16,
     e30: f16,
     e29: f16,
@@ -104,7 +104,7 @@ pub unsafe fn _mm512_set_ph(
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_set_sh(a: f16) -> __m128h {
+pub fn _mm_set_sh(a: f16) -> __m128h {
     __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
 }
 
@@ -114,8 +114,8 @@ pub unsafe fn _mm_set_sh(a: f16) -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_set1_ph(a: f16) -> __m128h {
-    transmute(f16x8::splat(a))
+pub fn _mm_set1_ph(a: f16) -> __m128h {
+    unsafe { transmute(f16x8::splat(a)) }
 }
 
 /// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
@@ -124,8 +124,8 @@ pub unsafe fn _mm_set1_ph(a: f16) -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_set1_ph(a: f16) -> __m256h {
-    transmute(f16x16::splat(a))
+pub fn _mm256_set1_ph(a: f16) -> __m256h {
+    unsafe { transmute(f16x16::splat(a)) }
 }
 
 /// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
@@ -134,8 +134,8 @@ pub unsafe fn _mm256_set1_ph(a: f16) -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_set1_ph(a: f16) -> __m512h {
-    transmute(f16x32::splat(a))
+pub fn _mm512_set1_ph(a: f16) -> __m512h {
+    unsafe { transmute(f16x32::splat(a)) }
 }
 
 /// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
@@ -144,7 +144,7 @@ pub unsafe fn _mm512_set1_ph(a: f16) -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_setr_ph(
+pub fn _mm_setr_ph(
     e0: f16,
     e1: f16,
     e2: f16,
@@ -163,7 +163,7 @@ pub unsafe fn _mm_setr_ph(
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_setr_ph(
+pub fn _mm256_setr_ph(
     e0: f16,
     e1: f16,
     e2: f16,
@@ -192,7 +192,7 @@ pub unsafe fn _mm256_setr_ph(
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_setr_ph(
+pub fn _mm512_setr_ph(
     e0: f16,
     e1: f16,
     e2: f16,
@@ -238,8 +238,8 @@ pub unsafe fn _mm512_setr_ph(
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_setzero_ph() -> __m128h {
-    transmute(f16x8::ZERO)
+pub fn _mm_setzero_ph() -> __m128h {
+    unsafe { transmute(f16x8::ZERO) }
 }
 
 /// Return vector of type __m256h with all elements set to zero.
@@ -248,8 +248,8 @@ pub unsafe fn _mm_setzero_ph() -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_setzero_ph() -> __m256h {
-    transmute(f16x16::ZERO)
+pub fn _mm256_setzero_ph() -> __m256h {
+    unsafe { transmute(f16x16::ZERO) }
 }
 
 /// Return vector of type __m512h with all elements set to zero.
@@ -258,8 +258,8 @@ pub unsafe fn _mm256_setzero_ph() -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_setzero_ph() -> __m512h {
-    transmute(f16x32::ZERO)
+pub fn _mm512_setzero_ph() -> __m512h {
+    unsafe { transmute(f16x32::ZERO) }
 }
 
 /// Return vector of type `__m128h` with undefined elements. In practice, this returns the all-zero
@@ -269,8 +269,8 @@ pub unsafe fn _mm512_setzero_ph() -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_undefined_ph() -> __m128h {
-    transmute(f16x8::ZERO)
+pub fn _mm_undefined_ph() -> __m128h {
+    unsafe { transmute(f16x8::ZERO) }
 }
 
 /// Return vector of type `__m256h` with undefined elements. In practice, this returns the all-zero
@@ -280,8 +280,8 @@ pub unsafe fn _mm_undefined_ph() -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_undefined_ph() -> __m256h {
-    transmute(f16x16::ZERO)
+pub fn _mm256_undefined_ph() -> __m256h {
+    unsafe { transmute(f16x16::ZERO) }
 }
 
 /// Return vector of type `__m512h` with undefined elements. In practice, this returns the all-zero
@@ -291,8 +291,8 @@ pub unsafe fn _mm256_undefined_ph() -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_undefined_ph() -> __m512h {
-    transmute(f16x32::ZERO)
+pub fn _mm512_undefined_ph() -> __m512h {
+    unsafe { transmute(f16x32::ZERO) }
 }
 
 /// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
@@ -302,8 +302,8 @@ pub unsafe fn _mm512_undefined_ph() -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_castpd_ph(a: __m128d) -> __m128h {
-    transmute(a)
+pub fn _mm_castpd_ph(a: __m128d) -> __m128h {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
@@ -313,8 +313,8 @@ pub unsafe fn _mm_castpd_ph(a: __m128d) -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_castpd_ph(a: __m256d) -> __m256h {
-    transmute(a)
+pub fn _mm256_castpd_ph(a: __m256d) -> __m256h {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
@@ -324,8 +324,8 @@ pub unsafe fn _mm256_castpd_ph(a: __m256d) -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_castpd_ph(a: __m512d) -> __m512h {
-    transmute(a)
+pub fn _mm512_castpd_ph(a: __m512d) -> __m512h {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
@@ -335,8 +335,8 @@ pub unsafe fn _mm512_castpd_ph(a: __m512d) -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_castph_pd(a: __m128h) -> __m128d {
-    transmute(a)
+pub fn _mm_castph_pd(a: __m128h) -> __m128d {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
@@ -346,8 +346,8 @@ pub unsafe fn _mm_castph_pd(a: __m128h) -> __m128d {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_castph_pd(a: __m256h) -> __m256d {
-    transmute(a)
+pub fn _mm256_castph_pd(a: __m256h) -> __m256d {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
@@ -357,8 +357,8 @@ pub unsafe fn _mm256_castph_pd(a: __m256h) -> __m256d {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_castph_pd(a: __m512h) -> __m512d {
-    transmute(a)
+pub fn _mm512_castph_pd(a: __m512h) -> __m512d {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
@@ -368,8 +368,8 @@ pub unsafe fn _mm512_castph_pd(a: __m512h) -> __m512d {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_castps_ph(a: __m128) -> __m128h {
-    transmute(a)
+pub fn _mm_castps_ph(a: __m128) -> __m128h {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
@@ -379,8 +379,8 @@ pub unsafe fn _mm_castps_ph(a: __m128) -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_castps_ph(a: __m256) -> __m256h {
-    transmute(a)
+pub fn _mm256_castps_ph(a: __m256) -> __m256h {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
@@ -390,8 +390,8 @@ pub unsafe fn _mm256_castps_ph(a: __m256) -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_castps_ph(a: __m512) -> __m512h {
-    transmute(a)
+pub fn _mm512_castps_ph(a: __m512) -> __m512h {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
@@ -401,8 +401,8 @@ pub unsafe fn _mm512_castps_ph(a: __m512) -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_castph_ps(a: __m128h) -> __m128 {
-    transmute(a)
+pub fn _mm_castph_ps(a: __m128h) -> __m128 {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
@@ -412,8 +412,8 @@ pub unsafe fn _mm_castph_ps(a: __m128h) -> __m128 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_castph_ps(a: __m256h) -> __m256 {
-    transmute(a)
+pub fn _mm256_castph_ps(a: __m256h) -> __m256 {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
@@ -423,8 +423,8 @@ pub unsafe fn _mm256_castph_ps(a: __m256h) -> __m256 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_castph_ps(a: __m512h) -> __m512 {
-    transmute(a)
+pub fn _mm512_castph_ps(a: __m512h) -> __m512 {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
@@ -434,8 +434,8 @@ pub unsafe fn _mm512_castph_ps(a: __m512h) -> __m512 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_castsi128_ph(a: __m128i) -> __m128h {
-    transmute(a)
+pub fn _mm_castsi128_ph(a: __m128i) -> __m128h {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
@@ -445,8 +445,8 @@ pub unsafe fn _mm_castsi128_ph(a: __m128i) -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
-    transmute(a)
+pub fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
@@ -456,8 +456,8 @@ pub unsafe fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
-    transmute(a)
+pub fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
@@ -467,8 +467,8 @@ pub unsafe fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_castph_si128(a: __m128h) -> __m128i {
-    transmute(a)
+pub fn _mm_castph_si128(a: __m128h) -> __m128i {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
@@ -478,8 +478,8 @@ pub unsafe fn _mm_castph_si128(a: __m128h) -> __m128i {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_castph_si256(a: __m256h) -> __m256i {
-    transmute(a)
+pub fn _mm256_castph_si256(a: __m256h) -> __m256i {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
@@ -489,8 +489,8 @@ pub unsafe fn _mm256_castph_si256(a: __m256h) -> __m256i {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_castph_si512(a: __m512h) -> __m512i {
-    transmute(a)
+pub fn _mm512_castph_si512(a: __m512h) -> __m512i {
+    unsafe { transmute(a) }
 }
 
 /// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
@@ -500,8 +500,8 @@ pub unsafe fn _mm512_castph_si512(a: __m512h) -> __m512i {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
-    simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+pub fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
 }
 
 /// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
@@ -511,8 +511,8 @@ pub unsafe fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
-    simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
+pub fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
 }
 
 /// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
@@ -522,8 +522,8 @@ pub unsafe fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
-    simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
+pub fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
+    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
 }
 
 /// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
@@ -534,12 +534,14 @@ pub unsafe fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
-    simd_shuffle!(
-        a,
-        _mm_undefined_ph(),
-        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
-    )
+pub fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_ph(),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
+        )
+    }
 }
 
 /// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
@@ -550,15 +552,17 @@ pub unsafe fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
-    simd_shuffle!(
-        a,
-        _mm_undefined_ph(),
-        [
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-            8, 8, 8
-        ]
-    )
+pub fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_undefined_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8
+            ]
+        )
+    }
 }
 
 /// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
@@ -569,15 +573,17 @@ pub unsafe fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
-    simd_shuffle!(
-        a,
-        _mm256_undefined_ph(),
-        [
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16,
-            16, 16, 16, 16, 16, 16, 16, 16
-        ]
-    )
+pub fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_undefined_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16, 16
+            ]
+        )
+    }
 }
 
 /// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
@@ -588,12 +594,14 @@ pub unsafe fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
-    simd_shuffle!(
-        a,
-        _mm_setzero_ph(),
-        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
-    )
+pub fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_setzero_ph(),
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
+        )
+    }
 }
 
 /// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
@@ -604,15 +612,17 @@ pub unsafe fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
-    simd_shuffle!(
-        a,
-        _mm256_setzero_ph(),
-        [
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16,
-            16, 16, 16, 16, 16, 16, 16, 16
-        ]
-    )
+pub fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm256_setzero_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
+                16, 16, 16, 16, 16, 16, 16, 16, 16
+            ]
+        )
+    }
 }
 
 /// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
@@ -623,15 +633,17 @@ pub unsafe fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
-    simd_shuffle!(
-        a,
-        _mm_setzero_ph(),
-        [
-            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-            8, 8, 8
-        ]
-    )
+pub fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
+    unsafe {
+        simd_shuffle!(
+            a,
+            _mm_setzero_ph(),
+            [
+                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                8, 8, 8, 8
+            ]
+        )
+    }
 }
 
 macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
@@ -670,9 +682,11 @@ macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    cmp_asm!(__mmask8, xmm_reg, a, b)
+pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask8, xmm_reg, a, b)
+    }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -684,13 +698,11 @@ pub unsafe fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmas
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cmp_ph_mask<const IMM5: i32>(
-    k1: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    cmp_asm!(__mmask8, k1, xmm_reg, a, b)
+pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask8, k1, xmm_reg, a, b)
+    }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -701,9 +713,11 @@ pub unsafe fn _mm_mask_cmp_ph_mask<const IMM5: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
-    static_assert_uimm_bits!(IMM5, 5);
-    cmp_asm!(__mmask16, ymm_reg, a, b)
+pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask16, ymm_reg, a, b)
+    }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -715,13 +729,15 @@ pub unsafe fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
+pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
     k1: __mmask16,
     a: __m256h,
     b: __m256h,
 ) -> __mmask16 {
-    static_assert_uimm_bits!(IMM5, 5);
-    cmp_asm!(__mmask16, k1, ymm_reg, a, b)
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask16, k1, ymm_reg, a, b)
+    }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -732,9 +748,11 @@ pub unsafe fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
-    static_assert_uimm_bits!(IMM5, 5);
-    cmp_asm!(__mmask32, zmm_reg, a, b)
+pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask32, zmm_reg, a, b)
+    }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -746,13 +764,15 @@ pub unsafe fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __m
 #[target_feature(enable = "avx512fp16")]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
+pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
     k1: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __mmask32 {
-    static_assert_uimm_bits!(IMM5, 5);
-    cmp_asm!(__mmask32, k1, zmm_reg, a, b)
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
+    }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -765,25 +785,27 @@ pub unsafe fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
+pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
     a: __m512h,
     b: __m512h,
 ) -> __mmask32 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_sae!(SAE);
-    if SAE == _MM_FROUND_NO_EXC {
-        let dst: __mmask32;
-        asm!(
-            "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
-            k = lateout(kreg) dst,
-            a = in(zmm_reg) a,
-            b = in(zmm_reg) b,
-            imm8 = const IMM5,
-            options(pure, nomem, nostack)
-        );
-        dst
-    } else {
-        cmp_asm!(__mmask32, zmm_reg, a, b)
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        if SAE == _MM_FROUND_NO_EXC {
+            let dst: __mmask32;
+            asm!(
+                "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
+                k = lateout(kreg) dst,
+                a = in(zmm_reg) a,
+                b = in(zmm_reg) b,
+                imm8 = const IMM5,
+                options(pure, nomem, nostack)
+            );
+            dst
+        } else {
+            cmp_asm!(__mmask32, zmm_reg, a, b)
+        }
     }
 }
 
@@ -798,27 +820,29 @@ pub unsafe fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
+pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
     k1: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __mmask32 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_sae!(SAE);
-    if SAE == _MM_FROUND_NO_EXC {
-        let dst: __mmask32;
-        asm!(
-            "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
-            k = lateout(kreg) dst,
-            k1 = in(kreg) k1,
-            a = in(zmm_reg) a,
-            b = in(zmm_reg) b,
-            imm8 = const IMM5,
-            options(pure, nomem, nostack)
-        );
-        dst
-    } else {
-        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        if SAE == _MM_FROUND_NO_EXC {
+            let dst: __mmask32;
+            asm!(
+                "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
+                k = lateout(kreg) dst,
+                k1 = in(kreg) k1,
+                a = in(zmm_reg) a,
+                b = in(zmm_reg) b,
+                imm8 = const IMM5,
+                options(pure, nomem, nostack)
+            );
+            dst
+        } else {
+            cmp_asm!(__mmask32, k1, zmm_reg, a, b)
+        }
     }
 }
 
@@ -831,10 +855,7 @@ pub unsafe fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
-    a: __m128h,
-    b: __m128h,
-) -> __mmask8 {
+pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
     static_assert_uimm_bits!(IMM5, 5);
     static_assert_sae!(SAE);
     _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
@@ -849,14 +870,16 @@ pub unsafe fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
+pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
     k1: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __mmask8 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_sae!(SAE);
-    vcmpsh(a, b, IMM5, k1, SAE)
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        vcmpsh(a, b, IMM5, k1, SAE)
+    }
 }
 
 /// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -867,7 +890,7 @@ pub unsafe fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
+pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
     static_assert_uimm_bits!(IMM5, 5);
     _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
 }
@@ -880,11 +903,7 @@ pub unsafe fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmas
 #[target_feature(enable = "avx512fp16")]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cmp_sh_mask<const IMM5: i32>(
-    k1: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __mmask8 {
+pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
     static_assert_uimm_bits!(IMM5, 5);
     _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
 }
@@ -898,10 +917,12 @@ pub unsafe fn _mm_mask_cmp_sh_mask<const IMM5: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
-    static_assert_uimm_bits!(IMM5, 5);
-    static_assert_sae!(SAE);
-    vcomish(a, b, IMM5, SAE)
+pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM5, 5);
+        static_assert_sae!(SAE);
+        vcomish(a, b, IMM5, SAE)
+    }
 }
 
 /// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
@@ -912,7 +933,7 @@ pub unsafe fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b:
 #[target_feature(enable = "avx512fp16")]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
     static_assert_uimm_bits!(IMM5, 5);
     _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
 }
@@ -924,7 +945,7 @@ pub unsafe fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_EQ_OS>(a, b)
 }
 
@@ -935,7 +956,7 @@ pub unsafe fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_GE_OS>(a, b)
 }
 
@@ -946,7 +967,7 @@ pub unsafe fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_GT_OS>(a, b)
 }
 
@@ -957,7 +978,7 @@ pub unsafe fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_LE_OS>(a, b)
 }
 
@@ -968,7 +989,7 @@ pub unsafe fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_LT_OS>(a, b)
 }
 
@@ -979,7 +1000,7 @@ pub unsafe fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_NEQ_OS>(a, b)
 }
 
@@ -990,7 +1011,7 @@ pub unsafe fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
 }
 
@@ -1001,7 +1022,7 @@ pub unsafe fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_GE_OQ>(a, b)
 }
 
@@ -1012,7 +1033,7 @@ pub unsafe fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_GT_OQ>(a, b)
 }
 
@@ -1023,7 +1044,7 @@ pub unsafe fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_LE_OQ>(a, b)
 }
 
@@ -1034,7 +1055,7 @@ pub unsafe fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_LT_OQ>(a, b)
 }
 
@@ -1045,7 +1066,7 @@ pub unsafe fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
+pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
     _mm_comi_sh::<_CMP_NEQ_OQ>(a, b)
 }
 
@@ -1172,12 +1193,14 @@ pub unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    let mut mov: f16 = simd_extract!(src, 0);
-    if (k & 1) != 0 {
-        mov = simd_extract!(b, 0);
+pub fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let mut mov: f16 = simd_extract!(src, 0);
+        if (k & 1) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
     }
-    simd_insert!(a, 0, mov)
 }
 
 /// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
@@ -1188,12 +1211,14 @@ pub unsafe fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    let mut mov: f16 = 0.;
-    if (k & 1) != 0 {
-        mov = simd_extract!(b, 0);
+pub fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let mut mov: f16 = 0.;
+        if (k & 1) != 0 {
+            mov = simd_extract!(b, 0);
+        }
+        simd_insert!(a, 0, mov)
     }
-    simd_insert!(a, 0, mov)
 }
 
 /// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
@@ -1203,9 +1228,11 @@ pub unsafe fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
-    let mov: f16 = simd_extract!(b, 0);
-    simd_insert!(a, 0, mov)
+pub fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let mov: f16 = simd_extract!(b, 0);
+        simd_insert!(a, 0, mov)
+    }
 }
 
 /// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
@@ -1307,8 +1334,8 @@ pub unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vaddph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
-    simd_add(a, b)
+pub fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_add(a, b) }
 }
 
 /// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1319,9 +1346,11 @@ pub unsafe fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vaddph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    let r = _mm_add_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_add_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1332,9 +1361,11 @@ pub unsafe fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vaddph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    let r = _mm_add_ph(a, b);
-    simd_select_bitmask(k, r, _mm_setzero_ph())
+pub fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_add_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
 }
 
 /// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@@ -1344,8 +1375,8 @@ pub unsafe fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vaddph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
-    simd_add(a, b)
+pub fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_add(a, b) }
 }
 
 /// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1356,9 +1387,11 @@ pub unsafe fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vaddph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    let r = _mm256_add_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_add_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1369,9 +1402,11 @@ pub unsafe fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m2
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vaddph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    let r = _mm256_add_ph(a, b);
-    simd_select_bitmask(k, r, _mm256_setzero_ph())
+pub fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_add_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
 }
 
 /// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@@ -1381,8 +1416,8 @@ pub unsafe fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m25
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vaddph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
-    simd_add(a, b)
+pub fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_add(a, b) }
 }
 
 /// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1393,9 +1428,11 @@ pub unsafe fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vaddph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    let r = _mm512_add_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_add_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1406,9 +1443,11 @@ pub unsafe fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m5
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vaddph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    let r = _mm512_add_ph(a, b);
-    simd_select_bitmask(k, r, _mm512_setzero_ph())
+pub fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_add_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
 }
 
 /// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@@ -1426,9 +1465,11 @@ pub unsafe fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m51
 #[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vaddph(a, b, ROUNDING)
+pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vaddph(a, b, ROUNDING)
+    }
 }
 
 /// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1447,15 +1488,17 @@ pub unsafe fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -
 #[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    let r = _mm512_add_round_ph::<ROUNDING>(a, b);
-    simd_select_bitmask(k, r, src)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1473,14 +1516,16 @@ pub unsafe fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
+pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
     k: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    let r = _mm512_add_round_ph::<ROUNDING>(a, b);
-    simd_select_bitmask(k, r, _mm512_setzero_ph())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
 }
 
 /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1499,7 +1544,7 @@ pub unsafe fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_add_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -1521,14 +1566,16 @@ pub unsafe fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> _
 #[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_add_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vaddsh(a, b, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vaddsh(a, b, src, k, ROUNDING)
+    }
 }
 
 /// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -1548,11 +1595,7 @@ pub unsafe fn _mm_mask_add_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_add_round_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
+pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_add_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
 }
@@ -1565,7 +1608,7 @@ pub unsafe fn _mm_maskz_add_round_sh<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vaddsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
     _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
 }
 
@@ -1578,7 +1621,7 @@ pub unsafe fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vaddsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -1591,7 +1634,7 @@ pub unsafe fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vaddsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
 }
 
@@ -1602,8 +1645,8 @@ pub unsafe fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsubph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
-    simd_sub(a, b)
+pub fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_sub(a, b) }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
@@ -1614,9 +1657,11 @@ pub unsafe fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsubph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    let r = _mm_sub_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_sub_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
@@ -1627,9 +1672,11 @@ pub unsafe fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsubph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    let r = _mm_sub_ph(a, b);
-    simd_select_bitmask(k, r, _mm_setzero_ph())
+pub fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_sub_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
@@ -1639,8 +1686,8 @@ pub unsafe fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsubph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
-    simd_sub(a, b)
+pub fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_sub(a, b) }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
@@ -1651,9 +1698,11 @@ pub unsafe fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsubph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    let r = _mm256_sub_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_sub_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
@@ -1664,9 +1713,11 @@ pub unsafe fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m2
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsubph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    let r = _mm256_sub_ph(a, b);
-    simd_select_bitmask(k, r, _mm256_setzero_ph())
+pub fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_sub_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
@@ -1676,8 +1727,8 @@ pub unsafe fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m25
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsubph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
-    simd_sub(a, b)
+pub fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_sub(a, b) }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
@@ -1688,9 +1739,11 @@ pub unsafe fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsubph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    let r = _mm512_sub_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_sub_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
@@ -1701,9 +1754,11 @@ pub unsafe fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m5
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsubph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    let r = _mm512_sub_ph(a, b);
-    simd_select_bitmask(k, r, _mm512_setzero_ph())
+pub fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_sub_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
@@ -1721,9 +1776,11 @@ pub unsafe fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m51
 #[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vsubph(a, b, ROUNDING)
+pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsubph(a, b, ROUNDING)
+    }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
@@ -1742,15 +1799,17 @@ pub unsafe fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -
 #[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
-    simd_select_bitmask(k, r, src)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
@@ -1769,14 +1828,16 @@ pub unsafe fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
     k: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
-    simd_select_bitmask(k, r, _mm512_setzero_ph())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
 }
 
 /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -1795,7 +1856,7 @@ pub unsafe fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_sub_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -1817,14 +1878,16 @@ pub unsafe fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> _
 #[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vsubsh(a, b, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsubsh(a, b, src, k, ROUNDING)
+    }
 }
 
 /// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
@@ -1844,11 +1907,7 @@ pub unsafe fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
+pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_sub_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
 }
@@ -1861,7 +1920,7 @@ pub unsafe fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsubsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
     _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
 }
 
@@ -1874,7 +1933,7 @@ pub unsafe fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsubsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -1887,7 +1946,7 @@ pub unsafe fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsubsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
 }
 
@@ -1898,8 +1957,8 @@ pub unsafe fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmulph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
-    simd_mul(a, b)
+pub fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_mul(a, b) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1910,9 +1969,11 @@ pub unsafe fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmulph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    let r = _mm_mul_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_mul_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1923,9 +1984,11 @@ pub unsafe fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmulph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    let r = _mm_mul_ph(a, b);
-    simd_select_bitmask(k, r, _mm_setzero_ph())
+pub fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_mul_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@@ -1935,8 +1998,8 @@ pub unsafe fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmulph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
-    simd_mul(a, b)
+pub fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_mul(a, b) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1947,9 +2010,11 @@ pub unsafe fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmulph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    let r = _mm256_mul_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_mul_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1960,9 +2025,11 @@ pub unsafe fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m2
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmulph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    let r = _mm256_mul_ph(a, b);
-    simd_select_bitmask(k, r, _mm256_setzero_ph())
+pub fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_mul_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@@ -1972,8 +2039,8 @@ pub unsafe fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m25
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vmulph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
-    simd_mul(a, b)
+pub fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_mul(a, b) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1984,9 +2051,11 @@ pub unsafe fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vmulph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    let r = _mm512_mul_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_mul_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -1997,9 +2066,11 @@ pub unsafe fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m5
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vmulph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    let r = _mm512_mul_ph(a, b);
-    simd_select_bitmask(k, r, _mm512_setzero_ph())
+pub fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_mul_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
@@ -2017,9 +2088,11 @@ pub unsafe fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m51
 #[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vmulph(a, b, ROUNDING)
+pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vmulph(a, b, ROUNDING)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -2038,15 +2111,17 @@ pub unsafe fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -
 #[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
-    simd_select_bitmask(k, r, src)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
@@ -2065,14 +2140,16 @@ pub unsafe fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
+pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
     k: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
-    simd_select_bitmask(k, r, _mm512_setzero_ph())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2091,7 +2168,7 @@ pub unsafe fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_mul_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -2113,14 +2190,16 @@ pub unsafe fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> _
 #[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vmulsh(a, b, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vmulsh(a, b, src, k, ROUNDING)
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
@@ -2140,11 +2219,7 @@ pub unsafe fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
+pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_mul_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
 }
@@ -2157,7 +2232,7 @@ pub unsafe fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vmulsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
     _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
 }
 
@@ -2170,7 +2245,7 @@ pub unsafe fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vmulsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -2183,7 +2258,7 @@ pub unsafe fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vmulsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
 }
 
@@ -2194,8 +2269,8 @@ pub unsafe fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vdivph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
-    simd_div(a, b)
+pub fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_div(a, b) }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
@@ -2206,9 +2281,11 @@ pub unsafe fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vdivph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    let r = _mm_div_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_div_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
@@ -2219,9 +2296,11 @@ pub unsafe fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vdivph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    let r = _mm_div_ph(a, b);
-    simd_select_bitmask(k, r, _mm_setzero_ph())
+pub fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe {
+        let r = _mm_div_ph(a, b);
+        simd_select_bitmask(k, r, _mm_setzero_ph())
+    }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
@@ -2231,8 +2310,8 @@ pub unsafe fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vdivph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
-    simd_div(a, b)
+pub fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_div(a, b) }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
@@ -2243,9 +2322,11 @@ pub unsafe fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vdivph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    let r = _mm256_div_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_div_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
@@ -2256,9 +2337,11 @@ pub unsafe fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m2
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vdivph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    let r = _mm256_div_ph(a, b);
-    simd_select_bitmask(k, r, _mm256_setzero_ph())
+pub fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe {
+        let r = _mm256_div_ph(a, b);
+        simd_select_bitmask(k, r, _mm256_setzero_ph())
+    }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
@@ -2268,8 +2351,8 @@ pub unsafe fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m25
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vdivph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
-    simd_div(a, b)
+pub fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_div(a, b) }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
@@ -2280,9 +2363,11 @@ pub unsafe fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vdivph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    let r = _mm512_div_ph(a, b);
-    simd_select_bitmask(k, r, src)
+pub fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_div_ph(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
@@ -2293,9 +2378,11 @@ pub unsafe fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m5
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vdivph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    let r = _mm512_div_ph(a, b);
-    simd_select_bitmask(k, r, _mm512_setzero_ph())
+pub fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        let r = _mm512_div_ph(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
@@ -2313,9 +2400,11 @@ pub unsafe fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m51
 #[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vdivph(a, b, ROUNDING)
+pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vdivph(a, b, ROUNDING)
+    }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
@@ -2334,15 +2423,17 @@ pub unsafe fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -
 #[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    let r = _mm512_div_round_ph::<ROUNDING>(a, b);
-    simd_select_bitmask(k, r, src)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, src)
+    }
 }
 
 /// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
@@ -2361,14 +2452,16 @@ pub unsafe fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
+pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
     k: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    let r = _mm512_div_round_ph::<ROUNDING>(a, b);
-    simd_select_bitmask(k, r, _mm512_setzero_ph())
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
+        simd_select_bitmask(k, r, _mm512_setzero_ph())
+    }
 }
 
 /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2387,7 +2480,7 @@ pub unsafe fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_div_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -2409,14 +2502,16 @@ pub unsafe fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> _
 #[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_div_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vdivsh(a, b, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vdivsh(a, b, src, k, ROUNDING)
+    }
 }
 
 /// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
@@ -2436,11 +2531,7 @@ pub unsafe fn _mm_mask_div_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_div_round_sh<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
+pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_div_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
 }
@@ -2453,7 +2544,7 @@ pub unsafe fn _mm_maskz_div_round_sh<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vdivsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
     _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
 }
 
@@ -2466,7 +2557,7 @@ pub unsafe fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vdivsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -2479,7 +2570,7 @@ pub unsafe fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vdivsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
 }
 
@@ -2492,7 +2583,7 @@ pub unsafe fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -2505,8 +2596,8 @@ pub unsafe fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k))
+pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
 }
 
 /// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
@@ -2518,7 +2609,7 @@ pub unsafe fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
 }
 
@@ -2531,7 +2622,7 @@ pub unsafe fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
     _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
 }
 
@@ -2544,8 +2635,8 @@ pub unsafe fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
-    transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k))
+pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
 }
 
 /// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
@@ -2557,7 +2648,7 @@ pub unsafe fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m2
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
     _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
 }
 
@@ -2570,7 +2661,7 @@ pub unsafe fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m25
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
     _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
 }
 
@@ -2583,7 +2674,7 @@ pub unsafe fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
     _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -2596,7 +2687,7 @@ pub unsafe fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
     _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
 }
 
@@ -2618,7 +2709,7 @@ pub unsafe fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m5
 #[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
 }
@@ -2641,20 +2732,22 @@ pub unsafe fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h)
 #[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
+pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask16,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    transmute(vfmulcph_512(
-        transmute(a),
-        transmute(b),
-        transmute(src),
-        k,
-        ROUNDING,
-    ))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmulcph_512(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
 }
 
 /// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
@@ -2675,7 +2768,7 @@ pub unsafe fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
+pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512h,
     b: __m512h,
@@ -2694,7 +2787,7 @@ pub unsafe fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_mul_sch(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -2708,7 +2801,7 @@ pub unsafe fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -2722,7 +2815,7 @@ pub unsafe fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_mul_sch(_mm_setzero_ph(), k, a, b)
 }
 
@@ -2745,7 +2838,7 @@ pub unsafe fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h
 #[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_mul_round_sch::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -2769,20 +2862,22 @@ pub unsafe fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) ->
 #[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
+pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    transmute(vfmulcsh(
-        transmute(a),
-        transmute(b),
-        transmute(src),
-        k,
-        ROUNDING,
-    ))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmulcsh(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
 }
 
 /// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
@@ -2804,7 +2899,7 @@ pub unsafe fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
+pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
@@ -2822,7 +2917,7 @@ pub unsafe fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
     _mm_mul_pch(a, b)
 }
 
@@ -2835,7 +2930,7 @@ pub unsafe fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_mul_pch(src, k, a, b)
 }
 
@@ -2848,7 +2943,7 @@ pub unsafe fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_maskz_mul_pch(k, a, b)
 }
 
@@ -2861,7 +2956,7 @@ pub unsafe fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
     _mm256_mul_pch(a, b)
 }
 
@@ -2874,7 +2969,7 @@ pub unsafe fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
     _mm256_mask_mul_pch(src, k, a, b)
 }
 
@@ -2887,7 +2982,7 @@ pub unsafe fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
     _mm256_maskz_mul_pch(k, a, b)
 }
 
@@ -2899,7 +2994,7 @@ pub unsafe fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m2
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
     _mm512_mul_pch(a, b)
 }
 
@@ -2912,7 +3007,7 @@ pub unsafe fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
     _mm512_mask_mul_pch(src, k, a, b)
 }
 
@@ -2925,7 +3020,7 @@ pub unsafe fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
     _mm512_maskz_mul_pch(k, a, b)
 }
 
@@ -2945,7 +3040,7 @@ pub unsafe fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m
 #[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
     static_assert_rounding!(ROUNDING);
     _mm512_mul_round_pch::<ROUNDING>(a, b)
 }
@@ -2967,7 +3062,7 @@ pub unsafe fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h)
 #[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
+pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask16,
     a: __m512h,
@@ -2994,7 +3089,7 @@ pub unsafe fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512h,
     b: __m512h,
@@ -3012,7 +3107,7 @@ pub unsafe fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
     _mm_mul_sch(a, b)
 }
 
@@ -3025,7 +3120,7 @@ pub unsafe fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_mul_sch(src, k, a, b)
 }
 
@@ -3038,7 +3133,7 @@ pub unsafe fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_maskz_mul_sch(k, a, b)
 }
 
@@ -3059,7 +3154,7 @@ pub unsafe fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h
 #[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mul_round_sch::<ROUNDING>(a, b)
 }
@@ -3082,7 +3177,7 @@ pub unsafe fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) ->
 #[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
+pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
@@ -3110,7 +3205,7 @@ pub unsafe fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
+pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
@@ -3129,7 +3224,7 @@ pub unsafe fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -3143,8 +3238,8 @@ pub unsafe fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k))
+pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
@@ -3157,7 +3252,7 @@ pub unsafe fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
 }
 
@@ -3171,7 +3266,7 @@ pub unsafe fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
     _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
 }
 
@@ -3185,8 +3280,8 @@ pub unsafe fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
-    transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k))
+pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
@@ -3199,7 +3294,7 @@ pub unsafe fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
     _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
 }
 
@@ -3213,7 +3308,7 @@ pub unsafe fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m2
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
     _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
 }
 
@@ -3227,7 +3322,7 @@ pub unsafe fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
     _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -3241,7 +3336,7 @@ pub unsafe fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
     _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
 }
 
@@ -3264,7 +3359,7 @@ pub unsafe fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m
 #[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
 }
@@ -3288,20 +3383,22 @@ pub unsafe fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h)
 #[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
+pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask16,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    transmute(vfcmulcph_512(
-        transmute(a),
-        transmute(b),
-        transmute(src),
-        k,
-        ROUNDING,
-    ))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmulcph_512(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
@@ -3323,7 +3420,7 @@ pub unsafe fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
+pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512h,
     b: __m512h,
@@ -3341,7 +3438,7 @@ pub unsafe fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_cmul_sch(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -3355,7 +3452,7 @@ pub unsafe fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -3369,7 +3466,7 @@ pub unsafe fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_cmul_sch(_mm_setzero_ph(), k, a, b)
 }
 
@@ -3391,7 +3488,7 @@ pub unsafe fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h
 #[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_cmul_round_sch::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -3415,20 +3512,22 @@ pub unsafe fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) ->
 #[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
+pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    transmute(vfcmulcsh(
-        transmute(a),
-        transmute(b),
-        transmute(src),
-        k,
-        ROUNDING,
-    ))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmulcsh(
+            transmute(a),
+            transmute(b),
+            transmute(src),
+            k,
+            ROUNDING,
+        ))
+    }
 }
 
 /// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
@@ -3450,7 +3549,7 @@ pub unsafe fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
+pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
@@ -3469,7 +3568,7 @@ pub unsafe fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
     _mm_cmul_pch(a, b)
 }
 
@@ -3483,7 +3582,7 @@ pub unsafe fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_cmul_pch(src, k, a, b)
 }
 
@@ -3497,7 +3596,7 @@ pub unsafe fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m12
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_maskz_cmul_pch(k, a, b)
 }
 
@@ -3511,7 +3610,7 @@ pub unsafe fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
     _mm256_cmul_pch(a, b)
 }
 
@@ -3525,7 +3624,7 @@ pub unsafe fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
     _mm256_mask_cmul_pch(src, k, a, b)
 }
 
@@ -3539,7 +3638,7 @@ pub unsafe fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
     _mm256_maskz_cmul_pch(k, a, b)
 }
 
@@ -3553,7 +3652,7 @@ pub unsafe fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
     _mm512_cmul_pch(a, b)
 }
 
@@ -3567,7 +3666,7 @@ pub unsafe fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
     _mm512_mask_cmul_pch(src, k, a, b)
 }
 
@@ -3581,7 +3680,7 @@ pub unsafe fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: _
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
     _mm512_maskz_cmul_pch(k, a, b)
 }
 
@@ -3603,7 +3702,7 @@ pub unsafe fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __
 #[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
     static_assert_rounding!(ROUNDING);
     _mm512_cmul_round_pch::<ROUNDING>(a, b)
 }
@@ -3627,7 +3726,7 @@ pub unsafe fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h
 #[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
+pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask16,
     a: __m512h,
@@ -3656,7 +3755,7 @@ pub unsafe fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
+pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512h,
     b: __m512h,
@@ -3675,7 +3774,7 @@ pub unsafe fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
     _mm_cmul_sch(a, b)
 }
 
@@ -3689,7 +3788,7 @@ pub unsafe fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_cmul_sch(src, k, a, b)
 }
 
@@ -3703,7 +3802,7 @@ pub unsafe fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m12
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmulcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_maskz_cmul_sch(k, a, b)
 }
 
@@ -3725,7 +3824,7 @@ pub unsafe fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128
 #[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_cmul_round_sch::<ROUNDING>(a, b)
 }
@@ -3749,7 +3848,7 @@ pub unsafe fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -
 #[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
+pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
@@ -3778,7 +3877,7 @@ pub unsafe fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
+pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
@@ -3794,8 +3893,8 @@ pub unsafe fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_abs_ph(v2: __m128h) -> __m128h {
-    transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX)))
+pub fn _mm_abs_ph(v2: __m128h) -> __m128h {
+    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
 }
 
 /// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
@@ -3805,8 +3904,8 @@ pub unsafe fn _mm_abs_ph(v2: __m128h) -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_abs_ph(v2: __m256h) -> __m256h {
-    transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX)))
+pub fn _mm256_abs_ph(v2: __m256h) -> __m256h {
+    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
 }
 
 /// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
@@ -3816,8 +3915,8 @@ pub unsafe fn _mm256_abs_ph(v2: __m256h) -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_abs_ph(v2: __m512h) -> __m512h {
-    transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX)))
+pub fn _mm512_abs_ph(v2: __m512h) -> __m512h {
+    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
 }
 
 /// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
@@ -3829,8 +3928,8 @@ pub unsafe fn _mm512_abs_ph(v2: __m512h) -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_conj_pch(a: __m128h) -> __m128h {
-    transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN)))
+pub fn _mm_conj_pch(a: __m128h) -> __m128h {
+    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
 }
 
 /// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
@@ -3842,9 +3941,11 @@ pub unsafe fn _mm_conj_pch(a: __m128h) -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    let r: __m128 = transmute(_mm_conj_pch(a));
-    transmute(simd_select_bitmask(k, r, transmute(src)))
+pub fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe {
+        let r: __m128 = transmute(_mm_conj_pch(a));
+        transmute(simd_select_bitmask(k, r, transmute(src)))
+    }
 }
 
 /// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
@@ -3856,7 +3957,7 @@ pub unsafe fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
+pub fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
     _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
 }
 
@@ -3868,8 +3969,8 @@ pub unsafe fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_conj_pch(a: __m256h) -> __m256h {
-    transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN)))
+pub fn _mm256_conj_pch(a: __m256h) -> __m256h {
+    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
 }
 
 /// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
@@ -3881,9 +3982,11 @@ pub unsafe fn _mm256_conj_pch(a: __m256h) -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
-    let r: __m256 = transmute(_mm256_conj_pch(a));
-    transmute(simd_select_bitmask(k, r, transmute(src)))
+pub fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
+    unsafe {
+        let r: __m256 = transmute(_mm256_conj_pch(a));
+        transmute(simd_select_bitmask(k, r, transmute(src)))
+    }
 }
 
 /// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
@@ -3895,7 +3998,7 @@ pub unsafe fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
+pub fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
     _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
 }
 
@@ -3907,8 +4010,8 @@ pub unsafe fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_conj_pch(a: __m512h) -> __m512h {
-    transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN)))
+pub fn _mm512_conj_pch(a: __m512h) -> __m512h {
+    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
 }
 
 /// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
@@ -3920,9 +4023,11 @@ pub unsafe fn _mm512_conj_pch(a: __m512h) -> __m512h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
-    let r: __m512 = transmute(_mm512_conj_pch(a));
-    transmute(simd_select_bitmask(k, r, transmute(src)))
+pub fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
+    unsafe {
+        let r: __m512 = transmute(_mm512_conj_pch(a));
+        transmute(simd_select_bitmask(k, r, transmute(src)))
+    }
 }
 
 /// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
@@ -3934,7 +4039,7 @@ pub unsafe fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
+pub fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
     _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
 }
 
@@ -3947,7 +4052,7 @@ pub unsafe fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
     _mm_mask3_fmadd_pch(a, b, c, 0xff)
 }
 
@@ -3961,9 +4066,11 @@ pub unsafe fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
-    transmute(simd_select_bitmask(k, r, transmute(a)))
+pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
 }
 
 /// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
@@ -3976,13 +4083,15 @@ pub unsafe fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    transmute(vfmaddcph_mask3_128(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-    ))
+pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        transmute(vfmaddcph_mask3_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
 }
 
 /// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
@@ -3995,13 +4104,15 @@ pub unsafe fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    transmute(vfmaddcph_maskz_128(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-    ))
+pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        transmute(vfmaddcph_maskz_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
 }
 
 /// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
@@ -4013,7 +4124,7 @@ pub unsafe fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
     _mm256_mask3_fmadd_pch(a, b, c, 0xff)
 }
 
@@ -4027,9 +4138,11 @@ pub unsafe fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
-    let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
-    transmute(simd_select_bitmask(k, r, transmute(a)))
+pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
 }
 
 /// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
@@ -4042,13 +4155,15 @@ pub unsafe fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m2
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
-    transmute(vfmaddcph_mask3_256(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-    ))
+pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
+    unsafe {
+        transmute(vfmaddcph_mask3_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
 }
 
 /// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
@@ -4061,13 +4176,15 @@ pub unsafe fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mm
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    transmute(vfmaddcph_maskz_256(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-    ))
+pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        transmute(vfmaddcph_maskz_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
 }
 
 /// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
@@ -4079,7 +4196,7 @@ pub unsafe fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
     _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
 }
 
@@ -4093,7 +4210,7 @@ pub unsafe fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
+pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
     _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
 }
 
@@ -4107,7 +4224,7 @@ pub unsafe fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
+pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
     _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
 }
 
@@ -4121,7 +4238,7 @@ pub unsafe fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mm
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
     _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
 }
 
@@ -4143,11 +4260,7 @@ pub unsafe fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __
 #[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmadd_round_pch<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
+pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
 }
@@ -4171,15 +4284,17 @@ pub unsafe fn _mm512_fmadd_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
+pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
     a: __m512h,
     k: __mmask16,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
-    transmute(simd_select_bitmask(k, r, transmute(a)))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
 }
 
 /// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
@@ -4201,20 +4316,22 @@ pub unsafe fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
     a: __m512h,
     b: __m512h,
     c: __m512h,
     k: __mmask16,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    transmute(vfmaddcph_mask3_512(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-        ROUNDING,
-    ))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmaddcph_mask3_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
 }
 
 /// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
@@ -4236,20 +4353,22 @@ pub unsafe fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512h,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    transmute(vfmaddcph_maskz_512(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-        ROUNDING,
-    ))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmaddcph_maskz_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
 }
 
 /// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
@@ -4262,7 +4381,7 @@ pub unsafe fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
     _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
 }
 
@@ -4277,7 +4396,7 @@ pub unsafe fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
     _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
 }
 
@@ -4292,7 +4411,7 @@ pub unsafe fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
     _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
 }
 
@@ -4307,7 +4426,7 @@ pub unsafe fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
     _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
 }
 
@@ -4329,19 +4448,17 @@ pub unsafe fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128
 #[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmadd_round_sch<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    transmute(vfmaddcsh_mask(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        0xff,
-        ROUNDING,
-    ))
+pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfmaddcsh_mask(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            0xff,
+            ROUNDING,
+        ))
+    }
 }
 
 /// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
@@ -4364,16 +4481,18 @@ pub unsafe fn _mm_fmadd_round_sch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
+pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
     a: __m128h,
     k: __mmask8,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let a = transmute(a);
-    let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
-    transmute(_mm_mask_move_ss(a, k, a, r))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = transmute(a);
+        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(_mm_mask_move_ss(a, k, a, r))
+    }
 }
 
 /// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
@@ -4396,16 +4515,18 @@ pub unsafe fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
+pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
     a: __m128h,
     b: __m128h,
     c: __m128h,
     k: __mmask8,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let c = transmute(c);
-    let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
-    transmute(_mm_move_ss(c, r))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let c = transmute(c);
+        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
+        transmute(_mm_move_ss(c, r))
+    }
 }
 
 /// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
@@ -4428,16 +4549,18 @@ pub unsafe fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
+pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let a = transmute(a);
-    let r = vfmaddcsh_maskz(a, transmute(b), transmute(c), k, ROUNDING);
-    transmute(_mm_move_ss(a, r)) // FIXME: If `k == 0`, then LLVM optimized `vfmaddcsh_maskz` to output an all-zero vector, which is incorrect
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = transmute(a);
+        let r = vfmaddcsh_maskz(a, transmute(b), transmute(c), k, ROUNDING);
+        transmute(_mm_move_ss(a, r)) // FIXME: If `k == 0`, then LLVM optimized `vfmaddcsh_maskz` to output an all-zero vector, which is incorrect
+    }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
@@ -4450,7 +4573,7 @@ pub unsafe fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
     _mm_mask3_fcmadd_pch(a, b, c, 0xff)
 }
 
@@ -4465,9 +4588,11 @@ pub unsafe fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
-    transmute(simd_select_bitmask(k, r, transmute(a)))
+pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
@@ -4481,13 +4606,15 @@ pub unsafe fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    transmute(vfcmaddcph_mask3_128(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-    ))
+pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        transmute(vfcmaddcph_mask3_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
@@ -4501,13 +4628,15 @@ pub unsafe fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmas
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    transmute(vfcmaddcph_maskz_128(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-    ))
+pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        transmute(vfcmaddcph_maskz_128(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
@@ -4520,7 +4649,7 @@ pub unsafe fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m12
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
     _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
 }
 
@@ -4535,9 +4664,11 @@ pub unsafe fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
-    let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
-    transmute(simd_select_bitmask(k, r, transmute(a)))
+pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
@@ -4551,13 +4682,15 @@ pub unsafe fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
-    transmute(vfcmaddcph_mask3_256(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-    ))
+pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
+    unsafe {
+        transmute(vfcmaddcph_mask3_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
@@ -4571,13 +4704,15 @@ pub unsafe fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    transmute(vfcmaddcph_maskz_256(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-    ))
+pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe {
+        transmute(vfcmaddcph_maskz_256(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+        ))
+    }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
@@ -4590,7 +4725,7 @@ pub unsafe fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
     _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
 }
 
@@ -4605,7 +4740,7 @@ pub unsafe fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
+pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
     _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
 }
 
@@ -4620,7 +4755,7 @@ pub unsafe fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
+pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
     _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
 }
 
@@ -4635,7 +4770,7 @@ pub unsafe fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmaddcph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
     _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
 }
 
@@ -4658,11 +4793,7 @@ pub unsafe fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: _
 #[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
+pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
 }
@@ -4687,15 +4818,17 @@ pub unsafe fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
+pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
     a: __m512h,
     k: __mmask16,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
-    transmute(simd_select_bitmask(k, r, transmute(a)))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
+        transmute(simd_select_bitmask(k, r, transmute(a)))
+    }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
@@ -4718,20 +4851,22 @@ pub unsafe fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
+pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
     a: __m512h,
     b: __m512h,
     c: __m512h,
     k: __mmask16,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    transmute(vfcmaddcph_mask3_512(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-        ROUNDING,
-    ))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmaddcph_mask3_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
 }
 
 /// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
@@ -4754,20 +4889,22 @@ pub unsafe fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
+pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
     k: __mmask16,
     a: __m512h,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    transmute(vfcmaddcph_maskz_512(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        k,
-        ROUNDING,
-    ))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmaddcph_maskz_512(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            k,
+            ROUNDING,
+        ))
+    }
 }
 
 /// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
@@ -4781,7 +4918,7 @@ pub unsafe fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmaddcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
     _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
 }
 
@@ -4797,7 +4934,7 @@ pub unsafe fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmaddcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
     _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
 }
 
@@ -4813,7 +4950,7 @@ pub unsafe fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmaddcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
     _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
 }
 
@@ -4829,7 +4966,7 @@ pub unsafe fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmas
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfcmaddcsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
     _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
 }
 
@@ -4853,19 +4990,17 @@ pub unsafe fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m12
 #[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fcmadd_round_sch<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    transmute(vfcmaddcsh_mask(
-        transmute(a),
-        transmute(b),
-        transmute(c),
-        0xff,
-        ROUNDING,
-    ))
+pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vfcmaddcsh_mask(
+            transmute(a),
+            transmute(b),
+            transmute(c),
+            0xff,
+            ROUNDING,
+        ))
+    }
 }
 
 /// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
@@ -4889,16 +5024,18 @@ pub unsafe fn _mm_fcmadd_round_sch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
+pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
     a: __m128h,
     k: __mmask8,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let a = transmute(a);
-    let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
-    transmute(_mm_mask_move_ss(a, k, a, r))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = transmute(a);
+        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
+        transmute(_mm_mask_move_ss(a, k, a, r))
+    }
 }
 
 /// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
@@ -4922,16 +5059,18 @@ pub unsafe fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
+pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
     a: __m128h,
     b: __m128h,
     c: __m128h,
     k: __mmask8,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let c = transmute(c);
-    let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
-    transmute(_mm_move_ss(c, r))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let c = transmute(c);
+        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
+        transmute(_mm_move_ss(c, r))
+    }
 }
 
 /// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
@@ -4955,16 +5094,18 @@ pub unsafe fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
+pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let a = transmute(a);
-    let r = vfcmaddcsh_maskz(a, transmute(b), transmute(c), k, ROUNDING);
-    transmute(_mm_move_ss(a, r)) // FIXME: If `k == 0`, then LLVM optimized `vfcmaddcsh_maskz` to output an all-zero vector, which is incorrect
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = transmute(a);
+        let r = vfcmaddcsh_maskz(a, transmute(b), transmute(c), k, ROUNDING);
+        transmute(_mm_move_ss(a, r)) // FIXME: If `k == 0`, then LLVM optimized `vfcmaddcsh_maskz` to output an all-zero vector, which is incorrect
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -4975,8 +5116,8 @@ pub unsafe fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    simd_fma(a, b, c)
+pub fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(a, b, c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -4988,8 +5129,8 @@ pub unsafe fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a)
+pub fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5001,8 +5142,8 @@ pub unsafe fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h)
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c)
+pub fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5014,8 +5155,8 @@ pub unsafe fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph())
+pub fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5026,8 +5167,8 @@ pub unsafe fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    simd_fma(a, b, c)
+pub fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(a, b, c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5039,8 +5180,8 @@ pub unsafe fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a)
+pub fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5052,8 +5193,8 @@ pub unsafe fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m2
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c)
+pub fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5065,8 +5206,8 @@ pub unsafe fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mma
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph())
+pub fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5077,8 +5218,8 @@ pub unsafe fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    simd_fma(a, b, c)
+pub fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(a, b, c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5090,8 +5231,8 @@ pub unsafe fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a)
+pub fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5103,8 +5244,8 @@ pub unsafe fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m5
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c)
+pub fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5116,8 +5257,8 @@ pub unsafe fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mma
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph())
+pub fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5137,13 +5278,11 @@ pub unsafe fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmadd_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vfmaddph_512(a, b, c, ROUNDING)
+pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(a, b, c, ROUNDING)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5164,14 +5303,16 @@ pub unsafe fn _mm512_fmadd_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
     a: __m512h,
     k: __mmask32,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5192,14 +5333,16 @@ pub unsafe fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
     a: __m512h,
     b: __m512h,
     c: __m512h,
     k: __mmask32,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
@@ -5220,18 +5363,20 @@ pub unsafe fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
     k: __mmask32,
     a: __m512h,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(
-        k,
-        _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
-        _mm512_setzero_ph(),
-    )
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
@@ -5243,12 +5388,14 @@ pub unsafe fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    let extracta: f16 = simd_extract!(a, 0);
-    let extractb: f16 = simd_extract!(b, 0);
-    let extractc: f16 = simd_extract!(c, 0);
-    let r = fmaf16(extracta, extractb, extractc);
-    simd_insert!(a, 0, r)
+pub fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(extracta, extractb, extractc);
+        simd_insert!(a, 0, r)
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
@@ -5261,14 +5408,16 @@ pub unsafe fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    let mut fmadd: f16 = simd_extract!(a, 0);
-    if k & 1 != 0 {
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fmadd = fmaf16(fmadd, extractb, extractc);
+pub fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = fmaf16(fmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
@@ -5281,14 +5430,16 @@ pub unsafe fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h)
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    let mut fmadd: f16 = simd_extract!(c, 0);
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        fmadd = fmaf16(extracta, extractb, fmadd);
+pub fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmadd = fmaf16(extracta, extractb, fmadd);
+        }
+        simd_insert!(c, 0, fmadd)
     }
-    simd_insert!(c, 0, fmadd)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
@@ -5301,15 +5452,17 @@ pub unsafe fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    let mut fmadd: f16 = 0.0;
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fmadd = fmaf16(extracta, extractb, extractc);
+pub fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = fmaf16(extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
@@ -5330,17 +5483,15 @@ pub unsafe fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmadd_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f16 = simd_extract!(a, 0);
-    let extractb: f16 = simd_extract!(b, 0);
-    let extractc: f16 = simd_extract!(c, 0);
-    let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
-    simd_insert!(a, 0, r)
+pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
@@ -5362,20 +5513,22 @@ pub unsafe fn _mm_fmadd_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
     a: __m128h,
     k: __mmask8,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fmadd: f16 = simd_extract!(a, 0);
-    if k & 1 != 0 {
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
@@ -5397,20 +5550,22 @@ pub unsafe fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
     a: __m128h,
     b: __m128h,
     c: __m128h,
     k: __mmask8,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fmadd: f16 = simd_extract!(c, 0);
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fmadd)
     }
-    simd_insert!(c, 0, fmadd)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
@@ -5432,21 +5587,23 @@ pub unsafe fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
+pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fmadd: f16 = 0.0;
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmadd)
     }
-    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5458,8 +5615,8 @@ pub unsafe fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    simd_fma(a, b, simd_neg(c))
+pub fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5471,8 +5628,8 @@ pub unsafe fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a)
+pub fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5484,8 +5641,8 @@ pub unsafe fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h)
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c)
+pub fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5497,8 +5654,8 @@ pub unsafe fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph())
+pub fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5509,8 +5666,8 @@ pub unsafe fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    simd_fma(a, b, simd_neg(c))
+pub fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5522,8 +5679,8 @@ pub unsafe fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a)
+pub fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5535,8 +5692,8 @@ pub unsafe fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m2
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c)
+pub fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5548,8 +5705,8 @@ pub unsafe fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mma
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph())
+pub fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5560,8 +5717,8 @@ pub unsafe fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    simd_fma(a, b, simd_neg(c))
+pub fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(a, b, simd_neg(c)) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5573,8 +5730,8 @@ pub unsafe fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a)
+pub fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5586,8 +5743,8 @@ pub unsafe fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m5
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c)
+pub fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5599,8 +5756,8 @@ pub unsafe fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mma
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph())
+pub fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5620,13 +5777,11 @@ pub unsafe fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmsub_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vfmaddph_512(a, b, simd_neg(c), ROUNDING)
+pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5647,14 +5802,16 @@ pub unsafe fn _mm512_fmsub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
     a: __m512h,
     k: __mmask32,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5675,14 +5832,16 @@ pub unsafe fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
     a: __m512h,
     b: __m512h,
     c: __m512h,
     k: __mmask32,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -5703,18 +5862,20 @@ pub unsafe fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
     k: __mmask32,
     a: __m512h,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(
-        k,
-        _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
-        _mm512_setzero_ph(),
-    )
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
@@ -5726,12 +5887,14 @@ pub unsafe fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    let extracta: f16 = simd_extract!(a, 0);
-    let extractb: f16 = simd_extract!(b, 0);
-    let extractc: f16 = simd_extract!(c, 0);
-    let r = fmaf16(extracta, extractb, -extractc);
-    simd_insert!(a, 0, r)
+pub fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(extracta, extractb, -extractc);
+        simd_insert!(a, 0, r)
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
@@ -5744,14 +5907,16 @@ pub unsafe fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    let mut fmsub: f16 = simd_extract!(a, 0);
-    if k & 1 != 0 {
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fmsub = fmaf16(fmsub, extractb, -extractc);
+pub fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = fmaf16(fmsub, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
@@ -5764,14 +5929,16 @@ pub unsafe fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h)
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    let mut fmsub: f16 = simd_extract!(c, 0);
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        fmsub = fmaf16(extracta, extractb, -fmsub);
+pub fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmsub = fmaf16(extracta, extractb, -fmsub);
+        }
+        simd_insert!(c, 0, fmsub)
     }
-    simd_insert!(c, 0, fmsub)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
@@ -5784,15 +5951,17 @@ pub unsafe fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    let mut fmsub: f16 = 0.0;
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fmsub = fmaf16(extracta, extractb, -extractc);
+pub fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = fmaf16(extracta, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
@@ -5813,17 +5982,15 @@ pub unsafe fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmsub_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f16 = simd_extract!(a, 0);
-    let extractb: f16 = simd_extract!(b, 0);
-    let extractc: f16 = simd_extract!(c, 0);
-    let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
-    simd_insert!(a, 0, r)
+pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
@@ -5845,20 +6012,22 @@ pub unsafe fn _mm_fmsub_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
     a: __m128h,
     k: __mmask8,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fmsub: f16 = simd_extract!(a, 0);
-    if k & 1 != 0 {
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
@@ -5880,20 +6049,22 @@ pub unsafe fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
     a: __m128h,
     b: __m128h,
     c: __m128h,
     k: __mmask8,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fmsub: f16 = simd_extract!(c, 0);
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
+        }
+        simd_insert!(c, 0, fmsub)
     }
-    simd_insert!(c, 0, fmsub)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
@@ -5907,21 +6078,23 @@ pub unsafe fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
+pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fmsub: f16 = 0.0;
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fmsub)
     }
-    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -5932,8 +6105,8 @@ pub unsafe fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    simd_fma(simd_neg(a), b, c)
+pub fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(simd_neg(a), b, c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -5945,8 +6118,8 @@ pub unsafe fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a)
+pub fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -5958,8 +6131,8 @@ pub unsafe fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c)
+pub fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -5971,8 +6144,8 @@ pub unsafe fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph())
+pub fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -5983,8 +6156,8 @@ pub unsafe fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    simd_fma(simd_neg(a), b, c)
+pub fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(simd_neg(a), b, c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -5996,8 +6169,8 @@ pub unsafe fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a)
+pub fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -6009,8 +6182,8 @@ pub unsafe fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
-    simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c)
+pub fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -6022,8 +6195,8 @@ pub unsafe fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mm
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph())
+pub fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -6034,8 +6207,8 @@ pub unsafe fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    simd_fma(simd_neg(a), b, c)
+pub fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(simd_neg(a), b, c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -6047,8 +6220,8 @@ pub unsafe fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a)
+pub fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -6060,8 +6233,8 @@ pub unsafe fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
-    simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c)
+pub fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -6073,8 +6246,8 @@ pub unsafe fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mm
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph())
+pub fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -6094,13 +6267,11 @@ pub unsafe fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vfmaddph_512(simd_neg(a), b, c, ROUNDING)
+pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -6121,14 +6292,16 @@ pub unsafe fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
     a: __m512h,
     k: __mmask32,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -6149,14 +6322,16 @@ pub unsafe fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
     a: __m512h,
     b: __m512h,
     c: __m512h,
     k: __mmask32,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
@@ -6177,18 +6352,20 @@ pub unsafe fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
+pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
     k: __mmask32,
     a: __m512h,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(
-        k,
-        _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
-        _mm512_setzero_ph(),
-    )
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6200,12 +6377,14 @@ pub unsafe fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    let extracta: f16 = simd_extract!(a, 0);
-    let extractb: f16 = simd_extract!(b, 0);
-    let extractc: f16 = simd_extract!(c, 0);
-    let r = fmaf16(-extracta, extractb, extractc);
-    simd_insert!(a, 0, r)
+pub fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(-extracta, extractb, extractc);
+        simd_insert!(a, 0, r)
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6218,14 +6397,16 @@ pub unsafe fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    let mut fnmadd: f16 = simd_extract!(a, 0);
-    if k & 1 != 0 {
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fnmadd = fmaf16(-fnmadd, extractb, extractc);
+pub fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = fmaf16(-fnmadd, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6238,14 +6419,16 @@ pub unsafe fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    let mut fnmadd: f16 = simd_extract!(c, 0);
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        fnmadd = fmaf16(-extracta, extractb, fnmadd);
+pub fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fnmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmadd = fmaf16(-extracta, extractb, fnmadd);
+        }
+        simd_insert!(c, 0, fnmadd)
     }
-    simd_insert!(c, 0, fnmadd)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6258,15 +6441,17 @@ pub unsafe fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    let mut fnmadd: f16 = 0.0;
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fnmadd = fmaf16(-extracta, extractb, extractc);
+pub fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = fmaf16(-extracta, extractb, extractc);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6287,17 +6472,15 @@ pub unsafe fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fnmadd_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f16 = simd_extract!(a, 0);
-    let extractb: f16 = simd_extract!(b, 0);
-    let extractc: f16 = simd_extract!(c, 0);
-    let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
-    simd_insert!(a, 0, r)
+pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6319,20 +6502,22 @@ pub unsafe fn _mm_fnmadd_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
     a: __m128h,
     k: __mmask8,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f16 = simd_extract!(a, 0);
-    if k & 1 != 0 {
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6354,20 +6539,22 @@ pub unsafe fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
     a: __m128h,
     b: __m128h,
     c: __m128h,
     k: __mmask8,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f16 = simd_extract!(c, 0);
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmadd)
     }
-    simd_insert!(c, 0, fnmadd)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6389,21 +6576,23 @@ pub unsafe fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
+pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f16 = 0.0;
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmadd: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmadd)
     }
-    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6414,8 +6603,8 @@ pub unsafe fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    simd_fma(simd_neg(a), b, simd_neg(c))
+pub fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6427,8 +6616,8 @@ pub unsafe fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a)
+pub fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6440,8 +6629,8 @@ pub unsafe fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c)
+pub fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6453,8 +6642,8 @@ pub unsafe fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph())
+pub fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6465,8 +6654,8 @@ pub unsafe fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    simd_fma(simd_neg(a), b, simd_neg(c))
+pub fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6478,8 +6667,8 @@ pub unsafe fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a)
+pub fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6491,8 +6680,8 @@ pub unsafe fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
-    simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c)
+pub fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6504,8 +6693,8 @@ pub unsafe fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mm
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph())
+pub fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6516,8 +6705,8 @@ pub unsafe fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    simd_fma(simd_neg(a), b, simd_neg(c))
+pub fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6529,8 +6718,8 @@ pub unsafe fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a)
+pub fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6542,8 +6731,8 @@ pub unsafe fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
-    simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c)
+pub fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6555,8 +6744,8 @@ pub unsafe fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mm
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph())
+pub fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6576,13 +6765,11 @@ pub unsafe fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
+pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6603,14 +6790,16 @@ pub unsafe fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
     a: __m512h,
     k: __mmask32,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6631,14 +6820,16 @@ pub unsafe fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
     a: __m512h,
     b: __m512h,
     c: __m512h,
     k: __mmask32,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
@@ -6659,18 +6850,20 @@ pub unsafe fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
     k: __mmask32,
     a: __m512h,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(
-        k,
-        _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
-        _mm512_setzero_ph(),
-    )
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6682,12 +6875,14 @@ pub unsafe fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    let extracta: f16 = simd_extract!(a, 0);
-    let extractb: f16 = simd_extract!(b, 0);
-    let extractc: f16 = simd_extract!(c, 0);
-    let r = fmaf16(-extracta, extractb, -extractc);
-    simd_insert!(a, 0, r)
+pub fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = fmaf16(-extracta, extractb, -extractc);
+        simd_insert!(a, 0, r)
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6700,14 +6895,16 @@ pub unsafe fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    let mut fnmsub: f16 = simd_extract!(a, 0);
-    if k & 1 != 0 {
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fnmsub = fmaf16(-fnmsub, extractb, -extractc);
+pub fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6720,14 +6917,16 @@ pub unsafe fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    let mut fnmsub: f16 = simd_extract!(c, 0);
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        fnmsub = fmaf16(-extracta, extractb, -fnmsub);
+pub fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe {
+        let mut fnmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
+        }
+        simd_insert!(c, 0, fnmsub)
     }
-    simd_insert!(c, 0, fnmsub)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6740,15 +6939,17 @@ pub unsafe fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfnmsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    let mut fnmsub: f16 = 0.0;
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fnmsub = fmaf16(-extracta, extractb, -extractc);
+pub fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        let mut fnmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = fmaf16(-extracta, extractb, -extractc);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6769,17 +6970,15 @@ pub unsafe fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fnmsub_round_sh<const ROUNDING: i32>(
-    a: __m128h,
-    b: __m128h,
-    c: __m128h,
-) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let extracta: f16 = simd_extract!(a, 0);
-    let extractb: f16 = simd_extract!(b, 0);
-    let extractc: f16 = simd_extract!(c, 0);
-    let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
-    simd_insert!(a, 0, r)
+pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let extracta: f16 = simd_extract!(a, 0);
+        let extractb: f16 = simd_extract!(b, 0);
+        let extractc: f16 = simd_extract!(c, 0);
+        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
+        simd_insert!(a, 0, r)
+    }
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6801,20 +7000,22 @@ pub unsafe fn _mm_fnmsub_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
     a: __m128h,
     k: __mmask8,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f16 = simd_extract!(a, 0);
-    if k & 1 != 0 {
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f16 = simd_extract!(a, 0);
+        if k & 1 != 0 {
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6836,20 +7037,22 @@ pub unsafe fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
     a: __m128h,
     b: __m128h,
     c: __m128h,
     k: __mmask8,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f16 = simd_extract!(c, 0);
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f16 = simd_extract!(c, 0);
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
+        }
+        simd_insert!(c, 0, fnmsub)
     }
-    simd_insert!(c, 0, fnmsub)
 }
 
 /// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
@@ -6871,21 +7074,23 @@ pub unsafe fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
+pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
     c: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f16 = 0.0;
-    if k & 1 != 0 {
-        let extracta: f16 = simd_extract!(a, 0);
-        let extractb: f16 = simd_extract!(b, 0);
-        let extractc: f16 = simd_extract!(c, 0);
-        fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let mut fnmsub: f16 = 0.0;
+        if k & 1 != 0 {
+            let extracta: f16 = simd_extract!(a, 0);
+            let extractb: f16 = simd_extract!(b, 0);
+            let extractc: f16 = simd_extract!(c, 0);
+            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
+        }
+        simd_insert!(a, 0, fnmsub)
     }
-    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -6896,8 +7101,8 @@ pub unsafe fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    vfmaddsubph_128(a, b, c)
+pub fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { vfmaddsubph_128(a, b, c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -6909,8 +7114,8 @@ pub unsafe fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a)
+pub fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -6922,8 +7127,8 @@ pub unsafe fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m12
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c)
+pub fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -6935,8 +7140,8 @@ pub unsafe fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mma
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph())
+pub fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -6947,8 +7152,8 @@ pub unsafe fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m1
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    vfmaddsubph_256(a, b, c)
+pub fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { vfmaddsubph_256(a, b, c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -6960,8 +7165,8 @@ pub unsafe fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a)
+pub fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -6973,13 +7178,8 @@ pub unsafe fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: _
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask3_fmaddsub_ph(
-    a: __m256h,
-    b: __m256h,
-    c: __m256h,
-    k: __mmask16,
-) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c)
+pub fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -6991,13 +7191,8 @@ pub unsafe fn _mm256_mask3_fmaddsub_ph(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_fmaddsub_ph(
-    k: __mmask16,
-    a: __m256h,
-    b: __m256h,
-    c: __m256h,
-) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph())
+pub fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -7008,7 +7203,7 @@ pub unsafe fn _mm256_maskz_fmaddsub_ph(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+pub fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
     _mm512_fmaddsub_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
 }
 
@@ -7021,8 +7216,8 @@ pub unsafe fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a)
+pub fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -7034,13 +7229,8 @@ pub unsafe fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: _
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fmaddsub_ph(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-    k: __mmask32,
-) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c)
+pub fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -7052,13 +7242,8 @@ pub unsafe fn _mm512_mask3_fmaddsub_ph(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmaddsub))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmaddsub_ph(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph())
+pub fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -7078,13 +7263,15 @@ pub unsafe fn _mm512_maskz_fmaddsub_ph(
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
     a: __m512h,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vfmaddsubph_512(a, b, c, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubph_512(a, b, c, ROUNDING)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -7105,14 +7292,16 @@ pub unsafe fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
     a: __m512h,
     k: __mmask32,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -7133,14 +7322,16 @@ pub unsafe fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
     a: __m512h,
     b: __m512h,
     c: __m512h,
     k: __mmask32,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
@@ -7161,18 +7352,20 @@ pub unsafe fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
     k: __mmask32,
     a: __m512h,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(
-        k,
-        _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
-        _mm512_setzero_ph(),
-    )
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7183,8 +7376,8 @@ pub unsafe fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    vfmaddsubph_128(a, b, simd_neg(c))
+pub fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { vfmaddsubph_128(a, b, simd_neg(c)) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7196,8 +7389,8 @@ pub unsafe fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a)
+pub fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7209,8 +7402,8 @@ pub unsafe fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m12
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
-    simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c)
+pub fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7222,8 +7415,8 @@ pub unsafe fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mma
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph())
+pub fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7234,8 +7427,8 @@ pub unsafe fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m1
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
-    vfmaddsubph_256(a, b, simd_neg(c))
+pub fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { vfmaddsubph_256(a, b, simd_neg(c)) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7247,8 +7440,8 @@ pub unsafe fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a)
+pub fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7260,13 +7453,8 @@ pub unsafe fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: _
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask3_fmsubadd_ph(
-    a: __m256h,
-    b: __m256h,
-    c: __m256h,
-    k: __mmask16,
-) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c)
+pub fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7278,13 +7466,8 @@ pub unsafe fn _mm256_mask3_fmsubadd_ph(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_fmsubadd_ph(
-    k: __mmask16,
-    a: __m256h,
-    b: __m256h,
-    c: __m256h,
-) -> __m256h {
-    simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph())
+pub fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7295,7 +7478,7 @@ pub unsafe fn _mm256_maskz_fmsubadd_ph(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+pub fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
     _mm512_fmsubadd_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
 }
 
@@ -7308,8 +7491,8 @@ pub unsafe fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a)
+pub fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7321,13 +7504,8 @@ pub unsafe fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: _
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fmsubadd_ph(
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-    k: __mmask32,
-) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c)
+pub fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7339,13 +7517,8 @@ pub unsafe fn _mm512_mask3_fmsubadd_ph(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vfmsubadd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmsubadd_ph(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-    c: __m512h,
-) -> __m512h {
-    simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph())
+pub fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7365,13 +7538,15 @@ pub unsafe fn _mm512_maskz_fmsubadd_ph(
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
+pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
     a: __m512h,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7392,14 +7567,16 @@ pub unsafe fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
     a: __m512h,
     k: __mmask32,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7420,14 +7597,16 @@ pub unsafe fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
     a: __m512h,
     b: __m512h,
     c: __m512h,
     k: __mmask32,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
+    }
 }
 
 /// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
@@ -7448,18 +7627,20 @@ pub unsafe fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
+pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
     k: __mmask32,
     a: __m512h,
     b: __m512h,
     c: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(
-        k,
-        _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
-        _mm512_setzero_ph(),
-    )
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(
+            k,
+            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
+            _mm512_setzero_ph(),
+        )
+    }
 }
 
 /// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
@@ -7470,7 +7651,7 @@ pub unsafe fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrcpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_rcp_ph(a: __m128h) -> __m128h {
+pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
     _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
 }
 
@@ -7483,8 +7664,8 @@ pub unsafe fn _mm_rcp_ph(a: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrcpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    vrcpph_128(a, src, k)
+pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { vrcpph_128(a, src, k) }
 }
 
 /// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
@@ -7496,7 +7677,7 @@ pub unsafe fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrcpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
+pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
     _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -7508,7 +7689,7 @@ pub unsafe fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrcpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_rcp_ph(a: __m256h) -> __m256h {
+pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
     _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
 }
 
@@ -7521,8 +7702,8 @@ pub unsafe fn _mm256_rcp_ph(a: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrcpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
-    vrcpph_256(a, src, k)
+pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { vrcpph_256(a, src, k) }
 }
 
 /// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
@@ -7534,7 +7715,7 @@ pub unsafe fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m2
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrcpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
+pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
     _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
 }
 
@@ -7546,7 +7727,7 @@ pub unsafe fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrcpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_rcp_ph(a: __m512h) -> __m512h {
+pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
     _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
 }
 
@@ -7559,8 +7740,8 @@ pub unsafe fn _mm512_rcp_ph(a: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrcpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
-    vrcpph_512(a, src, k)
+pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { vrcpph_512(a, src, k) }
 }
 
 /// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
@@ -7572,7 +7753,7 @@ pub unsafe fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m5
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrcpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
+pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
     _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
 }
 
@@ -7586,7 +7767,7 @@ pub unsafe fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrcpsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_rcp_sh(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -7600,8 +7781,8 @@ pub unsafe fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrcpsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    vrcpsh(a, b, src, k)
+pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vrcpsh(a, b, src, k) }
 }
 
 /// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
@@ -7614,7 +7795,7 @@ pub unsafe fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrcpsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_rcp_sh(_mm_setzero_ph(), k, a, b)
 }
 
@@ -7627,7 +7808,7 @@ pub unsafe fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
+pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
     _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
 }
 
@@ -7641,8 +7822,8 @@ pub unsafe fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    vrsqrtph_128(a, src, k)
+pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { vrsqrtph_128(a, src, k) }
 }
 
 /// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
@@ -7655,7 +7836,7 @@ pub unsafe fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
+pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
     _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -7668,7 +7849,7 @@ pub unsafe fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
+pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
     _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
 }
 
@@ -7682,8 +7863,8 @@ pub unsafe fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
-    vrsqrtph_256(a, src, k)
+pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { vrsqrtph_256(a, src, k) }
 }
 
 /// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
@@ -7696,7 +7877,7 @@ pub unsafe fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vrsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
+pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
     _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
 }
 
@@ -7709,7 +7890,7 @@ pub unsafe fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
+pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
     _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
 }
 
@@ -7723,8 +7904,8 @@ pub unsafe fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
-    vrsqrtph_512(a, src, k)
+pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { vrsqrtph_512(a, src, k) }
 }
 
 /// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
@@ -7737,7 +7918,7 @@ pub unsafe fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
+pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
     _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
 }
 
@@ -7751,7 +7932,7 @@ pub unsafe fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrsqrtsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_rsqrt_sh(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -7765,8 +7946,8 @@ pub unsafe fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrsqrtsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    vrsqrtsh(a, b, src, k)
+pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vrsqrtsh(a, b, src, k) }
 }
 
 /// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
@@ -7779,7 +7960,7 @@ pub unsafe fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vrsqrtsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_rsqrt_sh(_mm_setzero_ph(), k, a, b)
 }
 
@@ -7791,8 +7972,8 @@ pub unsafe fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_sqrt_ph(a: __m128h) -> __m128h {
-    simd_fsqrt(a)
+pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
+    unsafe { simd_fsqrt(a) }
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -7803,8 +7984,8 @@ pub unsafe fn _mm_sqrt_ph(a: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_sqrt_ph(a), src)
+pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -7815,8 +7996,8 @@ pub unsafe fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph())
+pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -7827,8 +8008,8 @@ pub unsafe fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
-    simd_fsqrt(a)
+pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
+    unsafe { simd_fsqrt(a) }
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -7839,8 +8020,8 @@ pub unsafe fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_sqrt_ph(a), src)
+pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -7851,8 +8032,8 @@ pub unsafe fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph())
+pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -7863,8 +8044,8 @@ pub unsafe fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
-    simd_fsqrt(a)
+pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
+    unsafe { simd_fsqrt(a) }
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -7875,8 +8056,8 @@ pub unsafe fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_sqrt_ph(a), src)
+pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -7887,8 +8068,8 @@ pub unsafe fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsqrtph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph())
+pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -7907,9 +8088,11 @@ pub unsafe fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
 #[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vsqrtph_512(a, ROUNDING)
+pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtph_512(a, ROUNDING)
+    }
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -7928,13 +8111,15 @@ pub unsafe fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
 #[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
+    }
 }
 
 /// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
@@ -7953,9 +8138,11 @@ pub unsafe fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
+pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
+    }
 }
 
 /// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
@@ -7967,7 +8154,7 @@ pub unsafe fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: _
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsqrtsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_sqrt_sh(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -7980,7 +8167,7 @@ pub unsafe fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsqrtsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -7993,7 +8180,7 @@ pub unsafe fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vsqrtsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_sqrt_sh(_mm_setzero_ph(), k, a, b)
 }
 
@@ -8014,7 +8201,7 @@ pub unsafe fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h
 #[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_sqrt_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -8036,14 +8223,16 @@ pub unsafe fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) ->
 #[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vsqrtsh(a, b, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vsqrtsh(a, b, src, k, ROUNDING)
+    }
 }
 
 /// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
@@ -8063,7 +8252,7 @@ pub unsafe fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
+pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
@@ -8081,8 +8270,8 @@ pub unsafe fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmaxph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
-    vmaxph_128(a, b)
+pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vmaxph_128(a, b) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
@@ -8095,8 +8284,8 @@ pub unsafe fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmaxph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_max_ph(a, b), src)
+pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
@@ -8109,8 +8298,8 @@ pub unsafe fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmaxph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph())
+pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
@@ -8122,8 +8311,8 @@ pub unsafe fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmaxph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
-    vmaxph_256(a, b)
+pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { vmaxph_256(a, b) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
@@ -8136,8 +8325,8 @@ pub unsafe fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmaxph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_max_ph(a, b), src)
+pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
@@ -8150,8 +8339,8 @@ pub unsafe fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m2
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmaxph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph())
+pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
@@ -8163,7 +8352,7 @@ pub unsafe fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m25
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vmaxph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
     _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
 }
 
@@ -8177,8 +8366,8 @@ pub unsafe fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vmaxph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_max_ph(a, b), src)
+pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
@@ -8191,8 +8380,8 @@ pub unsafe fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m5
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vmaxph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph())
+pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
@@ -8206,9 +8395,11 @@ pub unsafe fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m51
 #[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
-    static_assert_sae!(SAE);
-    vmaxph_512(a, b, SAE)
+pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vmaxph_512(a, b, SAE)
+    }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
@@ -8222,14 +8413,16 @@ pub unsafe fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m
 #[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_max_round_ph<const SAE: i32>(
+pub fn _mm512_mask_max_round_ph<const SAE: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_sae!(SAE);
-    simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
+    }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
@@ -8243,13 +8436,11 @@ pub unsafe fn _mm512_mask_max_round_ph<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_max_round_ph<const SAE: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    static_assert_sae!(SAE);
-    simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
+pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
+    }
 }
 
 /// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
@@ -8262,7 +8453,7 @@ pub unsafe fn _mm512_maskz_max_round_ph<const SAE: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmaxsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -8276,7 +8467,7 @@ pub unsafe fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmaxsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -8290,7 +8481,7 @@ pub unsafe fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vmaxsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_max_sh(_mm_setzero_ph(), k, a, b)
 }
 
@@ -8305,7 +8496,7 @@ pub unsafe fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_sae!(SAE);
     _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -8322,14 +8513,16 @@ pub unsafe fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128
 #[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_max_round_sh<const SAE: i32>(
+pub fn _mm_mask_max_round_sh<const SAE: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_sae!(SAE);
-    vmaxsh(a, b, src, k, SAE)
+    unsafe {
+        static_assert_sae!(SAE);
+        vmaxsh(a, b, src, k, SAE)
+    }
 }
 
 /// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
@@ -8344,11 +8537,7 @@ pub unsafe fn _mm_mask_max_round_sh<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_max_round_sh<const SAE: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
+pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_sae!(SAE);
     _mm_mask_max_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
 }
@@ -8362,8 +8551,8 @@ pub unsafe fn _mm_maskz_max_round_sh<const SAE: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vminph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
-    vminph_128(a, b)
+pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vminph_128(a, b) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
@@ -8376,8 +8565,8 @@ pub unsafe fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vminph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_min_ph(a, b), src)
+pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
@@ -8390,8 +8579,8 @@ pub unsafe fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vminph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph())
+pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
@@ -8403,8 +8592,8 @@ pub unsafe fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vminph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
-    vminph_256(a, b)
+pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
+    unsafe { vminph_256(a, b) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
@@ -8417,8 +8606,8 @@ pub unsafe fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vminph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_min_ph(a, b), src)
+pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
@@ -8431,8 +8620,8 @@ pub unsafe fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m2
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vminph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph())
+pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
@@ -8444,7 +8633,7 @@ pub unsafe fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m25
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vminph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
     _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
 }
 
@@ -8458,8 +8647,8 @@ pub unsafe fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vminph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_min_ph(a, b), src)
+pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
@@ -8472,8 +8661,8 @@ pub unsafe fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m5
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vminph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph())
+pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
@@ -8486,9 +8675,11 @@ pub unsafe fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m51
 #[cfg_attr(test, assert_instr(vminph, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
-    static_assert_sae!(SAE);
-    vminph_512(a, b, SAE)
+pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        vminph_512(a, b, SAE)
+    }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
@@ -8502,14 +8693,16 @@ pub unsafe fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m
 #[cfg_attr(test, assert_instr(vminph, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_min_round_ph<const SAE: i32>(
+pub fn _mm512_mask_min_round_ph<const SAE: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_sae!(SAE);
-    simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
+    }
 }
 
 /// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
@@ -8523,13 +8716,11 @@ pub unsafe fn _mm512_mask_min_round_ph<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vminph, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_min_round_ph<const SAE: i32>(
-    k: __mmask32,
-    a: __m512h,
-    b: __m512h,
-) -> __m512h {
-    static_assert_sae!(SAE);
-    simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
+pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe {
+        static_assert_sae!(SAE);
+        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
+    }
 }
 
 /// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
@@ -8542,7 +8733,7 @@ pub unsafe fn _mm512_maskz_min_round_ph<const SAE: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vminsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -8556,7 +8747,7 @@ pub unsafe fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vminsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -8570,7 +8761,7 @@ pub unsafe fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h)
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vminsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_min_sh(_mm_setzero_ph(), k, a, b)
 }
 
@@ -8585,7 +8776,7 @@ pub unsafe fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
 #[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_sae!(SAE);
     _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -8602,14 +8793,16 @@ pub unsafe fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128
 #[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_min_round_sh<const SAE: i32>(
+pub fn _mm_mask_min_round_sh<const SAE: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_sae!(SAE);
-    vminsh(a, b, src, k, SAE)
+    unsafe {
+        static_assert_sae!(SAE);
+        vminsh(a, b, src, k, SAE)
+    }
 }
 
 /// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
@@ -8624,11 +8817,7 @@ pub unsafe fn _mm_mask_min_round_sh<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_min_round_sh<const SAE: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
+pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_sae!(SAE);
     _mm_mask_min_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
 }
@@ -8642,7 +8831,7 @@ pub unsafe fn _mm_maskz_min_round_sh<const SAE: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vgetexpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_getexp_ph(a: __m128h) -> __m128h {
+pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
     _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
 }
 
@@ -8656,8 +8845,8 @@ pub unsafe fn _mm_getexp_ph(a: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vgetexpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
-    vgetexpph_128(a, src, k)
+pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe { vgetexpph_128(a, src, k) }
 }
 
 /// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
@@ -8670,7 +8859,7 @@ pub unsafe fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m12
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vgetexpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
+pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
     _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -8683,7 +8872,7 @@ pub unsafe fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vgetexpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_getexp_ph(a: __m256h) -> __m256h {
+pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
     _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
 }
 
@@ -8697,8 +8886,8 @@ pub unsafe fn _mm256_getexp_ph(a: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vgetexpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
-    vgetexpph_256(a, src, k)
+pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe { vgetexpph_256(a, src, k) }
 }
 
 /// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
@@ -8711,7 +8900,7 @@ pub unsafe fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> _
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vgetexpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
+pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
     _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
 }
 
@@ -8724,7 +8913,7 @@ pub unsafe fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vgetexpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_getexp_ph(a: __m512h) -> __m512h {
+pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
     _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
 }
 
@@ -8738,7 +8927,7 @@ pub unsafe fn _mm512_getexp_ph(a: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vgetexpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
+pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
     _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
 }
 
@@ -8752,7 +8941,7 @@ pub unsafe fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> _
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vgetexpph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
+pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
     _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
 }
 
@@ -8767,7 +8956,7 @@ pub unsafe fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
 #[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
+pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
     static_assert_sae!(SAE);
     _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
 }
@@ -8783,13 +8972,15 @@ pub unsafe fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
 #[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_getexp_round_ph<const SAE: i32>(
+pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
 ) -> __m512h {
-    static_assert_sae!(SAE);
-    vgetexpph_512(a, src, k, SAE)
+    unsafe {
+        static_assert_sae!(SAE);
+        vgetexpph_512(a, src, k, SAE)
+    }
 }
 
 /// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
@@ -8803,7 +8994,7 @@ pub unsafe fn _mm512_mask_getexp_round_ph<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
+pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
     static_assert_sae!(SAE);
     _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
 }
@@ -8818,7 +9009,7 @@ pub unsafe fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m5
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vgetexpsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_getexp_sh(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -8833,7 +9024,7 @@ pub unsafe fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vgetexpsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -8848,7 +9039,7 @@ pub unsafe fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m12
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vgetexpsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_getexp_sh(_mm_setzero_ph(), k, a, b)
 }
 
@@ -8864,7 +9055,7 @@ pub unsafe fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128
 #[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_sae!(SAE);
     _mm_mask_getexp_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -8881,14 +9072,16 @@ pub unsafe fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m
 #[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_getexp_round_sh<const SAE: i32>(
+pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_sae!(SAE);
-    vgetexpsh(a, b, src, k, SAE)
+    unsafe {
+        static_assert_sae!(SAE);
+        vgetexpsh(a, b, src, k, SAE)
+    }
 }
 
 /// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
@@ -8903,11 +9096,7 @@ pub unsafe fn _mm_mask_getexp_round_sh<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_getexp_round_sh<const SAE: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
+pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_sae!(SAE);
     _mm_mask_getexp_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
 }
@@ -8935,10 +9124,7 @@ pub unsafe fn _mm_maskz_getexp_round_sh<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(1, 2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_getmant_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m128h,
 ) -> __m128h {
     static_assert_uimm_bits!(NORM, 4);
@@ -8970,7 +9156,7 @@ pub unsafe fn _mm_getmant_ph<
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_getmant_ph<
+pub fn _mm_mask_getmant_ph<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -8978,9 +9164,11 @@ pub unsafe fn _mm_mask_getmant_ph<
     k: __mmask8,
     a: __m128h,
 ) -> __m128h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
+    }
 }
 
 /// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
@@ -9007,7 +9195,7 @@ pub unsafe fn _mm_mask_getmant_ph<
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_getmant_ph<
+pub fn _mm_maskz_getmant_ph<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -9042,10 +9230,7 @@ pub unsafe fn _mm_maskz_getmant_ph<
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(1, 2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_getmant_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m256h,
 ) -> __m256h {
     static_assert_uimm_bits!(NORM, 4);
@@ -9077,7 +9262,7 @@ pub unsafe fn _mm256_getmant_ph<
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_getmant_ph<
+pub fn _mm256_mask_getmant_ph<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -9085,9 +9270,11 @@ pub unsafe fn _mm256_mask_getmant_ph<
     k: __mmask16,
     a: __m256h,
 ) -> __m256h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
+    }
 }
 
 /// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
@@ -9114,7 +9301,7 @@ pub unsafe fn _mm256_mask_getmant_ph<
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_getmant_ph<
+pub fn _mm256_maskz_getmant_ph<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -9149,10 +9336,7 @@ pub unsafe fn _mm256_maskz_getmant_ph<
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(1, 2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_getmant_ph<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m512h,
 ) -> __m512h {
     static_assert_uimm_bits!(NORM, 4);
@@ -9184,7 +9368,7 @@ pub unsafe fn _mm512_getmant_ph<
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_getmant_ph<
+pub fn _mm512_mask_getmant_ph<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -9221,7 +9405,7 @@ pub unsafe fn _mm512_mask_getmant_ph<
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_getmant_ph<
+pub fn _mm512_maskz_getmant_ph<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -9259,7 +9443,7 @@ pub unsafe fn _mm512_maskz_getmant_ph<
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
 #[rustc_legacy_const_generics(1, 2, 3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_getmant_round_ph<
+pub fn _mm512_getmant_round_ph<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -9299,7 +9483,7 @@ pub unsafe fn _mm512_getmant_round_ph<
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4, 5)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_getmant_round_ph<
+pub fn _mm512_mask_getmant_round_ph<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -9308,10 +9492,12 @@ pub unsafe fn _mm512_mask_getmant_round_ph<
     k: __mmask32,
     a: __m512h,
 ) -> __m512h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_sae!(SAE);
-    vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_sae!(SAE);
+        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
+    }
 }
 
 /// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
@@ -9341,7 +9527,7 @@ pub unsafe fn _mm512_mask_getmant_round_ph<
 #[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_getmant_round_ph<
+pub fn _mm512_maskz_getmant_round_ph<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -9378,11 +9564,8 @@ pub unsafe fn _mm512_maskz_getmant_round_ph<
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(2, 3)]
-#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_getmant_sh<
-    const NORM: _MM_MANTISSA_NORM_ENUM,
-    const SIGN: _MM_MANTISSA_SIGN_ENUM,
->(
+#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
+pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
@@ -9416,7 +9599,7 @@ pub unsafe fn _mm_getmant_sh<
 #[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(4, 5)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_getmant_sh<
+pub fn _mm_mask_getmant_sh<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -9455,7 +9638,7 @@ pub unsafe fn _mm_mask_getmant_sh<
 #[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_getmant_sh<
+pub fn _mm_maskz_getmant_sh<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
 >(
@@ -9495,7 +9678,7 @@ pub unsafe fn _mm_maskz_getmant_sh<
 #[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_getmant_round_sh<
+pub fn _mm_getmant_round_sh<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -9536,7 +9719,7 @@ pub unsafe fn _mm_getmant_round_sh<
 #[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5, 6)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_getmant_round_sh<
+pub fn _mm_mask_getmant_round_sh<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -9546,10 +9729,12 @@ pub unsafe fn _mm_mask_getmant_round_sh<
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_uimm_bits!(NORM, 4);
-    static_assert_uimm_bits!(SIGN, 2);
-    static_assert_sae!(SAE);
-    vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
+    unsafe {
+        static_assert_uimm_bits!(NORM, 4);
+        static_assert_uimm_bits!(SIGN, 2);
+        static_assert_sae!(SAE);
+        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
+    }
 }
 
 /// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
@@ -9579,7 +9764,7 @@ pub unsafe fn _mm_mask_getmant_round_sh<
 #[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4, 5)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_getmant_round_sh<
+pub fn _mm_maskz_getmant_round_sh<
     const NORM: _MM_MANTISSA_NORM_ENUM,
     const SIGN: _MM_MANTISSA_SIGN_ENUM,
     const SAE: i32,
@@ -9611,7 +9796,7 @@ pub unsafe fn _mm_maskz_getmant_round_sh<
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
+pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
 }
@@ -9634,13 +9819,11 @@ pub unsafe fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_roundscale_ph<const IMM8: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    vrndscaleph_128(a, IMM8, src, k)
+pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vrndscaleph_128(a, IMM8, src, k)
+    }
 }
 
 /// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
@@ -9661,7 +9844,7 @@ pub unsafe fn _mm_mask_roundscale_ph<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
+pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
 }
@@ -9683,7 +9866,7 @@ pub unsafe fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h)
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
+pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
 }
@@ -9706,13 +9889,15 @@ pub unsafe fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_roundscale_ph<const IMM8: i32>(
+pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
     src: __m256h,
     k: __mmask16,
     a: __m256h,
 ) -> __m256h {
-    static_assert_uimm_bits!(IMM8, 8);
-    vrndscaleph_256(a, IMM8, src, k)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vrndscaleph_256(a, IMM8, src, k)
+    }
 }
 
 /// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
@@ -9733,7 +9918,7 @@ pub unsafe fn _mm256_mask_roundscale_ph<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
+pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
 }
@@ -9755,7 +9940,7 @@ pub unsafe fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m25
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
+pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
 }
@@ -9778,7 +9963,7 @@ pub unsafe fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_roundscale_ph<const IMM8: i32>(
+pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
@@ -9805,7 +9990,7 @@ pub unsafe fn _mm512_mask_roundscale_ph<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
+pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
 }
@@ -9828,7 +10013,7 @@ pub unsafe fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m51
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(1, 2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
+pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
     _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
@@ -9853,14 +10038,16 @@ pub unsafe fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
 ) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    vrndscaleph_512(a, IMM8, src, k, SAE)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vrndscaleph_512(a, IMM8, src, k, SAE)
+    }
 }
 
 /// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
@@ -9881,7 +10068,7 @@ pub unsafe fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
     k: __mmask32,
     a: __m512h,
 ) -> __m512h {
@@ -9908,7 +10095,7 @@ pub unsafe fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_roundscale_sh::<IMM8>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -9931,7 +10118,7 @@ pub unsafe fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m1
 #[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_roundscale_sh<const IMM8: i32>(
+pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
@@ -9959,11 +10146,7 @@ pub unsafe fn _mm_mask_roundscale_sh<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_roundscale_sh<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
+pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_roundscale_sh::<IMM8>(_mm_setzero_ph(), k, a, b)
 }
@@ -9988,10 +10171,7 @@ pub unsafe fn _mm_maskz_roundscale_sh<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
+pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
     _mm_mask_roundscale_round_sh::<IMM8, SAE>(_mm_undefined_ph(), 0xff, a, b)
@@ -10017,15 +10197,17 @@ pub unsafe fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
+pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    vrndscalesh(a, b, src, k, IMM8, SAE)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vrndscalesh(a, b, src, k, IMM8, SAE)
+    }
 }
 
 /// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
@@ -10048,7 +10230,7 @@ pub unsafe fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
+pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
@@ -10066,7 +10248,7 @@ pub unsafe fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vscalefph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -10078,8 +10260,8 @@ pub unsafe fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vscalefph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    vscalefph_128(a, b, src, k)
+pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { vscalefph_128(a, b, src, k) }
 }
 
 /// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
@@ -10090,7 +10272,7 @@ pub unsafe fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m12
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vscalefph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
 }
 
@@ -10102,7 +10284,7 @@ pub unsafe fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vscalefph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
     _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
 }
 
@@ -10114,8 +10296,8 @@ pub unsafe fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vscalefph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    vscalefph_256(a, b, src, k)
+pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { vscalefph_256(a, b, src, k) }
 }
 
 /// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
@@ -10126,7 +10308,7 @@ pub unsafe fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: _
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vscalefph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
     _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
 }
 
@@ -10138,7 +10320,7 @@ pub unsafe fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vscalefph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
     _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
 }
 
@@ -10150,7 +10332,7 @@ pub unsafe fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vscalefph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
     _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -10162,7 +10344,7 @@ pub unsafe fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: _
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vscalefph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
     _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
 }
 
@@ -10183,7 +10365,7 @@ pub unsafe fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __
 #[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
+pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
 }
@@ -10205,14 +10387,16 @@ pub unsafe fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h
 #[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
     b: __m512h,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vscalefph_512(a, b, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vscalefph_512(a, b, src, k, ROUNDING)
+    }
 }
 
 /// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
@@ -10232,7 +10416,7 @@ pub unsafe fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
+pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
     k: __mmask32,
     a: __m512h,
     b: __m512h,
@@ -10250,7 +10434,7 @@ pub unsafe fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vscalefsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_scalef_sh(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -10263,7 +10447,7 @@ pub unsafe fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vscalefsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
 }
 
@@ -10276,7 +10460,7 @@ pub unsafe fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m12
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vscalefsh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     _mm_mask_scalef_sh(_mm_setzero_ph(), k, a, b)
 }
 
@@ -10298,7 +10482,7 @@ pub unsafe fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128
 #[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_scalef_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -10321,14 +10505,16 @@ pub unsafe fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -
 #[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
+pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vscalefsh(a, b, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vscalefsh(a, b, src, k, ROUNDING)
+    }
 }
 
 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
@@ -10349,7 +10535,7 @@ pub unsafe fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
+pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
@@ -10375,7 +10561,7 @@ pub unsafe fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
+pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
 }
@@ -10398,13 +10584,11 @@ pub unsafe fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_reduce_ph<const IMM8: i32>(
-    src: __m128h,
-    k: __mmask8,
-    a: __m128h,
-) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    vreduceph_128(a, IMM8, src, k)
+pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vreduceph_128(a, IMM8, src, k)
+    }
 }
 
 /// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
@@ -10425,7 +10609,7 @@ pub unsafe fn _mm_mask_reduce_ph<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
+pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
 }
@@ -10447,7 +10631,7 @@ pub unsafe fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> _
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
+pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
 }
@@ -10470,13 +10654,11 @@ pub unsafe fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_reduce_ph<const IMM8: i32>(
-    src: __m256h,
-    k: __mmask16,
-    a: __m256h,
-) -> __m256h {
-    static_assert_uimm_bits!(IMM8, 8);
-    vreduceph_256(a, IMM8, src, k)
+pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vreduceph_256(a, IMM8, src, k)
+    }
 }
 
 /// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
@@ -10497,7 +10679,7 @@ pub unsafe fn _mm256_mask_reduce_ph<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
+pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
 }
@@ -10519,7 +10701,7 @@ pub unsafe fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h)
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
+pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
 }
@@ -10542,11 +10724,7 @@ pub unsafe fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_reduce_ph<const IMM8: i32>(
-    src: __m512h,
-    k: __mmask32,
-    a: __m512h,
-) -> __m512h {
+pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
 }
@@ -10569,7 +10747,7 @@ pub unsafe fn _mm512_mask_reduce_ph<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
+pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
 }
@@ -10593,7 +10771,7 @@ pub unsafe fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h)
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(1, 2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
+pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
     _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
@@ -10619,14 +10797,16 @@ pub unsafe fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512h,
 ) -> __m512h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    vreduceph_512(a, IMM8, src, k, SAE)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vreduceph_512(a, IMM8, src, k, SAE)
+    }
 }
 
 /// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
@@ -10649,7 +10829,7 @@ pub unsafe fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
+pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
     k: __mmask32,
     a: __m512h,
 ) -> __m512h {
@@ -10676,7 +10856,7 @@ pub unsafe fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_sh::<IMM8>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -10700,7 +10880,7 @@ pub unsafe fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h
 #[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_reduce_sh<const IMM8: i32>(
+pub fn _mm_mask_reduce_sh<const IMM8: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
@@ -10729,7 +10909,7 @@ pub unsafe fn _mm_mask_reduce_sh<const IMM8: i32>(
 #[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_mask_reduce_sh::<IMM8>(_mm_setzero_ph(), k, a, b)
 }
@@ -10754,10 +10934,7 @@ pub unsafe fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: _
 #[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(2, 3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(
-    a: __m128h,
-    b: __m128h,
-) -> __m128h {
+pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
     static_assert_uimm_bits!(IMM8, 8);
     static_assert_sae!(SAE);
     _mm_mask_reduce_round_sh::<IMM8, SAE>(_mm_undefined_ph(), 0xff, a, b)
@@ -10784,15 +10961,17 @@ pub unsafe fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(4, 5)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
+pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128h,
 ) -> __m128h {
-    static_assert_uimm_bits!(IMM8, 8);
-    static_assert_sae!(SAE);
-    vreducesh(a, b, src, k, IMM8, SAE)
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        static_assert_sae!(SAE);
+        vreducesh(a, b, src, k, IMM8, SAE)
+    }
 }
 
 /// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
@@ -10816,7 +10995,7 @@ pub unsafe fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
 #[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
 #[rustc_legacy_const_generics(3, 4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
+pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128h,
@@ -10833,12 +11012,14 @@ pub unsafe fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_reduce_add_ph(a: __m128h) -> f16 {
-    let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
-    let a = _mm_add_ph(a, b);
-    let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
-    let a = _mm_add_ph(a, b);
-    simd_extract::<_, f16>(a, 0) + simd_extract::<_, f16>(a, 1)
+pub fn _mm_reduce_add_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_add_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_add_ph(a, b);
+        simd_extract::<_, f16>(a, 0) + simd_extract::<_, f16>(a, 1)
+    }
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
@@ -10848,10 +11029,12 @@ pub unsafe fn _mm_reduce_add_ph(a: __m128h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
-    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    _mm_reduce_add_ph(_mm_add_ph(p, q))
+pub fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_add_ph(_mm_add_ph(p, q))
+    }
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
@@ -10861,16 +11044,18 @@ pub unsafe fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
-    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-    let q = simd_shuffle!(
-        a,
-        a,
-        [
-            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ]
-    );
-    _mm256_reduce_add_ph(_mm256_add_ph(p, q))
+pub fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
+    }
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
@@ -10880,12 +11065,14 @@ pub unsafe fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
-    let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
-    let a = _mm_mul_ph(a, b);
-    let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
-    let a = _mm_mul_ph(a, b);
-    simd_extract::<_, f16>(a, 0) * simd_extract::<_, f16>(a, 1)
+pub fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_mul_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_mul_ph(a, b);
+        simd_extract::<_, f16>(a, 0) * simd_extract::<_, f16>(a, 1)
+    }
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
@@ -10895,10 +11082,12 @@ pub unsafe fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
-    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    _mm_reduce_mul_ph(_mm_mul_ph(p, q))
+pub fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
+    }
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
@@ -10909,15 +11098,17 @@ pub unsafe fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
 pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
-    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-    let q = simd_shuffle!(
-        a,
-        a,
-        [
-            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ]
-    );
-    _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
+    }
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
@@ -10927,13 +11118,15 @@ pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_reduce_min_ph(a: __m128h) -> f16 {
-    let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
-    let a = _mm_min_ph(a, b);
-    let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
-    let a = _mm_min_ph(a, b);
-    let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
-    simd_extract!(_mm_min_sh(a, b), 0)
+pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_min_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_min_ph(a, b);
+        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
+        simd_extract!(_mm_min_sh(a, b), 0)
+    }
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
@@ -10943,10 +11136,12 @@ pub unsafe fn _mm_reduce_min_ph(a: __m128h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
-    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    _mm_reduce_min_ph(_mm_min_ph(p, q))
+pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_min_ph(_mm_min_ph(p, q))
+    }
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
@@ -10956,16 +11151,18 @@ pub unsafe fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
-    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-    let q = simd_shuffle!(
-        a,
-        a,
-        [
-            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ]
-    );
-    _mm256_reduce_min_ph(_mm256_min_ph(p, q))
+pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
+    }
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
@@ -10975,13 +11172,15 @@ pub unsafe fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_reduce_max_ph(a: __m128h) -> f16 {
-    let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
-    let a = _mm_max_ph(a, b);
-    let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
-    let a = _mm_max_ph(a, b);
-    let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
-    simd_extract!(_mm_max_sh(a, b), 0)
+pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
+    unsafe {
+        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
+        let a = _mm_max_ph(a, b);
+        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
+        let a = _mm_max_ph(a, b);
+        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
+        simd_extract!(_mm_max_sh(a, b), 0)
+    }
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
@@ -10991,10 +11190,12 @@ pub unsafe fn _mm_reduce_max_ph(a: __m128h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
-    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
-    let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-    _mm_reduce_max_ph(_mm_max_ph(p, q))
+pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
+        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
+        _mm_reduce_max_ph(_mm_max_ph(p, q))
+    }
 }
 
 /// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
@@ -11004,16 +11205,18 @@ pub unsafe fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
-    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
-    let q = simd_shuffle!(
-        a,
-        a,
-        [
-            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ]
-    );
-    _mm256_reduce_max_ph(_mm256_max_ph(p, q))
+pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
+    unsafe {
+        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
+        let q = simd_shuffle!(
+            a,
+            a,
+            [
+                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+            ]
+        );
+        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
+    }
 }
 
 macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
@@ -11061,9 +11264,11 @@ macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
 #[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    fpclass_asm!(__mmask8, xmm_reg, a)
+pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask8, xmm_reg, a)
+    }
 }
 
 /// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
@@ -11086,9 +11291,11 @@ pub unsafe fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    fpclass_asm!(__mmask8, k1, xmm_reg, a)
+pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask8, k1, xmm_reg, a)
+    }
 }
 
 /// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
@@ -11110,9 +11317,11 @@ pub unsafe fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h
 #[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 8);
-    fpclass_asm!(__mmask16, ymm_reg, a)
+pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask16, ymm_reg, a)
+    }
 }
 
 /// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
@@ -11135,9 +11344,11 @@ pub unsafe fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
 #[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
-    static_assert_uimm_bits!(IMM8, 8);
-    fpclass_asm!(__mmask16, k1, ymm_reg, a)
+pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask16, k1, ymm_reg, a)
+    }
 }
 
 /// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
@@ -11159,9 +11370,11 @@ pub unsafe fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m
 #[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    fpclass_asm!(__mmask32, zmm_reg, a)
+pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask32, zmm_reg, a)
+    }
 }
 
 /// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
@@ -11184,9 +11397,11 @@ pub unsafe fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
 #[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
-    static_assert_uimm_bits!(IMM8, 8);
-    fpclass_asm!(__mmask32, k1, zmm_reg, a)
+pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        fpclass_asm!(__mmask32, k1, zmm_reg, a)
+    }
 }
 
 /// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
@@ -11208,7 +11423,7 @@ pub unsafe fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m
 #[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
+pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
     _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
 }
 
@@ -11232,9 +11447,11 @@ pub unsafe fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
 #[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
-    static_assert_uimm_bits!(IMM8, 8);
-    vfpclasssh(a, IMM8, k1)
+pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        vfpclasssh(a, IMM8, k1)
+    }
 }
 
 /// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
@@ -11244,8 +11461,8 @@ pub unsafe fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
-    simd_select_bitmask(k, b, a)
+pub fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
+    unsafe { simd_select_bitmask(k, b, a) }
 }
 
 /// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
@@ -11255,8 +11472,8 @@ pub unsafe fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
-    simd_select_bitmask(k, b, a)
+pub fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
+    unsafe { simd_select_bitmask(k, b, a) }
 }
 
 /// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
@@ -11266,8 +11483,8 @@ pub unsafe fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m2
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
-    simd_select_bitmask(k, b, a)
+pub fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
+    unsafe { simd_select_bitmask(k, b, a) }
 }
 
 /// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
@@ -11277,7 +11494,7 @@ pub unsafe fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m5
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
+pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
     _mm_castsi128_ph(_mm_permutex2var_epi16(
         _mm_castph_si128(a),
         idx,
@@ -11292,7 +11509,7 @@ pub unsafe fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m12
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
+pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
     _mm256_castsi256_ph(_mm256_permutex2var_epi16(
         _mm256_castph_si256(a),
         idx,
@@ -11307,7 +11524,7 @@ pub unsafe fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
+pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
     _mm512_castsi512_ph(_mm512_permutex2var_epi16(
         _mm512_castph_si512(a),
         idx,
@@ -11322,7 +11539,7 @@ pub unsafe fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
+pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
     _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
 }
 
@@ -11333,7 +11550,7 @@ pub unsafe fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
 #[inline]
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
+pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
     _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
 }
 
@@ -11344,7 +11561,7 @@ pub unsafe fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
+pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
     _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
 }
 
@@ -11356,8 +11573,8 @@ pub unsafe fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
-    vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
+    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11369,8 +11586,8 @@ pub unsafe fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    simd_select_bitmask(k, _mm_cvtepi16_ph(a), src)
+pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11381,7 +11598,7 @@ pub unsafe fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
+pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
     _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -11393,8 +11610,8 @@ pub unsafe fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
-    vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
+    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11406,8 +11623,8 @@ pub unsafe fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
-    simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src)
+pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11418,7 +11635,7 @@ pub unsafe fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
+pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
     _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
 }
 
@@ -11430,8 +11647,8 @@ pub unsafe fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
-    vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
+    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11443,8 +11660,8 @@ pub unsafe fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
-    simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src)
+pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
 }
 
 /// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11455,7 +11672,7 @@ pub unsafe fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
+pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
     _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
 }
 
@@ -11476,9 +11693,11 @@ pub unsafe fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
 #[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vcvtw2ph_512(a.as_i16x32(), ROUNDING)
+pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
+    }
 }
 
 /// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11499,13 +11718,15 @@ pub unsafe fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m51
 #[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512i,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
+    }
 }
 
 /// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11525,10 +11746,7 @@ pub unsafe fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512i,
-) -> __m512h {
+pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
 }
@@ -11541,8 +11759,8 @@ pub unsafe fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
-    vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
+    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11554,8 +11772,8 @@ pub unsafe fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    simd_select_bitmask(k, _mm_cvtepu16_ph(a), src)
+pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11566,7 +11784,7 @@ pub unsafe fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
+pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
     _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -11578,8 +11796,8 @@ pub unsafe fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
-    vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
+    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11591,8 +11809,8 @@ pub unsafe fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
-    simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src)
+pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11603,7 +11821,7 @@ pub unsafe fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
+pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
     _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
 }
 
@@ -11615,8 +11833,8 @@ pub unsafe fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtuw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
-    vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
+    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11628,8 +11846,8 @@ pub unsafe fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtuw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
-    simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src)
+pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11640,7 +11858,7 @@ pub unsafe fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtuw2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
+pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
     _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
 }
 
@@ -11661,9 +11879,11 @@ pub unsafe fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
 #[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
+pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
+    }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11684,13 +11904,15 @@ pub unsafe fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m51
 #[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
     src: __m512h,
     k: __mmask32,
     a: __m512i,
 ) -> __m512h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
+    }
 }
 
 /// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11710,10 +11932,7 @@ pub unsafe fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512i,
-) -> __m512h {
+pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
 }
@@ -11726,7 +11945,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtdq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
+pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
     _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
 }
 
@@ -11739,8 +11958,8 @@ pub unsafe fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtdq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    vcvtdq2ph_128(a.as_i32x4(), src, k)
+pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11752,7 +11971,7 @@ pub unsafe fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtdq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
+pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
     _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -11764,8 +11983,8 @@ pub unsafe fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtdq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
-    vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
+    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11777,8 +11996,8 @@ pub unsafe fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtdq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
-    simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src)
+pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11789,7 +12008,7 @@ pub unsafe fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtdq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
+pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
     _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -11801,8 +12020,8 @@ pub unsafe fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtdq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
-    vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
+    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11814,8 +12033,8 @@ pub unsafe fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtdq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
-    simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src)
+pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
 }
 
 /// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11826,7 +12045,7 @@ pub unsafe fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtdq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
+pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
     _mm512_mask_cvtepi32_ph(_mm256_setzero_ph(), k, a)
 }
 
@@ -11847,9 +12066,11 @@ pub unsafe fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
 #[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
-    static_assert_rounding!(ROUNDING);
-    vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
+pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11870,13 +12091,15 @@ pub unsafe fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m25
 #[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
     src: __m256h,
     k: __mmask16,
     a: __m512i,
 ) -> __m256h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
+    }
 }
 
 /// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11896,10 +12119,7 @@ pub unsafe fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512i,
-) -> __m256h {
+pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
 }
@@ -11913,8 +12133,8 @@ pub unsafe fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsi2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
-    vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
+    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
@@ -11935,9 +12155,11 @@ pub unsafe fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vcvtsi2sh(a, b, ROUNDING)
+pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsi2sh(a, b, ROUNDING)
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11948,7 +12170,7 @@ pub unsafe fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtudq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
+pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
     _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
 }
 
@@ -11961,8 +12183,8 @@ pub unsafe fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtudq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    vcvtudq2ph_128(a.as_u32x4(), src, k)
+pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11974,7 +12196,7 @@ pub unsafe fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtudq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
+pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
     _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -11986,8 +12208,8 @@ pub unsafe fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtudq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
-    vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
+    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -11999,8 +12221,8 @@ pub unsafe fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtudq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
-    simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src)
+pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12011,7 +12233,7 @@ pub unsafe fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtudq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
+pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
     _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12023,8 +12245,8 @@ pub unsafe fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtudq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
-    vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
+    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12036,8 +12258,8 @@ pub unsafe fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtudq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
-    simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src)
+pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12048,7 +12270,7 @@ pub unsafe fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtudq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
+pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
     _mm512_mask_cvtepu32_ph(_mm256_setzero_ph(), k, a)
 }
 
@@ -12069,9 +12291,11 @@ pub unsafe fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
 #[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
-    static_assert_rounding!(ROUNDING);
-    vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
+pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12092,13 +12316,15 @@ pub unsafe fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m25
 #[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
     src: __m256h,
     k: __mmask16,
     a: __m512i,
 ) -> __m256h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
+    }
 }
 
 /// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12118,10 +12344,7 @@ pub unsafe fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512i,
-) -> __m256h {
+pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
 }
@@ -12135,8 +12358,8 @@ pub unsafe fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtusi2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
-    vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
+    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
@@ -12157,9 +12380,11 @@ pub unsafe fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vcvtusi2sh(a, b, ROUNDING)
+pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtusi2sh(a, b, ROUNDING)
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12170,7 +12395,7 @@ pub unsafe fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
+pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
     _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
 }
 
@@ -12183,8 +12408,8 @@ pub unsafe fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    vcvtqq2ph_128(a.as_i64x2(), src, k)
+pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12196,7 +12421,7 @@ pub unsafe fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
+pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
     _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12208,7 +12433,7 @@ pub unsafe fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
+pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
     _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
 }
 
@@ -12221,8 +12446,8 @@ pub unsafe fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
-    vcvtqq2ph_256(a.as_i64x4(), src, k)
+pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12234,7 +12459,7 @@ pub unsafe fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
+pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
     _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12246,8 +12471,8 @@ pub unsafe fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
-    vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
+    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12259,8 +12484,8 @@ pub unsafe fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
-    simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src)
+pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
 }
 
 /// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12271,7 +12496,7 @@ pub unsafe fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
+pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
     _mm512_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12292,9 +12517,11 @@ pub unsafe fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
+pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12315,13 +12542,15 @@ pub unsafe fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m12
 #[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m512i,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
+    }
 }
 
 /// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12341,10 +12570,7 @@ pub unsafe fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512i,
-) -> __m128h {
+pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
 }
@@ -12357,7 +12583,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
+pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
     _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
 }
 
@@ -12370,8 +12596,8 @@ pub unsafe fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
-    vcvtuqq2ph_128(a.as_u64x2(), src, k)
+pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
+    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12383,7 +12609,7 @@ pub unsafe fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
+pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
     _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12395,7 +12621,7 @@ pub unsafe fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
+pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
     _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
 }
 
@@ -12408,8 +12634,8 @@ pub unsafe fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
-    vcvtuqq2ph_256(a.as_u64x4(), src, k)
+pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
+    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12421,7 +12647,7 @@ pub unsafe fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
+pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
     _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12433,8 +12659,8 @@ pub unsafe fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
-    vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
+    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12446,8 +12672,8 @@ pub unsafe fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
-    simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src)
+pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
+    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12458,7 +12684,7 @@ pub unsafe fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtuqq2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
+pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
     _mm512_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12479,9 +12705,11 @@ pub unsafe fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
+pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12502,13 +12730,15 @@ pub unsafe fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m12
 #[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m512i,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
+    }
 }
 
 /// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
@@ -12528,10 +12758,7 @@ pub unsafe fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m512i,
-) -> __m128h {
+pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
 }
@@ -12544,7 +12771,7 @@ pub unsafe fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2phx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtxps_ph(a: __m128) -> __m128h {
+pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
     _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
 }
 
@@ -12557,8 +12784,8 @@ pub unsafe fn _mm_cvtxps_ph(a: __m128) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2phx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
-    vcvtps2phx_128(a, src, k)
+pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
+    unsafe { vcvtps2phx_128(a, src, k) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -12570,7 +12797,7 @@ pub unsafe fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2phx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
+pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
     _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12582,7 +12809,7 @@ pub unsafe fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2phx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
+pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
     _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
 }
 
@@ -12595,8 +12822,8 @@ pub unsafe fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2phx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
-    vcvtps2phx_256(a, src, k)
+pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
+    unsafe { vcvtps2phx_256(a, src, k) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -12608,7 +12835,7 @@ pub unsafe fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtps2phx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
+pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
     _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12620,7 +12847,7 @@ pub unsafe fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtps2phx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
+pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
     _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), 0xffff, a)
 }
 
@@ -12633,8 +12860,8 @@ pub unsafe fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtps2phx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
-    vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
+    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -12646,7 +12873,7 @@ pub unsafe fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtps2phx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
+pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
     _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), k, a)
 }
 
@@ -12667,7 +12894,7 @@ pub unsafe fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
 #[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
+pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvtx_roundps_ph::<ROUNDING>(_mm256_setzero_ph(), 0xffff, a)
 }
@@ -12690,13 +12917,15 @@ pub unsafe fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h
 #[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
     src: __m256h,
     k: __mmask16,
     a: __m512,
 ) -> __m256h {
-    static_assert_rounding!(ROUNDING);
-    vcvtps2phx_512(a, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtps2phx_512(a, src, k, ROUNDING)
+    }
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -12717,10 +12946,7 @@ pub unsafe fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m512,
-) -> __m256h {
+pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvtx_roundps_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
 }
@@ -12734,7 +12960,7 @@ pub unsafe fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtss2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
+pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
     _mm_mask_cvtss_sh(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -12748,8 +12974,8 @@ pub unsafe fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtss2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
-    vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
+    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
@@ -12762,7 +12988,7 @@ pub unsafe fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtss2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
+pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
     _mm_mask_cvtss_sh(_mm_setzero_ph(), k, a, b)
 }
 
@@ -12784,7 +13010,7 @@ pub unsafe fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h
 #[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
+pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_cvt_roundss_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -12808,14 +13034,16 @@ pub unsafe fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) ->
 #[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
+pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vcvtss2sh(a, b, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtss2sh(a, b, src, k, ROUNDING)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
@@ -12837,7 +13065,7 @@ pub unsafe fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
+pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128,
@@ -12854,7 +13082,7 @@ pub unsafe fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
+pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
     _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
 }
 
@@ -12867,8 +13095,8 @@ pub unsafe fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
-    vcvtpd2ph_128(a, src, k)
+pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
+    unsafe { vcvtpd2ph_128(a, src, k) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -12880,7 +13108,7 @@ pub unsafe fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
+pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
     _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12892,7 +13120,7 @@ pub unsafe fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
+pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
     _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
 }
 
@@ -12905,8 +13133,8 @@ pub unsafe fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
-    vcvtpd2ph_256(a, src, k)
+pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
+    unsafe { vcvtpd2ph_256(a, src, k) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -12918,7 +13146,7 @@ pub unsafe fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtpd2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
+pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
     _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12930,7 +13158,7 @@ pub unsafe fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtpd2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
+pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
     _mm512_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
 }
 
@@ -12943,8 +13171,8 @@ pub unsafe fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtpd2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
-    vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
+    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -12956,7 +13184,7 @@ pub unsafe fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtpd2ph))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
+pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
     _mm512_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
 }
 
@@ -12977,7 +13205,7 @@ pub unsafe fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
+pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundpd_ph::<ROUNDING>(_mm_setzero_ph(), 0xff, a)
 }
@@ -13000,13 +13228,15 @@ pub unsafe fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h
 #[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m512d,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vcvtpd2ph_512(a, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtpd2ph_512(a, src, k, ROUNDING)
+    }
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
@@ -13027,7 +13257,7 @@ pub unsafe fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
+pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundpd_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
 }
@@ -13041,7 +13271,7 @@ pub unsafe fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: _
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsd2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
+pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
     _mm_mask_cvtsd_sh(_mm_undefined_ph(), 0xff, a, b)
 }
 
@@ -13055,8 +13285,8 @@ pub unsafe fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsd2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
-    vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
+    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
@@ -13069,7 +13299,7 @@ pub unsafe fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsd2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
+pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
     _mm_mask_cvtsd_sh(_mm_setzero_ph(), k, a, b)
 }
 
@@ -13091,7 +13321,7 @@ pub unsafe fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h
 #[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
+pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
     static_assert_rounding!(ROUNDING);
     _mm_mask_cvt_roundsd_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
 }
@@ -13115,14 +13345,16 @@ pub unsafe fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) ->
 #[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
+pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
     src: __m128h,
     k: __mmask8,
     a: __m128h,
     b: __m128d,
 ) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vcvtsd2sh(a, b, src, k, ROUNDING)
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsd2sh(a, b, src, k, ROUNDING)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
@@ -13144,7 +13376,7 @@ pub unsafe fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
+pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
     k: __mmask8,
     a: __m128h,
     b: __m128d,
@@ -13161,7 +13393,7 @@ pub unsafe fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
+pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
     _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -13174,8 +13406,8 @@ pub unsafe fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvtph2w_128(a, src.as_i16x8(), k))
+pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
@@ -13186,7 +13418,7 @@ pub unsafe fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
 }
 
@@ -13198,7 +13430,7 @@ pub unsafe fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
+pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
     _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
 }
 
@@ -13211,8 +13443,8 @@ pub unsafe fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
-    transmute(vcvtph2w_256(a, src.as_i16x16(), k))
+pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
@@ -13223,7 +13455,7 @@ pub unsafe fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
+pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
     _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
 }
 
@@ -13235,7 +13467,7 @@ pub unsafe fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
+pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
     _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
 }
 
@@ -13248,13 +13480,15 @@ pub unsafe fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
-    transmute(vcvtph2w_512(
-        a,
-        src.as_i16x32(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2w_512(
+            a,
+            src.as_i16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
@@ -13265,7 +13499,7 @@ pub unsafe fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
+pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
     _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
 }
 
@@ -13286,7 +13520,7 @@ pub unsafe fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
+pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
 }
@@ -13309,13 +13543,15 @@ pub unsafe fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m51
 #[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512h,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
@@ -13335,10 +13571,7 @@ pub unsafe fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-) -> __m512i {
+pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
 }
@@ -13351,7 +13584,7 @@ pub unsafe fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
+pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
     _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -13364,8 +13597,8 @@ pub unsafe fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvtph2uw_128(a, src.as_u16x8(), k))
+pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
@@ -13376,7 +13609,7 @@ pub unsafe fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
 }
 
@@ -13388,7 +13621,7 @@ pub unsafe fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
+pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
     _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
 }
 
@@ -13401,8 +13634,8 @@ pub unsafe fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
-    transmute(vcvtph2uw_256(a, src.as_u16x16(), k))
+pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
@@ -13413,7 +13646,7 @@ pub unsafe fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
+pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
     _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
 }
 
@@ -13425,7 +13658,7 @@ pub unsafe fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
+pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
     _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
 }
 
@@ -13438,13 +13671,15 @@ pub unsafe fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
-    transmute(vcvtph2uw_512(
-        a,
-        src.as_u16x32(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2uw_512(
+            a,
+            src.as_u16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
@@ -13455,7 +13690,7 @@ pub unsafe fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
+pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
     _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
 }
 
@@ -13476,7 +13711,7 @@ pub unsafe fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundph_epu16<const ROUNDING: i32>(a: __m512h) -> __m512i {
+pub fn _mm512_cvt_roundph_epu16<const ROUNDING: i32>(a: __m512h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epu16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
 }
@@ -13499,13 +13734,15 @@ pub unsafe fn _mm512_cvt_roundph_epu16<const ROUNDING: i32>(a: __m512h) -> __m51
 #[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundph_epu16<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundph_epu16<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512h,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtph2uw_512(a, src.as_u16x32(), k, ROUNDING))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, ROUNDING))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
@@ -13525,10 +13762,7 @@ pub unsafe fn _mm512_mask_cvt_roundph_epu16<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundph_epu16<const ROUNDING: i32>(
-    k: __mmask32,
-    a: __m512h,
-) -> __m512i {
+pub fn _mm512_maskz_cvt_roundph_epu16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epu16::<ROUNDING>(_mm512_setzero_si512(), k, a)
 }
@@ -13541,7 +13775,7 @@ pub unsafe fn _mm512_maskz_cvt_roundph_epu16<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
+pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
     _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -13554,8 +13788,8 @@ pub unsafe fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvttph2w_128(a, src.as_i16x8(), k))
+pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
@@ -13567,7 +13801,7 @@ pub unsafe fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
 }
 
@@ -13579,7 +13813,7 @@ pub unsafe fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
+pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
     _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
 }
 
@@ -13592,8 +13826,8 @@ pub unsafe fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
-    transmute(vcvttph2w_256(a, src.as_i16x16(), k))
+pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
@@ -13605,7 +13839,7 @@ pub unsafe fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
+pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
     _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
 }
 
@@ -13617,7 +13851,7 @@ pub unsafe fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
+pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
     _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
 }
 
@@ -13630,13 +13864,15 @@ pub unsafe fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
-    transmute(vcvttph2w_512(
-        a,
-        src.as_i16x32(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2w_512(
+            a,
+            src.as_i16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
@@ -13648,7 +13884,7 @@ pub unsafe fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2w))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
+pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
     _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
 }
 
@@ -13663,7 +13899,7 @@ pub unsafe fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
+pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
 }
@@ -13680,13 +13916,15 @@ pub unsafe fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512h,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
@@ -13701,7 +13939,7 @@ pub unsafe fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
+pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
 }
@@ -13714,7 +13952,7 @@ pub unsafe fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: _
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
+pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
     _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -13727,8 +13965,8 @@ pub unsafe fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvttph2uw_128(a, src.as_u16x8(), k))
+pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
@@ -13740,7 +13978,7 @@ pub unsafe fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
 }
 
@@ -13752,7 +13990,7 @@ pub unsafe fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
+pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
     _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
 }
 
@@ -13765,8 +14003,8 @@ pub unsafe fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
-    transmute(vcvttph2uw_256(a, src.as_u16x16(), k))
+pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
+    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
@@ -13778,7 +14016,7 @@ pub unsafe fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
+pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
     _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
 }
 
@@ -13790,7 +14028,7 @@ pub unsafe fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
+pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
     _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
 }
 
@@ -13803,13 +14041,15 @@ pub unsafe fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
-    transmute(vcvttph2uw_512(
-        a,
-        src.as_u16x32(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2uw_512(
+            a,
+            src.as_u16x32(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
@@ -13821,7 +14061,7 @@ pub unsafe fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2uw))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
+pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
     _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
 }
 
@@ -13836,7 +14076,7 @@ pub unsafe fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
+pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
 }
@@ -13853,13 +14093,15 @@ pub unsafe fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512h,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
@@ -13874,7 +14116,7 @@ pub unsafe fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
+pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
 }
@@ -13887,7 +14129,7 @@ pub unsafe fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: _
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
+pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
     _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -13899,8 +14141,8 @@ pub unsafe fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvtph2dq_128(a, src.as_i32x4(), k))
+pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
@@ -13911,7 +14153,7 @@ pub unsafe fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
 }
 
@@ -13923,7 +14165,7 @@ pub unsafe fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
+pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
     _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -13935,8 +14177,8 @@ pub unsafe fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    transmute(vcvtph2dq_256(a, src.as_i32x8(), k))
+pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
@@ -13947,7 +14189,7 @@ pub unsafe fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
+pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
     _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
 }
 
@@ -13959,7 +14201,7 @@ pub unsafe fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
+pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
     _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
 }
 
@@ -13971,13 +14213,15 @@ pub unsafe fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
-    transmute(vcvtph2dq_512(
-        a,
-        src.as_i32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2dq_512(
+            a,
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
@@ -13988,7 +14232,7 @@ pub unsafe fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
+pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
     _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
 }
 
@@ -14009,7 +14253,7 @@ pub unsafe fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
+pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
 }
@@ -14031,13 +14275,15 @@ pub unsafe fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m51
 #[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m256h,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
@@ -14057,10 +14303,7 @@ pub unsafe fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m256h,
-) -> __m512i {
+pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
 }
@@ -14073,8 +14316,8 @@ pub unsafe fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2si))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsh_i32(a: __m128h) -> i32 {
-    vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
+    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
@@ -14094,9 +14337,11 @@ pub unsafe fn _mm_cvtsh_i32(a: __m128h) -> i32 {
 #[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
-    static_assert_rounding!(ROUNDING);
-    vcvtsh2si32(a, ROUNDING)
+pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsh2si32(a, ROUNDING)
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
@@ -14107,7 +14352,7 @@ pub unsafe fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
+pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
     _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -14119,8 +14364,8 @@ pub unsafe fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvtph2udq_128(a, src.as_u32x4(), k))
+pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
@@ -14131,7 +14376,7 @@ pub unsafe fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
 }
 
@@ -14143,7 +14388,7 @@ pub unsafe fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
+pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
     _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -14155,8 +14400,8 @@ pub unsafe fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    transmute(vcvtph2udq_256(a, src.as_u32x8(), k))
+pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
@@ -14167,7 +14412,7 @@ pub unsafe fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
+pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
     _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
 }
 
@@ -14179,7 +14424,7 @@ pub unsafe fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
+pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
     _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
 }
 
@@ -14191,13 +14436,15 @@ pub unsafe fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
-    transmute(vcvtph2udq_512(
-        a,
-        src.as_u32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2udq_512(
+            a,
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
@@ -14208,7 +14455,7 @@ pub unsafe fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
+pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
     _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
 }
 
@@ -14229,7 +14476,7 @@ pub unsafe fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
+pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
 }
@@ -14251,13 +14498,15 @@ pub unsafe fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m51
 #[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m256h,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
@@ -14277,10 +14526,7 @@ pub unsafe fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(
-    k: __mmask16,
-    a: __m256h,
-) -> __m512i {
+pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
 }
@@ -14293,8 +14539,8 @@ pub unsafe fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2usi))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsh_u32(a: __m128h) -> u32 {
-    vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
+    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
@@ -14314,9 +14560,11 @@ pub unsafe fn _mm_cvtsh_u32(a: __m128h) -> u32 {
 #[cfg_attr(test, assert_instr(vcvtsh2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundsh_u32<const ROUNDING: i32>(a: __m128h) -> u32 {
-    static_assert_rounding!(ROUNDING);
-    vcvtsh2usi32(a, ROUNDING)
+pub fn _mm_cvt_roundsh_u32<const ROUNDING: i32>(a: __m128h) -> u32 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsh2usi32(a, ROUNDING)
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
@@ -14327,7 +14575,7 @@ pub unsafe fn _mm_cvt_roundsh_u32<const ROUNDING: i32>(a: __m128h) -> u32 {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
+pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
     _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -14339,8 +14587,8 @@ pub unsafe fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvttph2dq_128(a, src.as_i32x4(), k))
+pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
@@ -14351,7 +14599,7 @@ pub unsafe fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
 }
 
@@ -14363,7 +14611,7 @@ pub unsafe fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
+pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
     _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -14375,8 +14623,8 @@ pub unsafe fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    transmute(vcvttph2dq_256(a, src.as_i32x8(), k))
+pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
@@ -14387,7 +14635,7 @@ pub unsafe fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
+pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
     _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
 }
 
@@ -14399,7 +14647,7 @@ pub unsafe fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
+pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
     _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
 }
 
@@ -14411,13 +14659,15 @@ pub unsafe fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
-    transmute(vcvttph2dq_512(
-        a,
-        src.as_i32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2dq_512(
+            a,
+            src.as_i32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
@@ -14428,7 +14678,7 @@ pub unsafe fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2dq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
+pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
     _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
 }
 
@@ -14443,7 +14693,7 @@ pub unsafe fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
+pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
 }
@@ -14459,13 +14709,15 @@ pub unsafe fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m256h,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
@@ -14479,7 +14731,7 @@ pub unsafe fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
+pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
 }
@@ -14492,8 +14744,8 @@ pub unsafe fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: _
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttsh2si))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttsh_i32(a: __m128h) -> i32 {
-    vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
+    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
@@ -14507,9 +14759,11 @@ pub unsafe fn _mm_cvttsh_i32(a: __m128h) -> i32 {
 #[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
-    static_assert_sae!(SAE);
-    vcvttsh2si32(a, SAE)
+pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2si32(a, SAE)
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
@@ -14520,7 +14774,7 @@ pub unsafe fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
+pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
     _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -14532,8 +14786,8 @@ pub unsafe fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvttph2udq_128(a, src.as_u32x4(), k))
+pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
@@ -14544,7 +14798,7 @@ pub unsafe fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
 }
 
@@ -14556,7 +14810,7 @@ pub unsafe fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
+pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
     _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -14568,8 +14822,8 @@ pub unsafe fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    transmute(vcvttph2udq_256(a, src.as_u32x8(), k))
+pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
@@ -14580,7 +14834,7 @@ pub unsafe fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
+pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
     _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
 }
 
@@ -14592,7 +14846,7 @@ pub unsafe fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
+pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
     _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
 }
 
@@ -14604,13 +14858,15 @@ pub unsafe fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
-    transmute(vcvttph2udq_512(
-        a,
-        src.as_u32x16(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2udq_512(
+            a,
+            src.as_u32x16(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
@@ -14621,7 +14877,7 @@ pub unsafe fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2udq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
+pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
     _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
 }
 
@@ -14636,7 +14892,7 @@ pub unsafe fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
+pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
 }
@@ -14652,13 +14908,15 @@ pub unsafe fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m256h,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
@@ -14672,7 +14930,7 @@ pub unsafe fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
+pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
 }
@@ -14685,8 +14943,8 @@ pub unsafe fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: _
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttsh2usi))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttsh_u32(a: __m128h) -> u32 {
-    vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
+    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
@@ -14700,9 +14958,11 @@ pub unsafe fn _mm_cvttsh_u32(a: __m128h) -> u32 {
 #[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
-    static_assert_sae!(SAE);
-    vcvttsh2usi32(a, SAE)
+pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2usi32(a, SAE)
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
@@ -14713,7 +14973,7 @@ pub unsafe fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
+pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
     _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -14725,8 +14985,8 @@ pub unsafe fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvtph2qq_128(a, src.as_i64x2(), k))
+pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
@@ -14737,7 +14997,7 @@ pub unsafe fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
 }
 
@@ -14749,7 +15009,7 @@ pub unsafe fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
+pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
     _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -14761,8 +15021,8 @@ pub unsafe fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    transmute(vcvtph2qq_256(a, src.as_i64x4(), k))
+pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
@@ -14773,7 +15033,7 @@ pub unsafe fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
+pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
     _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
 }
 
@@ -14785,7 +15045,7 @@ pub unsafe fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
+pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
     _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -14797,13 +15057,15 @@ pub unsafe fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
-    transmute(vcvtph2qq_512(
-        a,
-        src.as_i64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2qq_512(
+            a,
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
@@ -14814,7 +15076,7 @@ pub unsafe fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
+pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
     _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
 }
 
@@ -14835,7 +15097,7 @@ pub unsafe fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
+pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -14857,13 +15119,15 @@ pub unsafe fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m51
 #[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m128h,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
@@ -14883,10 +15147,7 @@ pub unsafe fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-) -> __m512i {
+pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
 }
@@ -14899,7 +15160,7 @@ pub unsafe fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
+pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
     _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -14911,8 +15172,8 @@ pub unsafe fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvtph2uqq_128(a, src.as_u64x2(), k))
+pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
@@ -14923,7 +15184,7 @@ pub unsafe fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
 }
 
@@ -14935,7 +15196,7 @@ pub unsafe fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
+pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
     _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -14947,8 +15208,8 @@ pub unsafe fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    transmute(vcvtph2uqq_256(a, src.as_u64x4(), k))
+pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
@@ -14959,7 +15220,7 @@ pub unsafe fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
+pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
     _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
 }
 
@@ -14971,7 +15232,7 @@ pub unsafe fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
+pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
     _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -14983,13 +15244,15 @@ pub unsafe fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
-    transmute(vcvtph2uqq_512(
-        a,
-        src.as_u64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvtph2uqq_512(
+            a,
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
@@ -15000,7 +15263,7 @@ pub unsafe fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
+pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
     _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
 }
 
@@ -15021,7 +15284,7 @@ pub unsafe fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
+pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -15043,13 +15306,15 @@ pub unsafe fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m51
 #[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
+pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m128h,
 ) -> __m512i {
-    static_assert_rounding!(ROUNDING);
-    transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
@@ -15069,10 +15334,7 @@ pub unsafe fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
 #[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(
-    k: __mmask8,
-    a: __m128h,
-) -> __m512i {
+pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
     static_assert_rounding!(ROUNDING);
     _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
 }
@@ -15085,7 +15347,7 @@ pub unsafe fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
+pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
     _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -15097,8 +15359,8 @@ pub unsafe fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvttph2qq_128(a, src.as_i64x2(), k))
+pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
@@ -15109,7 +15371,7 @@ pub unsafe fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
 }
 
@@ -15121,7 +15383,7 @@ pub unsafe fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
+pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
     _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -15133,8 +15395,8 @@ pub unsafe fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    transmute(vcvttph2qq_256(a, src.as_i64x4(), k))
+pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
@@ -15145,7 +15407,7 @@ pub unsafe fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
+pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
     _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
 }
 
@@ -15157,7 +15419,7 @@ pub unsafe fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
+pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
     _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -15169,13 +15431,15 @@ pub unsafe fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
-    transmute(vcvttph2qq_512(
-        a,
-        src.as_i64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2qq_512(
+            a,
+            src.as_i64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
@@ -15186,7 +15450,7 @@ pub unsafe fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2qq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
+pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
     _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
 }
 
@@ -15201,7 +15465,7 @@ pub unsafe fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
+pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -15217,13 +15481,15 @@ pub unsafe fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m128h,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
@@ -15237,7 +15503,7 @@ pub unsafe fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
+pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
 }
@@ -15250,7 +15516,7 @@ pub unsafe fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
+pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
     _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
 }
 
@@ -15262,8 +15528,8 @@ pub unsafe fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
-    transmute(vcvttph2uqq_128(a, src.as_u64x2(), k))
+pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
+    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
@@ -15274,7 +15540,7 @@ pub unsafe fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
+pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
     _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
 }
 
@@ -15286,7 +15552,7 @@ pub unsafe fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
+pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
     _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
 }
 
@@ -15298,8 +15564,8 @@ pub unsafe fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
-    transmute(vcvttph2uqq_256(a, src.as_u64x4(), k))
+pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
+    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
@@ -15310,7 +15576,7 @@ pub unsafe fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvttph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
+pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
     _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
 }
 
@@ -15322,7 +15588,7 @@ pub unsafe fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
+pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
     _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
 }
 
@@ -15334,13 +15600,15 @@ pub unsafe fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
-    transmute(vcvttph2uqq_512(
-        a,
-        src.as_u64x8(),
-        k,
-        _MM_FROUND_CUR_DIRECTION,
-    ))
+pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
+    unsafe {
+        transmute(vcvttph2uqq_512(
+            a,
+            src.as_u64x8(),
+            k,
+            _MM_FROUND_CUR_DIRECTION,
+        ))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
@@ -15351,7 +15619,7 @@ pub unsafe fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) ->
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttph2uqq))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
+pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
     _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
 }
 
@@ -15366,7 +15634,7 @@ pub unsafe fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
+pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
 }
@@ -15382,13 +15650,15 @@ pub unsafe fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
 #[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
+pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m128h,
 ) -> __m512i {
-    static_assert_sae!(SAE);
-    transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
+    unsafe {
+        static_assert_sae!(SAE);
+        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
@@ -15402,7 +15672,7 @@ pub unsafe fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
+pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
     static_assert_sae!(SAE);
     _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
 }
@@ -15415,7 +15685,7 @@ pub unsafe fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2psx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
+pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
     _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
 }
 
@@ -15428,8 +15698,8 @@ pub unsafe fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2psx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
-    vcvtph2psx_128(a, src, k)
+pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
+    unsafe { vcvtph2psx_128(a, src, k) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
@@ -15441,7 +15711,7 @@ pub unsafe fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2psx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
+pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
     _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
 }
 
@@ -15453,7 +15723,7 @@ pub unsafe fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2psx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
+pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
     _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
 }
 
@@ -15466,8 +15736,8 @@ pub unsafe fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2psx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
-    vcvtph2psx_256(a, src, k)
+pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
+    unsafe { vcvtph2psx_256(a, src, k) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
@@ -15479,7 +15749,7 @@ pub unsafe fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2psx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
+pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
     _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
 }
 
@@ -15491,7 +15761,7 @@ pub unsafe fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2psx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
+pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
     _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
 }
 
@@ -15504,8 +15774,8 @@ pub unsafe fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2psx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
-    vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
+    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
@@ -15517,7 +15787,7 @@ pub unsafe fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2psx))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
+pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
     _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
 }
 
@@ -15532,7 +15802,7 @@ pub unsafe fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
 #[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
+pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
     static_assert_sae!(SAE);
     _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
 }
@@ -15549,13 +15819,15 @@ pub unsafe fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
 #[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
+pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
     src: __m512,
     k: __mmask16,
     a: __m256h,
 ) -> __m512 {
-    static_assert_sae!(SAE);
-    vcvtph2psx_512(a, src, k, SAE)
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtph2psx_512(a, src, k, SAE)
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
@@ -15570,7 +15842,7 @@ pub unsafe fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
+pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
     static_assert_sae!(SAE);
     _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
 }
@@ -15584,7 +15856,7 @@ pub unsafe fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m2
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2ss))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
+pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
     _mm_mask_cvtsh_ss(a, 0xff, a, b)
 }
 
@@ -15598,8 +15870,8 @@ pub unsafe fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2ss))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
-    vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
+    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
@@ -15612,7 +15884,7 @@ pub unsafe fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h)
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2ss))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
+pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
     _mm_mask_cvtsh_ss(_mm_setzero_ps(), k, a, b)
 }
 
@@ -15628,7 +15900,7 @@ pub unsafe fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
+pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
     static_assert_sae!(SAE);
     _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
 }
@@ -15646,14 +15918,16 @@ pub unsafe fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m12
 #[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
+pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128h,
 ) -> __m128 {
-    static_assert_sae!(SAE);
-    vcvtsh2ss(a, b, src, k, SAE)
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtsh2ss(a, b, src, k, SAE)
+    }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
@@ -15669,11 +15943,7 @@ pub unsafe fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(
-    k: __mmask8,
-    a: __m128,
-    b: __m128h,
-) -> __m128 {
+pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
     static_assert_sae!(SAE);
     _mm_mask_cvt_roundsh_ss::<SAE>(_mm_setzero_ps(), k, a, b)
 }
@@ -15686,7 +15956,7 @@ pub unsafe fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2pd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtph_pd(a: __m128h) -> __m128d {
+pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
     _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
 }
 
@@ -15699,8 +15969,8 @@ pub unsafe fn _mm_cvtph_pd(a: __m128h) -> __m128d {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2pd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
-    vcvtph2pd_128(a, src, k)
+pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
+    unsafe { vcvtph2pd_128(a, src, k) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
@@ -15712,7 +15982,7 @@ pub unsafe fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2pd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
+pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
     _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
 }
 
@@ -15724,7 +15994,7 @@ pub unsafe fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2pd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
+pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
     _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
 }
 
@@ -15737,8 +16007,8 @@ pub unsafe fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2pd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
-    vcvtph2pd_256(a, src, k)
+pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
+    unsafe { vcvtph2pd_256(a, src, k) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
@@ -15750,7 +16020,7 @@ pub unsafe fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m
 #[target_feature(enable = "avx512fp16,avx512vl")]
 #[cfg_attr(test, assert_instr(vcvtph2pd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
+pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
     _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
 }
 
@@ -15762,7 +16032,7 @@ pub unsafe fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2pd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
+pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
     _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
 }
 
@@ -15775,8 +16045,8 @@ pub unsafe fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2pd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
-    vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
+    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
@@ -15788,7 +16058,7 @@ pub unsafe fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtph2pd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
+pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
     _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
 }
 
@@ -15803,7 +16073,7 @@ pub unsafe fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
 #[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
+pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
     static_assert_sae!(SAE);
     _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
 }
@@ -15820,13 +16090,15 @@ pub unsafe fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
 #[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
+pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
     src: __m512d,
     k: __mmask8,
     a: __m128h,
 ) -> __m512d {
-    static_assert_sae!(SAE);
-    vcvtph2pd_512(a, src, k, SAE)
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtph2pd_512(a, src, k, SAE)
+    }
 }
 
 /// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
@@ -15841,7 +16113,7 @@ pub unsafe fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
+pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
     static_assert_sae!(SAE);
     _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
 }
@@ -15855,7 +16127,7 @@ pub unsafe fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2sd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
+pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
     _mm_mask_cvtsh_sd(a, 0xff, a, b)
 }
 
@@ -15869,8 +16141,8 @@ pub unsafe fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2sd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
-    vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
+    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
@@ -15882,7 +16154,7 @@ pub unsafe fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2sd))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
+pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
     _mm_mask_cvtsh_sd(_mm_setzero_pd(), k, a, b)
 }
 
@@ -15898,7 +16170,7 @@ pub unsafe fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d
 #[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
+pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
     static_assert_sae!(SAE);
     _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
 }
@@ -15916,14 +16188,16 @@ pub unsafe fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m1
 #[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
 #[rustc_legacy_const_generics(4)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
+pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128h,
 ) -> __m128d {
-    static_assert_sae!(SAE);
-    vcvtsh2sd(a, b, src, k, SAE)
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvtsh2sd(a, b, src, k, SAE)
+    }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
@@ -15938,11 +16212,7 @@ pub unsafe fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
 #[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
 #[rustc_legacy_const_generics(3)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128h,
-) -> __m128d {
+pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
     static_assert_sae!(SAE);
     _mm_mask_cvt_roundsh_sd::<SAE>(_mm_setzero_pd(), k, a, b)
 }
@@ -15953,8 +16223,8 @@ pub unsafe fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsh_h(a: __m128h) -> f16 {
-    simd_extract!(a, 0)
+pub fn _mm_cvtsh_h(a: __m128h) -> f16 {
+    unsafe { simd_extract!(a, 0) }
 }
 
 /// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
@@ -15963,8 +16233,8 @@ pub unsafe fn _mm_cvtsh_h(a: __m128h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm256_cvtsh_h(a: __m256h) -> f16 {
-    simd_extract!(a, 0)
+pub fn _mm256_cvtsh_h(a: __m256h) -> f16 {
+    unsafe { simd_extract!(a, 0) }
 }
 
 /// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
@@ -15973,8 +16243,8 @@ pub unsafe fn _mm256_cvtsh_h(a: __m256h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm512_cvtsh_h(a: __m512h) -> f16 {
-    simd_extract!(a, 0)
+pub fn _mm512_cvtsh_h(a: __m512h) -> f16 {
+    unsafe { simd_extract!(a, 0) }
 }
 
 /// Copy the lower 16-bit integer in a to dst.
@@ -15983,8 +16253,8 @@ pub unsafe fn _mm512_cvtsh_h(a: __m512h) -> f16 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
-    simd_extract!(a.as_i16x8(), 0)
+pub fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
+    unsafe { simd_extract!(a.as_i16x8(), 0) }
 }
 
 /// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
@@ -15993,8 +16263,8 @@ pub unsafe fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
 #[inline]
 #[target_feature(enable = "avx512fp16")]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsi16_si128(a: i16) -> __m128i {
-    transmute(simd_insert!(i16x8::ZERO, 0, a))
+pub fn _mm_cvtsi16_si128(a: i16) -> __m128i {
+    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
 }
 
 #[allow(improper_ctypes)]
diff --git a/crates/core_arch/src/x86/avx512ifma.rs b/crates/core_arch/src/x86/avx512ifma.rs
index 12123c2162..e4e715ae7b 100644
--- a/crates/core_arch/src/x86/avx512ifma.rs
+++ b/crates/core_arch/src/x86/avx512ifma.rs
@@ -15,8 +15,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    vpmadd52huq_512(a, b, c)
+pub fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { vpmadd52huq_512(a, b, c) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -31,13 +31,8 @@ pub unsafe fn _mm512_madd52hi_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm512_mask_madd52hi_epu64(
-    a: __m512i,
-    k: __mmask8,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    simd_select_bitmask(k, vpmadd52huq_512(a, b, c), a)
+pub fn _mm512_mask_madd52hi_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_512(a, b, c), a) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -52,13 +47,8 @@ pub unsafe fn _mm512_mask_madd52hi_epu64(
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm512_maskz_madd52hi_epu64(
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    simd_select_bitmask(k, vpmadd52huq_512(a, b, c), _mm512_setzero_si512())
+pub fn _mm512_maskz_madd52hi_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_512(a, b, c), _mm512_setzero_si512()) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -72,8 +62,8 @@ pub unsafe fn _mm512_maskz_madd52hi_epu64(
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    vpmadd52luq_512(a, b, c)
+pub fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { vpmadd52luq_512(a, b, c) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -88,13 +78,8 @@ pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm512_mask_madd52lo_epu64(
-    a: __m512i,
-    k: __mmask8,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    simd_select_bitmask(k, vpmadd52luq_512(a, b, c), a)
+pub fn _mm512_mask_madd52lo_epu64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_512(a, b, c), a) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -109,13 +94,8 @@ pub unsafe fn _mm512_mask_madd52lo_epu64(
 #[target_feature(enable = "avx512ifma")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm512_maskz_madd52lo_epu64(
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512())
+pub fn _mm512_maskz_madd52lo_epu64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512()) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -132,8 +112,8 @@ pub unsafe fn _mm512_maskz_madd52lo_epu64(
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpmadd52huq)
 )]
-pub unsafe fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    vpmadd52huq_256(a, b, c)
+pub fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52huq_256(a, b, c) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -147,8 +127,8 @@ pub unsafe fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> _
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    vpmadd52huq_256(a, b, c)
+pub fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52huq_256(a, b, c) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -163,13 +143,8 @@ pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m25
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm256_mask_madd52hi_epu64(
-    a: __m256i,
-    k: __mmask8,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    simd_select_bitmask(k, vpmadd52huq_256(a, b, c), a)
+pub fn _mm256_mask_madd52hi_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_256(a, b, c), a) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -184,13 +159,8 @@ pub unsafe fn _mm256_mask_madd52hi_epu64(
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm256_maskz_madd52hi_epu64(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256())
+pub fn _mm256_maskz_madd52hi_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256()) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -207,8 +177,8 @@ pub unsafe fn _mm256_maskz_madd52hi_epu64(
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpmadd52luq)
 )]
-pub unsafe fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    vpmadd52luq_256(a, b, c)
+pub fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52luq_256(a, b, c) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -222,8 +192,8 @@ pub unsafe fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> _
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    vpmadd52luq_256(a, b, c)
+pub fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { vpmadd52luq_256(a, b, c) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -238,13 +208,8 @@ pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m25
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm256_mask_madd52lo_epu64(
-    a: __m256i,
-    k: __mmask8,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    simd_select_bitmask(k, vpmadd52luq_256(a, b, c), a)
+pub fn _mm256_mask_madd52lo_epu64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_256(a, b, c), a) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -259,13 +224,8 @@ pub unsafe fn _mm256_mask_madd52lo_epu64(
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm256_maskz_madd52lo_epu64(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256())
+pub fn _mm256_maskz_madd52lo_epu64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256()) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -282,8 +242,8 @@ pub unsafe fn _mm256_maskz_madd52lo_epu64(
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpmadd52huq)
 )]
-pub unsafe fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    vpmadd52huq_128(a, b, c)
+pub fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52huq_128(a, b, c) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -297,8 +257,8 @@ pub unsafe fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m1
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    vpmadd52huq_128(a, b, c)
+pub fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52huq_128(a, b, c) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -313,8 +273,8 @@ pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    simd_select_bitmask(k, vpmadd52huq_128(a, b, c), a)
+pub fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_128(a, b, c), a) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -329,8 +289,8 @@ pub unsafe fn _mm_mask_madd52hi_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52huq))]
-pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128())
+pub fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128()) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -347,8 +307,8 @@ pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: _
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpmadd52luq)
 )]
-pub unsafe fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    vpmadd52luq_128(a, b, c)
+pub fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52luq_128(a, b, c) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -362,8 +322,8 @@ pub unsafe fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m1
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    vpmadd52luq_128(a, b, c)
+pub fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { vpmadd52luq_128(a, b, c) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -378,8 +338,8 @@ pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    simd_select_bitmask(k, vpmadd52luq_128(a, b, c), a)
+pub fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_128(a, b, c), a) }
 }
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
@@ -394,8 +354,8 @@ pub unsafe fn _mm_mask_madd52lo_epu64(a: __m128i, k: __mmask8, b: __m128i, c: __
 #[target_feature(enable = "avx512ifma,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmadd52luq))]
-pub unsafe fn _mm_maskz_madd52lo_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    simd_select_bitmask(k, vpmadd52luq_128(a, b, c), _mm_setzero_si128())
+pub fn _mm_maskz_madd52lo_epu64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { simd_select_bitmask(k, vpmadd52luq_128(a, b, c), _mm_setzero_si128()) }
 }
 
 #[allow(improper_ctypes)]
diff --git a/crates/core_arch/src/x86/avx512vbmi.rs b/crates/core_arch/src/x86/avx512vbmi.rs
index b9bded92d6..cd3f4ca03d 100644
--- a/crates/core_arch/src/x86/avx512vbmi.rs
+++ b/crates/core_arch/src/x86/avx512vbmi.rs
@@ -11,8 +11,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm512_permutex2var_epi8(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
-    transmute(vpermi2b(a.as_i8x64(), idx.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_permutex2var_epi8(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpermi2b(a.as_i8x64(), idx.as_i8x64(), b.as_i8x64())) }
 }
 
 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -22,14 +22,16 @@ pub unsafe fn _mm512_permutex2var_epi8(a: __m512i, idx: __m512i, b: __m512i) ->
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2b))]
-pub unsafe fn _mm512_mask_permutex2var_epi8(
+pub fn _mm512_mask_permutex2var_epi8(
     a: __m512i,
     k: __mmask64,
     idx: __m512i,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
-    transmute(simd_select_bitmask(k, permute, a.as_i8x64()))
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x64()))
+    }
 }
 
 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -39,14 +41,16 @@ pub unsafe fn _mm512_mask_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm512_maskz_permutex2var_epi8(
+pub fn _mm512_maskz_permutex2var_epi8(
     k: __mmask64,
     a: __m512i,
     idx: __m512i,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
-    transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+    }
 }
 
 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -56,14 +60,16 @@ pub unsafe fn _mm512_maskz_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2b))]
-pub unsafe fn _mm512_mask2_permutex2var_epi8(
+pub fn _mm512_mask2_permutex2var_epi8(
     a: __m512i,
     idx: __m512i,
     k: __mmask64,
     b: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
-    transmute(simd_select_bitmask(k, permute, idx.as_i8x64()))
+    unsafe {
+        let permute = _mm512_permutex2var_epi8(a, idx, b).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x64()))
+    }
 }
 
 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -73,8 +79,8 @@ pub unsafe fn _mm512_mask2_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm256_permutex2var_epi8(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
-    transmute(vpermi2b256(a.as_i8x32(), idx.as_i8x32(), b.as_i8x32()))
+pub fn _mm256_permutex2var_epi8(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpermi2b256(a.as_i8x32(), idx.as_i8x32(), b.as_i8x32())) }
 }
 
 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -84,14 +90,16 @@ pub unsafe fn _mm256_permutex2var_epi8(a: __m256i, idx: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2b))]
-pub unsafe fn _mm256_mask_permutex2var_epi8(
+pub fn _mm256_mask_permutex2var_epi8(
     a: __m256i,
     k: __mmask32,
     idx: __m256i,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
-    transmute(simd_select_bitmask(k, permute, a.as_i8x32()))
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x32()))
+    }
 }
 
 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -101,14 +109,16 @@ pub unsafe fn _mm256_mask_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm256_maskz_permutex2var_epi8(
+pub fn _mm256_maskz_permutex2var_epi8(
     k: __mmask32,
     a: __m256i,
     idx: __m256i,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
-    transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+    }
 }
 
 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -118,14 +128,16 @@ pub unsafe fn _mm256_maskz_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2b))]
-pub unsafe fn _mm256_mask2_permutex2var_epi8(
+pub fn _mm256_mask2_permutex2var_epi8(
     a: __m256i,
     idx: __m256i,
     k: __mmask32,
     b: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
-    transmute(simd_select_bitmask(k, permute, idx.as_i8x32()))
+    unsafe {
+        let permute = _mm256_permutex2var_epi8(a, idx, b).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x32()))
+    }
 }
 
 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
@@ -135,8 +147,8 @@ pub unsafe fn _mm256_mask2_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm_permutex2var_epi8(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
-    transmute(vpermi2b128(a.as_i8x16(), idx.as_i8x16(), b.as_i8x16()))
+pub fn _mm_permutex2var_epi8(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpermi2b128(a.as_i8x16(), idx.as_i8x16(), b.as_i8x16())) }
 }
 
 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -146,14 +158,11 @@ pub unsafe fn _mm_permutex2var_epi8(a: __m128i, idx: __m128i, b: __m128i) -> __m
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermt2b))]
-pub unsafe fn _mm_mask_permutex2var_epi8(
-    a: __m128i,
-    k: __mmask16,
-    idx: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
-    transmute(simd_select_bitmask(k, permute, a.as_i8x16()))
+pub fn _mm_mask_permutex2var_epi8(a: __m128i, k: __mmask16, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, a.as_i8x16()))
+    }
 }
 
 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -163,14 +172,11 @@ pub unsafe fn _mm_mask_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2b
-pub unsafe fn _mm_maskz_permutex2var_epi8(
-    k: __mmask16,
-    a: __m128i,
-    idx: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
-    transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+pub fn _mm_maskz_permutex2var_epi8(k: __mmask16, a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+    }
 }
 
 /// Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -180,14 +186,11 @@ pub unsafe fn _mm_maskz_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermi2b))]
-pub unsafe fn _mm_mask2_permutex2var_epi8(
-    a: __m128i,
-    idx: __m128i,
-    k: __mmask16,
-    b: __m128i,
-) -> __m128i {
-    let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
-    transmute(simd_select_bitmask(k, permute, idx.as_i8x16()))
+pub fn _mm_mask2_permutex2var_epi8(a: __m128i, idx: __m128i, k: __mmask16, b: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutex2var_epi8(a, idx, b).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, idx.as_i8x16()))
+    }
 }
 
 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -197,8 +200,8 @@ pub unsafe fn _mm_mask2_permutex2var_epi8(
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm512_permutexvar_epi8(idx: __m512i, a: __m512i) -> __m512i {
-    transmute(vpermb(a.as_i8x64(), idx.as_i8x64()))
+pub fn _mm512_permutexvar_epi8(idx: __m512i, a: __m512i) -> __m512i {
+    unsafe { transmute(vpermb(a.as_i8x64(), idx.as_i8x64())) }
 }
 
 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -208,14 +211,16 @@ pub unsafe fn _mm512_permutexvar_epi8(idx: __m512i, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm512_mask_permutexvar_epi8(
+pub fn _mm512_mask_permutexvar_epi8(
     src: __m512i,
     k: __mmask64,
     idx: __m512i,
     a: __m512i,
 ) -> __m512i {
-    let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
-    transmute(simd_select_bitmask(k, permute, src.as_i8x64()))
+    unsafe {
+        let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x64()))
+    }
 }
 
 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -225,9 +230,11 @@ pub unsafe fn _mm512_mask_permutexvar_epi8(
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m512i) -> __m512i {
-    let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
-    transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+pub fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m512i) -> __m512i {
+    unsafe {
+        let permute = _mm512_permutexvar_epi8(idx, a).as_i8x64();
+        transmute(simd_select_bitmask(k, permute, i8x64::ZERO))
+    }
 }
 
 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -237,8 +244,8 @@ pub unsafe fn _mm512_maskz_permutexvar_epi8(k: __mmask64, idx: __m512i, a: __m51
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm256_permutexvar_epi8(idx: __m256i, a: __m256i) -> __m256i {
-    transmute(vpermb256(a.as_i8x32(), idx.as_i8x32()))
+pub fn _mm256_permutexvar_epi8(idx: __m256i, a: __m256i) -> __m256i {
+    unsafe { transmute(vpermb256(a.as_i8x32(), idx.as_i8x32())) }
 }
 
 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -248,14 +255,16 @@ pub unsafe fn _mm256_permutexvar_epi8(idx: __m256i, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm256_mask_permutexvar_epi8(
+pub fn _mm256_mask_permutexvar_epi8(
     src: __m256i,
     k: __mmask32,
     idx: __m256i,
     a: __m256i,
 ) -> __m256i {
-    let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
-    transmute(simd_select_bitmask(k, permute, src.as_i8x32()))
+    unsafe {
+        let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x32()))
+    }
 }
 
 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -265,9 +274,11 @@ pub unsafe fn _mm256_mask_permutexvar_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m256i) -> __m256i {
-    let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
-    transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+pub fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m256i) -> __m256i {
+    unsafe {
+        let permute = _mm256_permutexvar_epi8(idx, a).as_i8x32();
+        transmute(simd_select_bitmask(k, permute, i8x32::ZERO))
+    }
 }
 
 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
@@ -277,8 +288,8 @@ pub unsafe fn _mm256_maskz_permutexvar_epi8(k: __mmask32, idx: __m256i, a: __m25
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm_permutexvar_epi8(idx: __m128i, a: __m128i) -> __m128i {
-    transmute(vpermb128(a.as_i8x16(), idx.as_i8x16()))
+pub fn _mm_permutexvar_epi8(idx: __m128i, a: __m128i) -> __m128i {
+    unsafe { transmute(vpermb128(a.as_i8x16(), idx.as_i8x16())) }
 }
 
 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -288,14 +299,11 @@ pub unsafe fn _mm_permutexvar_epi8(idx: __m128i, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm_mask_permutexvar_epi8(
-    src: __m128i,
-    k: __mmask16,
-    idx: __m128i,
-    a: __m128i,
-) -> __m128i {
-    let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
-    transmute(simd_select_bitmask(k, permute, src.as_i8x16()))
+pub fn _mm_mask_permutexvar_epi8(src: __m128i, k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, src.as_i8x16()))
+    }
 }
 
 /// Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -305,9 +313,11 @@ pub unsafe fn _mm_mask_permutexvar_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpermb))]
-pub unsafe fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
-    let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
-    transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+pub fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i) -> __m128i {
+    unsafe {
+        let permute = _mm_permutexvar_epi8(idx, a).as_i8x16();
+        transmute(simd_select_bitmask(k, permute, i8x16::ZERO))
+    }
 }
 
 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
@@ -317,8 +327,8 @@ pub unsafe fn _mm_maskz_permutexvar_epi8(k: __mmask16, idx: __m128i, a: __m128i)
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm512_multishift_epi64_epi8(a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpmultishiftqb(a.as_i8x64(), b.as_i8x64()))
+pub fn _mm512_multishift_epi64_epi8(a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpmultishiftqb(a.as_i8x64(), b.as_i8x64())) }
 }
 
 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -328,14 +338,16 @@ pub unsafe fn _mm512_multishift_epi64_epi8(a: __m512i, b: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm512_mask_multishift_epi64_epi8(
+pub fn _mm512_mask_multishift_epi64_epi8(
     src: __m512i,
     k: __mmask64,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, multishift, src.as_i8x64()))
+    unsafe {
+        let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x64()))
+    }
 }
 
 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -345,9 +357,11 @@ pub unsafe fn _mm512_mask_multishift_epi64_epi8(
 #[target_feature(enable = "avx512vbmi")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
-    let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
-    transmute(simd_select_bitmask(k, multishift, i8x64::ZERO))
+pub fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let multishift = _mm512_multishift_epi64_epi8(a, b).as_i8x64();
+        transmute(simd_select_bitmask(k, multishift, i8x64::ZERO))
+    }
 }
 
 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
@@ -357,8 +371,8 @@ pub unsafe fn _mm512_maskz_multishift_epi64_epi8(k: __mmask64, a: __m512i, b: __
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm256_multishift_epi64_epi8(a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpmultishiftqb256(a.as_i8x32(), b.as_i8x32()))
+pub fn _mm256_multishift_epi64_epi8(a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpmultishiftqb256(a.as_i8x32(), b.as_i8x32())) }
 }
 
 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -368,14 +382,16 @@ pub unsafe fn _mm256_multishift_epi64_epi8(a: __m256i, b: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm256_mask_multishift_epi64_epi8(
+pub fn _mm256_mask_multishift_epi64_epi8(
     src: __m256i,
     k: __mmask32,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, multishift, src.as_i8x32()))
+    unsafe {
+        let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x32()))
+    }
 }
 
 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -385,9 +401,11 @@ pub unsafe fn _mm256_mask_multishift_epi64_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
-    let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
-    transmute(simd_select_bitmask(k, multishift, i8x32::ZERO))
+pub fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let multishift = _mm256_multishift_epi64_epi8(a, b).as_i8x32();
+        transmute(simd_select_bitmask(k, multishift, i8x32::ZERO))
+    }
 }
 
 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
@@ -397,8 +415,8 @@ pub unsafe fn _mm256_maskz_multishift_epi64_epi8(k: __mmask32, a: __m256i, b: __
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm_multishift_epi64_epi8(a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpmultishiftqb128(a.as_i8x16(), b.as_i8x16()))
+pub fn _mm_multishift_epi64_epi8(a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpmultishiftqb128(a.as_i8x16(), b.as_i8x16())) }
 }
 
 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -408,14 +426,16 @@ pub unsafe fn _mm_multishift_epi64_epi8(a: __m128i, b: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm_mask_multishift_epi64_epi8(
+pub fn _mm_mask_multishift_epi64_epi8(
     src: __m128i,
     k: __mmask16,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, multishift, src.as_i8x16()))
+    unsafe {
+        let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, multishift, src.as_i8x16()))
+    }
 }
 
 /// For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -425,9 +445,11 @@ pub unsafe fn _mm_mask_multishift_epi64_epi8(
 #[target_feature(enable = "avx512vbmi,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpmultishiftqb))]
-pub unsafe fn _mm_maskz_multishift_epi64_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
-    let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
-    transmute(simd_select_bitmask(k, multishift, i8x16::ZERO))
+pub fn _mm_maskz_multishift_epi64_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let multishift = _mm_multishift_epi64_epi8(a, b).as_i8x16();
+        transmute(simd_select_bitmask(k, multishift, i8x16::ZERO))
+    }
 }
 
 #[allow(improper_ctypes)]
diff --git a/crates/core_arch/src/x86/avx512vbmi2.rs b/crates/core_arch/src/x86/avx512vbmi2.rs
index f5a9cce3e6..97c7986c17 100644
--- a/crates/core_arch/src/x86/avx512vbmi2.rs
+++ b/crates/core_arch/src/x86/avx512vbmi2.rs
@@ -235,8 +235,8 @@ pub unsafe fn _mm_mask_compressstoreu_epi8(base_addr: *mut u8, k: __mmask16, a:
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressw))]
-pub unsafe fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    transmute(vpcompressw(a.as_i16x32(), src.as_i16x32(), k))
+pub fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressw(a.as_i16x32(), src.as_i16x32(), k)) }
 }
 
 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -246,8 +246,8 @@ pub unsafe fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i)
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressw))]
-pub unsafe fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    transmute(vpcompressw(a.as_i16x32(), i16x32::ZERO, k))
+pub fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressw(a.as_i16x32(), i16x32::ZERO, k)) }
 }
 
 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -257,8 +257,8 @@ pub unsafe fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressw))]
-pub unsafe fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    transmute(vpcompressw256(a.as_i16x16(), src.as_i16x16(), k))
+pub fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressw256(a.as_i16x16(), src.as_i16x16(), k)) }
 }
 
 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -268,8 +268,8 @@ pub unsafe fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i)
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressw))]
-pub unsafe fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    transmute(vpcompressw256(a.as_i16x16(), i16x16::ZERO, k))
+pub fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressw256(a.as_i16x16(), i16x16::ZERO, k)) }
 }
 
 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -279,8 +279,8 @@ pub unsafe fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressw))]
-pub unsafe fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpcompressw128(a.as_i16x8(), src.as_i16x8(), k))
+pub fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressw128(a.as_i16x8(), src.as_i16x8(), k)) }
 }
 
 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -290,8 +290,8 @@ pub unsafe fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) ->
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressw))]
-pub unsafe fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpcompressw128(a.as_i16x8(), i16x8::ZERO, k))
+pub fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressw128(a.as_i16x8(), i16x8::ZERO, k)) }
 }
 
 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -301,8 +301,8 @@ pub unsafe fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressb))]
-pub unsafe fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
-    transmute(vpcompressb(a.as_i8x64(), src.as_i8x64(), k))
+pub fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressb(a.as_i8x64(), src.as_i8x64(), k)) }
 }
 
 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -312,8 +312,8 @@ pub unsafe fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i)
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressb))]
-pub unsafe fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    transmute(vpcompressb(a.as_i8x64(), i8x64::ZERO, k))
+pub fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpcompressb(a.as_i8x64(), i8x64::ZERO, k)) }
 }
 
 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -323,8 +323,8 @@ pub unsafe fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressb))]
-pub unsafe fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
-    transmute(vpcompressb256(a.as_i8x32(), src.as_i8x32(), k))
+pub fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressb256(a.as_i8x32(), src.as_i8x32(), k)) }
 }
 
 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -334,8 +334,8 @@ pub unsafe fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i)
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressb))]
-pub unsafe fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    transmute(vpcompressb256(a.as_i8x32(), i8x32::ZERO, k))
+pub fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpcompressb256(a.as_i8x32(), i8x32::ZERO, k)) }
 }
 
 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
@@ -345,8 +345,8 @@ pub unsafe fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressb))]
-pub unsafe fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    transmute(vpcompressb128(a.as_i8x16(), src.as_i8x16(), k))
+pub fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressb128(a.as_i8x16(), src.as_i8x16(), k)) }
 }
 
 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
@@ -356,8 +356,8 @@ pub unsafe fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) ->
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpcompressb))]
-pub unsafe fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    transmute(vpcompressb128(a.as_i8x16(), i8x16::ZERO, k))
+pub fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpcompressb128(a.as_i8x16(), i8x16::ZERO, k)) }
 }
 
 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -367,8 +367,8 @@ pub unsafe fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandw))]
-pub unsafe fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
-    transmute(vpexpandw(a.as_i16x32(), src.as_i16x32(), k))
+pub fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandw(a.as_i16x32(), src.as_i16x32(), k)) }
 }
 
 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -378,8 +378,8 @@ pub unsafe fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandw))]
-pub unsafe fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i {
-    transmute(vpexpandw(a.as_i16x32(), i16x32::ZERO, k))
+pub fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandw(a.as_i16x32(), i16x32::ZERO, k)) }
 }
 
 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -389,8 +389,8 @@ pub unsafe fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandw))]
-pub unsafe fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
-    transmute(vpexpandw256(a.as_i16x16(), src.as_i16x16(), k))
+pub fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandw256(a.as_i16x16(), src.as_i16x16(), k)) }
 }
 
 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -400,8 +400,8 @@ pub unsafe fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandw))]
-pub unsafe fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i {
-    transmute(vpexpandw256(a.as_i16x16(), i16x16::ZERO, k))
+pub fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandw256(a.as_i16x16(), i16x16::ZERO, k)) }
 }
 
 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -411,8 +411,8 @@ pub unsafe fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandw))]
-pub unsafe fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpexpandw128(a.as_i16x8(), src.as_i16x8(), k))
+pub fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandw128(a.as_i16x8(), src.as_i16x8(), k)) }
 }
 
 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -422,8 +422,8 @@ pub unsafe fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandw))]
-pub unsafe fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(vpexpandw128(a.as_i16x8(), i16x8::ZERO, k))
+pub fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandw128(a.as_i16x8(), i16x8::ZERO, k)) }
 }
 
 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -433,8 +433,8 @@ pub unsafe fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandb))]
-pub unsafe fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
-    transmute(vpexpandb(a.as_i8x64(), src.as_i8x64(), k))
+pub fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandb(a.as_i8x64(), src.as_i8x64(), k)) }
 }
 
 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -444,8 +444,8 @@ pub unsafe fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) ->
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandb))]
-pub unsafe fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i {
-    transmute(vpexpandb(a.as_i8x64(), i8x64::ZERO, k))
+pub fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i {
+    unsafe { transmute(vpexpandb(a.as_i8x64(), i8x64::ZERO, k)) }
 }
 
 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -455,8 +455,8 @@ pub unsafe fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandb))]
-pub unsafe fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
-    transmute(vpexpandb256(a.as_i8x32(), src.as_i8x32(), k))
+pub fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandb256(a.as_i8x32(), src.as_i8x32(), k)) }
 }
 
 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -466,8 +466,8 @@ pub unsafe fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) ->
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandb))]
-pub unsafe fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i {
-    transmute(vpexpandb256(a.as_i8x32(), i8x32::ZERO, k))
+pub fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i {
+    unsafe { transmute(vpexpandb256(a.as_i8x32(), i8x32::ZERO, k)) }
 }
 
 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -477,8 +477,8 @@ pub unsafe fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandb))]
-pub unsafe fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
-    transmute(vpexpandb128(a.as_i8x16(), src.as_i8x16(), k))
+pub fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandb128(a.as_i8x16(), src.as_i8x16(), k)) }
 }
 
 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -488,8 +488,8 @@ pub unsafe fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpexpandb))]
-pub unsafe fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
-    transmute(vpexpandb128(a.as_i8x16(), i8x16::ZERO, k))
+pub fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
+    unsafe { transmute(vpexpandb128(a.as_i8x16(), i8x16::ZERO, k)) }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
@@ -499,8 +499,8 @@ pub unsafe fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    transmute(vpshldvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8()))
+pub fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshldvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8())) }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -510,9 +510,11 @@ pub unsafe fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
-    let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+pub fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -522,9 +524,11 @@ pub unsafe fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+pub fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
@@ -534,8 +538,8 @@ pub unsafe fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: _
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    transmute(vpshldvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4()))
+pub fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshldvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4())) }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -545,9 +549,11 @@ pub unsafe fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
-    let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+pub fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -557,9 +563,11 @@ pub unsafe fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+pub fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
@@ -569,8 +577,8 @@ pub unsafe fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: _
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    transmute(vpshldvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2()))
+pub fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshldvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2())) }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -580,9 +588,11 @@ pub unsafe fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+pub fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -592,9 +602,11 @@ pub unsafe fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvq))]
-pub unsafe fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+pub fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
@@ -604,8 +616,8 @@ pub unsafe fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m1
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    transmute(vpshldvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16()))
+pub fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshldvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16())) }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -615,9 +627,11 @@ pub unsafe fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm512_mask_shldv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
-    let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+pub fn _mm512_mask_shldv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -627,14 +641,11 @@ pub unsafe fn _mm512_mask_shldv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: _
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm512_maskz_shldv_epi32(
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+pub fn _mm512_maskz_shldv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
@@ -644,8 +655,8 @@ pub unsafe fn _mm512_maskz_shldv_epi32(
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    transmute(vpshldvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8()))
+pub fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshldvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8())) }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -655,9 +666,11 @@ pub unsafe fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
-    let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+pub fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -667,9 +680,11 @@ pub unsafe fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+pub fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
@@ -679,8 +694,8 @@ pub unsafe fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: _
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    transmute(vpshldvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4()))
+pub fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshldvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4())) }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -690,9 +705,11 @@ pub unsafe fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+pub fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -702,9 +719,11 @@ pub unsafe fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvd))]
-pub unsafe fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+pub fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
@@ -714,8 +733,8 @@ pub unsafe fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m1
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    transmute(vpshldvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32()))
+pub fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshldvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32())) }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -725,9 +744,11 @@ pub unsafe fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm512_mask_shldv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
-    let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+pub fn _mm512_mask_shldv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -737,14 +758,11 @@ pub unsafe fn _mm512_mask_shldv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: _
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm512_maskz_shldv_epi16(
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+pub fn _mm512_maskz_shldv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
@@ -754,8 +772,8 @@ pub unsafe fn _mm512_maskz_shldv_epi16(
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    transmute(vpshldvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16()))
+pub fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshldvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16())) }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -765,9 +783,11 @@ pub unsafe fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm256_mask_shldv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
-    let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+pub fn _mm256_mask_shldv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -777,14 +797,11 @@ pub unsafe fn _mm256_mask_shldv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: _
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm256_maskz_shldv_epi16(
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+pub fn _mm256_maskz_shldv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
@@ -794,8 +811,8 @@ pub unsafe fn _mm256_maskz_shldv_epi16(
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    transmute(vpshldvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8()))
+pub fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshldvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8())) }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -805,9 +822,11 @@ pub unsafe fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+pub fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -817,9 +836,11 @@ pub unsafe fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldvw))]
-pub unsafe fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+pub fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
@@ -829,8 +850,8 @@ pub unsafe fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m1
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    transmute(vpshrdvq(b.as_i64x8(), a.as_i64x8(), c.as_i64x8()))
+pub fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshrdvq(b.as_i64x8(), a.as_i64x8(), c.as_i64x8())) }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -840,9 +861,11 @@ pub unsafe fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
-    let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+pub fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -852,9 +875,11 @@ pub unsafe fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+pub fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
@@ -864,8 +889,8 @@ pub unsafe fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: _
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    transmute(vpshrdvq256(b.as_i64x4(), a.as_i64x4(), c.as_i64x4()))
+pub fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshrdvq256(b.as_i64x4(), a.as_i64x4(), c.as_i64x4())) }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -875,9 +900,11 @@ pub unsafe fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
-    let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+pub fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -887,9 +914,11 @@ pub unsafe fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+pub fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
@@ -899,8 +928,8 @@ pub unsafe fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: _
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    transmute(vpshrdvq128(b.as_i64x2(), a.as_i64x2(), c.as_i64x2()))
+pub fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshrdvq128(b.as_i64x2(), a.as_i64x2(), c.as_i64x2())) }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -910,9 +939,11 @@ pub unsafe fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+pub fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -922,9 +953,11 @@ pub unsafe fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvq))]
-pub unsafe fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+pub fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
@@ -934,8 +967,8 @@ pub unsafe fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m1
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    transmute(vpshrdvd(b.as_i32x16(), a.as_i32x16(), c.as_i32x16()))
+pub fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshrdvd(b.as_i32x16(), a.as_i32x16(), c.as_i32x16())) }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -945,9 +978,11 @@ pub unsafe fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm512_mask_shrdv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
-    let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+pub fn _mm512_mask_shrdv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -957,14 +992,11 @@ pub unsafe fn _mm512_mask_shrdv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: _
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm512_maskz_shrdv_epi32(
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+pub fn _mm512_maskz_shrdv_epi32(k: __mmask16, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
@@ -974,8 +1006,8 @@ pub unsafe fn _mm512_maskz_shrdv_epi32(
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    transmute(vpshrdvd256(b.as_i32x8(), a.as_i32x8(), c.as_i32x8()))
+pub fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshrdvd256(b.as_i32x8(), a.as_i32x8(), c.as_i32x8())) }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -985,9 +1017,11 @@ pub unsafe fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
-    let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+pub fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -997,9 +1031,11 @@ pub unsafe fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+pub fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
@@ -1009,8 +1045,8 @@ pub unsafe fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: _
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    transmute(vpshrdvd128(b.as_i32x4(), a.as_i32x4(), c.as_i32x4()))
+pub fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshrdvd128(b.as_i32x4(), a.as_i32x4(), c.as_i32x4())) }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -1020,9 +1056,11 @@ pub unsafe fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+pub fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1032,9 +1070,11 @@ pub unsafe fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvd))]
-pub unsafe fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+pub fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
@@ -1044,8 +1084,8 @@ pub unsafe fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m1
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
-    transmute(vpshrdvw(b.as_i16x32(), a.as_i16x32(), c.as_i16x32()))
+pub fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe { transmute(vpshrdvw(b.as_i16x32(), a.as_i16x32(), c.as_i16x32())) }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -1055,9 +1095,11 @@ pub unsafe fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm512_mask_shrdv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
-    let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+pub fn _mm512_mask_shrdv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1067,14 +1109,11 @@ pub unsafe fn _mm512_mask_shrdv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: _
 #[target_feature(enable = "avx512vbmi2")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm512_maskz_shrdv_epi16(
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-    c: __m512i,
-) -> __m512i {
-    let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+pub fn _mm512_maskz_shrdv_epi16(k: __mmask32, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
+    unsafe {
+        let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
@@ -1084,8 +1123,8 @@ pub unsafe fn _mm512_maskz_shrdv_epi16(
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
-    transmute(vpshrdvw256(b.as_i16x16(), a.as_i16x16(), c.as_i16x16()))
+pub fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe { transmute(vpshrdvw256(b.as_i16x16(), a.as_i16x16(), c.as_i16x16())) }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -1095,9 +1134,11 @@ pub unsafe fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm256_mask_shrdv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
-    let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+pub fn _mm256_mask_shrdv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1107,14 +1148,11 @@ pub unsafe fn _mm256_mask_shrdv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: _
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm256_maskz_shrdv_epi16(
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-    c: __m256i,
-) -> __m256i {
-    let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+pub fn _mm256_maskz_shrdv_epi16(k: __mmask16, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    unsafe {
+        let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
@@ -1124,8 +1162,8 @@ pub unsafe fn _mm256_maskz_shrdv_epi16(
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    transmute(vpshrdvw128(b.as_i16x8(), a.as_i16x8(), c.as_i16x8()))
+pub fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe { transmute(vpshrdvw128(b.as_i16x8(), a.as_i16x8(), c.as_i16x8())) }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
@@ -1135,9 +1173,11 @@ pub unsafe fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+pub fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1147,9 +1187,11 @@ pub unsafe fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m12
 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshrdvw))]
-pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
-    let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+pub fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    unsafe {
+        let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
@@ -1160,7 +1202,7 @@ pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m1
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shldi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+pub fn _mm512_shldi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_shldv_epi64(a, b, _mm512_set1_epi64(IMM8 as i64))
 }
@@ -1173,15 +1215,17 @@ pub unsafe fn _mm512_shldi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shldi_epi64<const IMM8: i32>(
+pub fn _mm512_mask_shldi_epi64<const IMM8: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1192,14 +1236,12 @@ pub unsafe fn _mm512_mask_shldi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shldi_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+pub fn _mm512_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
@@ -1210,7 +1252,7 @@ pub unsafe fn _mm512_maskz_shldi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_shldi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+pub fn _mm256_shldi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_shldv_epi64(a, b, _mm256_set1_epi64x(IMM8 as i64))
 }
@@ -1223,15 +1265,17 @@ pub unsafe fn _mm256_shldi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shldi_epi64<const IMM8: i32>(
+pub fn _mm256_mask_shldi_epi64<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1242,14 +1286,12 @@ pub unsafe fn _mm256_mask_shldi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shldi_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+pub fn _mm256_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
@@ -1260,7 +1302,7 @@ pub unsafe fn _mm256_maskz_shldi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_shldi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+pub fn _mm_shldi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_shldv_epi64(a, b, _mm_set1_epi64x(IMM8 as i64))
 }
@@ -1273,15 +1315,17 @@ pub unsafe fn _mm_shldi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_shldi_epi64<const IMM8: i32>(
+pub fn _mm_mask_shldi_epi64<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1292,14 +1336,12 @@ pub unsafe fn _mm_mask_shldi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_shldi_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+pub fn _mm_maskz_shldi_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
@@ -1310,7 +1352,7 @@ pub unsafe fn _mm_maskz_shldi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shldi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+pub fn _mm512_shldi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_shldv_epi32(a, b, _mm512_set1_epi32(IMM8))
 }
@@ -1323,15 +1365,17 @@ pub unsafe fn _mm512_shldi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shldi_epi32<const IMM8: i32>(
+pub fn _mm512_mask_shldi_epi32<const IMM8: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1342,14 +1386,12 @@ pub unsafe fn _mm512_mask_shldi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shldi_epi32<const IMM8: i32>(
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+pub fn _mm512_maskz_shldi_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
@@ -1360,7 +1402,7 @@ pub unsafe fn _mm512_maskz_shldi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_shldi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+pub fn _mm256_shldi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_shldv_epi32(a, b, _mm256_set1_epi32(IMM8))
 }
@@ -1373,15 +1415,17 @@ pub unsafe fn _mm256_shldi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shldi_epi32<const IMM8: i32>(
+pub fn _mm256_mask_shldi_epi32<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1392,14 +1436,12 @@ pub unsafe fn _mm256_mask_shldi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shldi_epi32<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+pub fn _mm256_maskz_shldi_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
@@ -1410,7 +1452,7 @@ pub unsafe fn _mm256_maskz_shldi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_shldi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+pub fn _mm_shldi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_shldv_epi32(a, b, _mm_set1_epi32(IMM8))
 }
@@ -1423,15 +1465,17 @@ pub unsafe fn _mm_shldi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_shldi_epi32<const IMM8: i32>(
+pub fn _mm_mask_shldi_epi32<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1442,14 +1486,12 @@ pub unsafe fn _mm_mask_shldi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_shldi_epi32<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+pub fn _mm_maskz_shldi_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
@@ -1460,7 +1502,7 @@ pub unsafe fn _mm_maskz_shldi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shldi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+pub fn _mm512_shldi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_shldv_epi16(a, b, _mm512_set1_epi16(IMM8 as i16))
 }
@@ -1473,15 +1515,17 @@ pub unsafe fn _mm512_shldi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shldi_epi16<const IMM8: i32>(
+pub fn _mm512_mask_shldi_epi16<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1492,14 +1536,12 @@ pub unsafe fn _mm512_mask_shldi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shldi_epi16<const IMM8: i32>(
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+pub fn _mm512_maskz_shldi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shldi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
@@ -1510,7 +1552,7 @@ pub unsafe fn _mm512_maskz_shldi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_shldi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+pub fn _mm256_shldi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_shldv_epi16(a, b, _mm256_set1_epi16(IMM8 as i16))
 }
@@ -1523,15 +1565,17 @@ pub unsafe fn _mm256_shldi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shldi_epi16<const IMM8: i32>(
+pub fn _mm256_mask_shldi_epi16<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1542,14 +1586,12 @@ pub unsafe fn _mm256_mask_shldi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shldi_epi16<const IMM8: i32>(
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+pub fn _mm256_maskz_shldi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shldi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
@@ -1560,7 +1602,7 @@ pub unsafe fn _mm256_maskz_shldi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_shldi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+pub fn _mm_shldi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_shldv_epi16(a, b, _mm_set1_epi16(IMM8 as i16))
 }
@@ -1573,15 +1615,17 @@ pub unsafe fn _mm_shldi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_shldi_epi16<const IMM8: i32>(
+pub fn _mm_mask_shldi_epi16<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1592,14 +1636,12 @@ pub unsafe fn _mm_mask_shldi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))]
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_shldi_epi16<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+pub fn _mm_maskz_shldi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shldi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
@@ -1610,7 +1652,7 @@ pub unsafe fn _mm_maskz_shldi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shrdi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+pub fn _mm512_shrdi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_shrdv_epi64(a, b, _mm512_set1_epi64(IMM8 as i64))
 }
@@ -1623,15 +1665,17 @@ pub unsafe fn _mm512_shrdi_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shrdi_epi64<const IMM8: i32>(
+pub fn _mm512_mask_shrdi_epi64<const IMM8: i32>(
     src: __m512i,
     k: __mmask8,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1642,14 +1686,12 @@ pub unsafe fn _mm512_mask_shrdi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 255))] //should be vpshrdq
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shrdi_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
-    transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+pub fn _mm512_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi64::<IMM8>(a, b).as_i64x8();
+        transmute(simd_select_bitmask(k, shf, i64x8::ZERO))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
@@ -1660,7 +1702,7 @@ pub unsafe fn _mm512_maskz_shrdi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_shrdi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+pub fn _mm256_shrdi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_shrdv_epi64(a, b, _mm256_set1_epi64x(IMM8 as i64))
 }
@@ -1673,15 +1715,17 @@ pub unsafe fn _mm256_shrdi_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shrdi_epi64<const IMM8: i32>(
+pub fn _mm256_mask_shrdi_epi64<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1692,14 +1736,12 @@ pub unsafe fn _mm256_mask_shrdi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shrdi_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
-    transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+pub fn _mm256_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi64::<IMM8>(a, b).as_i64x4();
+        transmute(simd_select_bitmask(k, shf, i64x4::ZERO))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
@@ -1710,7 +1752,7 @@ pub unsafe fn _mm256_maskz_shrdi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_shrdi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+pub fn _mm_shrdi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_shrdv_epi64(a, b, _mm_set1_epi64x(IMM8 as i64))
 }
@@ -1723,15 +1765,17 @@ pub unsafe fn _mm_shrdi_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_shrdi_epi64<const IMM8: i32>(
+pub fn _mm_mask_shrdi_epi64<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
+    }
 }
 
 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1742,14 +1786,12 @@ pub unsafe fn _mm_mask_shrdi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldq, IMM8 = 5))] //should be vpshrdq
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_shrdi_epi64<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
-    transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+pub fn _mm_maskz_shrdi_epi64<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi64::<IMM8>(a, b).as_i64x2();
+        transmute(simd_select_bitmask(k, shf, i64x2::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
@@ -1760,7 +1802,7 @@ pub unsafe fn _mm_maskz_shrdi_epi64<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shrdi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+pub fn _mm512_shrdi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_shrdv_epi32(a, b, _mm512_set1_epi32(IMM8))
 }
@@ -1773,15 +1815,17 @@ pub unsafe fn _mm512_shrdi_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shrdi_epi32<const IMM8: i32>(
+pub fn _mm512_mask_shrdi_epi32<const IMM8: i32>(
     src: __m512i,
     k: __mmask16,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1792,14 +1836,12 @@ pub unsafe fn _mm512_mask_shrdi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shrdi_epi32<const IMM8: i32>(
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+pub fn _mm512_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi32::<IMM8>(a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, shf, i32x16::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
@@ -1810,7 +1852,7 @@ pub unsafe fn _mm512_maskz_shrdi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_shrdi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+pub fn _mm256_shrdi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_shrdv_epi32(a, b, _mm256_set1_epi32(IMM8))
 }
@@ -1823,15 +1865,17 @@ pub unsafe fn _mm256_shrdi_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shrdi_epi32<const IMM8: i32>(
+pub fn _mm256_mask_shrdi_epi32<const IMM8: i32>(
     src: __m256i,
     k: __mmask8,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1842,14 +1886,12 @@ pub unsafe fn _mm256_mask_shrdi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shrdi_epi32<const IMM8: i32>(
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+pub fn _mm256_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi32::<IMM8>(a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, shf, i32x8::ZERO))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
@@ -1860,7 +1902,7 @@ pub unsafe fn _mm256_maskz_shrdi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_shrdi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+pub fn _mm_shrdi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_shrdv_epi32(a, b, _mm_set1_epi32(IMM8))
 }
@@ -1873,15 +1915,17 @@ pub unsafe fn _mm_shrdi_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_shrdi_epi32<const IMM8: i32>(
+pub fn _mm_mask_shrdi_epi32<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
+    }
 }
 
 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1892,14 +1936,12 @@ pub unsafe fn _mm_mask_shrdi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldd, IMM8 = 5))] //should be vpshldd
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_shrdi_epi32<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+pub fn _mm_maskz_shrdi_epi32<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi32::<IMM8>(a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, shf, i32x4::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
@@ -1910,7 +1952,7 @@ pub unsafe fn _mm_maskz_shrdi_epi32<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm512_shrdi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
+pub fn _mm512_shrdi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm512_shrdv_epi16(a, b, _mm512_set1_epi16(IMM8 as i16))
 }
@@ -1923,15 +1965,17 @@ pub unsafe fn _mm512_shrdi_epi16<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm512_mask_shrdi_epi16<const IMM8: i32>(
+pub fn _mm512_mask_shrdi_epi16<const IMM8: i32>(
     src: __m512i,
     k: __mmask32,
     a: __m512i,
     b: __m512i,
 ) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1942,14 +1986,12 @@ pub unsafe fn _mm512_mask_shrdi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm512_maskz_shrdi_epi16<const IMM8: i32>(
-    k: __mmask32,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
-    transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+pub fn _mm512_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask32, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm512_shrdi_epi16::<IMM8>(a, b).as_i16x32();
+        transmute(simd_select_bitmask(k, shf, i16x32::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
@@ -1960,7 +2002,7 @@ pub unsafe fn _mm512_maskz_shrdi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm256_shrdi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
+pub fn _mm256_shrdi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm256_shrdv_epi16(a, b, _mm256_set1_epi16(IMM8 as i16))
 }
@@ -1973,15 +2015,17 @@ pub unsafe fn _mm256_shrdi_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm256_mask_shrdi_epi16<const IMM8: i32>(
+pub fn _mm256_mask_shrdi_epi16<const IMM8: i32>(
     src: __m256i,
     k: __mmask16,
     a: __m256i,
     b: __m256i,
 ) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -1992,14 +2036,12 @@ pub unsafe fn _mm256_mask_shrdi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm256_maskz_shrdi_epi16<const IMM8: i32>(
-    k: __mmask16,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
-    transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+pub fn _mm256_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask16, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm256_shrdi_epi16::<IMM8>(a, b).as_i16x16();
+        transmute(simd_select_bitmask(k, shf, i16x16::ZERO))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
@@ -2010,7 +2052,7 @@ pub unsafe fn _mm256_maskz_shrdi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_shrdi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
+pub fn _mm_shrdi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
     static_assert_uimm_bits!(IMM8, 8);
     _mm_shrdv_epi16(a, b, _mm_set1_epi16(IMM8 as i16))
 }
@@ -2023,15 +2065,17 @@ pub unsafe fn _mm_shrdi_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
 #[rustc_legacy_const_generics(4)]
-pub unsafe fn _mm_mask_shrdi_epi16<const IMM8: i32>(
+pub fn _mm_mask_shrdi_epi16<const IMM8: i32>(
     src: __m128i,
     k: __mmask8,
     a: __m128i,
     b: __m128i,
 ) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
+    }
 }
 
 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -2042,14 +2086,12 @@ pub unsafe fn _mm_mask_shrdi_epi16<const IMM8: i32>(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpshldw, IMM8 = 5))] //should be vpshrdw
 #[rustc_legacy_const_generics(3)]
-pub unsafe fn _mm_maskz_shrdi_epi16<const IMM8: i32>(
-    k: __mmask8,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    static_assert_uimm_bits!(IMM8, 8);
-    let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
-    transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+pub fn _mm_maskz_shrdi_epi16<const IMM8: i32>(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        static_assert_uimm_bits!(IMM8, 8);
+        let shf = _mm_shrdi_epi16::<IMM8>(a, b).as_i16x8();
+        transmute(simd_select_bitmask(k, shf, i16x8::ZERO))
+    }
 }
 
 #[allow(improper_ctypes)]
diff --git a/crates/core_arch/src/x86/avx512vnni.rs b/crates/core_arch/src/x86/avx512vnni.rs
index 1e1639b700..d7cd0838c2 100644
--- a/crates/core_arch/src/x86/avx512vnni.rs
+++ b/crates/core_arch/src/x86/avx512vnni.rs
@@ -11,8 +11,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpwssd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -22,14 +22,11 @@ pub unsafe fn _mm512_dpwssd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m51
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm512_mask_dpwssd_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_dpwssd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -39,14 +36,11 @@ pub unsafe fn _mm512_mask_dpwssd_epi32(
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm512_maskz_dpwssd_epi32(
-    k: __mmask16,
-    src: __m512i,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_dpwssd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -59,8 +53,8 @@ pub unsafe fn _mm512_maskz_dpwssd_epi32(
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpdpwssd)
 )]
-pub unsafe fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -70,8 +64,8 @@ pub unsafe fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> _
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -81,14 +75,11 @@ pub unsafe fn _mm256_dpwssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm256_mask_dpwssd_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_dpwssd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -98,14 +89,11 @@ pub unsafe fn _mm256_mask_dpwssd_epi32(
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm256_maskz_dpwssd_epi32(
-    k: __mmask8,
-    src: __m256i,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_dpwssd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -118,8 +106,8 @@ pub unsafe fn _mm256_maskz_dpwssd_epi32(
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpdpwssd)
 )]
-pub unsafe fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -129,8 +117,8 @@ pub unsafe fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m1
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -140,9 +128,11 @@ pub unsafe fn _mm_dpwssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -152,9 +142,11 @@ pub unsafe fn _mm_mask_dpwssd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssd))]
-pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -164,8 +156,8 @@ pub unsafe fn _mm_maskz_dpwssd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: _
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpwssds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -175,14 +167,11 @@ pub unsafe fn _mm512_dpwssds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m5
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm512_mask_dpwssds_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_dpwssds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -192,14 +181,11 @@ pub unsafe fn _mm512_mask_dpwssds_epi32(
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm512_maskz_dpwssds_epi32(
-    k: __mmask16,
-    src: __m512i,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_dpwssds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpwssds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -212,8 +198,8 @@ pub unsafe fn _mm512_maskz_dpwssds_epi32(
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpdpwssds)
 )]
-pub unsafe fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -223,8 +209,8 @@ pub unsafe fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -234,14 +220,11 @@ pub unsafe fn _mm256_dpwssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm256_mask_dpwssds_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_dpwssds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -251,14 +234,11 @@ pub unsafe fn _mm256_mask_dpwssds_epi32(
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm256_maskz_dpwssds_epi32(
-    k: __mmask8,
-    src: __m256i,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_dpwssds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpwssds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -271,8 +251,8 @@ pub unsafe fn _mm256_maskz_dpwssds_epi32(
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpdpwssds)
 )]
-pub unsafe fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -282,8 +262,8 @@ pub unsafe fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -293,9 +273,11 @@ pub unsafe fn _mm_dpwssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -305,14 +287,11 @@ pub unsafe fn _mm_mask_dpwssds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: _
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpwssds))]
-pub unsafe fn _mm_maskz_dpwssds_epi32(
-    k: __mmask8,
-    src: __m128i,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_dpwssds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpwssds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -322,8 +301,8 @@ pub unsafe fn _mm_maskz_dpwssds_epi32(
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpbusd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -333,14 +312,11 @@ pub unsafe fn _mm512_dpbusd_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m51
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm512_mask_dpbusd_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_dpbusd_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -350,14 +326,11 @@ pub unsafe fn _mm512_mask_dpbusd_epi32(
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm512_maskz_dpbusd_epi32(
-    k: __mmask16,
-    src: __m512i,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_dpbusd_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusd_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -370,8 +343,8 @@ pub unsafe fn _mm512_maskz_dpbusd_epi32(
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpdpbusd)
 )]
-pub unsafe fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -381,8 +354,8 @@ pub unsafe fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> _
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -392,14 +365,11 @@ pub unsafe fn _mm256_dpbusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm256_mask_dpbusd_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_dpbusd_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -409,14 +379,11 @@ pub unsafe fn _mm256_mask_dpbusd_epi32(
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm256_maskz_dpbusd_epi32(
-    k: __mmask8,
-    src: __m256i,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_dpbusd_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusd_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -429,8 +396,8 @@ pub unsafe fn _mm256_maskz_dpbusd_epi32(
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpdpbusd)
 )]
-pub unsafe fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
@@ -440,8 +407,8 @@ pub unsafe fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m1
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -451,9 +418,11 @@ pub unsafe fn _mm_dpbusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -463,9 +432,11 @@ pub unsafe fn _mm_mask_dpbusd_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusd))]
-pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusd_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -475,8 +446,8 @@ pub unsafe fn _mm_maskz_dpbusd_epi32(k: __mmask8, src: __m128i, a: __m128i, b: _
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
-    transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16()))
+pub fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe { transmute(vpdpbusds(src.as_i32x16(), a.as_i32x16(), b.as_i32x16())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -486,14 +457,11 @@ pub unsafe fn _mm512_dpbusds_epi32(src: __m512i, a: __m512i, b: __m512i) -> __m5
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm512_mask_dpbusds_epi32(
-    src: __m512i,
-    k: __mmask16,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+pub fn _mm512_mask_dpbusds_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -503,14 +471,11 @@ pub unsafe fn _mm512_mask_dpbusds_epi32(
 #[target_feature(enable = "avx512vnni")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm512_maskz_dpbusds_epi32(
-    k: __mmask16,
-    src: __m512i,
-    a: __m512i,
-    b: __m512i,
-) -> __m512i {
-    let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
-    transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+pub fn _mm512_maskz_dpbusds_epi32(k: __mmask16, src: __m512i, a: __m512i, b: __m512i) -> __m512i {
+    unsafe {
+        let r = _mm512_dpbusds_epi32(src, a, b).as_i32x16();
+        transmute(simd_select_bitmask(k, r, i32x16::ZERO))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -523,8 +488,8 @@ pub unsafe fn _mm512_maskz_dpbusds_epi32(
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpdpbusds)
 )]
-pub unsafe fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -534,8 +499,8 @@ pub unsafe fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) ->
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -545,14 +510,11 @@ pub unsafe fn _mm256_dpbusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm256_mask_dpbusds_epi32(
-    src: __m256i,
-    k: __mmask8,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+pub fn _mm256_mask_dpbusds_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, src.as_i32x8()))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -562,14 +524,11 @@ pub unsafe fn _mm256_mask_dpbusds_epi32(
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm256_maskz_dpbusds_epi32(
-    k: __mmask8,
-    src: __m256i,
-    a: __m256i,
-    b: __m256i,
-) -> __m256i {
-    let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
-    transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+pub fn _mm256_maskz_dpbusds_epi32(k: __mmask8, src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe {
+        let r = _mm256_dpbusds_epi32(src, a, b).as_i32x8();
+        transmute(simd_select_bitmask(k, r, i32x8::ZERO))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -582,8 +541,8 @@ pub unsafe fn _mm256_maskz_dpbusds_epi32(
     all(test, any(target_os = "linux", target_env = "msvc")),
     assert_instr(vpdpbusds)
 )]
-pub unsafe fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
@@ -593,8 +552,8 @@ pub unsafe fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
@@ -604,9 +563,11 @@ pub unsafe fn _mm_dpbusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
-    let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+pub fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, src.as_i32x4()))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
@@ -616,14 +577,11 @@ pub unsafe fn _mm_mask_dpbusds_epi32(src: __m128i, k: __mmask8, a: __m128i, b: _
 #[target_feature(enable = "avx512vnni,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpdpbusds))]
-pub unsafe fn _mm_maskz_dpbusds_epi32(
-    k: __mmask8,
-    src: __m128i,
-    a: __m128i,
-    b: __m128i,
-) -> __m128i {
-    let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
-    transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+pub fn _mm_maskz_dpbusds_epi32(k: __mmask8, src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe {
+        let r = _mm_dpbusds_epi32(src, a, b).as_i32x4();
+        transmute(simd_select_bitmask(k, r, i32x4::ZERO))
+    }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
@@ -638,8 +596,8 @@ pub unsafe fn _mm_maskz_dpbusds_epi32(
     assert_instr(vpdpbssd)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
@@ -654,8 +612,8 @@ pub unsafe fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpbssd)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
@@ -670,8 +628,8 @@ pub unsafe fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
     assert_instr(vpdpbssds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
@@ -686,8 +644,8 @@ pub unsafe fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpbssds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@@ -702,8 +660,8 @@ pub unsafe fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
     assert_instr(vpdpbsud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@@ -718,8 +676,8 @@ pub unsafe fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpbsud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@@ -734,8 +692,8 @@ pub unsafe fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
     assert_instr(vpdpbsuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
@@ -750,8 +708,8 @@ pub unsafe fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpbsuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@@ -766,8 +724,8 @@ pub unsafe fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
     assert_instr(vpdpbuud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@@ -782,8 +740,8 @@ pub unsafe fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpbuud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@@ -798,8 +756,8 @@ pub unsafe fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
     assert_instr(vpdpbuuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
@@ -814,8 +772,8 @@ pub unsafe fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpbuuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@@ -830,8 +788,8 @@ pub unsafe fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
     assert_instr(vpdpwsud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@@ -846,8 +804,8 @@ pub unsafe fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpwsud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@@ -862,8 +820,8 @@ pub unsafe fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
     assert_instr(vpdpwsuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
@@ -878,8 +836,8 @@ pub unsafe fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpwsuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@@ -894,8 +852,8 @@ pub unsafe fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
     assert_instr(vpdpwusd)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@@ -910,8 +868,8 @@ pub unsafe fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpwusd)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@@ -926,8 +884,8 @@ pub unsafe fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
     assert_instr(vpdpwusds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
@@ -942,8 +900,8 @@ pub unsafe fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpwusds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@@ -958,8 +916,8 @@ pub unsafe fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m2
     assert_instr(vpdpwuud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@@ -974,8 +932,8 @@ pub unsafe fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpwuud)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@@ -990,8 +948,8 @@ pub unsafe fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m25
     assert_instr(vpdpwuuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
-    transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+pub fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    unsafe { transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) }
 }
 
 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
@@ -1006,8 +964,8 @@ pub unsafe fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i
     assert_instr(vpdpwuuds)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
-    transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+pub fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    unsafe { transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) }
 }
 
 #[allow(improper_ctypes)]
diff --git a/crates/core_arch/src/x86/avx512vpopcntdq.rs b/crates/core_arch/src/x86/avx512vpopcntdq.rs
index 0bc343acae..7a06f09b19 100644
--- a/crates/core_arch/src/x86/avx512vpopcntdq.rs
+++ b/crates/core_arch/src/x86/avx512vpopcntdq.rs
@@ -26,8 +26,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
-    transmute(simd_ctpop(a.as_i32x16()))
+pub fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i32x16())) }
 }
 
 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@@ -40,12 +40,14 @@ pub unsafe fn _mm512_popcnt_epi32(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x16()),
-        i32x16::ZERO,
-    ))
+pub fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x16()),
+            i32x16::ZERO,
+        ))
+    }
 }
 
 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@@ -58,12 +60,14 @@ pub unsafe fn _mm512_maskz_popcnt_epi32(k: __mmask16, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm512_mask_popcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x16()),
-        src.as_i32x16(),
-    ))
+pub fn _mm512_mask_popcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x16()),
+            src.as_i32x16(),
+        ))
+    }
 }
 
 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@@ -73,8 +77,8 @@ pub unsafe fn _mm512_mask_popcnt_epi32(src: __m512i, k: __mmask16, a: __m512i) -
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
-    transmute(simd_ctpop(a.as_i32x8()))
+pub fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i32x8())) }
 }
 
 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@@ -87,12 +91,14 @@ pub unsafe fn _mm256_popcnt_epi32(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x8()),
-        i32x8::ZERO,
-    ))
+pub fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x8()),
+            i32x8::ZERO,
+        ))
+    }
 }
 
 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@@ -105,12 +111,14 @@ pub unsafe fn _mm256_maskz_popcnt_epi32(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm256_mask_popcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x8()),
-        src.as_i32x8(),
-    ))
+pub fn _mm256_mask_popcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x8()),
+            src.as_i32x8(),
+        ))
+    }
 }
 
 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@@ -120,8 +128,8 @@ pub unsafe fn _mm256_mask_popcnt_epi32(src: __m256i, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
-    transmute(simd_ctpop(a.as_i32x4()))
+pub fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i32x4())) }
 }
 
 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@@ -134,12 +142,14 @@ pub unsafe fn _mm_popcnt_epi32(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x4()),
-        i32x4::ZERO,
-    ))
+pub fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x4()),
+            i32x4::ZERO,
+        ))
+    }
 }
 
 /// For each packed 32-bit integer maps the value to the number of logical 1 bits.
@@ -152,12 +162,14 @@ pub unsafe fn _mm_maskz_popcnt_epi32(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntd))]
-pub unsafe fn _mm_mask_popcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i32x4()),
-        src.as_i32x4(),
-    ))
+pub fn _mm_mask_popcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i32x4()),
+            src.as_i32x4(),
+        ))
+    }
 }
 
 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@@ -167,8 +179,8 @@ pub unsafe fn _mm_mask_popcnt_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
-    transmute(simd_ctpop(a.as_i64x8()))
+pub fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
+    unsafe { transmute(simd_ctpop(a.as_i64x8())) }
 }
 
 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@@ -181,12 +193,14 @@ pub unsafe fn _mm512_popcnt_epi64(a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x8()),
-        i64x8::ZERO,
-    ))
+pub fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x8()),
+            i64x8::ZERO,
+        ))
+    }
 }
 
 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@@ -199,12 +213,14 @@ pub unsafe fn _mm512_maskz_popcnt_epi64(k: __mmask8, a: __m512i) -> __m512i {
 #[target_feature(enable = "avx512vpopcntdq")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm512_mask_popcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x8()),
-        src.as_i64x8(),
-    ))
+pub fn _mm512_mask_popcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x8()),
+            src.as_i64x8(),
+        ))
+    }
 }
 
 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@@ -214,8 +230,8 @@ pub unsafe fn _mm512_mask_popcnt_epi64(src: __m512i, k: __mmask8, a: __m512i) ->
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
-    transmute(simd_ctpop(a.as_i64x4()))
+pub fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
+    unsafe { transmute(simd_ctpop(a.as_i64x4())) }
 }
 
 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@@ -228,12 +244,14 @@ pub unsafe fn _mm256_popcnt_epi64(a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x4()),
-        i64x4::ZERO,
-    ))
+pub fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x4()),
+            i64x4::ZERO,
+        ))
+    }
 }
 
 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@@ -246,12 +264,14 @@ pub unsafe fn _mm256_maskz_popcnt_epi64(k: __mmask8, a: __m256i) -> __m256i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm256_mask_popcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x4()),
-        src.as_i64x4(),
-    ))
+pub fn _mm256_mask_popcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x4()),
+            src.as_i64x4(),
+        ))
+    }
 }
 
 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@@ -261,8 +281,8 @@ pub unsafe fn _mm256_mask_popcnt_epi64(src: __m256i, k: __mmask8, a: __m256i) ->
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
-    transmute(simd_ctpop(a.as_i64x2()))
+pub fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
+    unsafe { transmute(simd_ctpop(a.as_i64x2())) }
 }
 
 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@@ -275,12 +295,14 @@ pub unsafe fn _mm_popcnt_epi64(a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x2()),
-        i64x2::ZERO,
-    ))
+pub fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x2()),
+            i64x2::ZERO,
+        ))
+    }
 }
 
 /// For each packed 64-bit integer maps the value to the number of logical 1 bits.
@@ -293,12 +315,14 @@ pub unsafe fn _mm_maskz_popcnt_epi64(k: __mmask8, a: __m128i) -> __m128i {
 #[target_feature(enable = "avx512vpopcntdq,avx512vl")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vpopcntq))]
-pub unsafe fn _mm_mask_popcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
-    transmute(simd_select_bitmask(
-        k,
-        simd_ctpop(a.as_i64x2()),
-        src.as_i64x2(),
-    ))
+pub fn _mm_mask_popcnt_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
+    unsafe {
+        transmute(simd_select_bitmask(
+            k,
+            simd_ctpop(a.as_i64x2()),
+            src.as_i64x2(),
+        ))
+    }
 }
 
 #[cfg(test)]
diff --git a/crates/core_arch/src/x86/avxneconvert.rs b/crates/core_arch/src/x86/avxneconvert.rs
index 4520529934..cae48509ea 100644
--- a/crates/core_arch/src/x86/avxneconvert.rs
+++ b/crates/core_arch/src/x86/avxneconvert.rs
@@ -199,15 +199,17 @@ pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 {
     assert_instr(vcvtneps2bf16)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
-    let mut dst: __m128bh;
-    asm!(
-        "{{vex}}vcvtneps2bf16 {dst},{src}",
-        dst = lateout(xmm_reg) dst,
-        src = in(xmm_reg) a,
-        options(pure, nomem, nostack, preserves_flags)
-    );
-    dst
+pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "{{vex}}vcvtneps2bf16 {dst},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(xmm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
 }
 
 /// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
@@ -221,15 +223,17 @@ pub unsafe fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
     assert_instr(vcvtneps2bf16)
 )]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
-    let mut dst: __m128bh;
-    asm!(
-        "{{vex}}vcvtneps2bf16 {dst},{src}",
-        dst = lateout(xmm_reg) dst,
-        src = in(ymm_reg) a,
-        options(pure, nomem, nostack, preserves_flags)
-    );
-    dst
+pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
+    unsafe {
+        let mut dst: __m128bh;
+        asm!(
+            "{{vex}}vcvtneps2bf16 {dst},{src}",
+            dst = lateout(xmm_reg) dst,
+            src = in(ymm_reg) a,
+            options(pure, nomem, nostack, preserves_flags)
+        );
+        dst
+    }
 }
 
 #[allow(improper_ctypes)]
diff --git a/crates/core_arch/src/x86_64/avx512bw.rs b/crates/core_arch/src/x86_64/avx512bw.rs
index 798fc4adf6..43999b2a50 100644
--- a/crates/core_arch/src/x86_64/avx512bw.rs
+++ b/crates/core_arch/src/x86_64/avx512bw.rs
@@ -6,7 +6,7 @@ use crate::core_arch::x86::*;
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _cvtmask64_u64(a: __mmask64) -> u64 {
+pub fn _cvtmask64_u64(a: __mmask64) -> u64 {
     a
 }
 
@@ -16,7 +16,7 @@ pub unsafe fn _cvtmask64_u64(a: __mmask64) -> u64 {
 #[inline]
 #[target_feature(enable = "avx512bw")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
-pub unsafe fn _cvtu64_mask64(a: u64) -> __mmask64 {
+pub fn _cvtu64_mask64(a: u64) -> __mmask64 {
     a
 }
 
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index c1c79585b0..946b900a2b 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -13,7 +13,7 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2si))]
-pub unsafe fn _mm_cvtsd_i64(a: __m128d) -> i64 {
+pub fn _mm_cvtsd_i64(a: __m128d) -> i64 {
     _mm_cvtsd_si64(a)
 }
 
@@ -24,7 +24,7 @@ pub unsafe fn _mm_cvtsd_i64(a: __m128d) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2si))]
-pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 {
+pub fn _mm_cvtss_i64(a: __m128) -> i64 {
     _mm_cvtss_si64(a)
 }
 
@@ -35,8 +35,8 @@ pub unsafe fn _mm_cvtss_i64(a: __m128) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2usi))]
-pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
-    vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtss_u64(a: __m128) -> u64 {
+    unsafe { vcvtss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
@@ -46,8 +46,8 @@ pub unsafe fn _mm_cvtss_u64(a: __m128) -> u64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
-pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
-    vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtsd_u64(a: __m128d) -> u64 {
+    unsafe { vcvtsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -57,9 +57,11 @@ pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2ss))]
-pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
-    let b = b as f32;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
 }
 
 /// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -69,9 +71,11 @@ pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2sd))]
-pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
-    let b = b as f64;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
 }
 
 /// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -81,9 +85,11 @@ pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtusi2ss))]
-pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
-    let b = b as f32;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
+    unsafe {
+        let b = b as f32;
+        simd_insert!(a, 0, b)
+    }
 }
 
 /// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -93,9 +99,11 @@ pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtusi2sd))]
-pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
-    let b = b as f64;
-    simd_insert!(a, 0, b)
+pub fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
+    unsafe {
+        let b = b as f64;
+        simd_insert!(a, 0, b)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
@@ -105,8 +113,8 @@ pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2si))]
-pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
-    vcvttsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsd_i64(a: __m128d) -> i64 {
+    unsafe { vcvttsd2si64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
@@ -116,8 +124,8 @@ pub unsafe fn _mm_cvttsd_i64(a: __m128d) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2usi))]
-pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
-    vcvttsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsd_u64(a: __m128d) -> u64 {
+    unsafe { vcvttsd2usi64(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
@@ -127,8 +135,8 @@ pub unsafe fn _mm_cvttsd_u64(a: __m128d) -> u64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2si))]
-pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
-    vcvttss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttss_i64(a: __m128) -> i64 {
+    unsafe { vcvttss2si64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
@@ -138,8 +146,8 @@ pub unsafe fn _mm_cvttss_i64(a: __m128) -> i64 {
 #[target_feature(enable = "avx512f")]
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2usi))]
-pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
-    vcvttss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttss_u64(a: __m128) -> u64 {
+    unsafe { vcvttss2usi64(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -156,11 +164,13 @@ pub unsafe fn _mm_cvttss_u64(a: __m128) -> u64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let r = vcvtsi2sd64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let r = vcvtsi2sd64(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -177,11 +187,13 @@ pub unsafe fn _mm_cvt_roundi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let r = vcvtsi2sd64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let r = vcvtsi2sd64(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -198,11 +210,13 @@ pub unsafe fn _mm_cvt_roundsi64_sd<const ROUNDING: i32>(a: __m128d, b: i64) -> _
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let r = vcvtsi2ss64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss64(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -219,11 +233,13 @@ pub unsafe fn _mm_cvt_roundi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtusi2sd, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __m128d {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    let r = vcvtusi2sd64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __m128d {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        let r = vcvtusi2sd64(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the signed 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -240,11 +256,13 @@ pub unsafe fn _mm_cvt_roundu64_sd<const ROUNDING: i32>(a: __m128d, b: u64) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let r = vcvtsi2ss64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtsi2ss64(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -261,11 +279,13 @@ pub unsafe fn _mm_cvt_roundsi64_ss<const ROUNDING: i32>(a: __m128, b: i64) -> __
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
-pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m128 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    let r = vcvtusi2ss64(a, b, ROUNDING);
-    transmute(r)
+pub fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m128 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        let r = vcvtusi2ss64(a, b, ROUNDING);
+        transmute(r)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@@ -282,10 +302,12 @@ pub unsafe fn _mm_cvt_roundu64_ss<const ROUNDING: i32>(a: __m128, b: u64) -> __m
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    vcvtsd2si64(a, ROUNDING)
+pub fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si64(a, ROUNDING)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@@ -302,10 +324,12 @@ pub unsafe fn _mm_cvt_roundsd_si64<const ROUNDING: i32>(a: __m128d) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    vcvtsd2si64(a, ROUNDING)
+pub fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2si64(a, ROUNDING)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
@@ -322,10 +346,12 @@ pub unsafe fn _mm_cvt_roundsd_i64<const ROUNDING: i32>(a: __m128d) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f64x2();
-    vcvtsd2usi64(a, ROUNDING)
+pub fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f64x2();
+        vcvtsd2usi64(a, ROUNDING)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@@ -342,10 +368,12 @@ pub unsafe fn _mm_cvt_roundsd_u64<const ROUNDING: i32>(a: __m128d) -> u64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    vcvtss2si64(a, ROUNDING)
+pub fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si64(a, ROUNDING)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.\
@@ -362,10 +390,12 @@ pub unsafe fn _mm_cvt_roundss_si64<const ROUNDING: i32>(a: __m128) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    vcvtss2si64(a, ROUNDING)
+pub fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2si64(a, ROUNDING)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.\
@@ -382,10 +412,12 @@ pub unsafe fn _mm_cvt_roundss_i64<const ROUNDING: i32>(a: __m128) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
-    static_assert_rounding!(ROUNDING);
-    let a = a.as_f32x4();
-    vcvtss2usi64(a, ROUNDING)
+pub fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        let a = a.as_f32x4();
+        vcvtss2usi64(a, ROUNDING)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -397,10 +429,12 @@ pub unsafe fn _mm_cvt_roundss_u64<const ROUNDING: i32>(a: __m128) -> u64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    vcvttsd2si64(a, SAE)
+pub fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si64(a, SAE)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -412,10 +446,12 @@ pub unsafe fn _mm_cvtt_roundsd_si64<const SAE: i32>(a: __m128d) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    vcvttsd2si64(a, SAE)
+pub fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2si64(a, SAE)
+    }
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
@@ -427,10 +463,12 @@ pub unsafe fn _mm_cvtt_roundsd_i64<const SAE: i32>(a: __m128d) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttsd2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f64x2();
-    vcvttsd2usi64(a, SAE)
+pub fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f64x2();
+        vcvttsd2usi64(a, SAE)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -442,10 +480,12 @@ pub unsafe fn _mm_cvtt_roundsd_u64<const SAE: i32>(a: __m128d) -> u64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    vcvttss2si64(a, SAE)
+pub fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si64(a, SAE)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.\
@@ -457,10 +497,12 @@ pub unsafe fn _mm_cvtt_roundss_i64<const SAE: i32>(a: __m128) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    vcvttss2si64(a, SAE)
+pub fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2si64(a, SAE)
+    }
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.\
@@ -472,10 +514,12 @@ pub unsafe fn _mm_cvtt_roundss_si64<const SAE: i32>(a: __m128) -> i64 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vcvttss2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
-pub unsafe fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
-    static_assert_sae!(SAE);
-    let a = a.as_f32x4();
-    vcvttss2usi64(a, SAE)
+pub fn _mm_cvtt_roundss_u64<const SAE: i32>(a: __m128) -> u64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        let a = a.as_f32x4();
+        vcvttss2usi64(a, SAE)
+    }
 }
 
 #[allow(improper_ctypes)]
diff --git a/crates/core_arch/src/x86_64/avx512fp16.rs b/crates/core_arch/src/x86_64/avx512fp16.rs
index dbf88ab57f..69f1dcb5c7 100644
--- a/crates/core_arch/src/x86_64/avx512fp16.rs
+++ b/crates/core_arch/src/x86_64/avx512fp16.rs
@@ -11,8 +11,8 @@ use stdarch_test::assert_instr;
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsi2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvti64_sh(a: __m128h, b: i64) -> __m128h {
-    vcvtsi642sh(a, b, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvti64_sh(a: __m128h, b: i64) -> __m128h {
+    unsafe { vcvtsi642sh(a, b, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the signed 64-bit integer b to a half-precision (16-bit) floating-point element, store the
@@ -33,9 +33,11 @@ pub unsafe fn _mm_cvti64_sh(a: __m128h, b: i64) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundi64_sh<const ROUNDING: i32>(a: __m128h, b: i64) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vcvtsi642sh(a, b, ROUNDING)
+pub fn _mm_cvt_roundi64_sh<const ROUNDING: i32>(a: __m128h, b: i64) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsi642sh(a, b, ROUNDING)
+    }
 }
 
 /// Convert the unsigned 64-bit integer b to a half-precision (16-bit) floating-point element, store the
@@ -47,8 +49,8 @@ pub unsafe fn _mm_cvt_roundi64_sh<const ROUNDING: i32>(a: __m128h, b: i64) -> __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtusi2sh))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtu64_sh(a: __m128h, b: u64) -> __m128h {
-    vcvtusi642sh(a, b, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtu64_sh(a: __m128h, b: u64) -> __m128h {
+    unsafe { vcvtusi642sh(a, b, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the unsigned 64-bit integer b to a half-precision (16-bit) floating-point element, store the
@@ -69,9 +71,11 @@ pub unsafe fn _mm_cvtu64_sh(a: __m128h, b: u64) -> __m128h {
 #[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
 #[rustc_legacy_const_generics(2)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundu64_sh<const ROUNDING: i32>(a: __m128h, b: u64) -> __m128h {
-    static_assert_rounding!(ROUNDING);
-    vcvtusi642sh(a, b, ROUNDING)
+pub fn _mm_cvt_roundu64_sh<const ROUNDING: i32>(a: __m128h, b: u64) -> __m128h {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtusi642sh(a, b, ROUNDING)
+    }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer, and store
@@ -82,8 +86,8 @@ pub unsafe fn _mm_cvt_roundu64_sh<const ROUNDING: i32>(a: __m128h, b: u64) -> __
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2si))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsh_i64(a: __m128h) -> i64 {
-    vcvtsh2si64(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtsh_i64(a: __m128h) -> i64 {
+    unsafe { vcvtsh2si64(a, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer, and store
@@ -103,9 +107,11 @@ pub unsafe fn _mm_cvtsh_i64(a: __m128h) -> i64 {
 #[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundsh_i64<const ROUNDING: i32>(a: __m128h) -> i64 {
-    static_assert_rounding!(ROUNDING);
-    vcvtsh2si64(a, ROUNDING)
+pub fn _mm_cvt_roundsh_i64<const ROUNDING: i32>(a: __m128h) -> i64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsh2si64(a, ROUNDING)
+    }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer, and store
@@ -116,8 +122,8 @@ pub unsafe fn _mm_cvt_roundsh_i64<const ROUNDING: i32>(a: __m128h) -> i64 {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvtsh2usi))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtsh_u64(a: __m128h) -> u64 {
-    vcvtsh2usi64(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvtsh_u64(a: __m128h) -> u64 {
+    unsafe { vcvtsh2usi64(a, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer, and store
@@ -137,9 +143,11 @@ pub unsafe fn _mm_cvtsh_u64(a: __m128h) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtsh2usi, ROUNDING = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvt_roundsh_u64<const ROUNDING: i32>(a: __m128h) -> u64 {
-    static_assert_rounding!(ROUNDING);
-    vcvtsh2usi64(a, ROUNDING)
+pub fn _mm_cvt_roundsh_u64<const ROUNDING: i32>(a: __m128h) -> u64 {
+    unsafe {
+        static_assert_rounding!(ROUNDING);
+        vcvtsh2usi64(a, ROUNDING)
+    }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer with truncation,
@@ -150,8 +158,8 @@ pub unsafe fn _mm_cvt_roundsh_u64<const ROUNDING: i32>(a: __m128h) -> u64 {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttsh2si))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttsh_i64(a: __m128h) -> i64 {
-    vcvttsh2si64(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsh_i64(a: __m128h) -> i64 {
+    unsafe { vcvttsh2si64(a, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit integer with truncation,
@@ -165,9 +173,11 @@ pub unsafe fn _mm_cvttsh_i64(a: __m128h) -> i64 {
 #[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtt_roundsh_i64<const SAE: i32>(a: __m128h) -> i64 {
-    static_assert_sae!(SAE);
-    vcvttsh2si64(a, SAE)
+pub fn _mm_cvtt_roundsh_i64<const SAE: i32>(a: __m128h) -> i64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2si64(a, SAE)
+    }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer with truncation,
@@ -178,8 +188,8 @@ pub unsafe fn _mm_cvtt_roundsh_i64<const SAE: i32>(a: __m128h) -> i64 {
 #[target_feature(enable = "avx512fp16")]
 #[cfg_attr(test, assert_instr(vcvttsh2usi))]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvttsh_u64(a: __m128h) -> u64 {
-    vcvttsh2usi64(a, _MM_FROUND_CUR_DIRECTION)
+pub fn _mm_cvttsh_u64(a: __m128h) -> u64 {
+    unsafe { vcvttsh2usi64(a, _MM_FROUND_CUR_DIRECTION) }
 }
 
 /// Convert the lower half-precision (16-bit) floating-point element in a to a 64-bit unsigned integer with truncation,
@@ -193,9 +203,11 @@ pub unsafe fn _mm_cvttsh_u64(a: __m128h) -> u64 {
 #[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
 #[rustc_legacy_const_generics(1)]
 #[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
-pub unsafe fn _mm_cvtt_roundsh_u64<const SAE: i32>(a: __m128h) -> u64 {
-    static_assert_sae!(SAE);
-    vcvttsh2usi64(a, SAE)
+pub fn _mm_cvtt_roundsh_u64<const SAE: i32>(a: __m128h) -> u64 {
+    unsafe {
+        static_assert_sae!(SAE);
+        vcvttsh2usi64(a, SAE)
+    }
 }
 
 #[allow(improper_ctypes)]