1use crate::simd::Simd;
2use core::mem;
3
4impl<const N: usize> Simd<u8, N> {
5 #[inline]
14 pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
15 #![allow(unused_imports, unused_unsafe)]
16 #[cfg(all(
17 target_arch = "arm",
18 target_feature = "v7",
19 target_feature = "neon",
20 target_endian = "little"
21 ))]
22 use core::arch::arm::{uint8x8_t, vtbl1_u8};
23 #[cfg(target_arch = "wasm32")]
24 use core::arch::wasm32 as wasm;
25 #[cfg(target_arch = "wasm64")]
26 use core::arch::wasm64 as wasm;
27 #[cfg(target_arch = "x86")]
28 use core::arch::x86;
29 #[cfg(target_arch = "x86_64")]
30 use core::arch::x86_64 as x86;
31 unsafe {
33 match N {
34 #[cfg(all(
35 any(target_arch = "aarch64", target_arch = "arm64ec"),
36 target_feature = "neon",
37 target_endian = "little"
38 ))]
39 8 | 16 | 24 | 32 | 48 | 64 => aarch64_swizzle(self, idxs),
40 #[cfg(target_feature = "ssse3")]
41 16 => transize(x86::_mm_shuffle_epi8, self, zeroing_idxs(idxs)),
42 #[cfg(target_feature = "simd128")]
43 16 => transize(wasm::i8x16_swizzle, self, idxs),
44 #[cfg(all(
45 target_arch = "arm",
46 target_feature = "v7",
47 target_feature = "neon",
48 target_endian = "little"
49 ))]
50 16 => transize(armv7_neon_swizzle_u8x16, self, idxs),
51 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
52 32 => transize(avx2_pshufb, self, idxs),
53 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
54 32 => {
55 let swizzler = |bytes, idxs| {
57 let mask = x86::_mm256_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
58 idxs,
59 Simd::<u8, 32>::splat(N as u8).into(),
60 );
61 x86::_mm256_maskz_permutexvar_epi8(mask, idxs, bytes)
62 };
63 transize(swizzler, self, idxs)
64 }
65 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
67 64 => {
68 let swizzler = |bytes, idxs| {
70 let mask = x86::_mm512_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
71 idxs,
72 Simd::<u8, 64>::splat(N as u8).into(),
73 );
74 x86::_mm512_maskz_permutexvar_epi8(mask, idxs, bytes)
75 };
76 transize(swizzler, self, idxs)
77 }
78 _ => {
79 let mut array = [0; N];
80 for (i, k) in idxs.to_array().into_iter().enumerate() {
81 if (k as usize) < N {
82 array[i] = self[k as usize];
83 };
84 }
85 array.into()
86 }
87 }
88 }
89 }
90}
91
92#[cfg(all(
98 target_arch = "arm",
99 target_feature = "v7",
100 target_feature = "neon",
101 target_endian = "little"
102))]
103unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> {
104 use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8};
105 unsafe {
107 let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into()));
108 let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into()));
109 let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into()));
110 vcombine_u8(lo, hi).into()
111 }
112}
113
114#[cfg(all(
119 any(target_arch = "aarch64", target_arch = "arm64ec"),
120 target_feature = "neon",
121 target_endian = "little"
122))]
123unsafe fn aarch64_swizzle<const N: usize>(bytes: Simd<u8, N>, idxs: Simd<u8, N>) -> Simd<u8, N> {
124 use core::arch::aarch64::*;
125 use core::mem::transmute_copy;
126
127 unsafe {
129 match N {
130 8 => transmute_copy(&vtbl1_u8(transmute_copy(&bytes), transmute_copy(&idxs))),
131 16 => transmute_copy(&vqtbl1q_u8(transmute_copy(&bytes), transmute_copy(&idxs))),
132 24 => {
133 let bytes: uint8x8x3_t = transmute_copy(&bytes);
134 let idxs: uint8x8x3_t = transmute_copy(&idxs);
135
136 let ret0 = vtbl3_u8(bytes, idxs.0);
137 let ret1 = vtbl3_u8(bytes, idxs.1);
138 let ret2 = vtbl3_u8(bytes, idxs.2);
139
140 let ret = uint8x8x3_t(ret0, ret1, ret2);
141 transmute_copy(&ret)
142 }
143 32 => {
144 let bytes: uint8x16x2_t = transmute_copy(&bytes);
145 let idxs: uint8x16x2_t = transmute_copy(&idxs);
146
147 let ret0 = vqtbl2q_u8(bytes, idxs.0);
148 let ret1 = vqtbl2q_u8(bytes, idxs.1);
149
150 let ret = uint8x16x2_t(ret0, ret1);
151 transmute_copy(&ret)
152 }
153 48 => {
154 let bytes: uint8x16x3_t = transmute_copy(&bytes);
155 let idxs: uint8x16x3_t = transmute_copy(&idxs);
156
157 let ret0 = vqtbl3q_u8(bytes, idxs.0);
158 let ret1 = vqtbl3q_u8(bytes, idxs.1);
159 let ret2 = vqtbl3q_u8(bytes, idxs.2);
160
161 let ret = uint8x16x3_t(ret0, ret1, ret2);
162 transmute_copy(&ret)
163 }
164 64 => {
165 let bytes: uint8x16x4_t = transmute_copy(&bytes);
166 let idxs: uint8x16x4_t = transmute_copy(&idxs);
167
168 let ret0 = vqtbl4q_u8(bytes, idxs.0);
169 let ret1 = vqtbl4q_u8(bytes, idxs.1);
170 let ret2 = vqtbl4q_u8(bytes, idxs.2);
171 let ret3 = vqtbl4q_u8(bytes, idxs.3);
172
173 let ret = uint8x16x4_t(ret0, ret1, ret2, ret3);
174 transmute_copy(&ret)
175 }
176 _ => unreachable!(),
177 }
178 }
179}
180
181#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
186#[target_feature(enable = "avx2")]
187#[allow(unused)]
188#[inline]
189#[allow(clippy::let_and_return)]
190unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
191 use crate::simd::{Select, cmp::SimdPartialOrd};
192 #[cfg(target_arch = "x86")]
193 use core::arch::x86;
194 #[cfg(target_arch = "x86_64")]
195 use core::arch::x86_64 as x86;
196 use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle;
197 use x86::_mm256_shuffle_epi8 as avx2_half_pshufb;
198 let mid = Simd::splat(16u8);
199 let high = mid + mid;
200 unsafe {
202 let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into());
209 let hi_shuf = Simd::from(avx2_half_pshufb(
210 hihi, idxs.into(), ));
213 let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0));
215 let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into());
216 let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into()));
217 let compose = idxs.simd_lt(mid).select(lo_shuf, compose);
219 compose
220 }
221}
222
223#[allow(dead_code)]
231#[inline(always)]
232unsafe fn transize<T, const N: usize>(
233 f: unsafe fn(T, T) -> T,
234 a: Simd<u8, N>,
235 b: Simd<u8, N>,
236) -> Simd<u8, N> {
237 unsafe { mem::transmute_copy(&f(mem::transmute_copy(&a), mem::transmute_copy(&b))) }
239}
240
241#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
243#[allow(unused)]
244#[inline(always)]
245fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N> {
246 use crate::simd::{Select, cmp::SimdPartialOrd};
247 idxs.simd_lt(Simd::splat(N as u8))
248 .select(idxs, Simd::splat(u8::MAX))
249}