1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
use crate::simd::{LaneCount, Simd, SupportedLaneCount};
use core::mem;
impl<const N: usize> Simd<u8, N>
where
LaneCount<N>: SupportedLaneCount,
{
/// Swizzle a vector of bytes according to the index vector.
/// Indices within range select the appropriate byte.
/// Indices "out of bounds" instead select 0.
///
/// Note that the current implementation is selected during build-time
/// of the standard library, so `cargo build -Zbuild-std` may be necessary
/// to unlock better performance, especially for larger vectors.
/// A planned compiler improvement will enable using `#[target_feature]` instead.
#[inline]
pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
#![allow(unused_imports, unused_unsafe)]
#[cfg(all(
any(target_arch = "aarch64", target_arch = "arm64ec"),
target_endian = "little"
))]
use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
#[cfg(all(
target_arch = "arm",
target_feature = "v7",
target_feature = "neon",
target_endian = "little"
))]
use core::arch::arm::{uint8x8_t, vtbl1_u8};
#[cfg(target_arch = "wasm32")]
use core::arch::wasm32 as wasm;
#[cfg(target_arch = "x86")]
use core::arch::x86;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64 as x86;
// SAFETY: Intrinsics covered by cfg
unsafe {
match N {
#[cfg(all(
any(
target_arch = "aarch64",
target_arch = "arm64ec",
all(target_arch = "arm", target_feature = "v7")
),
target_feature = "neon",
target_endian = "little"
))]
8 => transize(vtbl1_u8, self, idxs),
#[cfg(target_feature = "ssse3")]
16 => transize(x86::_mm_shuffle_epi8, self, zeroing_idxs(idxs)),
#[cfg(target_feature = "simd128")]
16 => transize(wasm::i8x16_swizzle, self, idxs),
#[cfg(all(
any(target_arch = "aarch64", target_arch = "arm64ec"),
target_feature = "neon",
target_endian = "little"
))]
16 => transize(vqtbl1q_u8, self, idxs),
#[cfg(all(
target_arch = "arm",
target_feature = "v7",
target_feature = "neon",
target_endian = "little"
))]
16 => transize(armv7_neon_swizzle_u8x16, self, idxs),
#[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
32 => transize(avx2_pshufb, self, idxs),
#[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
32 => {
// Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
let swizzler = |bytes, idxs| {
let mask = x86::_mm256_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
idxs,
Simd::<u8, 32>::splat(N as u8).into(),
);
x86::_mm256_maskz_permutexvar_epi8(mask, idxs, bytes)
};
transize(swizzler, self, idxs)
}
// Notable absence: avx512bw pshufb shuffle
#[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
64 => {
// Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
let swizzler = |bytes, idxs| {
let mask = x86::_mm512_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
idxs,
Simd::<u8, 64>::splat(N as u8).into(),
);
x86::_mm512_maskz_permutexvar_epi8(mask, idxs, bytes)
};
transize(swizzler, self, idxs)
}
_ => {
let mut array = [0; N];
for (i, k) in idxs.to_array().into_iter().enumerate() {
if (k as usize) < N {
array[i] = self[k as usize];
};
}
array.into()
}
}
}
}
}
/// armv7 neon supports swizzling `u8x16` by swizzling two u8x8 blocks
/// with a u8x8x2 lookup table.
///
/// # Safety
/// This requires armv7 neon to work
#[cfg(all(
target_arch = "arm",
target_feature = "v7",
target_feature = "neon",
target_endian = "little"
))]
unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> {
use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8};
// SAFETY: Caller promised arm neon support
unsafe {
let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into()));
let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into()));
let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into()));
vcombine_u8(lo, hi).into()
}
}
/// "vpshufb like it was meant to be" on AVX2
///
/// # Safety
/// This requires AVX2 to work
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
#[allow(unused)]
#[inline]
#[allow(clippy::let_and_return)]
unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
use crate::simd::cmp::SimdPartialOrd;
#[cfg(target_arch = "x86")]
use core::arch::x86;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64 as x86;
use x86::_mm256_permute2x128_si256 as avx2_cross_shuffle;
use x86::_mm256_shuffle_epi8 as avx2_half_pshufb;
let mid = Simd::splat(16u8);
let high = mid + mid;
// SAFETY: Caller promised AVX2
unsafe {
// This is ordering sensitive, and LLVM will order these how you put them.
// Most AVX2 impls use ~5 "ports", and only 1 or 2 are capable of permutes.
// But the "compose" step will lower to ops that can also use at least 1 other port.
// So this tries to break up permutes so composition flows through "open" ports.
// Comparative benches should be done on multiple AVX2 CPUs before reordering this
let hihi = avx2_cross_shuffle::<0x11>(bytes.into(), bytes.into());
let hi_shuf = Simd::from(avx2_half_pshufb(
hihi, // duplicate the vector's top half
idxs.into(), // so that using only 4 bits of an index still picks bytes 16-31
));
// A zero-fill during the compose step gives the "all-Neon-like" OOB-is-0 semantics
let compose = idxs.simd_lt(high).select(hi_shuf, Simd::splat(0));
let lolo = avx2_cross_shuffle::<0x00>(bytes.into(), bytes.into());
let lo_shuf = Simd::from(avx2_half_pshufb(lolo, idxs.into()));
// Repeat, then pick indices < 16, overwriting indices 0-15 from previous compose step
let compose = idxs.simd_lt(mid).select(lo_shuf, compose);
compose
}
}
/// This sets up a call to an architecture-specific function, and in doing so
/// it persuades rustc that everything is the correct size. Which it is.
/// This would not be needed if one could convince Rust that, by matching on N,
/// N is that value, and thus it would be valid to substitute e.g. 16.
///
/// # Safety
/// The correctness of this function hinges on the sizes agreeing in actuality.
#[allow(dead_code)]
#[inline(always)]
unsafe fn transize<T, const N: usize>(
f: unsafe fn(T, T) -> T,
a: Simd<u8, N>,
b: Simd<u8, N>,
) -> Simd<u8, N>
where
LaneCount<N>: SupportedLaneCount,
{
// SAFETY: Same obligation to use this function as to use mem::transmute_copy.
unsafe { mem::transmute_copy(&f(mem::transmute_copy(&a), mem::transmute_copy(&b))) }
}
/// Make indices that yield 0 for x86
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[allow(unused)]
#[inline(always)]
fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N>
where
LaneCount<N>: SupportedLaneCount,
{
use crate::simd::cmp::SimdPartialOrd;
idxs.simd_lt(Simd::splat(N as u8))
.select(idxs, Simd::splat(u8::MAX))
}