1 /// A trait for describing vector operations used by vectorized searchers. 2 /// 3 /// The trait is highly constrained to low level vector operations needed. 4 /// In general, it was invented mostly to be generic over x86's __m128i and 5 /// __m256i types. At time of writing, it also supports wasm and aarch64 6 /// 128-bit vector types as well. 7 /// 8 /// # Safety 9 /// 10 /// All methods are not safe since they are intended to be implemented using 11 /// vendor intrinsics, which are also not safe. Callers must ensure that the 12 /// appropriate target features are enabled in the calling function, and that 13 /// the current CPU supports them. All implementations should avoid marking the 14 /// routines with #[target_feature] and instead mark them as #[inline(always)] 15 /// to ensure they get appropriately inlined. (inline(always) cannot be used 16 /// with target_feature.) 17 pub(crate) trait Vector: Copy + core::fmt::Debug { 18 /// The number of bytes in the vector. That is, this is the size of the 19 /// vector in memory. 20 const BYTES: usize; 21 /// The bits that must be zero in order for a `*const u8` pointer to be 22 /// correctly aligned to read vector values. 23 const ALIGN: usize; 24 25 /// The type of the value returned by `Vector::movemask`. 26 /// 27 /// This supports abstracting over the specific representation used in 28 /// order to accommodate different representations in different ISAs. 29 type Mask: MoveMask; 30 31 /// Create a vector with 8-bit lanes with the given byte repeated into each 32 /// lane. splat(byte: u8) -> Self33 unsafe fn splat(byte: u8) -> Self; 34 35 /// Read a vector-size number of bytes from the given pointer. The pointer 36 /// must be aligned to the size of the vector. 37 /// 38 /// # Safety 39 /// 40 /// Callers must guarantee that at least `BYTES` bytes are readable from 41 /// `data` and that `data` is aligned to a `BYTES` boundary. load_aligned(data: *const u8) -> Self42 unsafe fn load_aligned(data: *const u8) -> Self; 43 44 /// Read a vector-size number of bytes from the given pointer. The pointer 45 /// does not need to be aligned. 46 /// 47 /// # Safety 48 /// 49 /// Callers must guarantee that at least `BYTES` bytes are readable from 50 /// `data`. load_unaligned(data: *const u8) -> Self51 unsafe fn load_unaligned(data: *const u8) -> Self; 52 53 /// _mm_movemask_epi8 or _mm256_movemask_epi8 movemask(self) -> Self::Mask54 unsafe fn movemask(self) -> Self::Mask; 55 /// _mm_cmpeq_epi8 or _mm256_cmpeq_epi8 cmpeq(self, vector2: Self) -> Self56 unsafe fn cmpeq(self, vector2: Self) -> Self; 57 /// _mm_and_si128 or _mm256_and_si256 and(self, vector2: Self) -> Self58 unsafe fn and(self, vector2: Self) -> Self; 59 /// _mm_or or _mm256_or_si256 or(self, vector2: Self) -> Self60 unsafe fn or(self, vector2: Self) -> Self; 61 /// Returns true if and only if `Self::movemask` would return a mask that 62 /// contains at least one non-zero bit. movemask_will_have_non_zero(self) -> bool63 unsafe fn movemask_will_have_non_zero(self) -> bool { 64 self.movemask().has_non_zero() 65 } 66 } 67 68 /// A trait that abstracts over a vector-to-scalar operation called 69 /// "move mask." 70 /// 71 /// On x86-64, this is `_mm_movemask_epi8` for SSE2 and `_mm256_movemask_epi8` 72 /// for AVX2. It takes a vector of `u8` lanes and returns a scalar where the 73 /// `i`th bit is set if and only if the most significant bit in the `i`th lane 74 /// of the vector is set. The simd128 ISA for wasm32 also supports this 75 /// exact same operation natively. 76 /// 77 /// ... But aarch64 doesn't. So we have to fake it with more instructions and 78 /// a slightly different representation. We could do extra work to unify the 79 /// representations, but then would require additional costs in the hot path 80 /// for `memchr` and `packedpair`. So instead, we abstraction over the specific 81 /// representation with this trait an ddefine the operations we actually need. 82 pub(crate) trait MoveMask: Copy + core::fmt::Debug { 83 /// Return a mask that is all zeros except for the least significant `n` 84 /// lanes in a corresponding vector. all_zeros_except_least_significant(n: usize) -> Self85 fn all_zeros_except_least_significant(n: usize) -> Self; 86 87 /// Returns true if and only if this mask has a a non-zero bit anywhere. has_non_zero(self) -> bool88 fn has_non_zero(self) -> bool; 89 90 /// Returns the number of bits set to 1 in this mask. count_ones(self) -> usize91 fn count_ones(self) -> usize; 92 93 /// Does a bitwise `and` operation between `self` and `other`. and(self, other: Self) -> Self94 fn and(self, other: Self) -> Self; 95 96 /// Does a bitwise `or` operation between `self` and `other`. or(self, other: Self) -> Self97 fn or(self, other: Self) -> Self; 98 99 /// Returns a mask that is equivalent to `self` but with the least 100 /// significant 1-bit set to 0. clear_least_significant_bit(self) -> Self101 fn clear_least_significant_bit(self) -> Self; 102 103 /// Returns the offset of the first non-zero lane this mask represents. first_offset(self) -> usize104 fn first_offset(self) -> usize; 105 106 /// Returns the offset of the last non-zero lane this mask represents. last_offset(self) -> usize107 fn last_offset(self) -> usize; 108 } 109 110 /// This is a "sensible" movemask implementation where each bit represents 111 /// whether the most significant bit is set in each corresponding lane of a 112 /// vector. This is used on x86-64 and wasm, but such a mask is more expensive 113 /// to get on aarch64 so we use something a little different. 114 /// 115 /// We call this "sensible" because this is what we get using native sse/avx 116 /// movemask instructions. But neon has no such native equivalent. 117 #[derive(Clone, Copy, Debug)] 118 pub(crate) struct SensibleMoveMask(u32); 119 120 impl SensibleMoveMask { 121 /// Get the mask in a form suitable for computing offsets. 122 /// 123 /// Basically, this normalizes to little endian. On big endian, this swaps 124 /// the bytes. 125 #[inline(always)] get_for_offset(self) -> u32126 fn get_for_offset(self) -> u32 { 127 #[cfg(target_endian = "big")] 128 { 129 self.0.swap_bytes() 130 } 131 #[cfg(target_endian = "little")] 132 { 133 self.0 134 } 135 } 136 } 137 138 impl MoveMask for SensibleMoveMask { 139 #[inline(always)] all_zeros_except_least_significant(n: usize) -> SensibleMoveMask140 fn all_zeros_except_least_significant(n: usize) -> SensibleMoveMask { 141 debug_assert!(n < 32); 142 SensibleMoveMask(!((1 << n) - 1)) 143 } 144 145 #[inline(always)] has_non_zero(self) -> bool146 fn has_non_zero(self) -> bool { 147 self.0 != 0 148 } 149 150 #[inline(always)] count_ones(self) -> usize151 fn count_ones(self) -> usize { 152 self.0.count_ones() as usize 153 } 154 155 #[inline(always)] and(self, other: SensibleMoveMask) -> SensibleMoveMask156 fn and(self, other: SensibleMoveMask) -> SensibleMoveMask { 157 SensibleMoveMask(self.0 & other.0) 158 } 159 160 #[inline(always)] or(self, other: SensibleMoveMask) -> SensibleMoveMask161 fn or(self, other: SensibleMoveMask) -> SensibleMoveMask { 162 SensibleMoveMask(self.0 | other.0) 163 } 164 165 #[inline(always)] clear_least_significant_bit(self) -> SensibleMoveMask166 fn clear_least_significant_bit(self) -> SensibleMoveMask { 167 SensibleMoveMask(self.0 & (self.0 - 1)) 168 } 169 170 #[inline(always)] first_offset(self) -> usize171 fn first_offset(self) -> usize { 172 // We are dealing with little endian here (and if we aren't, we swap 173 // the bytes so we are in practice), where the most significant byte 174 // is at a higher address. That means the least significant bit that 175 // is set corresponds to the position of our first matching byte. 176 // That position corresponds to the number of zeros after the least 177 // significant bit. 178 self.get_for_offset().trailing_zeros() as usize 179 } 180 181 #[inline(always)] last_offset(self) -> usize182 fn last_offset(self) -> usize { 183 // We are dealing with little endian here (and if we aren't, we swap 184 // the bytes so we are in practice), where the most significant byte is 185 // at a higher address. That means the most significant bit that is set 186 // corresponds to the position of our last matching byte. The position 187 // from the end of the mask is therefore the number of leading zeros 188 // in a 32 bit integer, and the position from the start of the mask is 189 // therefore 32 - (leading zeros) - 1. 190 32 - self.get_for_offset().leading_zeros() as usize - 1 191 } 192 } 193 194 #[cfg(target_arch = "x86_64")] 195 mod x86sse2 { 196 use core::arch::x86_64::*; 197 198 use super::{SensibleMoveMask, Vector}; 199 200 impl Vector for __m128i { 201 const BYTES: usize = 16; 202 const ALIGN: usize = Self::BYTES - 1; 203 204 type Mask = SensibleMoveMask; 205 206 #[inline(always)] splat(byte: u8) -> __m128i207 unsafe fn splat(byte: u8) -> __m128i { 208 _mm_set1_epi8(byte as i8) 209 } 210 211 #[inline(always)] load_aligned(data: *const u8) -> __m128i212 unsafe fn load_aligned(data: *const u8) -> __m128i { 213 _mm_load_si128(data as *const __m128i) 214 } 215 216 #[inline(always)] load_unaligned(data: *const u8) -> __m128i217 unsafe fn load_unaligned(data: *const u8) -> __m128i { 218 _mm_loadu_si128(data as *const __m128i) 219 } 220 221 #[inline(always)] movemask(self) -> SensibleMoveMask222 unsafe fn movemask(self) -> SensibleMoveMask { 223 SensibleMoveMask(_mm_movemask_epi8(self) as u32) 224 } 225 226 #[inline(always)] cmpeq(self, vector2: Self) -> __m128i227 unsafe fn cmpeq(self, vector2: Self) -> __m128i { 228 _mm_cmpeq_epi8(self, vector2) 229 } 230 231 #[inline(always)] and(self, vector2: Self) -> __m128i232 unsafe fn and(self, vector2: Self) -> __m128i { 233 _mm_and_si128(self, vector2) 234 } 235 236 #[inline(always)] or(self, vector2: Self) -> __m128i237 unsafe fn or(self, vector2: Self) -> __m128i { 238 _mm_or_si128(self, vector2) 239 } 240 } 241 } 242 243 #[cfg(target_arch = "x86_64")] 244 mod x86avx2 { 245 use core::arch::x86_64::*; 246 247 use super::{SensibleMoveMask, Vector}; 248 249 impl Vector for __m256i { 250 const BYTES: usize = 32; 251 const ALIGN: usize = Self::BYTES - 1; 252 253 type Mask = SensibleMoveMask; 254 255 #[inline(always)] splat(byte: u8) -> __m256i256 unsafe fn splat(byte: u8) -> __m256i { 257 _mm256_set1_epi8(byte as i8) 258 } 259 260 #[inline(always)] load_aligned(data: *const u8) -> __m256i261 unsafe fn load_aligned(data: *const u8) -> __m256i { 262 _mm256_load_si256(data as *const __m256i) 263 } 264 265 #[inline(always)] load_unaligned(data: *const u8) -> __m256i266 unsafe fn load_unaligned(data: *const u8) -> __m256i { 267 _mm256_loadu_si256(data as *const __m256i) 268 } 269 270 #[inline(always)] movemask(self) -> SensibleMoveMask271 unsafe fn movemask(self) -> SensibleMoveMask { 272 SensibleMoveMask(_mm256_movemask_epi8(self) as u32) 273 } 274 275 #[inline(always)] cmpeq(self, vector2: Self) -> __m256i276 unsafe fn cmpeq(self, vector2: Self) -> __m256i { 277 _mm256_cmpeq_epi8(self, vector2) 278 } 279 280 #[inline(always)] and(self, vector2: Self) -> __m256i281 unsafe fn and(self, vector2: Self) -> __m256i { 282 _mm256_and_si256(self, vector2) 283 } 284 285 #[inline(always)] or(self, vector2: Self) -> __m256i286 unsafe fn or(self, vector2: Self) -> __m256i { 287 _mm256_or_si256(self, vector2) 288 } 289 } 290 } 291 292 #[cfg(target_arch = "aarch64")] 293 mod aarch64neon { 294 use core::arch::aarch64::*; 295 296 use super::{MoveMask, Vector}; 297 298 impl Vector for uint8x16_t { 299 const BYTES: usize = 16; 300 const ALIGN: usize = Self::BYTES - 1; 301 302 type Mask = NeonMoveMask; 303 304 #[inline(always)] splat(byte: u8) -> uint8x16_t305 unsafe fn splat(byte: u8) -> uint8x16_t { 306 vdupq_n_u8(byte) 307 } 308 309 #[inline(always)] load_aligned(data: *const u8) -> uint8x16_t310 unsafe fn load_aligned(data: *const u8) -> uint8x16_t { 311 // I've tried `data.cast::<uint8x16_t>().read()` instead, but 312 // couldn't observe any benchmark differences. 313 Self::load_unaligned(data) 314 } 315 316 #[inline(always)] load_unaligned(data: *const u8) -> uint8x16_t317 unsafe fn load_unaligned(data: *const u8) -> uint8x16_t { 318 vld1q_u8(data) 319 } 320 321 #[inline(always)] movemask(self) -> NeonMoveMask322 unsafe fn movemask(self) -> NeonMoveMask { 323 let asu16s = vreinterpretq_u16_u8(self); 324 let mask = vshrn_n_u16(asu16s, 4); 325 let asu64 = vreinterpret_u64_u8(mask); 326 let scalar64 = vget_lane_u64(asu64, 0); 327 NeonMoveMask(scalar64 & 0x8888888888888888) 328 } 329 330 #[inline(always)] cmpeq(self, vector2: Self) -> uint8x16_t331 unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t { 332 vceqq_u8(self, vector2) 333 } 334 335 #[inline(always)] and(self, vector2: Self) -> uint8x16_t336 unsafe fn and(self, vector2: Self) -> uint8x16_t { 337 vandq_u8(self, vector2) 338 } 339 340 #[inline(always)] or(self, vector2: Self) -> uint8x16_t341 unsafe fn or(self, vector2: Self) -> uint8x16_t { 342 vorrq_u8(self, vector2) 343 } 344 345 /// This is the only interesting implementation of this routine. 346 /// Basically, instead of doing the "shift right narrow" dance, we use 347 /// adajacent folding max to determine whether there are any non-zero 348 /// bytes in our mask. If there are, *then* we'll do the "shift right 349 /// narrow" dance. In benchmarks, this does lead to slightly better 350 /// throughput, but the win doesn't appear huge. 351 #[inline(always)] movemask_will_have_non_zero(self) -> bool352 unsafe fn movemask_will_have_non_zero(self) -> bool { 353 let low = vreinterpretq_u64_u8(vpmaxq_u8(self, self)); 354 vgetq_lane_u64(low, 0) != 0 355 } 356 } 357 358 /// Neon doesn't have a `movemask` that works like the one in x86-64, so we 359 /// wind up using a different method[1]. The different method also produces 360 /// a mask, but 4 bits are set in the neon case instead of a single bit set 361 /// in the x86-64 case. We do an extra step to zero out 3 of the 4 bits, 362 /// but we still wind up with at least 3 zeroes between each set bit. This 363 /// generally means that we need to do some division by 4 before extracting 364 /// offsets. 365 /// 366 /// In fact, the existence of this type is the entire reason that we have 367 /// the `MoveMask` trait in the first place. This basically lets us keep 368 /// the different representations of masks without being forced to unify 369 /// them into a single representation, which could result in extra and 370 /// unnecessary work. 371 /// 372 /// [1]: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon 373 #[derive(Clone, Copy, Debug)] 374 pub(crate) struct NeonMoveMask(u64); 375 376 impl NeonMoveMask { 377 /// Get the mask in a form suitable for computing offsets. 378 /// 379 /// Basically, this normalizes to little endian. On big endian, this 380 /// swaps the bytes. 381 #[inline(always)] get_for_offset(self) -> u64382 fn get_for_offset(self) -> u64 { 383 #[cfg(target_endian = "big")] 384 { 385 self.0.swap_bytes() 386 } 387 #[cfg(target_endian = "little")] 388 { 389 self.0 390 } 391 } 392 } 393 394 impl MoveMask for NeonMoveMask { 395 #[inline(always)] all_zeros_except_least_significant(n: usize) -> NeonMoveMask396 fn all_zeros_except_least_significant(n: usize) -> NeonMoveMask { 397 debug_assert!(n < 16); 398 NeonMoveMask(!(((1 << n) << 2) - 1)) 399 } 400 401 #[inline(always)] has_non_zero(self) -> bool402 fn has_non_zero(self) -> bool { 403 self.0 != 0 404 } 405 406 #[inline(always)] count_ones(self) -> usize407 fn count_ones(self) -> usize { 408 self.0.count_ones() as usize 409 } 410 411 #[inline(always)] and(self, other: NeonMoveMask) -> NeonMoveMask412 fn and(self, other: NeonMoveMask) -> NeonMoveMask { 413 NeonMoveMask(self.0 & other.0) 414 } 415 416 #[inline(always)] or(self, other: NeonMoveMask) -> NeonMoveMask417 fn or(self, other: NeonMoveMask) -> NeonMoveMask { 418 NeonMoveMask(self.0 | other.0) 419 } 420 421 #[inline(always)] clear_least_significant_bit(self) -> NeonMoveMask422 fn clear_least_significant_bit(self) -> NeonMoveMask { 423 NeonMoveMask(self.0 & (self.0 - 1)) 424 } 425 426 #[inline(always)] first_offset(self) -> usize427 fn first_offset(self) -> usize { 428 // We are dealing with little endian here (and if we aren't, 429 // we swap the bytes so we are in practice), where the most 430 // significant byte is at a higher address. That means the least 431 // significant bit that is set corresponds to the position of our 432 // first matching byte. That position corresponds to the number of 433 // zeros after the least significant bit. 434 // 435 // Note that unlike `SensibleMoveMask`, this mask has its bits 436 // spread out over 64 bits instead of 16 bits (for a 128 bit 437 // vector). Namely, where as x86-64 will turn 438 // 439 // 0x00 0xFF 0x00 0x00 0xFF 440 // 441 // into 10010, our neon approach will turn it into 442 // 443 // 10000000000010000000 444 // 445 // And this happens because neon doesn't have a native `movemask` 446 // instruction, so we kind of fake it[1]. Thus, we divide the 447 // number of trailing zeros by 4 to get the "real" offset. 448 // 449 // [1]: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon 450 (self.get_for_offset().trailing_zeros() >> 2) as usize 451 } 452 453 #[inline(always)] last_offset(self) -> usize454 fn last_offset(self) -> usize { 455 // See comment in `first_offset` above. This is basically the same, 456 // but coming from the other direction. 457 16 - (self.get_for_offset().leading_zeros() >> 2) as usize - 1 458 } 459 } 460 } 461 462 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] 463 mod wasm_simd128 { 464 use core::arch::wasm32::*; 465 466 use super::{SensibleMoveMask, Vector}; 467 468 impl Vector for v128 { 469 const BYTES: usize = 16; 470 const ALIGN: usize = Self::BYTES - 1; 471 472 type Mask = SensibleMoveMask; 473 474 #[inline(always)] splat(byte: u8) -> v128475 unsafe fn splat(byte: u8) -> v128 { 476 u8x16_splat(byte) 477 } 478 479 #[inline(always)] load_aligned(data: *const u8) -> v128480 unsafe fn load_aligned(data: *const u8) -> v128 { 481 *data.cast() 482 } 483 484 #[inline(always)] load_unaligned(data: *const u8) -> v128485 unsafe fn load_unaligned(data: *const u8) -> v128 { 486 v128_load(data.cast()) 487 } 488 489 #[inline(always)] movemask(self) -> SensibleMoveMask490 unsafe fn movemask(self) -> SensibleMoveMask { 491 SensibleMoveMask(u8x16_bitmask(self).into()) 492 } 493 494 #[inline(always)] cmpeq(self, vector2: Self) -> v128495 unsafe fn cmpeq(self, vector2: Self) -> v128 { 496 u8x16_eq(self, vector2) 497 } 498 499 #[inline(always)] and(self, vector2: Self) -> v128500 unsafe fn and(self, vector2: Self) -> v128 { 501 v128_and(self, vector2) 502 } 503 504 #[inline(always)] or(self, vector2: Self) -> v128505 unsafe fn or(self, vector2: Self) -> v128 { 506 v128_or(self, vector2) 507 } 508 } 509 } 510