• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use crate::arch::all::{
2     packedpair::{HeuristicFrequencyRank, Pair},
3     rabinkarp, twoway,
4 };
5 
6 #[cfg(target_arch = "aarch64")]
7 use crate::arch::aarch64::neon::packedpair as neon;
8 #[cfg(target_arch = "wasm32")]
9 use crate::arch::wasm32::simd128::packedpair as simd128;
10 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
11 use crate::arch::x86_64::{
12     avx2::packedpair as avx2, sse2::packedpair as sse2,
13 };
14 
15 /// A "meta" substring searcher.
16 ///
17 /// To a first approximation, this chooses what it believes to be the "best"
18 /// substring search implemnetation based on the needle at construction time.
19 /// Then, every call to `find` will execute that particular implementation. To
20 /// a second approximation, multiple substring search algorithms may be used,
21 /// depending on the haystack. For example, for supremely short haystacks,
22 /// Rabin-Karp is typically used.
23 ///
24 /// See the documentation on `Prefilter` for an explanation of the dispatching
25 /// mechanism. The quick summary is that an enum has too much overhead and
26 /// we can't use dynamic dispatch via traits because we need to work in a
27 /// core-only environment. (Dynamic dispatch works in core-only, but you
28 /// need `&dyn Trait` and we really need a `Box<dyn Trait>` here. The latter
29 /// requires `alloc`.) So instead, we use a union and an appropriately paired
30 /// free function to read from the correct field on the union and execute the
31 /// chosen substring search implementation.
32 #[derive(Clone)]
33 pub(crate) struct Searcher {
34     call: SearcherKindFn,
35     kind: SearcherKind,
36     rabinkarp: rabinkarp::Finder,
37 }
38 
39 impl Searcher {
40     /// Creates a new "meta" substring searcher that attempts to choose the
41     /// best algorithm based on the needle, heuristics and what the current
42     /// target supports.
43     #[inline]
new<R: HeuristicFrequencyRank>( prefilter: PrefilterConfig, ranker: R, needle: &[u8], ) -> Searcher44     pub(crate) fn new<R: HeuristicFrequencyRank>(
45         prefilter: PrefilterConfig,
46         ranker: R,
47         needle: &[u8],
48     ) -> Searcher {
49         let rabinkarp = rabinkarp::Finder::new(needle);
50         if needle.len() <= 1 {
51             return if needle.is_empty() {
52                 trace!("building empty substring searcher");
53                 Searcher {
54                     call: searcher_kind_empty,
55                     kind: SearcherKind { empty: () },
56                     rabinkarp,
57                 }
58             } else {
59                 trace!("building one-byte substring searcher");
60                 debug_assert_eq!(1, needle.len());
61                 Searcher {
62                     call: searcher_kind_one_byte,
63                     kind: SearcherKind { one_byte: needle[0] },
64                     rabinkarp,
65                 }
66             };
67         }
68         let pair = match Pair::with_ranker(needle, &ranker) {
69             Some(pair) => pair,
70             None => return Searcher::twoway(needle, rabinkarp, None),
71         };
72         debug_assert_ne!(
73             pair.index1(),
74             pair.index2(),
75             "pair offsets should not be equivalent"
76         );
77         #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
78         {
79             if let Some(pp) = avx2::Finder::with_pair(needle, pair) {
80                 if do_packed_search(needle) {
81                     trace!("building x86_64 AVX2 substring searcher");
82                     let kind = SearcherKind { avx2: pp };
83                     Searcher { call: searcher_kind_avx2, kind, rabinkarp }
84                 } else if prefilter.is_none() {
85                     Searcher::twoway(needle, rabinkarp, None)
86                 } else {
87                     let prestrat = Prefilter::avx2(pp, needle);
88                     Searcher::twoway(needle, rabinkarp, Some(prestrat))
89                 }
90             } else if let Some(pp) = sse2::Finder::with_pair(needle, pair) {
91                 if do_packed_search(needle) {
92                     trace!("building x86_64 SSE2 substring searcher");
93                     let kind = SearcherKind { sse2: pp };
94                     Searcher { call: searcher_kind_sse2, kind, rabinkarp }
95                 } else if prefilter.is_none() {
96                     Searcher::twoway(needle, rabinkarp, None)
97                 } else {
98                     let prestrat = Prefilter::sse2(pp, needle);
99                     Searcher::twoway(needle, rabinkarp, Some(prestrat))
100                 }
101             } else if prefilter.is_none() {
102                 Searcher::twoway(needle, rabinkarp, None)
103             } else {
104                 // We're pretty unlikely to get to this point, but it is
105                 // possible to be running on x86_64 without SSE2. Namely, it's
106                 // really up to the OS whether it wants to support vector
107                 // registers or not.
108                 let prestrat = Prefilter::fallback(ranker, pair, needle);
109                 Searcher::twoway(needle, rabinkarp, prestrat)
110             }
111         }
112         #[cfg(target_arch = "wasm32")]
113         {
114             if let Some(pp) = simd128::Finder::with_pair(needle, pair) {
115                 if do_packed_search(needle) {
116                     trace!("building wasm32 simd128 substring searcher");
117                     let kind = SearcherKind { simd128: pp };
118                     Searcher { call: searcher_kind_simd128, kind, rabinkarp }
119                 } else if prefilter.is_none() {
120                     Searcher::twoway(needle, rabinkarp, None)
121                 } else {
122                     let prestrat = Prefilter::simd128(pp, needle);
123                     Searcher::twoway(needle, rabinkarp, Some(prestrat))
124                 }
125             } else if prefilter.is_none() {
126                 Searcher::twoway(needle, rabinkarp, None)
127             } else {
128                 let prestrat = Prefilter::fallback(ranker, pair, needle);
129                 Searcher::twoway(needle, rabinkarp, prestrat)
130             }
131         }
132         #[cfg(target_arch = "aarch64")]
133         {
134             if let Some(pp) = neon::Finder::with_pair(needle, pair) {
135                 if do_packed_search(needle) {
136                     trace!("building aarch64 neon substring searcher");
137                     let kind = SearcherKind { neon: pp };
138                     Searcher { call: searcher_kind_neon, kind, rabinkarp }
139                 } else if prefilter.is_none() {
140                     Searcher::twoway(needle, rabinkarp, None)
141                 } else {
142                     let prestrat = Prefilter::neon(pp, needle);
143                     Searcher::twoway(needle, rabinkarp, Some(prestrat))
144                 }
145             } else if prefilter.is_none() {
146                 Searcher::twoway(needle, rabinkarp, None)
147             } else {
148                 let prestrat = Prefilter::fallback(ranker, pair, needle);
149                 Searcher::twoway(needle, rabinkarp, prestrat)
150             }
151         }
152         #[cfg(not(any(
153             all(target_arch = "x86_64", target_feature = "sse2"),
154             target_arch = "wasm32",
155             target_arch = "aarch64"
156         )))]
157         {
158             if prefilter.is_none() {
159                 Searcher::twoway(needle, rabinkarp, None)
160             } else {
161                 let prestrat = Prefilter::fallback(ranker, pair, needle);
162                 Searcher::twoway(needle, rabinkarp, prestrat)
163             }
164         }
165     }
166 
167     /// Creates a new searcher that always uses the Two-Way algorithm. This is
168     /// typically used when vector algorithms are unavailable or inappropriate.
169     /// (For example, when the needle is "too long.")
170     ///
171     /// If a prefilter is given, then the searcher returned will be accelerated
172     /// by the prefilter.
173     #[inline]
twoway( needle: &[u8], rabinkarp: rabinkarp::Finder, prestrat: Option<Prefilter>, ) -> Searcher174     fn twoway(
175         needle: &[u8],
176         rabinkarp: rabinkarp::Finder,
177         prestrat: Option<Prefilter>,
178     ) -> Searcher {
179         let finder = twoway::Finder::new(needle);
180         match prestrat {
181             None => {
182                 trace!("building scalar two-way substring searcher");
183                 let kind = SearcherKind { two_way: finder };
184                 Searcher { call: searcher_kind_two_way, kind, rabinkarp }
185             }
186             Some(prestrat) => {
187                 trace!(
188                     "building scalar two-way \
189                      substring searcher with a prefilter"
190                 );
191                 let two_way_with_prefilter =
192                     TwoWayWithPrefilter { finder, prestrat };
193                 let kind = SearcherKind { two_way_with_prefilter };
194                 Searcher {
195                     call: searcher_kind_two_way_with_prefilter,
196                     kind,
197                     rabinkarp,
198                 }
199             }
200         }
201     }
202 
203     /// Searches the given haystack for the given needle. The needle given
204     /// should be the same as the needle that this finder was initialized
205     /// with.
206     ///
207     /// Inlining this can lead to big wins for latency, and #[inline] doesn't
208     /// seem to be enough in some cases.
209     #[inline(always)]
find( &self, prestate: &mut PrefilterState, haystack: &[u8], needle: &[u8], ) -> Option<usize>210     pub(crate) fn find(
211         &self,
212         prestate: &mut PrefilterState,
213         haystack: &[u8],
214         needle: &[u8],
215     ) -> Option<usize> {
216         if haystack.len() < needle.len() {
217             None
218         } else {
219             // SAFETY: By construction, we've ensured that the function
220             // in `self.call` is properly paired with the union used in
221             // `self.kind`.
222             unsafe { (self.call)(self, prestate, haystack, needle) }
223         }
224     }
225 }
226 
227 impl core::fmt::Debug for Searcher {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result228     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
229         f.debug_struct("Searcher")
230             .field("call", &"<searcher function>")
231             .field("kind", &"<searcher kind union>")
232             .field("rabinkarp", &self.rabinkarp)
233             .finish()
234     }
235 }
236 
237 /// A union indicating one of several possible substring search implementations
238 /// that are in active use.
239 ///
240 /// This union should only be read by one of the functions prefixed with
241 /// `searcher_kind_`. Namely, the correct function is meant to be paired with
242 /// the union by the caller, such that the function always reads from the
243 /// designated union field.
244 #[derive(Clone, Copy)]
245 union SearcherKind {
246     empty: (),
247     one_byte: u8,
248     two_way: twoway::Finder,
249     two_way_with_prefilter: TwoWayWithPrefilter,
250     #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
251     sse2: crate::arch::x86_64::sse2::packedpair::Finder,
252     #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
253     avx2: crate::arch::x86_64::avx2::packedpair::Finder,
254     #[cfg(target_arch = "wasm32")]
255     simd128: crate::arch::wasm32::simd128::packedpair::Finder,
256     #[cfg(target_arch = "aarch64")]
257     neon: crate::arch::aarch64::neon::packedpair::Finder,
258 }
259 
260 /// A two-way substring searcher with a prefilter.
261 #[derive(Copy, Clone, Debug)]
262 struct TwoWayWithPrefilter {
263     finder: twoway::Finder,
264     prestrat: Prefilter,
265 }
266 
267 /// The type of a substring search function.
268 ///
269 /// # Safety
270 ///
271 /// When using a function of this type, callers must ensure that the correct
272 /// function is paired with the value populated in `SearcherKind` union.
273 type SearcherKindFn = unsafe fn(
274     searcher: &Searcher,
275     prestate: &mut PrefilterState,
276     haystack: &[u8],
277     needle: &[u8],
278 ) -> Option<usize>;
279 
280 /// Reads from the `empty` field of `SearcherKind` to handle the case of
281 /// searching for the empty needle. Works on all platforms.
282 ///
283 /// # Safety
284 ///
285 /// Callers must ensure that the `searcher.kind.empty` union field is set.
searcher_kind_empty( _searcher: &Searcher, _prestate: &mut PrefilterState, _haystack: &[u8], _needle: &[u8], ) -> Option<usize>286 unsafe fn searcher_kind_empty(
287     _searcher: &Searcher,
288     _prestate: &mut PrefilterState,
289     _haystack: &[u8],
290     _needle: &[u8],
291 ) -> Option<usize> {
292     Some(0)
293 }
294 
295 /// Reads from the `one_byte` field of `SearcherKind` to handle the case of
296 /// searching for a single byte needle. Works on all platforms.
297 ///
298 /// # Safety
299 ///
300 /// Callers must ensure that the `searcher.kind.one_byte` union field is set.
searcher_kind_one_byte( searcher: &Searcher, _prestate: &mut PrefilterState, haystack: &[u8], _needle: &[u8], ) -> Option<usize>301 unsafe fn searcher_kind_one_byte(
302     searcher: &Searcher,
303     _prestate: &mut PrefilterState,
304     haystack: &[u8],
305     _needle: &[u8],
306 ) -> Option<usize> {
307     let needle = searcher.kind.one_byte;
308     crate::memchr(needle, haystack)
309 }
310 
311 /// Reads from the `two_way` field of `SearcherKind` to handle the case of
312 /// searching for an arbitrary needle without prefilter acceleration. Works on
313 /// all platforms.
314 ///
315 /// # Safety
316 ///
317 /// Callers must ensure that the `searcher.kind.two_way` union field is set.
searcher_kind_two_way( searcher: &Searcher, _prestate: &mut PrefilterState, haystack: &[u8], needle: &[u8], ) -> Option<usize>318 unsafe fn searcher_kind_two_way(
319     searcher: &Searcher,
320     _prestate: &mut PrefilterState,
321     haystack: &[u8],
322     needle: &[u8],
323 ) -> Option<usize> {
324     if rabinkarp::is_fast(haystack, needle) {
325         searcher.rabinkarp.find(haystack, needle)
326     } else {
327         searcher.kind.two_way.find(haystack, needle)
328     }
329 }
330 
331 /// Reads from the `two_way_with_prefilter` field of `SearcherKind` to handle
332 /// the case of searching for an arbitrary needle with prefilter acceleration.
333 /// Works on all platforms.
334 ///
335 /// # Safety
336 ///
337 /// Callers must ensure that the `searcher.kind.two_way_with_prefilter` union
338 /// field is set.
searcher_kind_two_way_with_prefilter( searcher: &Searcher, prestate: &mut PrefilterState, haystack: &[u8], needle: &[u8], ) -> Option<usize>339 unsafe fn searcher_kind_two_way_with_prefilter(
340     searcher: &Searcher,
341     prestate: &mut PrefilterState,
342     haystack: &[u8],
343     needle: &[u8],
344 ) -> Option<usize> {
345     if rabinkarp::is_fast(haystack, needle) {
346         searcher.rabinkarp.find(haystack, needle)
347     } else {
348         let TwoWayWithPrefilter { ref finder, ref prestrat } =
349             searcher.kind.two_way_with_prefilter;
350         let pre = Pre { prestate, prestrat };
351         finder.find_with_prefilter(Some(pre), haystack, needle)
352     }
353 }
354 
355 /// Reads from the `sse2` field of `SearcherKind` to execute the x86_64 SSE2
356 /// vectorized substring search implementation.
357 ///
358 /// # Safety
359 ///
360 /// Callers must ensure that the `searcher.kind.sse2` union field is set.
361 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
searcher_kind_sse2( searcher: &Searcher, _prestate: &mut PrefilterState, haystack: &[u8], needle: &[u8], ) -> Option<usize>362 unsafe fn searcher_kind_sse2(
363     searcher: &Searcher,
364     _prestate: &mut PrefilterState,
365     haystack: &[u8],
366     needle: &[u8],
367 ) -> Option<usize> {
368     let finder = &searcher.kind.sse2;
369     if haystack.len() < finder.min_haystack_len() {
370         searcher.rabinkarp.find(haystack, needle)
371     } else {
372         finder.find(haystack, needle)
373     }
374 }
375 
376 /// Reads from the `avx2` field of `SearcherKind` to execute the x86_64 AVX2
377 /// vectorized substring search implementation.
378 ///
379 /// # Safety
380 ///
381 /// Callers must ensure that the `searcher.kind.avx2` union field is set.
382 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
searcher_kind_avx2( searcher: &Searcher, _prestate: &mut PrefilterState, haystack: &[u8], needle: &[u8], ) -> Option<usize>383 unsafe fn searcher_kind_avx2(
384     searcher: &Searcher,
385     _prestate: &mut PrefilterState,
386     haystack: &[u8],
387     needle: &[u8],
388 ) -> Option<usize> {
389     let finder = &searcher.kind.avx2;
390     if haystack.len() < finder.min_haystack_len() {
391         searcher.rabinkarp.find(haystack, needle)
392     } else {
393         finder.find(haystack, needle)
394     }
395 }
396 
397 /// Reads from the `simd128` field of `SearcherKind` to execute the wasm32
398 /// simd128 vectorized substring search implementation.
399 ///
400 /// # Safety
401 ///
402 /// Callers must ensure that the `searcher.kind.simd128` union field is set.
403 #[cfg(target_arch = "wasm32")]
searcher_kind_simd128( searcher: &Searcher, _prestate: &mut PrefilterState, haystack: &[u8], needle: &[u8], ) -> Option<usize>404 unsafe fn searcher_kind_simd128(
405     searcher: &Searcher,
406     _prestate: &mut PrefilterState,
407     haystack: &[u8],
408     needle: &[u8],
409 ) -> Option<usize> {
410     let finder = &searcher.kind.simd128;
411     if haystack.len() < finder.min_haystack_len() {
412         searcher.rabinkarp.find(haystack, needle)
413     } else {
414         finder.find(haystack, needle)
415     }
416 }
417 
418 /// Reads from the `neon` field of `SearcherKind` to execute the aarch64 neon
419 /// vectorized substring search implementation.
420 ///
421 /// # Safety
422 ///
423 /// Callers must ensure that the `searcher.kind.neon` union field is set.
424 #[cfg(target_arch = "aarch64")]
searcher_kind_neon( searcher: &Searcher, _prestate: &mut PrefilterState, haystack: &[u8], needle: &[u8], ) -> Option<usize>425 unsafe fn searcher_kind_neon(
426     searcher: &Searcher,
427     _prestate: &mut PrefilterState,
428     haystack: &[u8],
429     needle: &[u8],
430 ) -> Option<usize> {
431     let finder = &searcher.kind.neon;
432     if haystack.len() < finder.min_haystack_len() {
433         searcher.rabinkarp.find(haystack, needle)
434     } else {
435         finder.find(haystack, needle)
436     }
437 }
438 
439 /// A reverse substring searcher.
440 #[derive(Clone, Debug)]
441 pub(crate) struct SearcherRev {
442     kind: SearcherRevKind,
443     rabinkarp: rabinkarp::FinderRev,
444 }
445 
446 /// The kind of the reverse searcher.
447 ///
448 /// For the reverse case, we don't do any SIMD acceleration or prefilters.
449 /// There is no specific technical reason why we don't, but rather don't do it
450 /// because it's not clear it's worth the extra code to do so. If you have a
451 /// use case for it, please file an issue.
452 ///
453 /// We also don't do the union trick as we do with the forward case and
454 /// prefilters. Basically for the same reason we don't have prefilters or
455 /// vector algorithms for reverse searching: it's not clear it's worth doing.
456 /// Please file an issue if you have a compelling use case for fast reverse
457 /// substring search.
458 #[derive(Clone, Debug)]
459 enum SearcherRevKind {
460     Empty,
461     OneByte { needle: u8 },
462     TwoWay { finder: twoway::FinderRev },
463 }
464 
465 impl SearcherRev {
466     /// Creates a new searcher for finding occurrences of the given needle in
467     /// reverse. That is, it reports the last (instead of the first) occurrence
468     /// of a needle in a haystack.
469     #[inline]
new(needle: &[u8]) -> SearcherRev470     pub(crate) fn new(needle: &[u8]) -> SearcherRev {
471         let kind = if needle.len() <= 1 {
472             if needle.is_empty() {
473                 trace!("building empty reverse substring searcher");
474                 SearcherRevKind::Empty
475             } else {
476                 trace!("building one-byte reverse substring searcher");
477                 debug_assert_eq!(1, needle.len());
478                 SearcherRevKind::OneByte { needle: needle[0] }
479             }
480         } else {
481             trace!("building scalar two-way reverse substring searcher");
482             let finder = twoway::FinderRev::new(needle);
483             SearcherRevKind::TwoWay { finder }
484         };
485         let rabinkarp = rabinkarp::FinderRev::new(needle);
486         SearcherRev { kind, rabinkarp }
487     }
488 
489     /// Searches the given haystack for the last occurrence of the given
490     /// needle. The needle given should be the same as the needle that this
491     /// finder was initialized with.
492     #[inline]
rfind( &self, haystack: &[u8], needle: &[u8], ) -> Option<usize>493     pub(crate) fn rfind(
494         &self,
495         haystack: &[u8],
496         needle: &[u8],
497     ) -> Option<usize> {
498         if haystack.len() < needle.len() {
499             return None;
500         }
501         match self.kind {
502             SearcherRevKind::Empty => Some(haystack.len()),
503             SearcherRevKind::OneByte { needle } => {
504                 crate::memrchr(needle, haystack)
505             }
506             SearcherRevKind::TwoWay { ref finder } => {
507                 if rabinkarp::is_fast(haystack, needle) {
508                     self.rabinkarp.rfind(haystack, needle)
509                 } else {
510                     finder.rfind(haystack, needle)
511                 }
512             }
513         }
514     }
515 }
516 
517 /// Prefilter controls whether heuristics are used to accelerate searching.
518 ///
519 /// A prefilter refers to the idea of detecting candidate matches very quickly,
520 /// and then confirming whether those candidates are full matches. This
521 /// idea can be quite effective since it's often the case that looking for
522 /// candidates can be a lot faster than running a complete substring search
523 /// over the entire input. Namely, looking for candidates can be done with
524 /// extremely fast vectorized code.
525 ///
526 /// The downside of a prefilter is that it assumes false positives (which are
527 /// candidates generated by a prefilter that aren't matches) are somewhat rare
528 /// relative to the frequency of full matches. That is, if a lot of false
529 /// positives are generated, then it's possible for search time to be worse
530 /// than if the prefilter wasn't enabled in the first place.
531 ///
532 /// Another downside of a prefilter is that it can result in highly variable
533 /// performance, where some cases are extraordinarily fast and others aren't.
534 /// Typically, variable performance isn't a problem, but it may be for your use
535 /// case.
536 ///
537 /// The use of prefilters in this implementation does use a heuristic to detect
538 /// when a prefilter might not be carrying its weight, and will dynamically
539 /// disable its use. Nevertheless, this configuration option gives callers
540 /// the ability to disable prefilters if you have knowledge that they won't be
541 /// useful.
542 #[derive(Clone, Copy, Debug)]
543 #[non_exhaustive]
544 pub enum PrefilterConfig {
545     /// Never used a prefilter in substring search.
546     None,
547     /// Automatically detect whether a heuristic prefilter should be used. If
548     /// it is used, then heuristics will be used to dynamically disable the
549     /// prefilter if it is believed to not be carrying its weight.
550     Auto,
551 }
552 
553 impl Default for PrefilterConfig {
default() -> PrefilterConfig554     fn default() -> PrefilterConfig {
555         PrefilterConfig::Auto
556     }
557 }
558 
559 impl PrefilterConfig {
560     /// Returns true when this prefilter is set to the `None` variant.
is_none(&self) -> bool561     fn is_none(&self) -> bool {
562         matches!(*self, PrefilterConfig::None)
563     }
564 }
565 
566 /// The implementation of a prefilter.
567 ///
568 /// This type encapsulates dispatch to one of several possible choices for a
569 /// prefilter. Generally speaking, all prefilters have the same approximate
570 /// algorithm: they choose a couple of bytes from the needle that are believed
571 /// to be rare, use a fast vector algorithm to look for those bytes and return
572 /// positions as candidates for some substring search algorithm (currently only
573 /// Two-Way) to confirm as a match or not.
574 ///
575 /// The differences between the algorithms are actually at the vector
576 /// implementation level. Namely, we need different routines based on both
577 /// which target architecture we're on and what CPU features are supported.
578 ///
579 /// The straight-forwardly obvious approach here is to use an enum, and make
580 /// `Prefilter::find` do case analysis to determine which algorithm was
581 /// selected and invoke it. However, I've observed that this leads to poor
582 /// codegen in some cases, especially in latency sensitive benchmarks. That is,
583 /// this approach comes with overhead that I wasn't able to eliminate.
584 ///
585 /// The second obvious approach is to use dynamic dispatch with traits. Doing
586 /// that in this context where `Prefilter` owns the selection generally
587 /// requires heap allocation, and this code is designed to run in core-only
588 /// environments.
589 ///
590 /// So we settle on using a union (that's `PrefilterKind`) and a function
591 /// pointer (that's `PrefilterKindFn`). We select the right function pointer
592 /// based on which field in the union we set, and that function in turn
593 /// knows which field of the union to access. The downside of this approach
594 /// is that it forces us to think about safety, but the upside is that
595 /// there are some nice latency improvements to benchmarks. (Especially the
596 /// `memmem/sliceslice/short` benchmark.)
597 ///
598 /// In cases where we've selected a vector algorithm and the haystack given
599 /// is too short, we fallback to the scalar version of `memchr` on the
600 /// `rarest_byte`. (The scalar version of `memchr` is still better than a naive
601 /// byte-at-a-time loop because it will read in `usize`-sized chunks at a
602 /// time.)
603 #[derive(Clone, Copy)]
604 struct Prefilter {
605     call: PrefilterKindFn,
606     kind: PrefilterKind,
607     rarest_byte: u8,
608     rarest_offset: u8,
609 }
610 
611 impl Prefilter {
612     /// Return a "fallback" prefilter, but only if it is believed to be
613     /// effective.
614     #[inline]
fallback<R: HeuristicFrequencyRank>( ranker: R, pair: Pair, needle: &[u8], ) -> Option<Prefilter>615     fn fallback<R: HeuristicFrequencyRank>(
616         ranker: R,
617         pair: Pair,
618         needle: &[u8],
619     ) -> Option<Prefilter> {
620         /// The maximum frequency rank permitted for the fallback prefilter.
621         /// If the rarest byte in the needle has a frequency rank above this
622         /// value, then no prefilter is used if the fallback prefilter would
623         /// otherwise be selected.
624         const MAX_FALLBACK_RANK: u8 = 250;
625 
626         trace!("building fallback prefilter");
627         let rarest_offset = pair.index1();
628         let rarest_byte = needle[usize::from(rarest_offset)];
629         let rarest_rank = ranker.rank(rarest_byte);
630         if rarest_rank > MAX_FALLBACK_RANK {
631             None
632         } else {
633             let finder = crate::arch::all::packedpair::Finder::with_pair(
634                 needle,
635                 pair.clone(),
636             )?;
637             let call = prefilter_kind_fallback;
638             let kind = PrefilterKind { fallback: finder };
639             Some(Prefilter { call, kind, rarest_byte, rarest_offset })
640         }
641     }
642 
643     /// Return a prefilter using a x86_64 SSE2 vector algorithm.
644     #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
645     #[inline]
sse2(finder: sse2::Finder, needle: &[u8]) -> Prefilter646     fn sse2(finder: sse2::Finder, needle: &[u8]) -> Prefilter {
647         trace!("building x86_64 SSE2 prefilter");
648         let rarest_offset = finder.pair().index1();
649         let rarest_byte = needle[usize::from(rarest_offset)];
650         Prefilter {
651             call: prefilter_kind_sse2,
652             kind: PrefilterKind { sse2: finder },
653             rarest_byte,
654             rarest_offset,
655         }
656     }
657 
658     /// Return a prefilter using a x86_64 AVX2 vector algorithm.
659     #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
660     #[inline]
avx2(finder: avx2::Finder, needle: &[u8]) -> Prefilter661     fn avx2(finder: avx2::Finder, needle: &[u8]) -> Prefilter {
662         trace!("building x86_64 AVX2 prefilter");
663         let rarest_offset = finder.pair().index1();
664         let rarest_byte = needle[usize::from(rarest_offset)];
665         Prefilter {
666             call: prefilter_kind_avx2,
667             kind: PrefilterKind { avx2: finder },
668             rarest_byte,
669             rarest_offset,
670         }
671     }
672 
673     /// Return a prefilter using a wasm32 simd128 vector algorithm.
674     #[cfg(target_arch = "wasm32")]
675     #[inline]
simd128(finder: simd128::Finder, needle: &[u8]) -> Prefilter676     fn simd128(finder: simd128::Finder, needle: &[u8]) -> Prefilter {
677         trace!("building wasm32 simd128 prefilter");
678         let rarest_offset = finder.pair().index1();
679         let rarest_byte = needle[usize::from(rarest_offset)];
680         Prefilter {
681             call: prefilter_kind_simd128,
682             kind: PrefilterKind { simd128: finder },
683             rarest_byte,
684             rarest_offset,
685         }
686     }
687 
688     /// Return a prefilter using a aarch64 neon vector algorithm.
689     #[cfg(target_arch = "aarch64")]
690     #[inline]
neon(finder: neon::Finder, needle: &[u8]) -> Prefilter691     fn neon(finder: neon::Finder, needle: &[u8]) -> Prefilter {
692         trace!("building aarch64 neon prefilter");
693         let rarest_offset = finder.pair().index1();
694         let rarest_byte = needle[usize::from(rarest_offset)];
695         Prefilter {
696             call: prefilter_kind_neon,
697             kind: PrefilterKind { neon: finder },
698             rarest_byte,
699             rarest_offset,
700         }
701     }
702 
703     /// Return a *candidate* position for a match.
704     ///
705     /// When this returns an offset, it implies that a match could begin at
706     /// that offset, but it may not. That is, it is possible for a false
707     /// positive to be returned.
708     ///
709     /// When `None` is returned, then it is guaranteed that there are no
710     /// matches for the needle in the given haystack. That is, it is impossible
711     /// for a false negative to be returned.
712     ///
713     /// The purpose of this routine is to look for candidate matching positions
714     /// as quickly as possible before running a (likely) slower confirmation
715     /// step.
716     #[inline]
find(&self, haystack: &[u8]) -> Option<usize>717     fn find(&self, haystack: &[u8]) -> Option<usize> {
718         // SAFETY: By construction, we've ensured that the function in
719         // `self.call` is properly paired with the union used in `self.kind`.
720         unsafe { (self.call)(self, haystack) }
721     }
722 
723     /// A "simple" prefilter that just looks for the occurrence of the rarest
724     /// byte from the needle. This is generally only used for very small
725     /// haystacks.
726     #[inline]
find_simple(&self, haystack: &[u8]) -> Option<usize>727     fn find_simple(&self, haystack: &[u8]) -> Option<usize> {
728         // We don't use crate::memchr here because the haystack should be small
729         // enough that memchr won't be able to use vector routines anyway. So
730         // we just skip straight to the fallback implementation which is likely
731         // faster. (A byte-at-a-time loop is only used when the haystack is
732         // smaller than `size_of::<usize>()`.)
733         crate::arch::all::memchr::One::new(self.rarest_byte)
734             .find(haystack)
735             .map(|i| i.saturating_sub(usize::from(self.rarest_offset)))
736     }
737 }
738 
739 impl core::fmt::Debug for Prefilter {
fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result740     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
741         f.debug_struct("Prefilter")
742             .field("call", &"<prefilter function>")
743             .field("kind", &"<prefilter kind union>")
744             .field("rarest_byte", &self.rarest_byte)
745             .field("rarest_offset", &self.rarest_offset)
746             .finish()
747     }
748 }
749 
750 /// A union indicating one of several possible prefilters that are in active
751 /// use.
752 ///
753 /// This union should only be read by one of the functions prefixed with
754 /// `prefilter_kind_`. Namely, the correct function is meant to be paired with
755 /// the union by the caller, such that the function always reads from the
756 /// designated union field.
757 #[derive(Clone, Copy)]
758 union PrefilterKind {
759     fallback: crate::arch::all::packedpair::Finder,
760     #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
761     sse2: crate::arch::x86_64::sse2::packedpair::Finder,
762     #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
763     avx2: crate::arch::x86_64::avx2::packedpair::Finder,
764     #[cfg(target_arch = "wasm32")]
765     simd128: crate::arch::wasm32::simd128::packedpair::Finder,
766     #[cfg(target_arch = "aarch64")]
767     neon: crate::arch::aarch64::neon::packedpair::Finder,
768 }
769 
770 /// The type of a prefilter function.
771 ///
772 /// # Safety
773 ///
774 /// When using a function of this type, callers must ensure that the correct
775 /// function is paired with the value populated in `PrefilterKind` union.
776 type PrefilterKindFn =
777     unsafe fn(strat: &Prefilter, haystack: &[u8]) -> Option<usize>;
778 
779 /// Reads from the `fallback` field of `PrefilterKind` to execute the fallback
780 /// prefilter. Works on all platforms.
781 ///
782 /// # Safety
783 ///
784 /// Callers must ensure that the `strat.kind.fallback` union field is set.
prefilter_kind_fallback( strat: &Prefilter, haystack: &[u8], ) -> Option<usize>785 unsafe fn prefilter_kind_fallback(
786     strat: &Prefilter,
787     haystack: &[u8],
788 ) -> Option<usize> {
789     strat.kind.fallback.find_prefilter(haystack)
790 }
791 
792 /// Reads from the `sse2` field of `PrefilterKind` to execute the x86_64 SSE2
793 /// prefilter.
794 ///
795 /// # Safety
796 ///
797 /// Callers must ensure that the `strat.kind.sse2` union field is set.
798 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
prefilter_kind_sse2( strat: &Prefilter, haystack: &[u8], ) -> Option<usize>799 unsafe fn prefilter_kind_sse2(
800     strat: &Prefilter,
801     haystack: &[u8],
802 ) -> Option<usize> {
803     let finder = &strat.kind.sse2;
804     if haystack.len() < finder.min_haystack_len() {
805         strat.find_simple(haystack)
806     } else {
807         finder.find_prefilter(haystack)
808     }
809 }
810 
811 /// Reads from the `avx2` field of `PrefilterKind` to execute the x86_64 AVX2
812 /// prefilter.
813 ///
814 /// # Safety
815 ///
816 /// Callers must ensure that the `strat.kind.avx2` union field is set.
817 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
prefilter_kind_avx2( strat: &Prefilter, haystack: &[u8], ) -> Option<usize>818 unsafe fn prefilter_kind_avx2(
819     strat: &Prefilter,
820     haystack: &[u8],
821 ) -> Option<usize> {
822     let finder = &strat.kind.avx2;
823     if haystack.len() < finder.min_haystack_len() {
824         strat.find_simple(haystack)
825     } else {
826         finder.find_prefilter(haystack)
827     }
828 }
829 
830 /// Reads from the `simd128` field of `PrefilterKind` to execute the wasm32
831 /// simd128 prefilter.
832 ///
833 /// # Safety
834 ///
835 /// Callers must ensure that the `strat.kind.simd128` union field is set.
836 #[cfg(target_arch = "wasm32")]
prefilter_kind_simd128( strat: &Prefilter, haystack: &[u8], ) -> Option<usize>837 unsafe fn prefilter_kind_simd128(
838     strat: &Prefilter,
839     haystack: &[u8],
840 ) -> Option<usize> {
841     let finder = &strat.kind.simd128;
842     if haystack.len() < finder.min_haystack_len() {
843         strat.find_simple(haystack)
844     } else {
845         finder.find_prefilter(haystack)
846     }
847 }
848 
849 /// Reads from the `neon` field of `PrefilterKind` to execute the aarch64 neon
850 /// prefilter.
851 ///
852 /// # Safety
853 ///
854 /// Callers must ensure that the `strat.kind.neon` union field is set.
855 #[cfg(target_arch = "aarch64")]
prefilter_kind_neon( strat: &Prefilter, haystack: &[u8], ) -> Option<usize>856 unsafe fn prefilter_kind_neon(
857     strat: &Prefilter,
858     haystack: &[u8],
859 ) -> Option<usize> {
860     let finder = &strat.kind.neon;
861     if haystack.len() < finder.min_haystack_len() {
862         strat.find_simple(haystack)
863     } else {
864         finder.find_prefilter(haystack)
865     }
866 }
867 
868 /// PrefilterState tracks state associated with the effectiveness of a
869 /// prefilter. It is used to track how many bytes, on average, are skipped by
870 /// the prefilter. If this average dips below a certain threshold over time,
871 /// then the state renders the prefilter inert and stops using it.
872 ///
873 /// A prefilter state should be created for each search. (Where creating an
874 /// iterator is treated as a single search.) A prefilter state should only be
875 /// created from a `Freqy`. e.g., An inert `Freqy` will produce an inert
876 /// `PrefilterState`.
877 #[derive(Clone, Copy, Debug)]
878 pub(crate) struct PrefilterState {
879     /// The number of skips that has been executed. This is always 1 greater
880     /// than the actual number of skips. The special sentinel value of 0
881     /// indicates that the prefilter is inert. This is useful to avoid
882     /// additional checks to determine whether the prefilter is still
883     /// "effective." Once a prefilter becomes inert, it should no longer be
884     /// used (according to our heuristics).
885     skips: u32,
886     /// The total number of bytes that have been skipped.
887     skipped: u32,
888 }
889 
890 impl PrefilterState {
891     /// The minimum number of skip attempts to try before considering whether
892     /// a prefilter is effective or not.
893     const MIN_SKIPS: u32 = 50;
894 
895     /// The minimum amount of bytes that skipping must average.
896     ///
897     /// This value was chosen based on varying it and checking
898     /// the microbenchmarks. In particular, this can impact the
899     /// pathological/repeated-{huge,small} benchmarks quite a bit if it's set
900     /// too low.
901     const MIN_SKIP_BYTES: u32 = 8;
902 
903     /// Create a fresh prefilter state.
904     #[inline]
new() -> PrefilterState905     pub(crate) fn new() -> PrefilterState {
906         PrefilterState { skips: 1, skipped: 0 }
907     }
908 
909     /// Update this state with the number of bytes skipped on the last
910     /// invocation of the prefilter.
911     #[inline]
update(&mut self, skipped: usize)912     fn update(&mut self, skipped: usize) {
913         self.skips = self.skips.saturating_add(1);
914         // We need to do this dance since it's technically possible for
915         // `skipped` to overflow a `u32`. (And we use a `u32` to reduce the
916         // size of a prefilter state.)
917         self.skipped = match u32::try_from(skipped) {
918             Err(_) => core::u32::MAX,
919             Ok(skipped) => self.skipped.saturating_add(skipped),
920         };
921     }
922 
923     /// Return true if and only if this state indicates that a prefilter is
924     /// still effective.
925     #[inline]
is_effective(&mut self) -> bool926     fn is_effective(&mut self) -> bool {
927         if self.is_inert() {
928             return false;
929         }
930         if self.skips() < PrefilterState::MIN_SKIPS {
931             return true;
932         }
933         if self.skipped >= PrefilterState::MIN_SKIP_BYTES * self.skips() {
934             return true;
935         }
936 
937         // We're inert.
938         self.skips = 0;
939         false
940     }
941 
942     /// Returns true if the prefilter this state represents should no longer
943     /// be used.
944     #[inline]
is_inert(&self) -> bool945     fn is_inert(&self) -> bool {
946         self.skips == 0
947     }
948 
949     /// Returns the total number of times the prefilter has been used.
950     #[inline]
skips(&self) -> u32951     fn skips(&self) -> u32 {
952         // Remember, `0` is a sentinel value indicating inertness, so we
953         // always need to subtract `1` to get our actual number of skips.
954         self.skips.saturating_sub(1)
955     }
956 }
957 
958 /// A combination of prefilter effectiveness state and the prefilter itself.
959 #[derive(Debug)]
960 pub(crate) struct Pre<'a> {
961     /// State that tracks the effectiveness of a prefilter.
962     prestate: &'a mut PrefilterState,
963     /// The actual prefilter.
964     prestrat: &'a Prefilter,
965 }
966 
967 impl<'a> Pre<'a> {
968     /// Call this prefilter on the given haystack with the given needle.
969     #[inline]
find(&mut self, haystack: &[u8]) -> Option<usize>970     pub(crate) fn find(&mut self, haystack: &[u8]) -> Option<usize> {
971         let result = self.prestrat.find(haystack);
972         self.prestate.update(result.unwrap_or(haystack.len()));
973         result
974     }
975 
976     /// Return true if and only if this prefilter should be used.
977     #[inline]
is_effective(&mut self) -> bool978     pub(crate) fn is_effective(&mut self) -> bool {
979         self.prestate.is_effective()
980     }
981 }
982 
983 /// Returns true if the needle has the right characteristics for a vector
984 /// algorithm to handle the entirety of substring search.
985 ///
986 /// Vector algorithms can be used for prefilters for other substring search
987 /// algorithms (like Two-Way), but they can also be used for substring search
988 /// on their own. When used for substring search, vector algorithms will
989 /// quickly identify candidate match positions (just like in the prefilter
990 /// case), but instead of returning the candidate position they will try to
991 /// confirm the match themselves. Confirmation happens via `memcmp`. This
992 /// works well for short needles, but can break down when many false candidate
993 /// positions are generated for large needles. Thus, we only permit vector
994 /// algorithms to own substring search when the needle is of a certain length.
995 #[inline]
do_packed_search(needle: &[u8]) -> bool996 fn do_packed_search(needle: &[u8]) -> bool {
997     /// The minimum length of a needle required for this algorithm. The minimum
998     /// is 2 since a length of 1 should just use memchr and a length of 0 isn't
999     /// a case handled by this searcher.
1000     const MIN_LEN: usize = 2;
1001 
1002     /// The maximum length of a needle required for this algorithm.
1003     ///
1004     /// In reality, there is no hard max here. The code below can handle any
1005     /// length needle. (Perhaps that suggests there are missing optimizations.)
1006     /// Instead, this is a heuristic and a bound guaranteeing our linear time
1007     /// complexity.
1008     ///
1009     /// It is a heuristic because when a candidate match is found, memcmp is
1010     /// run. For very large needles with lots of false positives, memcmp can
1011     /// make the code run quite slow.
1012     ///
1013     /// It is a bound because the worst case behavior with memcmp is
1014     /// multiplicative in the size of the needle and haystack, and we want
1015     /// to keep that additive. This bound ensures we still meet that bound
1016     /// theoretically, since it's just a constant. We aren't acting in bad
1017     /// faith here, memcmp on tiny needles is so fast that even in pathological
1018     /// cases (see pathological vector benchmarks), this is still just as fast
1019     /// or faster in practice.
1020     ///
1021     /// This specific number was chosen by tweaking a bit and running
1022     /// benchmarks. The rare-medium-needle, for example, gets about 5% faster
1023     /// by using this algorithm instead of a prefilter-accelerated Two-Way.
1024     /// There's also a theoretical desire to keep this number reasonably
1025     /// low, to mitigate the impact of pathological cases. I did try 64, and
1026     /// some benchmarks got a little better, and others (particularly the
1027     /// pathological ones), got a lot worse. So... 32 it is?
1028     const MAX_LEN: usize = 32;
1029     MIN_LEN <= needle.len() && needle.len() <= MAX_LEN
1030 }
1031