• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // These routines are meant to be optimized specifically for low latency as
2 // compared to the equivalent routines offered by std. (Which may invoke the
3 // dynamic linker and call out to libc, which introduces a bit more latency
4 // than we'd like.)
5 
6 /// Returns true if and only if needle is a prefix of haystack.
7 #[inline(always)]
is_prefix(haystack: &[u8], needle: &[u8]) -> bool8 pub(crate) fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
9     needle.len() <= haystack.len() && memcmp(&haystack[..needle.len()], needle)
10 }
11 
12 /// Returns true if and only if needle is a suffix of haystack.
13 #[inline(always)]
is_suffix(haystack: &[u8], needle: &[u8]) -> bool14 pub(crate) fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool {
15     needle.len() <= haystack.len()
16         && memcmp(&haystack[haystack.len() - needle.len()..], needle)
17 }
18 
19 /// Return true if and only if x.len() == y.len() && x[i] == y[i] for all
20 /// 0 <= i < x.len().
21 ///
22 /// Why not just use actual memcmp for this? Well, memcmp requires calling out
23 /// to libc, and this routine is called in fairly hot code paths. Other than
24 /// just calling out to libc, it also seems to result in worse codegen. By
25 /// rolling our own memcmp in pure Rust, it seems to appear more friendly to
26 /// the optimizer.
27 ///
28 /// We mark this as inline always, although, some callers may not want it
29 /// inlined for better codegen (like Rabin-Karp). In that case, callers are
30 /// advised to create a non-inlineable wrapper routine that calls memcmp.
31 #[inline(always)]
memcmp(x: &[u8], y: &[u8]) -> bool32 pub(crate) fn memcmp(x: &[u8], y: &[u8]) -> bool {
33     if x.len() != y.len() {
34         return false;
35     }
36     // If we don't have enough bytes to do 4-byte at a time loads, then
37     // fall back to the naive slow version.
38     //
39     // TODO: We could do a copy_nonoverlapping combined with a mask instead
40     // of a loop. Benchmark it.
41     if x.len() < 4 {
42         for (&b1, &b2) in x.iter().zip(y) {
43             if b1 != b2 {
44                 return false;
45             }
46         }
47         return true;
48     }
49     // When we have 4 or more bytes to compare, then proceed in chunks of 4 at
50     // a time using unaligned loads.
51     //
52     // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
53     // that this particular version of memcmp is likely to be called with tiny
54     // needles. That means that if we do 8 byte loads, then a higher proportion
55     // of memcmp calls will use the slower variant above. With that said, this
56     // is a hypothesis and is only loosely supported by benchmarks. There's
57     // likely some improvement that could be made here. The main thing here
58     // though is to optimize for latency, not throughput.
59 
60     // SAFETY: Via the conditional above, we know that both `px` and `py`
61     // have the same length, so `px < pxend` implies that `py < pyend`.
62     // Thus, derefencing both `px` and `py` in the loop below is safe.
63     //
64     // Moreover, we set `pxend` and `pyend` to be 4 bytes before the actual
65     // end of of `px` and `py`. Thus, the final dereference outside of the
66     // loop is guaranteed to be valid. (The final comparison will overlap with
67     // the last comparison done in the loop for lengths that aren't multiples
68     // of four.)
69     //
70     // Finally, we needn't worry about alignment here, since we do unaligned
71     // loads.
72     unsafe {
73         let (mut px, mut py) = (x.as_ptr(), y.as_ptr());
74         let (pxend, pyend) = (px.add(x.len() - 4), py.add(y.len() - 4));
75         while px < pxend {
76             let vx = (px as *const u32).read_unaligned();
77             let vy = (py as *const u32).read_unaligned();
78             if vx != vy {
79                 return false;
80             }
81             px = px.add(4);
82             py = py.add(4);
83         }
84         let vx = (pxend as *const u32).read_unaligned();
85         let vy = (pyend as *const u32).read_unaligned();
86         vx == vy
87     }
88 }
89