1 use std::iter::repeat;
2
3 /// Create a sequence of tests that should be run by memchr implementations.
memchr_tests() -> Vec<MemchrTest>4 pub fn memchr_tests() -> Vec<MemchrTest> {
5 let mut tests = Vec::new();
6 for statict in MEMCHR_TESTS {
7 assert!(!statict.corpus.contains("%"), "% is not allowed in corpora");
8 assert!(!statict.corpus.contains("#"), "# is not allowed in corpora");
9 assert!(!statict.needles.contains(&b'%'), "% is an invalid needle");
10 assert!(!statict.needles.contains(&b'#'), "# is an invalid needle");
11
12 let t = MemchrTest {
13 corpus: statict.corpus.to_string(),
14 needles: statict.needles.to_vec(),
15 positions: statict.positions.to_vec(),
16 };
17 tests.push(t.clone());
18 tests.extend(t.expand());
19 }
20 tests
21 }
22
23 /// A set of tests for memchr-like functions.
24 ///
25 /// These tests mostly try to cover the short string cases. We cover the longer
26 /// string cases via the benchmarks (which are tests themselves), via
27 /// quickcheck tests and via automatic expansion of each test case (by
28 /// increasing the corpus size). Finally, we cover different alignment cases
29 /// in the tests by varying the starting point of the slice.
30 const MEMCHR_TESTS: &[MemchrTestStatic] = &[
31 // one needle (applied to memchr + memchr2 + memchr3)
32 MemchrTestStatic { corpus: "a", needles: &[b'a'], positions: &[0] },
33 MemchrTestStatic { corpus: "aa", needles: &[b'a'], positions: &[0, 1] },
34 MemchrTestStatic {
35 corpus: "aaa",
36 needles: &[b'a'],
37 positions: &[0, 1, 2],
38 },
39 MemchrTestStatic { corpus: "", needles: &[b'a'], positions: &[] },
40 MemchrTestStatic { corpus: "z", needles: &[b'a'], positions: &[] },
41 MemchrTestStatic { corpus: "zz", needles: &[b'a'], positions: &[] },
42 MemchrTestStatic { corpus: "zza", needles: &[b'a'], positions: &[2] },
43 MemchrTestStatic { corpus: "zaza", needles: &[b'a'], positions: &[1, 3] },
44 MemchrTestStatic { corpus: "zzza", needles: &[b'a'], positions: &[3] },
45 MemchrTestStatic { corpus: "\x00a", needles: &[b'a'], positions: &[1] },
46 MemchrTestStatic { corpus: "\x00", needles: &[b'\x00'], positions: &[0] },
47 MemchrTestStatic {
48 corpus: "\x00\x00",
49 needles: &[b'\x00'],
50 positions: &[0, 1],
51 },
52 MemchrTestStatic {
53 corpus: "\x00a\x00",
54 needles: &[b'\x00'],
55 positions: &[0, 2],
56 },
57 MemchrTestStatic {
58 corpus: "zzzzzzzzzzzzzzzza",
59 needles: &[b'a'],
60 positions: &[16],
61 },
62 MemchrTestStatic {
63 corpus: "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzza",
64 needles: &[b'a'],
65 positions: &[32],
66 },
67 // two needles (applied to memchr2 + memchr3)
68 MemchrTestStatic {
69 corpus: "az",
70 needles: &[b'a', b'z'],
71 positions: &[0, 1],
72 },
73 MemchrTestStatic {
74 corpus: "az",
75 needles: &[b'a', b'z'],
76 positions: &[0, 1],
77 },
78 MemchrTestStatic { corpus: "az", needles: &[b'x', b'y'], positions: &[] },
79 MemchrTestStatic { corpus: "az", needles: &[b'a', b'y'], positions: &[0] },
80 MemchrTestStatic { corpus: "az", needles: &[b'x', b'z'], positions: &[1] },
81 MemchrTestStatic {
82 corpus: "yyyyaz",
83 needles: &[b'a', b'z'],
84 positions: &[4, 5],
85 },
86 MemchrTestStatic {
87 corpus: "yyyyaz",
88 needles: &[b'z', b'a'],
89 positions: &[4, 5],
90 },
91 // three needles (applied to memchr3)
92 MemchrTestStatic {
93 corpus: "xyz",
94 needles: &[b'x', b'y', b'z'],
95 positions: &[0, 1, 2],
96 },
97 MemchrTestStatic {
98 corpus: "zxy",
99 needles: &[b'x', b'y', b'z'],
100 positions: &[0, 1, 2],
101 },
102 MemchrTestStatic {
103 corpus: "zxy",
104 needles: &[b'x', b'a', b'z'],
105 positions: &[0, 1],
106 },
107 MemchrTestStatic {
108 corpus: "zxy",
109 needles: &[b't', b'a', b'z'],
110 positions: &[0],
111 },
112 MemchrTestStatic {
113 corpus: "yxz",
114 needles: &[b't', b'a', b'z'],
115 positions: &[2],
116 },
117 ];
118
119 /// A description of a test on a memchr like function.
120 #[derive(Clone, Debug)]
121 pub struct MemchrTest {
122 /// The thing to search. We use `&str` instead of `&[u8]` because they
123 /// are nicer to write in tests, and we don't miss much since memchr
124 /// doesn't care about UTF-8.
125 ///
126 /// Corpora cannot contain either '%' or '#'. We use these bytes when
127 /// expanding test cases into many test cases, and we assume they are not
128 /// used. If they are used, `memchr_tests` will panic.
129 corpus: String,
130 /// The needles to search for. This is intended to be an "alternation" of
131 /// needles. The number of needles may cause this test to be skipped for
132 /// some memchr variants. For example, a test with 2 needles cannot be used
133 /// to test `memchr`, but can be used to test `memchr2` and `memchr3`.
134 /// However, a test with only 1 needle can be used to test all of `memchr`,
135 /// `memchr2` and `memchr3`. We achieve this by filling in the needles with
136 /// bytes that we never used in the corpus (such as '#').
137 needles: Vec<u8>,
138 /// The positions expected to match for all of the needles.
139 positions: Vec<usize>,
140 }
141
142 /// Like MemchrTest, but easier to define as a constant.
143 #[derive(Clone, Debug)]
144 pub struct MemchrTestStatic {
145 corpus: &'static str,
146 needles: &'static [u8],
147 positions: &'static [usize],
148 }
149
150 impl MemchrTest {
one<F: Fn(u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F)151 pub fn one<F: Fn(u8, &[u8]) -> Option<usize>>(&self, reverse: bool, f: F) {
152 let needles = match self.needles(1) {
153 None => return,
154 Some(needles) => needles,
155 };
156 // We test different alignments here. Since some implementations use
157 // AVX2, which can read 32 bytes at a time, we test at least that.
158 // Moreover, with loop unrolling, we sometimes process 64 (sse2) or 128
159 // (avx) bytes at a time, so we include that in our offsets as well.
160 //
161 // You might think this would cause most needles to not be found, but
162 // we actually expand our tests to include corpus sizes all the way up
163 // to >500 bytes, so we should exercise most branches.
164 for align in 0..130 {
165 let corpus = self.corpus(align);
166 assert_eq!(
167 self.positions(align, reverse).get(0).cloned(),
168 f(needles[0], corpus.as_bytes()),
169 "search for {:?} failed in: {:?} (len: {}, alignment: {})",
170 needles[0] as char,
171 corpus,
172 corpus.len(),
173 align
174 );
175 }
176 }
177
two<F: Fn(u8, u8, &[u8]) -> Option<usize>>( &self, reverse: bool, f: F, )178 pub fn two<F: Fn(u8, u8, &[u8]) -> Option<usize>>(
179 &self,
180 reverse: bool,
181 f: F,
182 ) {
183 let needles = match self.needles(2) {
184 None => return,
185 Some(needles) => needles,
186 };
187 for align in 0..130 {
188 let corpus = self.corpus(align);
189 assert_eq!(
190 self.positions(align, reverse).get(0).cloned(),
191 f(needles[0], needles[1], corpus.as_bytes()),
192 "search for {:?}|{:?} failed in: {:?} \
193 (len: {}, alignment: {})",
194 needles[0] as char,
195 needles[1] as char,
196 corpus,
197 corpus.len(),
198 align
199 );
200 }
201 }
202
three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>( &self, reverse: bool, f: F, )203 pub fn three<F: Fn(u8, u8, u8, &[u8]) -> Option<usize>>(
204 &self,
205 reverse: bool,
206 f: F,
207 ) {
208 let needles = match self.needles(3) {
209 None => return,
210 Some(needles) => needles,
211 };
212 for align in 0..130 {
213 let corpus = self.corpus(align);
214 assert_eq!(
215 self.positions(align, reverse).get(0).cloned(),
216 f(needles[0], needles[1], needles[2], corpus.as_bytes()),
217 "search for {:?}|{:?}|{:?} failed in: {:?} \
218 (len: {}, alignment: {})",
219 needles[0] as char,
220 needles[1] as char,
221 needles[2] as char,
222 corpus,
223 corpus.len(),
224 align
225 );
226 }
227 }
228
iter_one<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, &'a [u8]) -> I, I: Iterator<Item = usize>,229 pub fn iter_one<'a, I, F>(&'a self, reverse: bool, f: F)
230 where
231 F: FnOnce(u8, &'a [u8]) -> I,
232 I: Iterator<Item = usize>,
233 {
234 if let Some(ns) = self.needles(1) {
235 self.iter(reverse, f(ns[0], self.corpus.as_bytes()));
236 }
237 }
238
iter_two<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, u8, &'a [u8]) -> I, I: Iterator<Item = usize>,239 pub fn iter_two<'a, I, F>(&'a self, reverse: bool, f: F)
240 where
241 F: FnOnce(u8, u8, &'a [u8]) -> I,
242 I: Iterator<Item = usize>,
243 {
244 if let Some(ns) = self.needles(2) {
245 self.iter(reverse, f(ns[0], ns[1], self.corpus.as_bytes()));
246 }
247 }
248
iter_three<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, u8, u8, &'a [u8]) -> I, I: Iterator<Item = usize>,249 pub fn iter_three<'a, I, F>(&'a self, reverse: bool, f: F)
250 where
251 F: FnOnce(u8, u8, u8, &'a [u8]) -> I,
252 I: Iterator<Item = usize>,
253 {
254 if let Some(ns) = self.needles(3) {
255 self.iter(reverse, f(ns[0], ns[1], ns[2], self.corpus.as_bytes()));
256 }
257 }
258
259 /// Test that the positions yielded by the given iterator match the
260 /// positions in this test. If reverse is true, then reverse the positions
261 /// before comparing them.
iter<I: Iterator<Item = usize>>(&self, reverse: bool, it: I)262 fn iter<I: Iterator<Item = usize>>(&self, reverse: bool, it: I) {
263 assert_eq!(
264 self.positions(0, reverse),
265 it.collect::<Vec<usize>>(),
266 r"search for {:?} failed in: {:?}",
267 self.needles.iter().map(|&b| b as char).collect::<Vec<char>>(),
268 self.corpus
269 );
270 }
271
272 /// Expand this test into many variations of the same test.
273 ///
274 /// In particular, this will generate more tests with larger corpus sizes.
275 /// The expected positions are updated to maintain the integrity of the
276 /// test.
277 ///
278 /// This is important in testing a memchr implementation, because there are
279 /// often different cases depending on the length of the corpus.
280 ///
281 /// Note that we extend the corpus by adding `%` bytes, which we
282 /// don't otherwise use as a needle.
expand(&self) -> Vec<MemchrTest>283 fn expand(&self) -> Vec<MemchrTest> {
284 let mut more = Vec::new();
285
286 // Add bytes to the start of the corpus.
287 for i in 1..515 {
288 let mut t = self.clone();
289 let mut new_corpus: String = repeat('%').take(i).collect();
290 new_corpus.push_str(&t.corpus);
291 t.corpus = new_corpus;
292 t.positions = t.positions.into_iter().map(|p| p + i).collect();
293 more.push(t);
294 }
295 // Add bytes to the end of the corpus.
296 for i in 1..515 {
297 let mut t = self.clone();
298 let padding: String = repeat('%').take(i).collect();
299 t.corpus.push_str(&padding);
300 more.push(t);
301 }
302
303 more
304 }
305
306 /// Return the corpus at the given alignment.
307 ///
308 /// If the alignment exceeds the length of the corpus, then this returns
309 /// an empty slice.
corpus(&self, align: usize) -> &str310 fn corpus(&self, align: usize) -> &str {
311 self.corpus.get(align..).unwrap_or("")
312 }
313
314 /// Return exactly `count` needles from this test. If this test has less
315 /// than `count` needles, then add `#` until the number of needles
316 /// matches `count`. If this test has more than `count` needles, then
317 /// return `None` (because there is no way to use this test data for a
318 /// search using fewer needles).
needles(&self, count: usize) -> Option<Vec<u8>>319 fn needles(&self, count: usize) -> Option<Vec<u8>> {
320 if self.needles.len() > count {
321 return None;
322 }
323
324 let mut needles = self.needles.to_vec();
325 for _ in needles.len()..count {
326 // we assume # is never used in tests.
327 needles.push(b'#');
328 }
329 Some(needles)
330 }
331
332 /// Return the positions in this test, reversed if `reverse` is true.
333 ///
334 /// If alignment is given, then all positions greater than or equal to that
335 /// alignment are offset by the alignment. Positions less than the
336 /// alignment are dropped.
positions(&self, align: usize, reverse: bool) -> Vec<usize>337 fn positions(&self, align: usize, reverse: bool) -> Vec<usize> {
338 let positions = if reverse {
339 let mut positions = self.positions.to_vec();
340 positions.reverse();
341 positions
342 } else {
343 self.positions.to_vec()
344 };
345 positions
346 .into_iter()
347 .filter(|&p| p >= align)
348 .map(|p| p - align)
349 .collect()
350 }
351 }
352