1 /*
2 This module defines benchmarks for the memmem family of functions.
3 Benchmarking a substring algorithm is particularly difficult, especially
4 when implementations (like this one, and others) use heuristics to speed up
5 common cases, typically at the expense of less common cases. The job of this
6 benchmark suite is to not only highlight the fast common cases, but to also put
7 a spotlight on the less common or pathological cases. While some things are
8 generally expected to be slower because of these heuristics, the benchmarks
9 help us make sure they we don't let things get too slow.
10
11 The naming scheme is as follows:
12
13 memr?mem/{impl}/{config}/{corpus}/{needle}
14
15 Where {...} is a variable. Variables should never contain slashes. They are as
16 follows:
17
18 impl
19 A brief name describing the implementation under test. Possible values:
20
21 krate
22 The implementation provided by this crate.
23 krate-nopre
24 The implementation provided by this crate without prefilters enabled.
25 bstr
26 The implementation provided by the bstr crate.
27 N.B. This is only applicable at time of writing, since bstr will
28 eventually just use this crate.
29 regex
30 The implementation of substring search provided by the regex crate.
31 N.B. This is only applicable at time of writing, since regex will
32 eventually just use this crate.
33 stud
34 The implementation of substring search provided by the standard
35 library. This implementation only works on valid UTF-8 by virtue of
36 how its API is exposed.
37 twoway
38 The implementation of substring search provided by the twoway crate.
39 sliceslice
40 The implementation of substring search provided by the sliceslice crate.
41 libc
42 The implementation of memmem in your friendly neighborhood libc.
43
44 Note that there is also a 'memmem' crate, but it is unmaintained and
45 appears to just be a snapshot of std's implementation at a particular
46 point in time (but exposed in a way to permit it to search arbitrary
47 bytes).
48
49 config
50 This should be a brief description of the configuration of the search. Not
51 all implementations can be benchmarked in all configurations. It depends on
52 the API they expose. Possible values:
53
54 oneshot
55 Executes a single search without pre-building a searcher. That
56 this measurement includes the time it takes to initialize a
57 searcher.
58 prebuilt
59 Executes a single search without measuring the time it takes to
60 build a searcher.
61 iter-oneshot
62 Counts the total number of matches. This measures the time it takes to
63 build the searcher.
64 iter-prebuilt
65 Counts the total number of matches. This does not measure the time it
66 takes to build the searcher.
67
68 corpus
69 A brief name describing the corpus or haystack used in the benchmark. In
70 general, we vary this with regard to size and language. Possible values:
71
72 subtitles-{en,ru,zh}
73 Text from the OpenSubtitles project, in one of English, Russian or
74 Chinese. This is the primary input meant to represent most kinds of
75 haystacks.
76 pathological-{...}
77 A haystack that has been specifically constructed to exploit a
78 pathological case in or more substring search implementations.
79 sliceslice-words
80 The haystack is varied across words in an English dictionary. Using
81 this corpus means the benchmark is measuring performance on very small
82 haystacks. This was taken from the sliceslice crate benchmarks.
83 sliceslice-i386
84 The haystack is an Intel 80386 reference manual.
85 This was also taken from the sliceslice crate benchmarks.
86
87 needle
88 A brief name describing the needle used. Unlike other variables, there
89 isn't a strong controlled vocabularly for this parameter. The needle
90 variable is meant to be largely self explanatory. For example, a needle
91 named "rare" probably means that the number of occurrences of the needle
92 is expected to be particularly low.
93 */
94
95 use criterion::Criterion;
96
97 use crate::{define, memmem::inputs::INPUTS};
98
99 mod imp;
100 mod inputs;
101 mod sliceslice;
102
all(c: &mut Criterion)103 pub fn all(c: &mut Criterion) {
104 oneshot(c);
105 prebuilt(c);
106 oneshot_iter(c);
107 prebuilt_iter(c);
108 sliceslice::all(c);
109 }
110
oneshot(c: &mut Criterion)111 fn oneshot(c: &mut Criterion) {
112 macro_rules! def_impl {
113 ($inp:expr, $q:expr, $freq:expr, $impl:ident) => {
114 let config = "oneshot";
115 let available = imp::$impl::available($q.needle);
116 // We only define non-iter benchmarks when the count is <=1. Such
117 // queries are usually constructed to only appear at the end.
118 // Otherwise, for more common queries, the benchmark would be
119 // approximately duplicative with benchmarks on shorter haystacks
120 // for the implementations we benchmark.
121 if $q.count <= 1 && available.contains(&config) {
122 let expected = $q.count > 0;
123 macro_rules! define {
124 ($dir:expr, $find:expr) => {
125 let name = format!(
126 "{dir}/{imp}/{config}/{inp}/{freq}-{q}",
127 dir = $dir,
128 imp = stringify!($impl),
129 config = config,
130 inp = $inp.name,
131 freq = $freq,
132 q = $q.name,
133 );
134 define(
135 c,
136 &name,
137 $inp.corpus.as_bytes(),
138 Box::new(move |b| {
139 b.iter(|| {
140 assert_eq!(
141 expected,
142 $find($inp.corpus, $q.needle)
143 );
144 });
145 }),
146 );
147 };
148 }
149 define!("memmem", imp::$impl::fwd::oneshot);
150 if available.contains(&"reverse") {
151 define!("memrmem", imp::$impl::rev::oneshot);
152 }
153 }
154 };
155 }
156 macro_rules! def_all_impls {
157 ($inp:expr, $q:expr, $freq:expr) => {
158 def_impl!($inp, $q, $freq, krate);
159 def_impl!($inp, $q, $freq, krate_nopre);
160 def_impl!($inp, $q, $freq, bstr);
161 def_impl!($inp, $q, $freq, regex);
162 def_impl!($inp, $q, $freq, stud);
163 def_impl!($inp, $q, $freq, twoway);
164 def_impl!($inp, $q, $freq, sliceslice);
165 def_impl!($inp, $q, $freq, libc);
166 };
167 }
168 for inp in INPUTS {
169 for q in inp.never {
170 def_all_impls!(inp, q, "never");
171 }
172 for q in inp.rare {
173 def_all_impls!(inp, q, "rare");
174 }
175 for q in inp.common {
176 def_all_impls!(inp, q, "common");
177 }
178 }
179 }
180
prebuilt(c: &mut Criterion)181 fn prebuilt(c: &mut Criterion) {
182 macro_rules! def_impl {
183 ($inp:expr, $q:expr, $freq:expr, $impl:ident) => {
184 let config = "prebuilt";
185 let available = imp::$impl::available($q.needle);
186 // We only define non-iter benchmarks when the count is <=1. Such
187 // queries are usually constructed to only appear at the end.
188 // Otherwise, for more common queries, the benchmark would be
189 // approximately duplicative with benchmarks on shorter haystacks
190 // for the implementations we benchmark.
191 if $q.count <= 1 && available.contains(&config) {
192 let expected = $q.count > 0;
193 macro_rules! define {
194 ($dir:expr, $new_finder:expr) => {
195 let name = format!(
196 "{dir}/{imp}/{config}/{inp}/{freq}-{q}",
197 dir = $dir,
198 imp = stringify!($impl),
199 config = config,
200 inp = $inp.name,
201 freq = $freq,
202 q = $q.name,
203 );
204 define(
205 c,
206 &name,
207 $inp.corpus.as_bytes(),
208 Box::new(move |b| {
209 let find = $new_finder($q.needle);
210 b.iter(|| {
211 assert_eq!(expected, find($inp.corpus));
212 });
213 }),
214 );
215 };
216 }
217 define!("memmem", imp::$impl::fwd::prebuilt);
218 if available.contains(&"reverse") {
219 define!("memrmem", imp::$impl::rev::prebuilt);
220 }
221 }
222 };
223 }
224 macro_rules! def_all_impls {
225 ($inp:expr, $q:expr, $freq:expr) => {
226 def_impl!($inp, $q, $freq, krate);
227 def_impl!($inp, $q, $freq, krate_nopre);
228 def_impl!($inp, $q, $freq, bstr);
229 def_impl!($inp, $q, $freq, regex);
230 def_impl!($inp, $q, $freq, stud);
231 def_impl!($inp, $q, $freq, twoway);
232 def_impl!($inp, $q, $freq, sliceslice);
233 def_impl!($inp, $q, $freq, libc);
234 };
235 }
236 for inp in INPUTS {
237 for q in inp.never {
238 def_all_impls!(inp, q, "never");
239 }
240 for q in inp.rare {
241 def_all_impls!(inp, q, "rare");
242 }
243 for q in inp.common {
244 def_all_impls!(inp, q, "common");
245 }
246 }
247 }
248
oneshot_iter(c: &mut Criterion)249 fn oneshot_iter(c: &mut Criterion) {
250 macro_rules! def_impl {
251 ($inp:expr, $q:expr, $freq:expr, $impl:ident) => {
252 let config = "oneshotiter";
253 let available = imp::$impl::available($q.needle);
254 // We only define iter benchmarks when the count is >1. Since
255 // queries with count<=1 are usually constructed such that the
256 // match appears at the end of the haystack, it doesn't make much
257 // sense to also benchmark iteration for that case. Instead, we only
258 // benchmark iteration for queries that match more frequently.
259 if $q.count > 1 && available.contains(&config) {
260 macro_rules! define {
261 ($dir:expr, $find_iter:expr) => {
262 let name = format!(
263 "{dir}/{imp}/{config}/{inp}/{freq}-{q}",
264 dir = $dir,
265 imp = stringify!($impl),
266 config = config,
267 inp = $inp.name,
268 freq = $freq,
269 q = $q.name,
270 );
271 define(
272 c,
273 &name,
274 $inp.corpus.as_bytes(),
275 Box::new(move |b| {
276 b.iter(|| {
277 let it =
278 $find_iter($inp.corpus, $q.needle);
279 assert_eq!($q.count, it.count());
280 });
281 }),
282 );
283 };
284 }
285 define!("memmem", imp::$impl::fwd::oneshotiter);
286 if available.contains(&"reverse") {
287 define!("memrmem", imp::$impl::rev::oneshotiter);
288 }
289 }
290 };
291 }
292 macro_rules! def_all_impls {
293 ($inp:expr, $q:expr, $freq:expr) => {
294 def_impl!($inp, $q, $freq, krate);
295 def_impl!($inp, $q, $freq, krate_nopre);
296 def_impl!($inp, $q, $freq, bstr);
297 def_impl!($inp, $q, $freq, regex);
298 def_impl!($inp, $q, $freq, stud);
299 def_impl!($inp, $q, $freq, twoway);
300 def_impl!($inp, $q, $freq, sliceslice);
301 def_impl!($inp, $q, $freq, libc);
302 };
303 }
304 for inp in INPUTS {
305 for q in inp.never {
306 def_all_impls!(inp, q, "never");
307 }
308 for q in inp.rare {
309 def_all_impls!(inp, q, "rare");
310 }
311 for q in inp.common {
312 def_all_impls!(inp, q, "common");
313 }
314 }
315 }
316
prebuilt_iter(c: &mut Criterion)317 fn prebuilt_iter(c: &mut Criterion) {
318 macro_rules! def_impl {
319 ($inp:expr, $q:expr, $freq:expr, $impl:ident) => {
320 let config = "prebuiltiter";
321 let available = imp::$impl::available($q.needle);
322 // We only define iter benchmarks when the count is >1. Since
323 // queries with count<=1 are usually constructed such that the
324 // match appears at the end of the haystack, it doesn't make much
325 // sense to also benchmark iteration for that case. Instead, we only
326 // benchmark iteration for queries that match more frequently.
327 if $q.count > 1 && available.contains(&config) {
328 macro_rules! define {
329 ($dir:expr, $new_finder:expr) => {
330 let name = format!(
331 "{dir}/{imp}/{config}/{inp}/{freq}-{q}",
332 dir = $dir,
333 imp = stringify!($impl),
334 config = config,
335 inp = $inp.name,
336 freq = $freq,
337 q = $q.name,
338 );
339 define(
340 c,
341 &name,
342 $inp.corpus.as_bytes(),
343 Box::new(move |b| {
344 let finder = $new_finder($q.needle);
345 b.iter(|| {
346 let it = finder.iter($inp.corpus);
347 assert_eq!($q.count, it.count());
348 });
349 }),
350 );
351 };
352 }
353 define!("memmem", imp::$impl::fwd::prebuiltiter);
354 if available.contains(&"reverse") {
355 define!("memrmem", imp::$impl::rev::prebuiltiter);
356 }
357 }
358 };
359 }
360 macro_rules! def_all_impls {
361 ($inp:expr, $q:expr, $freq:expr) => {
362 def_impl!($inp, $q, $freq, krate);
363 def_impl!($inp, $q, $freq, krate_nopre);
364 def_impl!($inp, $q, $freq, bstr);
365 def_impl!($inp, $q, $freq, regex);
366 def_impl!($inp, $q, $freq, stud);
367 def_impl!($inp, $q, $freq, twoway);
368 def_impl!($inp, $q, $freq, sliceslice);
369 def_impl!($inp, $q, $freq, libc);
370 };
371 }
372 for inp in INPUTS {
373 for q in inp.never {
374 def_all_impls!(inp, q, "never");
375 }
376 for q in inp.rare {
377 def_all_impls!(inp, q, "rare");
378 }
379 for q in inp.common {
380 def_all_impls!(inp, q, "common");
381 }
382 }
383 }
384