• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 use test::Bencher;
2 
3 use crate::{Regex, Text};
4 
5 // USAGE: sherlock!(name, pattern, count)
6 //
7 // This is same as bench_find, except it always uses the Sherlock haystack.
8 macro_rules! sherlock {
9     ($name:ident, $pattern:expr, $count:expr) => {
10         bench_find!(
11             $name,
12             $pattern,
13             $count,
14             include_str!("data/sherlock.txt").to_owned()
15         );
16     };
17 }
18 
19 // These patterns are all single string literals that compile down to a variant
20 // of Boyer-Moore w/ memchr. This also demonstrates the impact that the
21 // frequency of a match has on performance.
22 sherlock!(name_sherlock, r"Sherlock", 97);
23 sherlock!(name_holmes, r"Holmes", 461);
24 sherlock!(name_sherlock_holmes, r"Sherlock Holmes", 91);
25 
26 // Like the above, except case insensitively. The prefix detector will extract
27 // multiple *cut* prefix literals for each of the following before hitting its
28 // limit. All of these should be able to use either memchr2 or memchr3.
29 // std C++ does not support inline modifier syntax
30 sherlock!(name_sherlock_nocase, r"(?i)Sherlock", 102);
31 sherlock!(name_holmes_nocase, r"(?i)Holmes", 467);
32 sherlock!(name_sherlock_holmes_nocase, r"(?i)Sherlock Holmes", 96);
33 
34 // Will quickly find instances of 'Sherlock', but then needs to fall back to
35 // the lazy DFA to process the Unicode aware `\s`.
36 sherlock!(name_whitespace, r"Sherlock\s+Holmes", 97);
37 
38 // Now try more variations on name matching.
39 // This one has two alternates that both start with 'S'. This should compile
40 // to an Aho-Corasick automaton that uses memchr. Never enters lazy DFA.
41 sherlock!(name_alt1, r"Sherlock|Street", 158);
42 // This one doesn't have a common byte, but should still use Aho-Corasick and
43 // memchr2.
44 // Never enters lazy DFA.
45 sherlock!(name_alt2, r"Sherlock|Holmes", 558);
46 // Still using Aho-Corasick, but more patterns. Never enters lazy DFA but
47 // also can't use any memchr variant.
48 sherlock!(name_alt3, r"Sherlock|Holmes|Watson|Irene|Adler|John|Baker", 740);
49 // Still using Aho-Corasick, but needs the lazy DFA.
50 sherlock!(
51     name_alt3_nocase,
52     r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker",
53     753
54 );
55 // Should still use Aho-Corasick for the prefixes in each alternate, but
56 // we need to use the lazy DFA to complete it.
57 sherlock!(name_alt4, r"Sher[a-z]+|Hol[a-z]+", 582);
58 sherlock!(name_alt4_nocase, r"(?i)Sher[a-z]+|Hol[a-z]+", 697);
59 // Uses Aho-Corasick, but can use memchr3 (unlike name_alt3).
60 sherlock!(name_alt5, r"Sherlock|Holmes|Watson", 639);
61 sherlock!(name_alt5_nocase, r"(?i)Sherlock|Holmes|Watson", 650);
62 
63 // How long does it take to discover that there's no match? In the first two
64 // cases, we detect the rarest byte in the literal to run memchr on. In the
65 // first, it's 'z' and in the second it's 'j'. The third case only has common
66 // letters, and is therefore slower.
67 sherlock!(no_match_uncommon, r"zqj", 0);
68 sherlock!(no_match_common, r"aqj", 0);
69 sherlock!(no_match_really_common, r"aei", 0);
70 
71 // Various twiddling on very common words. This tends to stress the constant
72 // overhead of actually reporting a match. (None of these actually enter any
73 // matching engines.)
74 sherlock!(the_lower, r"the", 7218);
75 sherlock!(the_upper, r"The", 741);
76 sherlock!(the_nocase, r"(?i)the", 7987);
77 
78 // Process whitespace after a very common word.
79 // Uses Boyer-Moore to find `the` and the lazy DFA for the rest.
80 sherlock!(the_whitespace, r"the\s+\w+", 5410);
81 
82 // How fast can we match everything? This essentially defeats any clever prefix
83 // tricks and just executes the DFA across the entire input.
84 #[cfg(not(feature = "re-pcre1"))]
85 #[cfg(not(feature = "re-pcre2"))]
86 #[cfg(not(feature = "re-tcl"))]
87 sherlock!(everything_greedy, r".*", 13053);
88 #[cfg(not(feature = "re-onig"))]
89 #[cfg(not(feature = "re-pcre1"))]
90 #[cfg(not(feature = "re-pcre2"))]
91 #[cfg(not(feature = "re-tcl"))]
92 sherlock!(everything_greedy_nl, r"(?s).*", 1);
93 
94 // How fast can we match every letter? This also defeats any clever prefix
95 // tricks.
96 #[cfg(not(feature = "re-tcl"))]
97 sherlock!(letters, r"\p{L}", 447160);
98 
99 #[cfg(not(feature = "re-tcl"))]
100 sherlock!(letters_upper, r"\p{Lu}", 14180);
101 
102 #[cfg(not(feature = "re-tcl"))]
103 sherlock!(letters_lower, r"\p{Ll}", 432980);
104 
105 // Similarly, for words.
106 #[cfg(not(feature = "re-re2"))]
107 sherlock!(words, r"\w+", 109214);
108 #[cfg(feature = "re-re2")]
109 sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?
110 
111 // Find complete words before Holmes. The `\w` defeats any prefix
112 // optimizations.
113 sherlock!(before_holmes, r"\w+\s+Holmes", 319);
114 
115 // Find complete words before Holmes. Both of the `\w`s defeat any prefix
116 // and suffix optimizations.
117 sherlock!(before_after_holmes, r"\w+\s+Holmes\s+\w+", 137);
118 
119 // Find Holmes co-occurring with Watson in a particular window of characters.
120 // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for
121 // the rest.
122 sherlock!(holmes_cochar_watson, r"Holmes.{0,25}Watson|Watson.{0,25}Holmes", 7);
123 
124 // Find Holmes co-occurring with Watson in a particular window of words.
125 // This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for
126 // the rest.
127 #[cfg(not(feature = "re-onig"))]
128 #[cfg(not(feature = "re-pcre1"))]
129 #[cfg(not(feature = "re-pcre2"))]
130 #[cfg(not(feature = "re-tcl"))]
131 sherlock!(
132     holmes_coword_watson,
133     r"Holmes(?:\s*.+\s*){0,10}Watson|Watson(?:\s*.+\s*){0,10}Holmes",
134     51
135 );
136 
137 // Find some subset of quotes in the text.
138 // This does detect the `"` or `'` prefix literal and does a quick scan for
139 // either byte before starting the lazy DFA.
140 sherlock!(quotes, r#"["'][^"']{0,30}[?!.]["']"#, 767);
141 
142 // Finds all occurrences of Sherlock Holmes at the beginning or end of a line.
143 // The empty assertions defeat any detection of prefix literals, so it's the
144 // lazy DFA the entire way.
145 sherlock!(
146     line_boundary_sherlock_holmes,
147     r"(?m)^Sherlock Holmes|Sherlock Holmes$",
148     34
149 );
150 
151 // All words ending in `n`. This uses Unicode word boundaries, which the DFA
152 // can speculatively handle. Since this benchmark is on mostly ASCII text, it
153 // performs well here. A different benchmark with non-Western text would be
154 // more revealing since the search would be forced to fall back to an NFA
155 // simulation.
156 #[cfg(not(feature = "re-tcl"))]
157 sherlock!(word_ending_n, r"\b\w+n\b", 8366);
158 
159 // This is a real bad one for Rust's engine. This particular expression
160 // fills the state cache quite frequently, which results in a lot of churn.
161 // This can be made to go roughly the speed of PCRE by increasing the DFA cache
162 // size.
163 //
164 // Its only salvation is that the DFA realizes it's executing slowly, gives up
165 // quickly and falls back to the NFA algorithm.
166 //
167 // RE2 seems to do a worse job at this than Rust. So much so that it's slow
168 // enough to be annoying, so we disable it.
169 #[cfg(not(feature = "re-re2"))]
170 sherlock!(repeated_class_negation, r"[a-q][^u-z]{13}x", 142);
171 
172 // This defeats any prefix optimizations but triggers the reverse suffix
173 // optimization.
174 sherlock!(ing_suffix, r"[a-zA-Z]+ing", 2824);
175 
176 // Similar to ing_suffix, but a little more complex by limiting the length
177 // of the word and making sure it's surrounded by whitespace. The trailing
178 // `\s` defeats the reverse suffix optimization.
179 //
180 // Onig does surprisingly well on this benchmark and yet does quite poorly on
181 // the ing_suffix benchmark. That one has me stumped.
182 sherlock!(ing_suffix_limited_space, r"\s[a-zA-Z]{0,12}ing\s", 2081);
183