1 use crate::data; 2 3 #[derive(Clone, Copy, Debug)] 4 pub struct Input { 5 /// A name describing the corpus, used to identify it in benchmarks. 6 pub name: &'static str, 7 /// The haystack to search. 8 pub corpus: &'static str, 9 /// Queries that are expected to never occur. 10 pub never: &'static [Query], 11 /// Queries that are expected to occur rarely. 12 pub rare: &'static [Query], 13 /// Queries that are expected to fairly common. 14 pub common: &'static [Query], 15 } 16 17 /// A substring search query for a particular haystack. 18 #[derive(Clone, Copy, Debug)] 19 pub struct Query { 20 /// A name for this query, used to identify it in benchmarks. 21 pub name: &'static str, 22 /// The needle to search for. 23 pub needle: &'static str, 24 /// The expected number of occurrences. 25 pub count: usize, 26 } 27 28 pub const INPUTS: &'static [Input] = &[ 29 Input { 30 name: "code-rust-library", 31 corpus: data::CODE_RUST_LIBRARY, 32 never: &[ 33 Query { name: "fn-strength", needle: "fn strength", count: 0 }, 34 Query { 35 name: "fn-strength-paren", 36 needle: "fn strength(", 37 count: 0, 38 }, 39 Query { name: "fn-quux", needle: "fn quux(", count: 0 }, 40 ], 41 rare: &[ 42 Query { 43 name: "fn-from-str", 44 needle: "pub fn from_str(", 45 count: 1, 46 }, 47 ], 48 common: &[ 49 Query { name: "fn-is-empty", needle: "fn is_empty(", count: 17 }, 50 Query { name: "fn", needle: "fn", count: 2985 }, 51 Query { name: "paren", needle: "(", count: 30193 }, 52 Query { name: "let", needle: "let", count: 4737 }, 53 ], 54 }, 55 Input { 56 name: "huge-en", 57 corpus: data::SUBTITLE_EN_HUGE, 58 never: &[ 59 Query { name: "john-watson", needle: "John Watson", count: 0 }, 60 Query { name: "all-common-bytes", needle: "sternness", count: 0 }, 61 Query { name: "some-rare-bytes", needle: "quartz", count: 0 }, 62 Query { name: "two-space", needle: " ", count: 0 }, 63 ], 64 rare: &[ 65 Query { 66 name: "sherlock-holmes", 67 needle: "Sherlock Holmes", 68 count: 1, 69 }, 70 Query { name: "sherlock", needle: "Sherlock", count: 1 }, 71 Query { 72 name: "medium-needle", 73 needle: "homer, marge, bart, lisa, maggie", 74 count: 1, 75 }, 76 Query { 77 name: "long-needle", 78 needle: "I feel afraid of Mostafa\nHe is stronger and older than I am, and more experienced\nShould I turn back?\nDoc you're beginning to sound like Sherlock Holmes.", 79 count: 1, 80 }, 81 Query { 82 name: "huge-needle", 83 needle: "Since we will meet anyway, then the sooner, the better\nTomorrow at 4:30 in front of the Horse-Riding Club\nNo, 4:30\nI am confused, almost lost\nAs if an invisible hand pushed me towards an unknown fate\nI needed someone by my side\nI needed someone to guide me to the path of security\nBut I had no one\nI couldn't ask my father's opinion, nor his wife's\nI felt just as lonely as I had before\nI feel afraid of Mostafa\nHe is stronger and older than I am, and more experienced\nShould I turn back?\nDoc you're beginning to sound like Sherlock Holmes.", 84 count: 1, 85 }, 86 ], 87 common: &[ 88 Query { name: "that", needle: "that", count: 865 }, 89 Query { name: "one-space", needle: " ", count: 96606 }, 90 Query { name: "you", needle: "you", count: 5009 }, 91 // It would be nice to benchmark this case, although it's not 92 // terribly important. The problem is that std's substring 93 // implementation (correctly) never returns match offsets that 94 // split an encoded codepoint, where as memmem on bytes will. So 95 // the counts differ. We could modify our harness to skip this on 96 // std, but it seems like much ado about nothing. 97 // Query { name: "empty", needle: "", count: 613655 }, 98 ], 99 }, 100 Input { 101 name: "huge-ru", 102 corpus: data::SUBTITLE_RU_HUGE, 103 never: &[Query { 104 name: "john-watson", 105 needle: "Джон Уотсон", 106 count: 0, 107 }], 108 rare: &[ 109 Query { 110 name: "sherlock-holmes", 111 needle: "Шерлок Холмс", 112 count: 1, 113 }, 114 Query { name: "sherlock", needle: "Шерлок", count: 1 }, 115 ], 116 common: &[ 117 Query { name: "that", needle: "что", count: 998 }, 118 Query { name: "not", needle: "не", count: 3092 }, 119 Query { name: "one-space", needle: " ", count: 46941 }, 120 ], 121 }, 122 Input { 123 name: "huge-zh", 124 corpus: data::SUBTITLE_ZH_HUGE, 125 never: &[Query { 126 name: "john-watson", needle: "约翰·沃森", count: 0 127 }], 128 rare: &[ 129 Query { 130 name: "sherlock-holmes", 131 needle: "夏洛克·福尔摩斯", 132 count: 1, 133 }, 134 Query { name: "sherlock", needle: "夏洛克", count: 1 }, 135 ], 136 common: &[ 137 Query { name: "that", needle: "那", count: 1056 }, 138 Query { name: "do-not", needle: "不", count: 2751 }, 139 Query { name: "one-space", needle: " ", count: 17232 }, 140 ], 141 }, 142 Input { 143 name: "teeny-en", 144 corpus: data::SUBTITLE_EN_TEENY, 145 never: &[ 146 Query { name: "john-watson", needle: "John Watson", count: 0 }, 147 Query { name: "all-common-bytes", needle: "sternness", count: 0 }, 148 Query { name: "some-rare-bytes", needle: "quartz", count: 0 }, 149 Query { name: "two-space", needle: " ", count: 0 }, 150 ], 151 rare: &[ 152 Query { 153 name: "sherlock-holmes", 154 needle: "Sherlock Holmes", 155 count: 1, 156 }, 157 Query { name: "sherlock", needle: "Sherlock", count: 1 }, 158 ], 159 common: &[], 160 }, 161 Input { 162 name: "teeny-ru", 163 corpus: data::SUBTITLE_RU_TEENY, 164 never: &[Query { 165 name: "john-watson", 166 needle: "Джон Уотсон", 167 count: 0, 168 }], 169 rare: &[ 170 Query { 171 name: "sherlock-holmes", 172 needle: "Шерлок Холмс", 173 count: 1, 174 }, 175 Query { name: "sherlock", needle: "Шерлок", count: 1 }, 176 ], 177 common: &[], 178 }, 179 Input { 180 name: "teeny-zh", 181 corpus: data::SUBTITLE_ZH_TEENY, 182 never: &[Query { 183 name: "john-watson", needle: "约翰·沃森", count: 0 184 }], 185 rare: &[ 186 Query { 187 name: "sherlock-holmes", 188 needle: "夏洛克·福尔摩斯", 189 count: 1, 190 }, 191 Query { name: "sherlock", needle: "夏洛克", count: 1 }, 192 ], 193 common: &[], 194 }, 195 Input { 196 name: "pathological-md5-huge", 197 corpus: data::PATHOLOGICAL_MD5_HUGE, 198 never: &[Query { 199 name: "no-hash", 200 needle: "61a1a40effcf97de24505f154a306597", 201 count: 0, 202 }], 203 rare: &[Query { 204 name: "last-hash", 205 needle: "831df319d8597f5bc793d690f08b159b", 206 count: 1, 207 }], 208 common: &[Query { name: "two-bytes", needle: "fe", count: 520 }], 209 }, 210 Input { 211 name: "pathological-repeated-rare-huge", 212 corpus: data::PATHOLOGICAL_REPEATED_RARE_HUGE, 213 never: &[Query { name: "tricky", needle: "abczdef", count: 0 }], 214 rare: &[], 215 common: &[Query { name: "match", needle: "zzzzzzzzzz", count: 50010 }], 216 }, 217 Input { 218 name: "pathological-repeated-rare-small", 219 corpus: data::PATHOLOGICAL_REPEATED_RARE_SMALL, 220 never: &[Query { name: "tricky", needle: "abczdef", count: 0 }], 221 rare: &[], 222 common: &[Query { name: "match", needle: "zzzzzzzzzz", count: 100 }], 223 }, 224 Input { 225 name: "pathological-defeat-simple-vector", 226 corpus: data::PATHOLOGICAL_DEFEAT_SIMPLE_VECTOR, 227 never: &[], 228 rare: &[Query { 229 name: "alphabet", 230 needle: "qbz", 231 count: 1, 232 }], 233 common: &[], 234 }, 235 Input { 236 name: "pathological-defeat-simple-vector-freq", 237 corpus: data::PATHOLOGICAL_DEFEAT_SIMPLE_VECTOR_FREQ, 238 never: &[], 239 rare: &[Query { 240 name: "alphabet", 241 needle: "qjaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaz", 242 count: 1, 243 }], 244 common: &[], 245 }, 246 Input { 247 name: "pathological-defeat-simple-vector-repeated", 248 corpus: data::PATHOLOGICAL_DEFEAT_SIMPLE_VECTOR_REPEATED, 249 never: &[], 250 rare: &[Query { 251 name: "alphabet", 252 needle: "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzaz", 253 count: 1, 254 }], 255 common: &[], 256 }, 257 ]; 258