1 // Copyright 2007 The RE2 Authors. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #ifndef RE2_PROG_H_ 6 #define RE2_PROG_H_ 7 8 // Compiled representation of regular expressions. 9 // See regexp.h for the Regexp class, which represents a regular 10 // expression symbolically. 11 12 #include <stdint.h> 13 #include <functional> 14 #include <mutex> 15 #include <string> 16 #include <vector> 17 #include <type_traits> 18 19 #include "util/util.h" 20 #include "util/logging.h" 21 #include "re2/pod_array.h" 22 #include "re2/re2.h" 23 #include "re2/sparse_array.h" 24 #include "re2/sparse_set.h" 25 26 namespace re2 { 27 28 // Opcodes for Inst 29 enum InstOp { 30 kInstAlt = 0, // choose between out_ and out1_ 31 kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa. 32 kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_] 33 kInstCapture, // capturing parenthesis number cap_ 34 kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_ 35 kInstMatch, // found a match! 36 kInstNop, // no-op; occasionally unavoidable 37 kInstFail, // never match; occasionally unavoidable 38 kNumInst, 39 }; 40 41 // Bit flags for empty-width specials 42 enum EmptyOp { 43 kEmptyBeginLine = 1<<0, // ^ - beginning of line 44 kEmptyEndLine = 1<<1, // $ - end of line 45 kEmptyBeginText = 1<<2, // \A - beginning of text 46 kEmptyEndText = 1<<3, // \z - end of text 47 kEmptyWordBoundary = 1<<4, // \b - word boundary 48 kEmptyNonWordBoundary = 1<<5, // \B - not \b 49 kEmptyAllFlags = (1<<6)-1, 50 }; 51 52 class DFA; 53 class Regexp; 54 55 // Compiled form of regexp program. 56 class Prog { 57 public: 58 Prog(); 59 ~Prog(); 60 61 // Single instruction in regexp program. 62 class Inst { 63 public: 64 // See the assertion below for why this is so. 65 Inst() = default; 66 67 // Copyable. 68 Inst(const Inst&) = default; 69 Inst& operator=(const Inst&) = default; 70 71 // Constructors per opcode 72 void InitAlt(uint32_t out, uint32_t out1); 73 void InitByteRange(int lo, int hi, int foldcase, uint32_t out); 74 void InitCapture(int cap, uint32_t out); 75 void InitEmptyWidth(EmptyOp empty, uint32_t out); 76 void InitMatch(int id); 77 void InitNop(uint32_t out); 78 void InitFail(); 79 80 // Getters id(Prog * p)81 int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); } opcode()82 InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } last()83 int last() { return (out_opcode_>>3)&1; } out()84 int out() { return out_opcode_>>4; } out1()85 int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } cap()86 int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } lo()87 int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } hi()88 int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } foldcase()89 int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; } hint()90 int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; } match_id()91 int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } empty()92 EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } 93 greedy(Prog * p)94 bool greedy(Prog* p) { 95 DCHECK_EQ(opcode(), kInstAltMatch); 96 return p->inst(out())->opcode() == kInstByteRange || 97 (p->inst(out())->opcode() == kInstNop && 98 p->inst(p->inst(out())->out())->opcode() == kInstByteRange); 99 } 100 101 // Does this inst (an kInstByteRange) match c? Matches(int c)102 inline bool Matches(int c) { 103 DCHECK_EQ(opcode(), kInstByteRange); 104 if (foldcase() && 'A' <= c && c <= 'Z') 105 c += 'a' - 'A'; 106 return lo_ <= c && c <= hi_; 107 } 108 109 // Returns string representation for debugging. 110 std::string Dump(); 111 112 // Maximum instruction id. 113 // (Must fit in out_opcode_. PatchList/last steal another bit.) 114 static const int kMaxInst = (1<<28) - 1; 115 116 private: set_opcode(InstOp opcode)117 void set_opcode(InstOp opcode) { 118 out_opcode_ = (out()<<4) | (last()<<3) | opcode; 119 } 120 set_last()121 void set_last() { 122 out_opcode_ = (out()<<4) | (1<<3) | opcode(); 123 } 124 set_out(int out)125 void set_out(int out) { 126 out_opcode_ = (out<<4) | (last()<<3) | opcode(); 127 } 128 set_out_opcode(int out,InstOp opcode)129 void set_out_opcode(int out, InstOp opcode) { 130 out_opcode_ = (out<<4) | (last()<<3) | opcode; 131 } 132 133 uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode 134 union { // additional instruction arguments: 135 uint32_t out1_; // opcode == kInstAlt 136 // alternate next instruction 137 138 int32_t cap_; // opcode == kInstCapture 139 // Index of capture register (holds text 140 // position recorded by capturing parentheses). 141 // For \n (the submatch for the nth parentheses), 142 // the left parenthesis captures into register 2*n 143 // and the right one captures into register 2*n+1. 144 145 int32_t match_id_; // opcode == kInstMatch 146 // Match ID to identify this match (for re2::Set). 147 148 struct { // opcode == kInstByteRange 149 uint8_t lo_; // byte range is lo_-hi_ inclusive 150 uint8_t hi_; // 151 uint16_t hint_foldcase_; // 15 bits: hint, 1 (low) bit: foldcase 152 // hint to execution engines: the delta to the 153 // next instruction (in the current list) worth 154 // exploring iff this instruction matched; 0 155 // means there are no remaining possibilities, 156 // which is most likely for character classes. 157 // foldcase: A-Z -> a-z before checking range. 158 }; 159 160 EmptyOp empty_; // opcode == kInstEmptyWidth 161 // empty_ is bitwise OR of kEmpty* flags above. 162 }; 163 164 friend class Compiler; 165 friend struct PatchList; 166 friend class Prog; 167 }; 168 169 // Inst must be trivial so that we can freely clear it with memset(3). 170 // Arrays of Inst are initialised by copying the initial elements with 171 // memmove(3) and then clearing any remaining elements with memset(3). 172 static_assert(std::is_trivial<Inst>::value, "Inst must be trivial"); 173 174 // Whether to anchor the search. 175 enum Anchor { 176 kUnanchored, // match anywhere 177 kAnchored, // match only starting at beginning of text 178 }; 179 180 // Kind of match to look for (for anchor != kFullMatch) 181 // 182 // kLongestMatch mode finds the overall longest 183 // match but still makes its submatch choices the way 184 // Perl would, not in the way prescribed by POSIX. 185 // The POSIX rules are much more expensive to implement, 186 // and no one has needed them. 187 // 188 // kFullMatch is not strictly necessary -- we could use 189 // kLongestMatch and then check the length of the match -- but 190 // the matching code can run faster if it knows to consider only 191 // full matches. 192 enum MatchKind { 193 kFirstMatch, // like Perl, PCRE 194 kLongestMatch, // like egrep or POSIX 195 kFullMatch, // match only entire text; implies anchor==kAnchored 196 kManyMatch // for SearchDFA, records set of matches 197 }; 198 inst(int id)199 Inst *inst(int id) { return &inst_[id]; } start()200 int start() { return start_; } set_start(int start)201 void set_start(int start) { start_ = start; } start_unanchored()202 int start_unanchored() { return start_unanchored_; } set_start_unanchored(int start)203 void set_start_unanchored(int start) { start_unanchored_ = start; } size()204 int size() { return size_; } reversed()205 bool reversed() { return reversed_; } set_reversed(bool reversed)206 void set_reversed(bool reversed) { reversed_ = reversed; } list_count()207 int list_count() { return list_count_; } inst_count(InstOp op)208 int inst_count(InstOp op) { return inst_count_[op]; } list_heads()209 uint16_t* list_heads() { return list_heads_.data(); } dfa_mem()210 int64_t dfa_mem() { return dfa_mem_; } set_dfa_mem(int64_t dfa_mem)211 void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } anchor_start()212 bool anchor_start() { return anchor_start_; } set_anchor_start(bool b)213 void set_anchor_start(bool b) { anchor_start_ = b; } anchor_end()214 bool anchor_end() { return anchor_end_; } set_anchor_end(bool b)215 void set_anchor_end(bool b) { anchor_end_ = b; } bytemap_range()216 int bytemap_range() { return bytemap_range_; } bytemap()217 const uint8_t* bytemap() { return bytemap_; } can_prefix_accel()218 bool can_prefix_accel() { return prefix_size_ != 0; } 219 220 // Accelerates to the first likely occurrence of the prefix. 221 // Returns a pointer to the first byte or NULL if not found. PrefixAccel(const void * data,size_t size)222 const void* PrefixAccel(const void* data, size_t size) { 223 DCHECK_GE(prefix_size_, 1); 224 return prefix_size_ == 1 ? memchr(data, prefix_front_, size) 225 : PrefixAccel_FrontAndBack(data, size); 226 } 227 228 // An implementation of prefix accel that looks for prefix_front_ and 229 // prefix_back_ to return fewer false positives than memchr(3) alone. 230 const void* PrefixAccel_FrontAndBack(const void* data, size_t size); 231 232 // Returns string representation of program for debugging. 233 std::string Dump(); 234 std::string DumpUnanchored(); 235 std::string DumpByteMap(); 236 237 // Returns the set of kEmpty flags that are in effect at 238 // position p within context. 239 static uint32_t EmptyFlags(const StringPiece& context, const char* p); 240 241 // Returns whether byte c is a word character: ASCII only. 242 // Used by the implementation of \b and \B. 243 // This is not right for Unicode, but: 244 // - it's hard to get right in a byte-at-a-time matching world 245 // (the DFA has only one-byte lookahead). 246 // - even if the lookahead were possible, the Progs would be huge. 247 // This crude approximation is the same one PCRE uses. IsWordChar(uint8_t c)248 static bool IsWordChar(uint8_t c) { 249 return ('A' <= c && c <= 'Z') || 250 ('a' <= c && c <= 'z') || 251 ('0' <= c && c <= '9') || 252 c == '_'; 253 } 254 255 // Execution engines. They all search for the regexp (run the prog) 256 // in text, which is in the larger context (used for ^ $ \b etc). 257 // Anchor and kind control the kind of search. 258 // Returns true if match found, false if not. 259 // If match found, fills match[0..nmatch-1] with submatch info. 260 // match[0] is overall match, match[1] is first set of parens, etc. 261 // If a particular submatch is not matched during the regexp match, 262 // it is set to NULL. 263 // 264 // Matching text == StringPiece(NULL, 0) is treated as any other empty 265 // string, but note that on return, it will not be possible to distinguish 266 // submatches that matched that empty string from submatches that didn't 267 // match anything. Either way, match[i] == NULL. 268 269 // Search using NFA: can find submatches but kind of slow. 270 bool SearchNFA(const StringPiece& text, const StringPiece& context, 271 Anchor anchor, MatchKind kind, 272 StringPiece* match, int nmatch); 273 274 // Search using DFA: much faster than NFA but only finds 275 // end of match and can use a lot more memory. 276 // Returns whether a match was found. 277 // If the DFA runs out of memory, sets *failed to true and returns false. 278 // If matches != NULL and kind == kManyMatch and there is a match, 279 // SearchDFA fills matches with the match IDs of the final matching state. 280 bool SearchDFA(const StringPiece& text, const StringPiece& context, 281 Anchor anchor, MatchKind kind, StringPiece* match0, 282 bool* failed, SparseSet* matches); 283 284 // The callback issued after building each DFA state with BuildEntireDFA(). 285 // If next is null, then the memory budget has been exhausted and building 286 // will halt. Otherwise, the state has been built and next points to an array 287 // of bytemap_range()+1 slots holding the next states as per the bytemap and 288 // kByteEndText. The number of the state is implied by the callback sequence: 289 // the first callback is for state 0, the second callback is for state 1, ... 290 // match indicates whether the state is a matching state. 291 using DFAStateCallback = std::function<void(const int* next, bool match)>; 292 293 // Build the entire DFA for the given match kind. 294 // Usually the DFA is built out incrementally, as needed, which 295 // avoids lots of unnecessary work. 296 // If cb is not empty, it receives one callback per state built. 297 // Returns the number of states built. 298 // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. 299 int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb); 300 301 // Controls whether the DFA should bail out early if the NFA would be faster. 302 // FOR TESTING ONLY. 303 static void TEST_dfa_should_bail_when_slow(bool b); 304 305 // Compute bytemap. 306 void ComputeByteMap(); 307 308 // Run peep-hole optimizer on program. 309 void Optimize(); 310 311 // One-pass NFA: only correct if IsOnePass() is true, 312 // but much faster than NFA (competitive with PCRE) 313 // for those expressions. 314 bool IsOnePass(); 315 bool SearchOnePass(const StringPiece& text, const StringPiece& context, 316 Anchor anchor, MatchKind kind, 317 StringPiece* match, int nmatch); 318 319 // Bit-state backtracking. Fast on small cases but uses memory 320 // proportional to the product of the list count and the text size. CanBitState()321 bool CanBitState() { return list_heads_.data() != NULL; } 322 bool SearchBitState(const StringPiece& text, const StringPiece& context, 323 Anchor anchor, MatchKind kind, 324 StringPiece* match, int nmatch); 325 326 static const int kMaxOnePassCapture = 5; // $0 through $4 327 328 // Backtracking search: the gold standard against which the other 329 // implementations are checked. FOR TESTING ONLY. 330 // It allocates a ton of memory to avoid running forever. 331 // It is also recursive, so can't use in production (will overflow stacks). 332 // The name "Unsafe" here is supposed to be a flag that 333 // you should not be using this function. 334 bool UnsafeSearchBacktrack(const StringPiece& text, 335 const StringPiece& context, 336 Anchor anchor, MatchKind kind, 337 StringPiece* match, int nmatch); 338 339 // Computes range for any strings matching regexp. The min and max can in 340 // some cases be arbitrarily precise, so the caller gets to specify the 341 // maximum desired length of string returned. 342 // 343 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any 344 // string s that is an anchored match for this regexp satisfies 345 // min <= s && s <= max. 346 // 347 // Note that PossibleMatchRange() will only consider the first copy of an 348 // infinitely repeated element (i.e., any regexp element followed by a '*' or 349 // '+' operator). Regexps with "{N}" constructions are not affected, as those 350 // do not compile down to infinite repetitions. 351 // 352 // Returns true on success, false on error. 353 bool PossibleMatchRange(std::string* min, std::string* max, int maxlen); 354 355 // EXPERIMENTAL! SUBJECT TO CHANGE! 356 // Outputs the program fanout into the given sparse array. 357 void Fanout(SparseArray<int>* fanout); 358 359 // Compiles a collection of regexps to Prog. Each regexp will have 360 // its own Match instruction recording the index in the output vector. 361 static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); 362 363 // Flattens the Prog from "tree" form to "list" form. This is an in-place 364 // operation in the sense that the old instructions are lost. 365 void Flatten(); 366 367 // Walks the Prog; the "successor roots" or predecessors of the reachable 368 // instructions are marked in rootmap or predmap/predvec, respectively. 369 // reachable and stk are preallocated scratch structures. 370 void MarkSuccessors(SparseArray<int>* rootmap, 371 SparseArray<int>* predmap, 372 std::vector<std::vector<int>>* predvec, 373 SparseSet* reachable, std::vector<int>* stk); 374 375 // Walks the Prog from the given "root" instruction; the "dominator root" 376 // of the reachable instructions (if such exists) is marked in rootmap. 377 // reachable and stk are preallocated scratch structures. 378 void MarkDominator(int root, SparseArray<int>* rootmap, 379 SparseArray<int>* predmap, 380 std::vector<std::vector<int>>* predvec, 381 SparseSet* reachable, std::vector<int>* stk); 382 383 // Walks the Prog from the given "root" instruction; the reachable 384 // instructions are emitted in "list" form and appended to flat. 385 // reachable and stk are preallocated scratch structures. 386 void EmitList(int root, SparseArray<int>* rootmap, 387 std::vector<Inst>* flat, 388 SparseSet* reachable, std::vector<int>* stk); 389 390 // Computes hints for ByteRange instructions in [begin, end). 391 void ComputeHints(std::vector<Inst>* flat, int begin, int end); 392 393 private: 394 friend class Compiler; 395 396 DFA* GetDFA(MatchKind kind); 397 void DeleteDFA(DFA* dfa); 398 399 bool anchor_start_; // regexp has explicit start anchor 400 bool anchor_end_; // regexp has explicit end anchor 401 bool reversed_; // whether program runs backward over input 402 bool did_flatten_; // has Flatten been called? 403 bool did_onepass_; // has IsOnePass been called? 404 405 int start_; // entry point for program 406 int start_unanchored_; // unanchored entry point for program 407 int size_; // number of instructions 408 int bytemap_range_; // bytemap_[x] < bytemap_range_ 409 size_t prefix_size_; // size of prefix (0 if no prefix) 410 int prefix_front_; // first byte of prefix (-1 if no prefix) 411 int prefix_back_; // last byte of prefix (-1 if no prefix) 412 413 int list_count_; // count of lists (see above) 414 int inst_count_[kNumInst]; // count of instructions by opcode 415 PODArray<uint16_t> list_heads_; // sparse array enumerating list heads 416 // not populated if size_ is overly large 417 418 PODArray<Inst> inst_; // pointer to instruction array 419 PODArray<uint8_t> onepass_nodes_; // data for OnePass nodes 420 421 int64_t dfa_mem_; // Maximum memory for DFAs. 422 DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch 423 DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch 424 425 uint8_t bytemap_[256]; // map from input bytes to byte classes 426 427 std::once_flag dfa_first_once_; 428 std::once_flag dfa_longest_once_; 429 430 Prog(const Prog&) = delete; 431 Prog& operator=(const Prog&) = delete; 432 }; 433 434 } // namespace re2 435 436 #endif // RE2_PROG_H_ 437