1This is a dump from Google's source control system of the change 2that removed UCS-2 support from RE2. As the explanation below 3says, UCS-2 mode is fundamentally at odds with things like ^ and $, 4so it never really worked very well. But if you are interested in using 5it without those operators, it did work for that. It assumed that the 6UCS-2 data was in the native host byte order. 7 8If you are interested in adding UCS-2 mode back, this patch might 9be a good starting point. 10 11 12Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15 13 14 Retire UCS-2 mode. 15 16 I added it as an experiment for V8, but it 17 requires 2-byte lookahead to do completely, 18 and RE2 has 1-byte lookahead (enough for UTF-8) 19 as a fairly deep fundamental assumption, 20 so it did not support ^ or $. 21 22==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ==== 23re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319 24 cap_[0] = p; 25 if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. 26 return true; 27- if (prog_->flags() & Regexp::UCS2) 28- p++; 29 } 30 return false; 31 } 32==== re2/compile.cc#17 - re2/compile.cc#18 ==== 33re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100 34 // Input encodings. 35 enum Encoding { 36 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) 37- kEncodingUCS2, // UCS-2 (0-FFFF), native byte order 38 kEncodingLatin1, // Latin1 (0-FF) 39 }; 40 41re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172 42 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); 43 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); 44 void Add_80_10ffff(); 45- void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase); 46- void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, 47- uint8 lo2, uint8 hi2, bool fold2); 48 49 // New suffix that matches the byte range lo-hi, then goes to next. 50 Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next); 51re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477 52 53 // Converts rune range lo-hi into a fragment that recognizes 54 // the bytes that would make up those runes in the current 55- // encoding (Latin 1, UTF-8, or UCS-2). 56+ // encoding (Latin 1 or UTF-8). 57 // This lets the machine work byte-by-byte even when 58 // using multibyte encodings. 59 60re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489 61 case kEncodingLatin1: 62 AddRuneRangeLatin1(lo, hi, foldcase); 63 break; 64- case kEncodingUCS2: 65- AddRuneRangeUCS2(lo, hi, foldcase); 66- break; 67 } 68 } 69 70re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501 71 AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL)); 72 } 73 74- // Test whether 16-bit values are big or little endian. 75- static bool BigEndian() { 76- union { 77- char byte[2]; 78- int16 endian; 79- } u; 80- 81- u.byte[0] = 1; 82- u.byte[1] = 2; 83- return u.endian == 0x0102; 84- } 85- 86- void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1, 87- uint8 lo2, uint8 hi2, bool fold2) { 88- Inst* ip; 89- if (reversed_) { 90- ip = RuneByteSuffix(lo1, hi1, fold1, NULL); 91- ip = RuneByteSuffix(lo2, hi2, fold2, ip); 92- } else { 93- ip = RuneByteSuffix(lo2, hi2, fold2, NULL); 94- ip = RuneByteSuffix(lo1, hi1, fold1, ip); 95- } 96- AddSuffix(ip); 97- } 98- 99- void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) { 100- if (lo > hi || lo > 0xFFFF) 101- return; 102- if (hi > 0xFFFF) 103- hi = 0xFFFF; 104- 105- // We'll assemble a pattern assuming big endian. 106- // If the machine isn't, tell Cat to reverse its arguments. 107- bool oldreversed = reversed_; 108- if (!BigEndian()) { 109- reversed_ = !oldreversed; 110- } 111- 112- // Split into bytes. 113- int lo1 = lo >> 8; 114- int lo2 = lo & 0xFF; 115- int hi1 = hi >> 8; 116- int hi2 = hi & 0xFF; 117- 118- if (lo1 == hi1) { 119- // Easy case: high bits are same in both. 120- // Only do ASCII case folding on the second byte if the top byte is 00. 121- AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase); 122- } else { 123- // Harder case: different second byte ranges depending on first byte. 124- 125- // Initial fragment. 126- if (lo2 > 0) { 127- AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase); 128- lo1++; 129- } 130- 131- // Trailing fragment. 132- if (hi2 < 0xFF) { 133- AddUCS2Pair(hi1, hi1, false, 0, hi2, false); 134- hi1--; 135- } 136- 137- // Inner ranges. 138- if (lo1 <= hi1) { 139- AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false); 140- } 141- } 142- 143- // Restore reverse setting. 144- reversed_ = oldreversed; 145- } 146- 147 // Table describing how to make a UTF-8 matching machine 148 // for the rune range 80-10FFFF (Runeself-Runemax). 149 // This range happens frequently enough (for example /./ and /[^a-z]/) 150re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634 151 152 Frag Compiler::Literal(Rune r, bool foldcase) { 153 switch (encoding_) { 154- default: // UCS-2 or something new 155- BeginRange(); 156- AddRuneRange(r, r, foldcase); 157- return EndRange(); 158+ default: 159+ return kNullFrag; 160 161 case kEncodingLatin1: 162 return ByteRange(r, r, foldcase); 163re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850 164 165 if (re->parse_flags() & Regexp::Latin1) 166 c.encoding_ = kEncodingLatin1; 167- else if (re->parse_flags() & Regexp::UCS2) 168- c.encoding_ = kEncodingUCS2; 169 c.reversed_ = reversed; 170 if (max_mem <= 0) { 171 c.max_inst_ = 100000; // more than enough 172re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905 173 c.prog_->set_start_unanchored(c.prog_->start()); 174 } else { 175 Frag dot; 176- if (c.encoding_ == kEncodingUCS2) { 177- dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false)); 178- } else { 179- dot = c.ByteRange(0x00, 0xFF, false); 180- } 181+ dot = c.ByteRange(0x00, 0xFF, false); 182 Frag dotloop = c.Star(dot, true); 183 Frag unanchored = c.Cat(dotloop, all); 184 c.prog_->set_start_unanchored(unanchored.begin); 185==== re2/nfa.cc#8 - re2/nfa.cc#9 ==== 186re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431 187 const char* bp = context.begin(); 188 int c = -1; 189 int wasword = 0; 190- bool ucs2 = prog_->flags() & Regexp::UCS2; 191 192 if (text.begin() > context.begin()) { 193 c = text.begin()[-1] & 0xFF; 194re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497 195 // If there's a required first byte for an unanchored search 196 // and we're not in the middle of any possible matches, 197 // use memchr to search for the byte quickly. 198- if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 && 199+ if (!anchored && first_byte_ >= 0 && runq->size() == 0 && 200 p < text.end() && (p[0] & 0xFF) != first_byte_) { 201 p = reinterpret_cast<const char*>(memchr(p, first_byte_, 202 text.end() - p)); 203re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514 204 flag = Prog::EmptyFlags(context, p); 205 } 206 207- // In UCS-2 mode, if we need to start a new thread, 208- // make sure to do it on an even boundary. 209- if(ucs2 && runq->size() == 0 && 210- (p - context.begin()) % 2 && p < text.end()) { 211- p++; 212- flag = Prog::EmptyFlags(context, p); 213- } 214- 215 // Steal match storage (cleared but unused as of yet) 216 // temporarily to hold match boundaries for new thread. 217- // In UCS-2 mode, only start the thread on a 2-byte boundary. 218- if(!ucs2 || (p - context.begin()) % 2 == 0) { 219- match_[0] = p; 220- AddToThreadq(runq, start_, flag, p, match_); 221- match_[0] = NULL; 222- } 223+ match_[0] = p; 224+ AddToThreadq(runq, start_, flag, p, match_); 225+ match_[0] = NULL; 226 } 227 228 // If all the threads have died, stop early. 229==== re2/parse.cc#22 - re2/parse.cc#23 ==== 230re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165 231 status_(status), stacktop_(NULL), ncap_(0) { 232 if (flags_ & Latin1) 233 rune_max_ = 0xFF; 234- else if (flags & UCS2) 235- rune_max_ = 0xFFFF; 236 else 237 rune_max_ = Runemax; 238 } 239re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374 240 bool Regexp::ParseState::PushCarat() { 241 if (flags_ & OneLine) { 242 return PushSimpleOp(kRegexpBeginText); 243- } else { 244- if (flags_ & UCS2) { 245- status_->set_code(kRegexpUnsupported); 246- status_->set_error_arg("multiline ^ in UCS-2 mode"); 247- return false; 248- } 249- return PushSimpleOp(kRegexpBeginLine); 250 } 251+ return PushSimpleOp(kRegexpBeginLine); 252 } 253 254 // Pushes a \b or \B onto the stack. 255 bool Regexp::ParseState::PushWordBoundary(bool word) { 256- if (flags_ & UCS2) { 257- status_->set_code(kRegexpUnsupported); 258- status_->set_error_arg("\\b or \\B in UCS-2 mode"); 259- return false; 260- } 261 if (word) 262 return PushSimpleOp(kRegexpWordBoundary); 263 return PushSimpleOp(kRegexpNoWordBoundary); 264re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389 265 bool ret = PushSimpleOp(kRegexpEndText); 266 flags_ = oflags; 267 return ret; 268- } 269- if (flags_ & UCS2) { 270- status_->set_code(kRegexpUnsupported); 271- status_->set_error_arg("multiline $ in UCS-2 mode"); 272- return false; 273 } 274 return PushSimpleOp(kRegexpEndLine); 275 } 276==== re2/re2.cc#34 - re2/re2.cc#35 ==== 277re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84 278 return RE2::ErrorBadUTF8; 279 case re2::kRegexpBadNamedCapture: 280 return RE2::ErrorBadNamedCapture; 281- case re2::kRegexpUnsupported: 282- return RE2::ErrorUnsupported; 283 } 284 return RE2::ErrorInternal; 285 } 286re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125 287 break; 288 case RE2::Options::EncodingLatin1: 289 flags |= Regexp::Latin1; 290- break; 291- case RE2::Options::EncodingUCS2: 292- flags |= Regexp::UCS2; 293 break; 294 } 295 296==== re2/re2.h#36 - re2/re2.h#37 ==== 297re2/re2.h#36:246,252 - re2/re2.h#37:246,251 298 ErrorBadUTF8, // invalid UTF-8 in regexp 299 ErrorBadNamedCapture, // bad named capture group 300 ErrorPatternTooLarge, // pattern too large (compile failed) 301- ErrorUnsupported, // unsupported feature (in UCS-2 mode) 302 }; 303 304 // Predefined common options. 305re2/re2.h#36:570,576 - re2/re2.h#37:569,574 306 307 enum Encoding { 308 EncodingUTF8 = 1, 309- EncodingUCS2, // 16-bit Unicode 0-FFFF only 310 EncodingLatin1 311 }; 312 313==== re2/regexp.cc#15 - re2/regexp.cc#16 ==== 314re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329 315 // the regexp that remains after the prefix. The prefix might 316 // be ASCII case-insensitive. 317 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { 318- // Don't even bother for UCS-2; it's time to throw that code away. 319- if (parse_flags_ & UCS2) 320- return false; 321- 322 // No need for a walker: the regexp must be of the form 323 // 1. some number of ^ anchors 324 // 2. a literal char or string 325==== re2/regexp.h#20 - re2/regexp.h#21 ==== 326re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192 327 kRegexpBadPerlOp, // bad perl operator 328 kRegexpBadUTF8, // invalid UTF-8 in regexp 329 kRegexpBadNamedCapture, // bad named capture 330- kRegexpUnsupported, // unsupported operator 331 }; 332 333 // Error status for certain operations. 334re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314 335 // \Q and \E to disable/enable metacharacters 336 // (?P<name>expr) for named captures 337 // \C to match any single byte 338- UCS2 = 1<<10, // Text is in UCS-2, regexp is in UTF-8. 339- UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group 340+ UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group 341 // and \P{Han} for its negation. 342- NeverNL = 1<<12, // Never match NL, even if the regexp mentions 343+ NeverNL = 1<<11, // Never match NL, even if the regexp mentions 344 // it explicitly. 345 346 // As close to Perl as we can get. 347==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ==== 348re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139 349 cap_[0] = p; 350 if (Visit(prog_->start(), p)) // Match must be leftmost; done. 351 return true; 352- if (prog_->flags() & Regexp::UCS2) 353- p++; 354 } 355 return false; 356 } 357==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ==== 358re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152 359 static ParseMode parse_modes[] = { 360 { single_line, "single-line" }, 361 { single_line|Regexp::Latin1, "single-line, latin1" }, 362- { single_line|Regexp::UCS2, "single-line, ucs2" }, 363 { multi_line, "multiline" }, 364 { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, 365 { multi_line|Regexp::Latin1, "multiline, latin1" }, 366- { multi_line|Regexp::UCS2, "multiline, ucs2" }, 367 }; 368 369 static string FormatMode(Regexp::ParseFlags flags) { 370re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185 371 RegexpStatus status; 372 regexp_ = Regexp::Parse(regexp_str, flags, &status); 373 if (regexp_ == NULL) { 374- if (status.code() != kRegexpUnsupported) { 375- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) 376- << " mode: " << FormatMode(flags); 377- error_ = true; 378- } 379+ LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) 380+ << " mode: " << FormatMode(flags); 381+ error_ = true; 382 return; 383 } 384 prog_ = regexp_->CompileToProg(0); 385re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231 386 RE2::Options options; 387 if (flags & Regexp::Latin1) 388 options.set_encoding(RE2::Options::EncodingLatin1); 389- else if (flags & Regexp::UCS2) 390- options.set_encoding(RE2::Options::EncodingUCS2); 391 if (kind_ == Prog::kLongestMatch) 392 options.set_longest_match(true); 393 re2_ = new RE2(re, options); 394re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280 395 delete re2_; 396 } 397 398- // Converts UTF-8 string in text into UCS-2 string in new_text. 399- static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) { 400- const char* p = text.begin(); 401- const char* ep = text.end(); 402- uint16* q = new uint16[ep - p]; 403- uint16* q0 = q; 404- 405- int n; 406- Rune r; 407- for (; p < ep; p += n) { 408- if (!fullrune(p, ep - p)) { 409- delete[] q0; 410- return false; 411- } 412- n = chartorune(&r, p); 413- if (r > 0xFFFF) { 414- delete[] q0; 415- return false; 416- } 417- *q++ = r; 418- } 419- *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0)); 420- return true; 421- } 422- 423- // Rewrites *sp from being a pointer into text8 (UTF-8) 424- // to being a pointer into text16 (equivalent text but in UCS-2). 425- static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16, 426- StringPiece *sp) { 427- if (sp->begin() == NULL && text8.begin() != NULL) 428- return; 429- 430- int nrune = 0; 431- int n; 432- Rune r; 433- const char* p = text8.begin(); 434- const char* ep = text8.end(); 435- const char* spbegin = NULL; 436- const char* spend = NULL; 437- for (;;) { 438- if (p == sp->begin()) 439- spbegin = text16.begin() + sizeof(uint16)*nrune; 440- if (p == sp->end()) 441- spend = text16.begin() + sizeof(uint16)*nrune; 442- if (p >= ep) 443- break; 444- n = chartorune(&r, p); 445- p += n; 446- nrune++; 447- } 448- if (spbegin == NULL || spend == NULL) { 449- LOG(FATAL) << "Error in AdjustUTF8ToUCS2 " 450- << CEscape(text8) << " " 451- << (int)(sp->begin() - text8.begin()) << " " 452- << (int)(sp->end() - text8.begin()); 453- } 454- *sp = StringPiece(spbegin, spend - spbegin); 455- } 456- 457- // Rewrites *sp from begin a pointer into text16 (UCS-2) 458- // to being a pointer into text8 (equivalent text but in UTF-8). 459- static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8, 460- StringPiece* sp) { 461- if (sp->begin() == NULL) 462- return; 463- 464- int nrune = 0; 465- int n; 466- Rune r; 467- const char* p = text8.begin(); 468- const char* ep = text8.end(); 469- const char* spbegin = NULL; 470- const char* spend = NULL; 471- for (;;) { 472- if (nrune == (sp->begin() - text16.begin())/2) 473- spbegin = p; 474- if (nrune == (sp->end() - text16.begin())/2) 475- spend = p; 476- if (p >= ep) 477- break; 478- n = chartorune(&r, p); 479- p += n; 480- nrune++; 481- } 482- if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) { 483- LOG(FATAL) << "Error in AdjustUCS2ToUTF8 " 484- << CEscape(text16) << " " 485- << (int)(sp->begin() - text16.begin()) << " " 486- << (int)(sp->end() - text16.begin()); 487- } 488- *sp = StringPiece(spbegin, spend - spbegin); 489- } 490- 491 // Runs a single search using the named engine type. 492 // This interface hides all the irregularities of the various 493 // engine interfaces from the rest of this file. 494re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300 495 496 StringPiece text = orig_text; 497 StringPiece context = orig_context; 498- bool ucs2 = false; 499 500- if ((flags() & Regexp::UCS2) && type != kEnginePCRE) { 501- if (!ConvertUTF8ToUCS2(orig_context, &context)) { 502- result->skipped = true; 503- return; 504- } 505- 506- // Rewrite context to refer to new text. 507- AdjustUTF8ToUCS2(orig_context, context, &text); 508- ucs2 = true; 509- } 510- 511 switch (type) { 512 default: 513 LOG(FATAL) << "Bad RunSearch type: " << (int)type; 514re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451 515 } 516 } 517 518- // If we did UCS-2 matching, rewrite the matches to refer 519- // to the original UTF-8 text. 520- if (ucs2) { 521- if (result->matched) { 522- if (result->have_submatch0) { 523- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]); 524- } else if (result->have_submatch) { 525- for (int i = 0; i < nsubmatch; i++) { 526- AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]); 527- } 528- } 529- } 530- delete[] context.begin(); 531- } 532- 533 if (!result->matched) 534 memset(result->submatch, 0, sizeof result->submatch); 535 } 536re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475 537 return true; 538 } 539 540- // Check whether text uses only Unicode points <= 0xFFFF 541- // (in the BMP). 542- static bool IsBMP(const StringPiece& text) { 543- const char* p = text.begin(); 544- const char* ep = text.end(); 545- while (p < ep) { 546- if (!fullrune(p, ep - p)) 547- return false; 548- Rune r; 549- p += chartorune(&r, p); 550- if (r > 0xFFFF) 551- return false; 552- } 553- return true; 554- } 555- 556 // Runs a single test. 557 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, 558 Prog::Anchor anchor) { 559re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483 560 Result correct; 561 RunSearch(kEngineBacktrack, text, context, anchor, &correct); 562 if (correct.skipped) { 563- if (regexp_ == NULL || !IsBMP(context)) // okay to skip in UCS-2 mode 564+ if (regexp_ == NULL) 565 return true; 566 LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) 567 << " " << FormatMode(flags_); 568