1 // Copyright 2019 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_SPECIAL_CASE_H_ 6 #define V8_REGEXP_SPECIAL_CASE_H_ 7 8 #ifdef V8_INTL_SUPPORT 9 #include "src/base/logging.h" 10 #include "src/common/globals.h" 11 12 #include "unicode/uchar.h" 13 #include "unicode/uniset.h" 14 #include "unicode/unistr.h" 15 16 namespace v8 { 17 namespace internal { 18 19 // Sets of Unicode characters that need special handling under "i" mode 20 21 // For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262 22 // defines slightly different case-folding rules than Unicode. An 23 // input character should match a pattern character if the result of 24 // the Canonicalize algorithm is the same for both characters. 25 // 26 // Roughly speaking, for "i" regexps, Canonicalize(c) is the same as 27 // c.toUpperCase(), unless a) c.toUpperCase() is a multi-character 28 // string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See 29 // https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for 30 // the precise definition. 31 // 32 // While compiling such regular expressions, we need to compute the 33 // set of characters that should match a given input character. (See 34 // GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.) 35 // For almost all characters, this can be efficiently computed using 36 // UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent 37 // the remaining special cases. 38 // 39 // For a character c, the rules are as follows: 40 // 41 // 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling 42 // UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet 43 // containing c will produce the set of characters that should 44 // match /c/i (or /[c]/i), and only those characters. 45 // 46 // 2. If c is in IgnoreSet, then the only character it should match is 47 // itself. However, closeOver will add additional incorrect 48 // matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ' 49 // (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is 50 // "SS". Step 3.e therefore requires that 'ß' canonicalizes to 51 // itself, and should not match 'ẞ'. In these cases, we can skip 52 // the closeOver entirely, because it will never add an equivalent 53 // character. 54 // 55 // 3. If c is in SpecialAddSet, then it should match at least one 56 // character other than itself. However, closeOver will add at 57 // least one additional incorrect match. For example, consider the 58 // letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase 59 // K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN 60 // SIGN should not match either of the other two characters. As a 61 // result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in 62 // IgnoreSet). To find the correct matches for characters in 63 // SpecialAddSet, we closeOver the original character, but filter 64 // out the results that do not have the same canonical value. 65 // 66 // The contents of these sets are calculated at build time by 67 // src/regexp/gen-regexp-special-case.cc, which generates 68 // gen/src/regexp/special-case.cc. This is done by iterating over the 69 // result of closeOver for each BMP character, and finding sets for 70 // which at least one character has a different canonical value than 71 // another character. Characters that match no other characters in 72 // their equivalence class are added to IgnoreSet. Characters that 73 // match at least one other character are added to SpecialAddSet. 74 75 class RegExpCaseFolding final : public AllStatic { 76 public: 77 static const icu::UnicodeSet& IgnoreSet(); 78 static const icu::UnicodeSet& SpecialAddSet(); 79 80 // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics: 81 // Canonicalize) step 3, which is used to determine whether 82 // characters match when ignoreCase is true and unicode is false. Canonicalize(UChar32 ch)83 static UChar32 Canonicalize(UChar32 ch) { 84 // a. Assert: ch is a UTF-16 code unit. 85 CHECK_LE(ch, 0xffff); 86 87 // b. Let s be the String value consisting of the single code unit ch. 88 icu::UnicodeString s(ch); 89 90 // c. Let u be the same result produced as if by performing the algorithm 91 // for String.prototype.toUpperCase using s as the this value. 92 // d. Assert: Type(u) is String. 93 icu::UnicodeString& u = s.toUpper(); 94 95 // e. If u does not consist of a single code unit, return ch. 96 if (u.length() != 1) { 97 return ch; 98 } 99 100 // f. Let cu be u's single code unit element. 101 UChar32 cu = u.char32At(0); 102 103 // g. If the value of ch >= 128 and the value of cu < 128, return ch. 104 if (ch >= 128 && cu < 128) { 105 return ch; 106 } 107 108 // h. Return cu. 109 return cu; 110 } 111 }; 112 113 } // namespace internal 114 } // namespace v8 115 116 #endif // V8_INTL_SUPPORT 117 118 #endif // V8_REGEXP_SPECIAL_CASE_H_ 119