• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <fstream>
6 #include <iomanip>
7 #include <iostream>
8 #include <sstream>
9 
10 #include "src/base/strings.h"
11 #include "src/regexp/special-case.h"
12 
13 namespace v8 {
14 namespace internal {
15 
16 static const base::uc32 kSurrogateStart = 0xd800;
17 static const base::uc32 kSurrogateEnd = 0xdfff;
18 static const base::uc32 kNonBmpStart = 0x10000;
19 
20 // The following code generates "src/regexp/special-case.cc".
PrintSet(std::ofstream & out,const char * name,const icu::UnicodeSet & set)21 void PrintSet(std::ofstream& out, const char* name,
22               const icu::UnicodeSet& set) {
23   out << "icu::UnicodeSet Build" << name << "() {\n"
24       << "  icu::UnicodeSet set;\n";
25   for (int32_t i = 0; i < set.getRangeCount(); i++) {
26     if (set.getRangeStart(i) == set.getRangeEnd(i)) {
27       out << "  set.add(0x" << set.getRangeStart(i) << ");\n";
28     } else {
29       out << "  set.add(0x" << set.getRangeStart(i) << ", 0x"
30           << set.getRangeEnd(i) << ");\n";
31     }
32   }
33   out << "  set.freeze();\n"
34       << "  return set;\n"
35       << "}\n\n";
36 
37   out << "struct " << name << "Data {\n"
38       << "  " << name << "Data() : set(Build" << name << "()) {}\n"
39       << "  const icu::UnicodeSet set;\n"
40       << "};\n\n";
41 
42   out << "//static\n"
43       << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
44       << "  static base::LazyInstance<" << name << "Data>::type set =\n"
45       << "      LAZY_INSTANCE_INITIALIZER;\n"
46       << "  return set.Pointer()->set;\n"
47       << "}\n\n";
48 }
49 
PrintSpecial(std::ofstream & out)50 void PrintSpecial(std::ofstream& out) {
51   icu::UnicodeSet current;
52   icu::UnicodeSet special_add;
53   icu::UnicodeSet ignore;
54   UErrorCode status = U_ZERO_ERROR;
55   icu::UnicodeSet upper("[\\p{Lu}]", status);
56   CHECK(U_SUCCESS(status));
57 
58   // Iterate through all chars in BMP except surrogates.
59   for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) {
60     if (i >= static_cast<UChar32>(kSurrogateStart) &&
61         i <= static_cast<UChar32>(kSurrogateEnd)) {
62       continue;  // Ignore surrogate range
63     }
64     current.set(i, i);
65     current.closeOver(USET_CASE_INSENSITIVE);
66 
67     // Check to see if all characters in the case-folding equivalence
68     // class as defined by UnicodeSet::closeOver all map to the same
69     // canonical value.
70     UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
71     bool class_has_matching_canonical_char = false;
72     bool class_has_non_matching_canonical_char = false;
73     for (int32_t j = 0; j < current.getRangeCount(); j++) {
74       for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
75            c++) {
76         if (c == i) {
77           continue;
78         }
79         UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
80         if (canonical == other_canonical) {
81           class_has_matching_canonical_char = true;
82         } else {
83           class_has_non_matching_canonical_char = true;
84         }
85       }
86     }
87     // If any other character in i's equivalence class has a
88     // different canonical value, then i needs special handling.  If
89     // no other character shares a canonical value with i, we can
90     // ignore i when adding alternatives for case-independent
91     // comparison.  If at least one other character shares a
92     // canonical value, then i needs special handling.
93     if (class_has_non_matching_canonical_char) {
94       if (class_has_matching_canonical_char) {
95         special_add.add(i);
96       } else {
97         ignore.add(i);
98       }
99     }
100   }
101 
102   // Verify that no Unicode equivalence class contains two non-trivial
103   // JS equivalence classes. Every character in SpecialAddSet has the
104   // same canonical value as every other non-IgnoreSet character in
105   // its Unicode equivalence class. Therefore, if we call closeOver on
106   // a set containing no IgnoreSet characters, the only characters
107   // that must be removed from the result are in IgnoreSet. This fact
108   // is used in CharacterRange::AddCaseEquivalents.
109   for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
110     for (UChar32 c = special_add.getRangeStart(i);
111          c <= special_add.getRangeEnd(i); c++) {
112       UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
113       current.set(c, c);
114       current.closeOver(USET_CASE_INSENSITIVE);
115       current.removeAll(ignore);
116       for (int32_t j = 0; j < current.getRangeCount(); j++) {
117         for (UChar32 c2 = current.getRangeStart(j);
118              c2 <= current.getRangeEnd(j); c2++) {
119           CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
120         }
121       }
122     }
123   }
124 
125   PrintSet(out, "IgnoreSet", ignore);
126   PrintSet(out, "SpecialAddSet", special_add);
127 }
128 
WriteHeader(const char * header_filename)129 void WriteHeader(const char* header_filename) {
130   std::ofstream out(header_filename);
131   out << std::hex << std::setfill('0') << std::setw(4);
132   out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
133       << "// Use of this source code is governed by a BSD-style license that\n"
134       << "// can be found in the LICENSE file.\n\n"
135       << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
136       << "// The following functions are used to build UnicodeSets\n"
137       << "// for special cases where the case-folding algorithm used by\n"
138       << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
139       << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
140       << "// Semantics: Canonicalize) step 3.\n\n"
141       << "#ifdef V8_INTL_SUPPORT\n"
142       << "#include \"src/base/lazy-instance.h\"\n\n"
143       << "#include \"src/regexp/special-case.h\"\n\n"
144       << "#include \"unicode/uniset.h\"\n"
145       << "namespace v8 {\n"
146       << "namespace internal {\n\n";
147 
148   PrintSpecial(out);
149 
150   out << "\n"
151       << "}  // namespace internal\n"
152       << "}  // namespace v8\n"
153       << "#endif  // V8_INTL_SUPPORT\n";
154 }
155 
156 }  // namespace internal
157 }  // namespace v8
158 
main(int argc,const char ** argv)159 int main(int argc, const char** argv) {
160   if (argc != 2) {
161     std::cerr << "Usage: " << argv[0] << " <output filename>\n";
162     std::exit(1);
163   }
164   v8::internal::WriteHeader(argv[1]);
165 
166   return 0;
167 }
168