1 // Copyright 2020 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <fstream>
6 #include <iomanip>
7 #include <iostream>
8 #include <sstream>
9
10 #include "src/base/strings.h"
11 #include "src/regexp/special-case.h"
12
13 namespace v8 {
14 namespace internal {
15
16 static const base::uc32 kSurrogateStart = 0xd800;
17 static const base::uc32 kSurrogateEnd = 0xdfff;
18 static const base::uc32 kNonBmpStart = 0x10000;
19
20 // The following code generates "src/regexp/special-case.cc".
PrintSet(std::ofstream & out,const char * name,const icu::UnicodeSet & set)21 void PrintSet(std::ofstream& out, const char* name,
22 const icu::UnicodeSet& set) {
23 out << "icu::UnicodeSet Build" << name << "() {\n"
24 << " icu::UnicodeSet set;\n";
25 for (int32_t i = 0; i < set.getRangeCount(); i++) {
26 if (set.getRangeStart(i) == set.getRangeEnd(i)) {
27 out << " set.add(0x" << set.getRangeStart(i) << ");\n";
28 } else {
29 out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
30 << set.getRangeEnd(i) << ");\n";
31 }
32 }
33 out << " set.freeze();\n"
34 << " return set;\n"
35 << "}\n\n";
36
37 out << "struct " << name << "Data {\n"
38 << " " << name << "Data() : set(Build" << name << "()) {}\n"
39 << " const icu::UnicodeSet set;\n"
40 << "};\n\n";
41
42 out << "//static\n"
43 << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
44 << " static base::LazyInstance<" << name << "Data>::type set =\n"
45 << " LAZY_INSTANCE_INITIALIZER;\n"
46 << " return set.Pointer()->set;\n"
47 << "}\n\n";
48 }
49
PrintSpecial(std::ofstream & out)50 void PrintSpecial(std::ofstream& out) {
51 icu::UnicodeSet current;
52 icu::UnicodeSet special_add;
53 icu::UnicodeSet ignore;
54 UErrorCode status = U_ZERO_ERROR;
55 icu::UnicodeSet upper("[\\p{Lu}]", status);
56 CHECK(U_SUCCESS(status));
57
58 // Iterate through all chars in BMP except surrogates.
59 for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) {
60 if (i >= static_cast<UChar32>(kSurrogateStart) &&
61 i <= static_cast<UChar32>(kSurrogateEnd)) {
62 continue; // Ignore surrogate range
63 }
64 current.set(i, i);
65 current.closeOver(USET_CASE_INSENSITIVE);
66
67 // Check to see if all characters in the case-folding equivalence
68 // class as defined by UnicodeSet::closeOver all map to the same
69 // canonical value.
70 UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
71 bool class_has_matching_canonical_char = false;
72 bool class_has_non_matching_canonical_char = false;
73 for (int32_t j = 0; j < current.getRangeCount(); j++) {
74 for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
75 c++) {
76 if (c == i) {
77 continue;
78 }
79 UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
80 if (canonical == other_canonical) {
81 class_has_matching_canonical_char = true;
82 } else {
83 class_has_non_matching_canonical_char = true;
84 }
85 }
86 }
87 // If any other character in i's equivalence class has a
88 // different canonical value, then i needs special handling. If
89 // no other character shares a canonical value with i, we can
90 // ignore i when adding alternatives for case-independent
91 // comparison. If at least one other character shares a
92 // canonical value, then i needs special handling.
93 if (class_has_non_matching_canonical_char) {
94 if (class_has_matching_canonical_char) {
95 special_add.add(i);
96 } else {
97 ignore.add(i);
98 }
99 }
100 }
101
102 // Verify that no Unicode equivalence class contains two non-trivial
103 // JS equivalence classes. Every character in SpecialAddSet has the
104 // same canonical value as every other non-IgnoreSet character in
105 // its Unicode equivalence class. Therefore, if we call closeOver on
106 // a set containing no IgnoreSet characters, the only characters
107 // that must be removed from the result are in IgnoreSet. This fact
108 // is used in CharacterRange::AddCaseEquivalents.
109 for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
110 for (UChar32 c = special_add.getRangeStart(i);
111 c <= special_add.getRangeEnd(i); c++) {
112 UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
113 current.set(c, c);
114 current.closeOver(USET_CASE_INSENSITIVE);
115 current.removeAll(ignore);
116 for (int32_t j = 0; j < current.getRangeCount(); j++) {
117 for (UChar32 c2 = current.getRangeStart(j);
118 c2 <= current.getRangeEnd(j); c2++) {
119 CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
120 }
121 }
122 }
123 }
124
125 PrintSet(out, "IgnoreSet", ignore);
126 PrintSet(out, "SpecialAddSet", special_add);
127 }
128
WriteHeader(const char * header_filename)129 void WriteHeader(const char* header_filename) {
130 std::ofstream out(header_filename);
131 out << std::hex << std::setfill('0') << std::setw(4);
132 out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
133 << "// Use of this source code is governed by a BSD-style license that\n"
134 << "// can be found in the LICENSE file.\n\n"
135 << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
136 << "// The following functions are used to build UnicodeSets\n"
137 << "// for special cases where the case-folding algorithm used by\n"
138 << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
139 << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
140 << "// Semantics: Canonicalize) step 3.\n\n"
141 << "#ifdef V8_INTL_SUPPORT\n"
142 << "#include \"src/base/lazy-instance.h\"\n\n"
143 << "#include \"src/regexp/special-case.h\"\n\n"
144 << "#include \"unicode/uniset.h\"\n"
145 << "namespace v8 {\n"
146 << "namespace internal {\n\n";
147
148 PrintSpecial(out);
149
150 out << "\n"
151 << "} // namespace internal\n"
152 << "} // namespace v8\n"
153 << "#endif // V8_INTL_SUPPORT\n";
154 }
155
156 } // namespace internal
157 } // namespace v8
158
main(int argc,const char ** argv)159 int main(int argc, const char** argv) {
160 if (argc != 2) {
161 std::cerr << "Usage: " << argv[0] << " <output filename>\n";
162 std::exit(1);
163 }
164 v8::internal::WriteHeader(argv[1]);
165
166 return 0;
167 }
168