• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 */
11 
12 // file name: genregexcasing.cpp
13 //
14 // Program to generate the casing data for use by ICU regular expressions.
15 // The data declarations output when running this program are to be copied
16 // into the file i18n/regexcmp.h
17 //
18 // See the function RegexCompile::findCaseInsensitiveStarters() for more explanation.
19 
20 #include "unicode/uniset.h"
21 #include "unicode/usetiter.h"
22 #include "iostream"
23 #include <map>
24 #include <set>
25 #include <string>
26 #include <vector>
27 
28 using namespace std;
29 
sstring(const UnicodeString & us)30 std::string sstring(const UnicodeString &us) {
31     string retString;
32     us.toUTF8String(retString);
33     return retString;
34 }
35 
main()36 int main() {
37 
38     std::map<UChar32, std::set<UChar32>> cmap;
39 
40     for (UChar32 cp = 0; cp<=0x10ffff; cp++) {
41         UnicodeSet s(cp, cp);
42         s.closeOver(USET_CASE_INSENSITIVE);
43 
44         UnicodeSetIterator setIter(s);
45         while (setIter.next()) {
46             if (!setIter.isString()) {
47                 continue;
48             }
49             const UnicodeString &str = setIter.getString();
50 
51             cout << "Got a string for \"" << sstring(UnicodeString(cp)) << "\" [\\u" << hex << cp << "]\n";
52             cout << "    \"" << sstring(str) << "\"    [";
53             for (int32_t j=0; j<str.length(); j=str.moveIndex32(j, 1)) {
54                 cout << hex << "\\u" << str.char32At(j) << " ";
55             }
56             cout << "]" << endl;
57             UChar32 c32 = str.char32At(0);
58             if (s.contains(c32)) {
59                 cout << "    Set contains first char.\n";
60             }
61             cmap[c32].insert(cp);
62         }
63     }
64 
65 
66     std::cout << "Iterating the map.\n";
67     for (const auto &mapPair: cmap) {
68         UChar32 cp = mapPair.first;
69         std::cout << "key: \"" << sstring(UnicodeString(cp)) << "\"  \\u" << cp << " :  [";
70         for (UChar32 valCP: mapPair.second) {
71            std::cout << "\"" << sstring(UnicodeString(valCP)) << "\" \\u" << valCP << " ";
72         }
73         std::cout << "]\n";
74     }
75 
76     //
77     // Create the data arrays to be pasted into regexcmp.cpp
78     //
79 
80     std::cout << "\n\nCopy the lines below into the file i18n/regexcmp.cpp.\n\n";
81     std::cout << "// Machine Generated Data. Do not hand edit.\n";
82 
83     UnicodeString outString;
84     struct Item {
85         UChar32  fCP = 0;
86         int16_t  fStrIndex = 0;
87         int16_t  fCount = 0;
88     };
89 
90     std::vector<Item> data;
91     for (const auto &mapPair: cmap) {
92         Item   dataForCP;
93         dataForCP.fCP = mapPair.first;
94         dataForCP.fStrIndex = outString.length();
95         for (UChar32 valCP: mapPair.second) {
96             outString.append(valCP);
97             dataForCP.fCount++;
98         }
99         data.push_back(dataForCP);
100     }
101 
102     std::cout << "    static const UChar32 RECaseFixCodePoints[] = {" ;
103     int items=0;
104     for (const Item &d: data) {
105         if (items++ % 10 == 0) {
106             std::cout << "\n        ";
107         }
108         std::cout << "0x" << d.fCP << ", ";
109     }
110     std::cout << "0x110000};\n\n";
111 
112     std::cout << "    static const int16_t RECaseFixStringOffsets[] = {";
113     items = 0;
114     for (const Item &d: data) {
115         if (items++ % 10 == 0) {
116             std::cout << "\n        ";
117         }
118         std::cout << "0x" << d.fStrIndex << ", ";
119     }
120     std::cout << "0};\n\n";
121 
122     std::cout << "    static const int16_t RECaseFixCounts[] = {";
123     items = 0;
124     for (const Item &d: data) {
125         if (items++ % 10 == 0) {
126             std::cout << "\n        ";
127         }
128         std::cout << "0x" << d.fCount << ", ";
129     }
130     std::cout << "0};\n\n";
131 
132     std::cout << "    static const UChar RECaseFixData[] = {";
133     for (int i=0; i<outString.length(); i++) {
134         if (i % 10 == 0) {
135             std::cout << "\n        ";
136         }
137         std::cout << "0x" << outString.charAt(i) << ", ";
138     }
139     std::cout << "0};\n\n";
140     return 0;
141 }
142 
143