1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 */ 11 12 // file name: genregexcasing.cpp 13 // 14 // Program to generate the casing data for use by ICU regular expressions. 15 // The data declarations output when running this program are to be copied 16 // into the file i18n/regexcmp.h 17 // 18 // See the function RegexCompile::findCaseInsensitiveStarters() for more explanation. 19 20 #include "unicode/uniset.h" 21 #include "unicode/usetiter.h" 22 #include "iostream" 23 #include <map> 24 #include <set> 25 #include <string> 26 #include <vector> 27 28 using namespace std; 29 sstring(const UnicodeString & us)30std::string sstring(const UnicodeString &us) { 31 string retString; 32 us.toUTF8String(retString); 33 return retString; 34 } 35 main()36int main() { 37 38 std::map<UChar32, std::set<UChar32>> cmap; 39 40 for (UChar32 cp = 0; cp<=0x10ffff; cp++) { 41 UnicodeSet s(cp, cp); 42 s.closeOver(USET_CASE_INSENSITIVE); 43 44 UnicodeSetIterator setIter(s); 45 while (setIter.next()) { 46 if (!setIter.isString()) { 47 continue; 48 } 49 const UnicodeString &str = setIter.getString(); 50 51 cout << "Got a string for \"" << sstring(UnicodeString(cp)) << "\" [\\u" << hex << cp << "]\n"; 52 cout << " \"" << sstring(str) << "\" ["; 53 for (int32_t j=0; j<str.length(); j=str.moveIndex32(j, 1)) { 54 cout << hex << "\\u" << str.char32At(j) << " "; 55 } 56 cout << "]" << endl; 57 UChar32 c32 = str.char32At(0); 58 if (s.contains(c32)) { 59 cout << " Set contains first char.\n"; 60 } 61 cmap[c32].insert(cp); 62 } 63 } 64 65 66 std::cout << "Iterating the map.\n"; 67 for (const auto &mapPair: cmap) { 68 UChar32 cp = mapPair.first; 69 std::cout << "key: \"" << sstring(UnicodeString(cp)) << "\" \\u" << cp << " : ["; 70 for (UChar32 valCP: mapPair.second) { 71 std::cout << "\"" << sstring(UnicodeString(valCP)) << "\" \\u" << valCP << " "; 72 } 73 std::cout << "]\n"; 74 } 75 76 // 77 // Create the data arrays to be pasted into regexcmp.cpp 78 // 79 80 std::cout << "\n\nCopy the lines below into the file i18n/regexcmp.cpp.\n\n"; 81 std::cout << "// Machine Generated Data. Do not hand edit.\n"; 82 83 UnicodeString outString; 84 struct Item { 85 UChar32 fCP = 0; 86 int16_t fStrIndex = 0; 87 int16_t fCount = 0; 88 }; 89 90 std::vector<Item> data; 91 for (const auto &mapPair: cmap) { 92 Item dataForCP; 93 dataForCP.fCP = mapPair.first; 94 dataForCP.fStrIndex = outString.length(); 95 for (UChar32 valCP: mapPair.second) { 96 outString.append(valCP); 97 dataForCP.fCount++; 98 } 99 data.push_back(dataForCP); 100 } 101 102 std::cout << " static const UChar32 RECaseFixCodePoints[] = {" ; 103 int items=0; 104 for (const Item &d: data) { 105 if (items++ % 10 == 0) { 106 std::cout << "\n "; 107 } 108 std::cout << "0x" << d.fCP << ", "; 109 } 110 std::cout << "0x110000};\n\n"; 111 112 std::cout << " static const int16_t RECaseFixStringOffsets[] = {"; 113 items = 0; 114 for (const Item &d: data) { 115 if (items++ % 10 == 0) { 116 std::cout << "\n "; 117 } 118 std::cout << "0x" << d.fStrIndex << ", "; 119 } 120 std::cout << "0};\n\n"; 121 122 std::cout << " static const int16_t RECaseFixCounts[] = {"; 123 items = 0; 124 for (const Item &d: data) { 125 if (items++ % 10 == 0) { 126 std::cout << "\n "; 127 } 128 std::cout << "0x" << d.fCount << ", "; 129 } 130 std::cout << "0};\n\n"; 131 132 std::cout << " static const UChar RECaseFixData[] = {"; 133 for (int i=0; i<outString.length(); i++) { 134 if (i % 10 == 0) { 135 std::cout << "\n "; 136 } 137 std::cout << "0x" << outString.charAt(i) << ", "; 138 } 139 std::cout << "0};\n\n"; 140 return 0; 141 } 142 143