1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2017 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 package ohos.global.icu.impl; 5 6 import static ohos.global.icu.impl.number.parse.ParsingUtils.safeContains; 7 8 import java.util.EnumMap; 9 import java.util.Map; 10 11 import ohos.global.icu.impl.UResource.Value; 12 import ohos.global.icu.text.UnicodeSet; 13 import ohos.global.icu.util.ULocale; 14 import ohos.global.icu.util.UResourceBundle; 15 16 /** 17 * This class statically initializes UnicodeSets, originally built for number parsing. Microbenchmarks 18 * show this to bring a very sizeable performance boost. 19 * 20 * IMPORTANT ASSUMPTION FOR NUMBER PARSING: All of the sets contain code points (no strings) and they are 21 * all case-folded. If this assumption were ever broken, logic in classes such as SymbolMatcher would 22 * need to be updated in order to return well-formed sets upon calls to getLeadCodePoints(). 23 * 24 * @author sffc 25 * @hide exposed on OHOS 26 */ 27 public class StaticUnicodeSets { 28 /** 29 * @hide exposed on OHOS 30 */ 31 public static enum Key { 32 EMPTY, 33 // Ignorables 34 DEFAULT_IGNORABLES, 35 STRICT_IGNORABLES, 36 37 // Separators 38 // Notes: 39 // - COMMA is a superset of STRICT_COMMA 40 // - PERIOD is a superset of SCRICT_PERIOD 41 // - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS 42 // - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS 43 COMMA, 44 PERIOD, 45 STRICT_COMMA, 46 STRICT_PERIOD, 47 APOSTROPHE_SIGN, 48 OTHER_GROUPING_SEPARATORS, 49 ALL_SEPARATORS, 50 STRICT_ALL_SEPARATORS, 51 52 // Symbols 53 // TODO: NaN? 54 MINUS_SIGN, 55 PLUS_SIGN, 56 PERCENT_SIGN, 57 PERMILLE_SIGN, 58 INFINITY_SIGN, 59 60 // Currency Symbols 61 DOLLAR_SIGN, 62 POUND_SIGN, 63 RUPEE_SIGN, 64 YEN_SIGN, 65 WON_SIGN, 66 67 // Other 68 DIGITS, 69 70 // Combined Separators with Digits (for lead code points) 71 DIGITS_OR_ALL_SEPARATORS, 72 DIGITS_OR_STRICT_ALL_SEPARATORS, 73 }; 74 75 private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<>(Key.class); 76 77 /** 78 * Gets the static-allocated UnicodeSet according to the provided key. 79 * 80 * @param key 81 * The desired UnicodeSet according to the enum in this file. 82 * @return The requested UnicodeSet. Guaranteed to be frozen and non-null, but may be empty if an 83 * error occurred during data loading. 84 */ get(Key key)85 public static UnicodeSet get(Key key) { 86 UnicodeSet candidate = unicodeSets.get(key); 87 if (candidate == null) { 88 return UnicodeSet.EMPTY; 89 } 90 return candidate; 91 } 92 93 /** 94 * Checks if the UnicodeSet given by key1 contains the given string. 95 * 96 * @param str 97 * The string to check. 98 * @param key1 99 * The set to check. 100 * @return key1 if the set contains str, or COUNT if not. 101 */ chooseFrom(String str, Key key1)102 public static Key chooseFrom(String str, Key key1) { 103 return safeContains(get(key1), str) ? key1 : null; 104 } 105 106 /** 107 * Checks if the UnicodeSet given by either key1 or key2 contains the string. 108 * 109 * Exported as U_COMMON_API for numparse_decimal.cpp 110 * 111 * @param str 112 * The string to check. 113 * @param key1 114 * The first set to check. 115 * @param key2 116 * The second set to check. 117 * @return key1 if that set contains str; key2 if that set contains str; or COUNT if neither set 118 * contains str. 119 */ chooseFrom(String str, Key key1, Key key2)120 public static Key chooseFrom(String str, Key key1, Key key2) { 121 return safeContains(get(key1), str) ? key1 : chooseFrom(str, key2); 122 } 123 124 /** 125 * Looks through all Currency-related sets for the given string, returning the first match or null if 126 * no match was round. 127 */ chooseCurrency(String str)128 public static Key chooseCurrency(String str) { 129 if (get(Key.DOLLAR_SIGN).contains(str)) { 130 return Key.DOLLAR_SIGN; 131 } else if (get(Key.POUND_SIGN).contains(str)) { 132 return Key.POUND_SIGN; 133 } else if (get(Key.RUPEE_SIGN).contains(str)) { 134 return Key.RUPEE_SIGN; 135 } else if (get(Key.YEN_SIGN).contains(str)) { 136 return Key.YEN_SIGN; 137 } else if (get(Key.WON_SIGN).contains(str)) { 138 return Key.WON_SIGN; 139 } else { 140 return null; 141 } 142 } 143 computeUnion(Key k1, Key k2)144 private static UnicodeSet computeUnion(Key k1, Key k2) { 145 return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze(); 146 } 147 computeUnion(Key k1, Key k2, Key k3)148 private static UnicodeSet computeUnion(Key k1, Key k2, Key k3) { 149 return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze(); 150 } 151 saveSet(Key key, String unicodeSetPattern)152 private static void saveSet(Key key, String unicodeSetPattern) { 153 assert unicodeSets.get(key) == null; 154 unicodeSets.put(key, new UnicodeSet(unicodeSetPattern).freeze()); 155 } 156 157 /* 158 parse{ 159 date{ 160 lenient{ 161 "[\\--/]", 162 "[\\:∶]", 163 } 164 } 165 general{ 166 lenient{ 167 "[.․。︒﹒.。]", 168 "[\$﹩$$]", 169 "[£₤]", 170 "[₨₹{Rp}{Rs}]", 171 } 172 } 173 number{ 174 lenient{ 175 "[\\-‒⁻₋−➖﹣-]", 176 "[,،٫、︐︑﹐﹑,、]", 177 "[+⁺₊➕﬩﹢+]", 178 } 179 stricter{ 180 "[,٫︐﹐,]", 181 "[.․﹒.。]", 182 } 183 } 184 } 185 */ 186 static class ParseDataSink extends UResource.Sink { 187 @Override put(ohos.global.icu.impl.UResource.Key key, Value value, boolean noFallback)188 public void put(ohos.global.icu.impl.UResource.Key key, Value value, boolean noFallback) { 189 UResource.Table contextsTable = value.getTable(); 190 for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) { 191 if (key.contentEquals("date")) { 192 // ignore 193 } else { 194 assert key.contentEquals("general") || key.contentEquals("number"); 195 UResource.Table strictnessTable = value.getTable(); 196 for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) { 197 boolean isLenient = key.contentEquals("lenient"); 198 UResource.Array array = value.getArray(); 199 for (int k = 0; k < array.getSize(); k++) { 200 array.getValue(k, value); 201 String str = value.toString(); 202 // There is both lenient and strict data for comma/period, 203 // but not for any of the other symbols. 204 if (str.indexOf('.') != -1) { 205 saveSet(isLenient ? Key.PERIOD : Key.STRICT_PERIOD, str); 206 } else if (str.indexOf(',') != -1) { 207 saveSet(isLenient ? Key.COMMA : Key.STRICT_COMMA, str); 208 } else if (str.indexOf('+') != -1) { 209 saveSet(Key.PLUS_SIGN, str); 210 } else if (str.indexOf('-') != -1) { 211 saveSet(Key.MINUS_SIGN, str); 212 } else if (str.indexOf('$') != -1) { 213 saveSet(Key.DOLLAR_SIGN, str); 214 } else if (str.indexOf('£') != -1) { 215 saveSet(Key.POUND_SIGN, str); 216 } else if (str.indexOf('₹') != -1) { 217 saveSet(Key.RUPEE_SIGN, str); 218 } else if (str.indexOf('¥') != -1) { 219 saveSet(Key.YEN_SIGN, str); 220 } else if (str.indexOf('₩') != -1) { 221 saveSet(Key.WON_SIGN, str); 222 } else if (str.indexOf('%') != -1) { 223 saveSet(Key.PERCENT_SIGN, str); 224 } else if (str.indexOf('‰') != -1) { 225 saveSet(Key.PERMILLE_SIGN, str); 226 } else if (str.indexOf('’') != -1) { 227 saveSet(Key.APOSTROPHE_SIGN, str); 228 } else { 229 // TODO(ICU-20428): Make ICU automatically accept new classes? 230 throw new AssertionError("Unknown class of parse lenients: " + str); 231 } 232 } 233 } 234 } 235 } 236 } 237 } 238 239 static { unicodeSets.put(Key.EMPTY, new UnicodeSet("[]").freeze())240 unicodeSets.put(Key.EMPTY, new UnicodeSet("[]").freeze()); 241 // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309. 242 // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). unicodeSets.put(Key.DEFAULT_IGNORABLES, new UnicodeSet("[[:Zs:][\\\\u0009][:Bidi_Control:][:Variation_Selector:]]").freeze())243 unicodeSets.put(Key.DEFAULT_IGNORABLES, 244 new UnicodeSet("[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]").freeze()); unicodeSets.put(Key.STRICT_IGNORABLES, new UnicodeSet("[[:Bidi_Control:]]").freeze())245 unicodeSets.put(Key.STRICT_IGNORABLES, new UnicodeSet("[[:Bidi_Control:]]").freeze()); 246 247 // CLDR provides data for comma, period, minus sign, and plus sign. 248 ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle 249 .getBundleInstance(ICUData.ICU_BASE_NAME, ULocale.ROOT); 250 rb.getAllItemsWithFallback("parse", new ParseDataSink()); 251 252 // NOTE: It is OK for these assertions to fail if there was a no-data build. unicodeSets.containsKey(Key.COMMA)253 assert unicodeSets.containsKey(Key.COMMA); unicodeSets.containsKey(Key.STRICT_COMMA)254 assert unicodeSets.containsKey(Key.STRICT_COMMA); unicodeSets.containsKey(Key.PERIOD)255 assert unicodeSets.containsKey(Key.PERIOD); unicodeSets.containsKey(Key.STRICT_PERIOD)256 assert unicodeSets.containsKey(Key.STRICT_PERIOD); unicodeSets.containsKey(Key.APOSTROPHE_SIGN)257 assert unicodeSets.containsKey(Key.APOSTROPHE_SIGN); 258 259 UnicodeSet otherGrouping = new UnicodeSet( 260 "[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]"); unicodeSets.get(Key.APOSTROPHE_SIGN)261 otherGrouping.addAll(unicodeSets.get(Key.APOSTROPHE_SIGN)); unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS, otherGrouping.freeze())262 unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS, otherGrouping.freeze()); unicodeSets.put(Key.ALL_SEPARATORS, computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS))263 unicodeSets.put(Key.ALL_SEPARATORS, 264 computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS)); unicodeSets.put(Key.STRICT_ALL_SEPARATORS, computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS))265 unicodeSets.put(Key.STRICT_ALL_SEPARATORS, 266 computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS)); 267 unicodeSets.containsKey(Key.MINUS_SIGN)268 assert unicodeSets.containsKey(Key.MINUS_SIGN); unicodeSets.containsKey(Key.PLUS_SIGN)269 assert unicodeSets.containsKey(Key.PLUS_SIGN); unicodeSets.containsKey(Key.PERCENT_SIGN)270 assert unicodeSets.containsKey(Key.PERCENT_SIGN); unicodeSets.containsKey(Key.PERMILLE_SIGN)271 assert unicodeSets.containsKey(Key.PERMILLE_SIGN); 272 unicodeSets.put(Key.INFINITY_SIGN, new UnicodeSet("[∞]").freeze())273 unicodeSets.put(Key.INFINITY_SIGN, new UnicodeSet("[∞]").freeze()); 274 unicodeSets.containsKey(Key.DOLLAR_SIGN)275 assert unicodeSets.containsKey(Key.DOLLAR_SIGN); unicodeSets.containsKey(Key.POUND_SIGN)276 assert unicodeSets.containsKey(Key.POUND_SIGN); unicodeSets.containsKey(Key.RUPEE_SIGN)277 assert unicodeSets.containsKey(Key.RUPEE_SIGN); unicodeSets.containsKey(Key.YEN_SIGN)278 assert unicodeSets.containsKey(Key.YEN_SIGN); unicodeSets.containsKey(Key.WON_SIGN)279 assert unicodeSets.containsKey(Key.WON_SIGN); 280 unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze())281 unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze()); 282 unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS))283 unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS)); unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.STRICT_ALL_SEPARATORS))284 unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS, 285 computeUnion(Key.DIGITS, Key.STRICT_ALL_SEPARATORS)); 286 } 287 } 288