• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2017 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 package ohos.global.icu.impl;
5 
6 import static ohos.global.icu.impl.number.parse.ParsingUtils.safeContains;
7 
8 import java.util.EnumMap;
9 import java.util.Map;
10 
11 import ohos.global.icu.impl.UResource.Value;
12 import ohos.global.icu.text.UnicodeSet;
13 import ohos.global.icu.util.ULocale;
14 import ohos.global.icu.util.UResourceBundle;
15 
16 /**
17  * This class statically initializes UnicodeSets, originally built for number parsing. Microbenchmarks
18  * show this to bring a very sizeable performance boost.
19  *
20  * IMPORTANT ASSUMPTION FOR NUMBER PARSING: All of the sets contain code points (no strings) and they are
21  * all case-folded. If this assumption were ever broken, logic in classes such as SymbolMatcher would
22  * need to be updated in order to return well-formed sets upon calls to getLeadCodePoints().
23  *
24  * @author sffc
25  * @hide exposed on OHOS
26  */
27 public class StaticUnicodeSets {
28     /**
29      * @hide exposed on OHOS
30      */
31     public static enum Key {
32         EMPTY,
33         // Ignorables
34         DEFAULT_IGNORABLES,
35         STRICT_IGNORABLES,
36 
37         // Separators
38         // Notes:
39         // - COMMA is a superset of STRICT_COMMA
40         // - PERIOD is a superset of SCRICT_PERIOD
41         // - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS
42         // - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS
43         COMMA,
44         PERIOD,
45         STRICT_COMMA,
46         STRICT_PERIOD,
47         APOSTROPHE_SIGN,
48         OTHER_GROUPING_SEPARATORS,
49         ALL_SEPARATORS,
50         STRICT_ALL_SEPARATORS,
51 
52         // Symbols
53         // TODO: NaN?
54         MINUS_SIGN,
55         PLUS_SIGN,
56         PERCENT_SIGN,
57         PERMILLE_SIGN,
58         INFINITY_SIGN,
59 
60         // Currency Symbols
61         DOLLAR_SIGN,
62         POUND_SIGN,
63         RUPEE_SIGN,
64         YEN_SIGN,
65         WON_SIGN,
66 
67         // Other
68         DIGITS,
69 
70         // Combined Separators with Digits (for lead code points)
71         DIGITS_OR_ALL_SEPARATORS,
72         DIGITS_OR_STRICT_ALL_SEPARATORS,
73     };
74 
75     private static final Map<Key, UnicodeSet> unicodeSets = new EnumMap<>(Key.class);
76 
77     /**
78      * Gets the static-allocated UnicodeSet according to the provided key.
79      *
80      * @param key
81      *            The desired UnicodeSet according to the enum in this file.
82      * @return The requested UnicodeSet. Guaranteed to be frozen and non-null, but may be empty if an
83      *         error occurred during data loading.
84      */
get(Key key)85     public static UnicodeSet get(Key key) {
86         UnicodeSet candidate = unicodeSets.get(key);
87         if (candidate == null) {
88             return UnicodeSet.EMPTY;
89         }
90         return candidate;
91     }
92 
93     /**
94      * Checks if the UnicodeSet given by key1 contains the given string.
95      *
96      * @param str
97      *            The string to check.
98      * @param key1
99      *            The set to check.
100      * @return key1 if the set contains str, or COUNT if not.
101      */
chooseFrom(String str, Key key1)102     public static Key chooseFrom(String str, Key key1) {
103         return safeContains(get(key1), str) ? key1 : null;
104     }
105 
106     /**
107      * Checks if the UnicodeSet given by either key1 or key2 contains the string.
108      *
109      * Exported as U_COMMON_API for numparse_decimal.cpp
110      *
111      * @param str
112      *            The string to check.
113      * @param key1
114      *            The first set to check.
115      * @param key2
116      *            The second set to check.
117      * @return key1 if that set contains str; key2 if that set contains str; or COUNT if neither set
118      *         contains str.
119      */
chooseFrom(String str, Key key1, Key key2)120     public static Key chooseFrom(String str, Key key1, Key key2) {
121         return safeContains(get(key1), str) ? key1 : chooseFrom(str, key2);
122     }
123 
124     /**
125      * Looks through all Currency-related sets for the given string, returning the first match or null if
126      * no match was round.
127      */
chooseCurrency(String str)128     public static Key chooseCurrency(String str) {
129         if (get(Key.DOLLAR_SIGN).contains(str)) {
130             return Key.DOLLAR_SIGN;
131         } else if (get(Key.POUND_SIGN).contains(str)) {
132             return Key.POUND_SIGN;
133         } else if (get(Key.RUPEE_SIGN).contains(str)) {
134             return Key.RUPEE_SIGN;
135         } else if (get(Key.YEN_SIGN).contains(str)) {
136             return Key.YEN_SIGN;
137         } else if (get(Key.WON_SIGN).contains(str)) {
138             return Key.WON_SIGN;
139         } else {
140             return null;
141         }
142     }
143 
computeUnion(Key k1, Key k2)144     private static UnicodeSet computeUnion(Key k1, Key k2) {
145         return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze();
146     }
147 
computeUnion(Key k1, Key k2, Key k3)148     private static UnicodeSet computeUnion(Key k1, Key k2, Key k3) {
149         return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze();
150     }
151 
saveSet(Key key, String unicodeSetPattern)152     private static void saveSet(Key key, String unicodeSetPattern) {
153         assert unicodeSets.get(key) == null;
154         unicodeSets.put(key, new UnicodeSet(unicodeSetPattern).freeze());
155     }
156 
157     /*
158     parse{
159         date{
160             lenient{
161                 "[\\--/]",
162                 "[\\:∶]",
163             }
164         }
165         general{
166             lenient{
167                 "[.․。︒﹒.。]",
168                 "[\$﹩$$]",
169                 "[£₤]",
170                 "[₨₹{Rp}{Rs}]",
171             }
172         }
173         number{
174             lenient{
175                 "[\\-‒⁻₋−➖﹣-]",
176                 "[,،٫、︐︑﹐﹑,、]",
177                 "[+⁺₊➕﬩﹢+]",
178             }
179             stricter{
180                 "[,٫︐﹐,]",
181                 "[.․﹒.。]",
182             }
183         }
184     }
185      */
186     static class ParseDataSink extends UResource.Sink {
187         @Override
put(ohos.global.icu.impl.UResource.Key key, Value value, boolean noFallback)188         public void put(ohos.global.icu.impl.UResource.Key key, Value value, boolean noFallback) {
189             UResource.Table contextsTable = value.getTable();
190             for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
191                 if (key.contentEquals("date")) {
192                     // ignore
193                 } else {
194                     assert key.contentEquals("general") || key.contentEquals("number");
195                     UResource.Table strictnessTable = value.getTable();
196                     for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
197                         boolean isLenient = key.contentEquals("lenient");
198                         UResource.Array array = value.getArray();
199                         for (int k = 0; k < array.getSize(); k++) {
200                             array.getValue(k, value);
201                             String str = value.toString();
202                             // There is both lenient and strict data for comma/period,
203                             // but not for any of the other symbols.
204                             if (str.indexOf('.') != -1) {
205                                 saveSet(isLenient ? Key.PERIOD : Key.STRICT_PERIOD, str);
206                             } else if (str.indexOf(',') != -1) {
207                                 saveSet(isLenient ? Key.COMMA : Key.STRICT_COMMA, str);
208                             } else if (str.indexOf('+') != -1) {
209                                 saveSet(Key.PLUS_SIGN, str);
210                             } else if (str.indexOf('-') != -1) {
211                                 saveSet(Key.MINUS_SIGN, str);
212                             } else if (str.indexOf('$') != -1) {
213                                 saveSet(Key.DOLLAR_SIGN, str);
214                             } else if (str.indexOf('£') != -1) {
215                                 saveSet(Key.POUND_SIGN, str);
216                             } else if (str.indexOf('₹') != -1) {
217                                 saveSet(Key.RUPEE_SIGN, str);
218                             } else if (str.indexOf('¥') != -1) {
219                                 saveSet(Key.YEN_SIGN, str);
220                             } else if (str.indexOf('₩') != -1) {
221                                 saveSet(Key.WON_SIGN, str);
222                             } else if (str.indexOf('%') != -1) {
223                                 saveSet(Key.PERCENT_SIGN, str);
224                             } else if (str.indexOf('‰') != -1) {
225                                 saveSet(Key.PERMILLE_SIGN, str);
226                             } else if (str.indexOf('’') != -1) {
227                                 saveSet(Key.APOSTROPHE_SIGN, str);
228                             } else {
229                                 // TODO(ICU-20428): Make ICU automatically accept new classes?
230                                 throw new AssertionError("Unknown class of parse lenients: " + str);
231                             }
232                         }
233                     }
234                 }
235             }
236         }
237     }
238 
239     static {
unicodeSets.put(Key.EMPTY, new UnicodeSet("[]").freeze())240         unicodeSets.put(Key.EMPTY, new UnicodeSet("[]").freeze());
241         // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
242         // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
unicodeSets.put(Key.DEFAULT_IGNORABLES, new UnicodeSet("[[:Zs:][\\\\u0009][:Bidi_Control:][:Variation_Selector:]]").freeze())243         unicodeSets.put(Key.DEFAULT_IGNORABLES,
244                 new UnicodeSet("[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]").freeze());
unicodeSets.put(Key.STRICT_IGNORABLES, new UnicodeSet("[[:Bidi_Control:]]").freeze())245         unicodeSets.put(Key.STRICT_IGNORABLES, new UnicodeSet("[[:Bidi_Control:]]").freeze());
246 
247         // CLDR provides data for comma, period, minus sign, and plus sign.
248         ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle
249                 .getBundleInstance(ICUData.ICU_BASE_NAME, ULocale.ROOT);
250         rb.getAllItemsWithFallback("parse", new ParseDataSink());
251 
252         // NOTE: It is OK for these assertions to fail if there was a no-data build.
unicodeSets.containsKey(Key.COMMA)253         assert unicodeSets.containsKey(Key.COMMA);
unicodeSets.containsKey(Key.STRICT_COMMA)254         assert unicodeSets.containsKey(Key.STRICT_COMMA);
unicodeSets.containsKey(Key.PERIOD)255         assert unicodeSets.containsKey(Key.PERIOD);
unicodeSets.containsKey(Key.STRICT_PERIOD)256         assert unicodeSets.containsKey(Key.STRICT_PERIOD);
unicodeSets.containsKey(Key.APOSTROPHE_SIGN)257         assert unicodeSets.containsKey(Key.APOSTROPHE_SIGN);
258 
259         UnicodeSet otherGrouping = new UnicodeSet(
260                 "[٬‘\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]");
unicodeSets.get(Key.APOSTROPHE_SIGN)261         otherGrouping.addAll(unicodeSets.get(Key.APOSTROPHE_SIGN));
unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS, otherGrouping.freeze())262         unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS, otherGrouping.freeze());
unicodeSets.put(Key.ALL_SEPARATORS, computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS))263         unicodeSets.put(Key.ALL_SEPARATORS,
264                 computeUnion(Key.COMMA, Key.PERIOD, Key.OTHER_GROUPING_SEPARATORS));
unicodeSets.put(Key.STRICT_ALL_SEPARATORS, computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS))265         unicodeSets.put(Key.STRICT_ALL_SEPARATORS,
266                 computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
267 
unicodeSets.containsKey(Key.MINUS_SIGN)268         assert unicodeSets.containsKey(Key.MINUS_SIGN);
unicodeSets.containsKey(Key.PLUS_SIGN)269         assert unicodeSets.containsKey(Key.PLUS_SIGN);
unicodeSets.containsKey(Key.PERCENT_SIGN)270         assert unicodeSets.containsKey(Key.PERCENT_SIGN);
unicodeSets.containsKey(Key.PERMILLE_SIGN)271         assert unicodeSets.containsKey(Key.PERMILLE_SIGN);
272 
unicodeSets.put(Key.INFINITY_SIGN, new UnicodeSet("[∞]").freeze())273         unicodeSets.put(Key.INFINITY_SIGN, new UnicodeSet("[∞]").freeze());
274 
unicodeSets.containsKey(Key.DOLLAR_SIGN)275         assert unicodeSets.containsKey(Key.DOLLAR_SIGN);
unicodeSets.containsKey(Key.POUND_SIGN)276         assert unicodeSets.containsKey(Key.POUND_SIGN);
unicodeSets.containsKey(Key.RUPEE_SIGN)277         assert unicodeSets.containsKey(Key.RUPEE_SIGN);
unicodeSets.containsKey(Key.YEN_SIGN)278         assert unicodeSets.containsKey(Key.YEN_SIGN);
unicodeSets.containsKey(Key.WON_SIGN)279         assert unicodeSets.containsKey(Key.WON_SIGN);
280 
unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze())281         unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
282 
unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS))283         unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS));
unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.STRICT_ALL_SEPARATORS))284         unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS,
285                 computeUnion(Key.DIGITS, Key.STRICT_ALL_SEPARATORS));
286     }
287 }
288