• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  ***************************************************************************
6  * Copyright (C) 2008-2016 International Business Machines Corporation
7  * and others. All Rights Reserved.
8  ***************************************************************************
9  *
10  * Unicode Spoof Detection
11  */
12 
13 package ohos.global.icu.text;
14 
15 import java.io.IOException;
16 import java.io.LineNumberReader;
17 import java.io.Reader;
18 import java.nio.ByteBuffer;
19 import java.text.ParseException;
20 import java.util.ArrayList;
21 import java.util.Arrays;
22 import java.util.BitSet;
23 import java.util.Collections;
24 import java.util.Comparator;
25 import java.util.HashSet;
26 import java.util.Hashtable;
27 import java.util.LinkedHashSet;
28 import java.util.Locale;
29 import java.util.MissingResourceException;
30 import java.util.Set;
31 import java.util.Vector;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34 
35 import ohos.global.icu.impl.ICUBinary;
36 import ohos.global.icu.impl.ICUBinary.Authenticate;
37 import ohos.global.icu.impl.Utility;
38 import ohos.global.icu.lang.UCharacter;
39 import ohos.global.icu.lang.UCharacterCategory;
40 import ohos.global.icu.lang.UProperty;
41 import ohos.global.icu.lang.UScript;
42 import ohos.global.icu.util.ULocale;
43 
44 /**
45  * <p>
46  * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and
47  * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
48  *
49  * <ol>
50  * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desparejado" and
51  * "ԁеѕрагејаԁо".</li>
52  * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
53  * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li>
54  * </ol>
55  *
56  * <p>
57  * Although originally designed as a method for flagging suspicious identifier strings such as URLs,
58  * <code>SpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word
59  * content filters.
60  *
61  * <h2>Confusables</h2>
62  *
63  * <p>
64  * The following example shows how to use <code>SpoofChecker</code> to check for confusability between two strings:
65  *
66  * <pre>
67  * <code>
68  * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
69  * int result = sc.areConfusable("desparejado", "ԁеѕрагејаԁо");
70  * System.out.println(result != 0);  // true
71  * </code>
72  * </pre>
73  *
74  * <p>
75  * <code>SpoofChecker</code> uses a builder paradigm: options are specified within the context of a lightweight
76  * {@link SpoofChecker.Builder} object, and upon calling {@link SpoofChecker.Builder#build}, expensive data loading
77  * operations are performed, and an immutable <code>SpoofChecker</code> is returned.
78  *
79  * <p>
80  * The first line of the example creates a <code>SpoofChecker</code> object with confusable-checking enabled; the second
81  * line performs the confusability test. For best performance, the instance should be created once (e.g., upon
82  * application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime.
83  *
84  * <p>
85  * UTS 39 defines two strings to be <em>confusable</em> if they map to the same skeleton. A <em>skeleton</em> is a
86  * sequence of families of confusable characters, where each family has a single exemplar character.
87  * {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so the following snippet is
88  * equivalent to the example above:
89  *
90  * <pre>
91  * <code>
92  * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
93  * boolean result = sc.getSkeleton("desparejado").equals(sc.getSkeleton("ԁеѕрагејаԁо"));
94  * System.out.println(result);  // true
95  * </code>
96  * </pre>
97  *
98  * <p>
99  * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
100  * {@link SpoofChecker#areConfusable} many times in a loop, {@link SpoofChecker#getSkeleton} can be used instead, as
101  * shown below:
102  *
103  * <pre>
104  * // Setup:
105  * String[] DICTIONARY = new String[]{ "lorem", "ipsum" }; // example
106  * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
107  * HashSet&lt;String&gt; skeletons = new HashSet&lt;String&gt;();
108  * for (String word : DICTIONARY) {
109  *   skeletons.add(sc.getSkeleton(word));
110  * }
111  *
112  * // Live Check:
113  * boolean result = skeletons.contains(sc.getSkeleton("1orern"));
114  * System.out.println(result);  // true
115  * </pre>
116  *
117  * <p>
118  * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em>
119  * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
120  * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
121  *
122  * <h2>Spoof Detection</h2>
123  *
124  * <p>
125  * The following snippet shows a minimal example of using <code>SpoofChecker</code> to perform spoof detection on a
126  * string:
127  *
128  * <pre>
129  * SpoofChecker sc = new SpoofChecker.Builder()
130  *     .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
131  *     .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
132  *     .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
133  *     .build();
134  * boolean result = sc.failsChecks("pаypаl");  // with Cyrillic 'а' characters
135  * System.out.println(result);  // true
136  * </pre>
137  *
138  * <p>
139  * As in the case for confusability checking, it is good practice to create one <code>SpoofChecker</code> instance at
140  * startup, and call the cheaper {@link SpoofChecker#failsChecks} online. In the second line, we specify the set of
141  * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. In the
142  * third line, the CONFUSABLE checks are disabled. It is good practice to disable them if you won't be using the
143  * instance to perform confusability checking.
144  *
145  * <p>
146  * To get more details on why a string failed the checks, use a {@link SpoofChecker.CheckResult}:
147  *
148  * <pre>
149  * <code>
150  * SpoofChecker sc = new SpoofChecker.Builder()
151  *     .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
152  *     .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
153  *     .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
154  *     .build();
155  * SpoofChecker.CheckResult checkResult = new SpoofChecker.CheckResult();
156  * boolean result = sc.failsChecks("pаypаl", checkResult);
157  * System.out.println(checkResult.checks);  // 16
158  * </code>
159  * </pre>
160  *
161  * <p>
162  * The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
163  * {@link SpoofChecker#RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
164  *
165  * <ul>
166  * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the
167  * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS
168  * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li>
169  * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character
170  * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li>
171  * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable
172  * characters. See {@link SpoofChecker.Builder#setAllowedChars} and {@link SpoofChecker.Builder#setAllowedLocales}.</li>
173  * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li>
174  * </ul>
175  *
176  * <p>
177  * These checks can be enabled independently of each other. For example, if you were interested in checking for only the
178  * INVISIBLE and MIXED_NUMBERS conditions, you could do:
179  *
180  * <pre>
181  * <code>
182  * SpoofChecker sc = new SpoofChecker.Builder()
183  *     .setChecks(SpoofChecker.INVISIBLE | SpoofChecker.MIXED_NUMBERS)
184  *     .build();
185  * boolean result = sc.failsChecks("৪8");
186  * System.out.println(result);  // true
187  * </code>
188  * </pre>
189  *
190  * <p>
191  * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in
192  * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings
193  * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have
194  * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is
195  * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed
196  * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
197  * the levels, see UTS 39 or {@link SpoofChecker.RestrictionLevel}. The Restriction Level test is aware of the set of
198  * allowed characters set in {@link SpoofChecker.Builder#setAllowedChars}. Note that characters which have script code
199  * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
200  * scripts.
201  *
202  * <h2>Additional Information</h2>
203  *
204  * <p>
205  * A <code>SpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
206  *
207  * <p>
208  * <b>Thread Safety:</b> The methods on <code>SpoofChecker</code> objects are thread safe. The test functions for
209  * checking a single identifier, or for testing whether two identifiers are potentially confusable, may called
210  * concurrently from multiple threads using the same <code>SpoofChecker</code> instance.
211  *
212  * @hide exposed on OHOS
213  */
214 public class SpoofChecker {
215 
216     /**
217      * Constants from UTS 39 for use in setRestrictionLevel.
218      *
219      * @hide exposed on OHOS
220      */
221     public enum RestrictionLevel {
222         /**
223          * All characters in the string are in the identifier profile and all characters in the string are in the ASCII
224          * range.
225          */
226         ASCII,
227         /**
228          * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and the
229          * string is single-script, according to the definition in UTS 39 section 5.1.
230          */
231         SINGLE_SCRIPT_RESTRICTIVE,
232         /**
233          * The string classifies as Single Script, or all characters in the string are in the identifier profile and the
234          * string is covered by any of the following sets of scripts, according to the definition in UTS 39 section 5.1:
235          * <ul>
236          * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li>
237          * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li>
238          * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li>
239          * </ul>
240          */
241         HIGHLY_RESTRICTIVE,
242         /**
243          * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
244          * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
245          * Greek, and Cherokee.
246          */
247         MODERATELY_RESTRICTIVE,
248         /**
249          * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts, such as
250          * Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us.
251          */
252         MINIMALLY_RESTRICTIVE,
253         /**
254          * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org
255          */
256         UNRESTRICTIVE,
257     }
258 
259     /**
260      * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}.
261      */
262     public static final UnicodeSet INCLUSION = new UnicodeSet(
263             "['\\-.\\:\\u00B7\\u0375\\u058A\\u05F3\\u05F4\\u06FD\\u06FE\\u0F0B\\u200C"
264             + "\\u200D\\u2010\\u2019\\u2027\\u30A0\\u30FB]"
265             ).freeze();
266     // Note: data from IdentifierStatus.txt & IdentifierType.txt
267     // There is tooling to generate this constant in the unicodetools project:
268     //      org.unicode.text.tools.RecommendedSetGenerator
269     // It will print the Java and C++ code to the console for easy copy-paste into this file.
270 
271     /**
272      * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}.
273      */
274     public static final UnicodeSet RECOMMENDED = new UnicodeSet(
275             "[0-9A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u0131\\u0134-\\u013E"
276             + "\\u0141-\\u0148\\u014A-\\u017E\\u018F\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-"
277             + "\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B\\u021E"
278             + "\\u021F\\u0226-\\u0233\\u0259\\u02BB\\u02BC\\u02EC\\u0300-\\u0304\\u0306-"
279             + "\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\\u0328\\u032D\\u032E"
280             + "\\u0330\\u0331\\u0335\\u0338\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386"
281             + "\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-"
282             + "\\u04FF\\u0510-\\u0529\\u052E\\u052F\\u0531-\\u0556\\u0559\\u0561-\\u0586"
283             + "\\u05B4\\u05D0-\\u05EA\\u05EF-\\u05F2\\u0620-\\u063F\\u0641-\\u0655\\u0660-"
284             + "\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-\\u06A0\\u06A2-\\u06D3"
285             + "\\u06D5\\u06E5\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC"
286             + "\\u08B2\\u08B6-\\u08C7\\u0901-\\u094D\\u094F\\u0950\\u0956\\u0957\\u0960-"
287             + "\\u0963\\u0966-\\u096F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983\\u0985-"
288             + "\\u098C\\u098F\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9"
289             + "\\u09BC-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CE\\u09D7\\u09E0-\\u09E3\\u09E6-"
290             + "\\u09F1\\u09FE\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-\\u0A28"
291             + "\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47"
292             + "\\u0A48\\u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D"
293             + "\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9"
294             + "\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-"
295             + "\\u0AEF\\u0AFA-\\u0AFF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F\\u0B10\\u0B13-"
296             + "\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47"
297             + "\\u0B48\\u0B4B-\\u0B4D\\u0B55-\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71"
298             + "\\u0B82\\u0B83\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A"
299             + "\\u0B9C\\u0B9E\\u0B9F\\u0BA3\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-"
300             + "\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD0\\u0BD7\\u0BE6-\\u0BEF\\u0C01-"
301             + "\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-"
302             + "\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56\\u0C60\\u0C61\\u0C66-"
303             + "\\u0C6F\\u0C80\\u0C82\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8"
304             + "\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD"
305             + "\\u0CD5\\u0CD6\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1\\u0CF2\\u0D00\\u0D02"
306             + "\\u0D03\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-"
307             + "\\u0D48\\u0D4A-\\u0D4E\\u0D54-\\u0D57\\u0D60\\u0D61\\u0D66-\\u0D6F\\u0D7A-"
308             + "\\u0D7F\\u0D82\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D96\\u0D9A-\\u0DA5\\u0DA7-"
309             + "\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0DD4\\u0DD6"
310             + "\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-"
311             + "\\u0E59\\u0E81\\u0E82\\u0E84\\u0E86-\\u0E8A\\u0E8C-\\u0EA3\\u0EA5\\u0EA7-"
312             + "\\u0EB2\\u0EB4-\\u0EBD\\u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9"
313             + "\\u0EDE\\u0EDF\\u0F00\\u0F20-\\u0F29\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-"
314             + "\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F56\\u0F58-\\u0F5B\\u0F5D-"
315             + "\\u0F68\\u0F6A-\\u0F6C\\u0F71\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0F82-\\u0F84"
316             + "\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6"
317             + "\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-"
318             + "\\u109D\\u10C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-"
319             + "\\u1248\\u124A-\\u124D\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288"
320             + "\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-"
321             + "\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1315\\u1318-\\u135A\\u135D-"
322             + "\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-"
323             + "\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1C90-\\u1CBA\\u1CBD-\\u1CBF"
324             + "\\u1E00-\\u1E99\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-"
325             + "\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70"
326             + "\\u1F72\\u1F74\\u1F76\\u1F78\\u1F7A\\u1F7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA"
327             + "\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1FD0-\\u1FD2\\u1FD6-"
328             + "\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FF8"
329             + "\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE"
330             + "\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6"
331             + "\\u2DD8-\\u2DDE\\u3005-\\u3007\\u3041-\\u3096\\u3099\\u309A\\u309D\\u309E"
332             + "\\u30A1-\\u30FA\\u30FC-\\u30FE\\u3105-\\u312D\\u312F\\u31A0-\\u31BF\\u3400-"
333             + "\\u4DBF\\u4E00-\\u9FFC\\uA67F\\uA717-\\uA71F\\uA788\\uA78D\\uA792\\uA793"
334             + "\\uA7AA\\uA7AE\\uA7B8\\uA7B9\\uA7C2-\\uA7CA\\uA9E7-\\uA9FE\\uAA60-\\uAA76"
335             + "\\uAA7A-\\uAA7F\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26"
336             + "\\uAB28-\\uAB2E\\uAB66\\uAB67\\uAC00-\\uD7A3\\uFA0E\\uFA0F\\uFA11\\uFA13"
337             + "\\uFA14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\U00011301\\U00011303"
338             + "\\U0001133B\\U0001133C\\U00016FF0\\U00016FF1\\U0001B150-\\U0001B152"
339             + "\\U0001B164-\\U0001B167\\U00020000-\\U0002A6DD\\U0002A700-\\U0002B734"
340             + "\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1\\U0002CEB0-\\U0002EBE0"
341             + "\\U00030000-\\U0003134A]"
342             ).freeze();
343     // Note: data from IdentifierStatus.txt & IdentifierType.txt
344     // There is tooling to generate this constant in the unicodetools project:
345     //      org.unicode.text.tools.RecommendedSetGenerator
346     // It will print the Java and C++ code to the console for easy copy-paste into this file.
347 
348     /**
349      * Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of
350      * checks that will be performed, and to report results from the check function.
351      *
352      */
353 
354     /**
355      * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
356      * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
357      * 4.
358      */
359     public static final int SINGLE_SCRIPT_CONFUSABLE = 1;
360 
361     /**
362      * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
363      * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS
364      * 39 section 4.
365      */
366     public static final int MIXED_SCRIPT_CONFUSABLE = 2;
367 
368     /**
369      * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
370      * that the two strings are visually confusable and that they are not from the same script but both of them are
371      * single-script strings, according to UTS 39 section 4.
372      */
373     public static final int WHOLE_SCRIPT_CONFUSABLE = 4;
374 
375     /**
376      * Enable this flag in {@link SpoofChecker.Builder#setChecks} to turn on all types of confusables. You may set the
377      * checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to make
378      * {@link SpoofChecker#areConfusable} return only those types of confusables.
379      */
380     public static final int CONFUSABLE = SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE;
381 
382     /**
383      * This flag is deprecated and no longer affects the behavior of SpoofChecker.
384      *
385      * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was
386      * deprecated.
387      */
388     @Deprecated
389     public static final int ANY_CASE = 8;
390 
391     /**
392      * Check that an identifier satisfies the requirements for the restriction level specified in
393      * {@link SpoofChecker.Builder#setRestrictionLevel}. The default restriction level is
394      * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}.
395      */
396     public static final int RESTRICTION_LEVEL = 16;
397 
398     /**
399      * Check that an identifier contains only characters from a single script (plus chars from the common and inherited
400      * scripts.) Applies to checks of a single identifier check only.
401      *
402      * @deprecated ICU 51 Use RESTRICTION_LEVEL
403      */
404     @Deprecated
405     public static final int SINGLE_SCRIPT = RESTRICTION_LEVEL;
406 
407     /**
408      * Check an identifier for the presence of invisible characters, such as zero-width spaces, or character sequences
409      * that are likely not to display, such as multiple occurrences of the same non-spacing mark. This check does not
410      * test the input string as a whole for conformance to any particular syntax for identifiers.
411      */
412     public static final int INVISIBLE = 32;
413 
414     /**
415      * Check that an identifier contains only characters from a specified set of acceptable characters. See
416      * {@link Builder#setAllowedChars} and {@link Builder#setAllowedLocales}. Note that a string that fails this check
417      * will also fail the {@link #RESTRICTION_LEVEL} check.
418      */
419     public static final int CHAR_LIMIT = 64;
420 
421     /**
422      * Check that an identifier does not mix numbers from different numbering systems. For more information, see UTS 39
423      * section 5.3.
424      */
425     public static final int MIXED_NUMBERS = 128;
426 
427     /**
428      * Check that an identifier does not have a combining character following a character in which that
429      * combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
430      * <p>
431      * More specifically, the following characters are forbidden from preceding a U+0307:
432      * <ul>
433      * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
434      * <li>Latin lowercase letter 'l'</li>
435      * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
436      * <li>Any character whose confusable prototype ends with such a character
437      * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
438      * </ul>
439      * In addition, combining characters are allowed between the above characters and U+0307 except those
440      * with combining class 0 or combining class "Above" (230, same class as U+0307).
441      * <p>
442      * This list and the number of combing characters considered by this check may grow over time.
443      */
444     public static final int HIDDEN_OVERLAY = 256;
445 
446     // Update CheckResult.toString() when a new check is added.
447 
448     /**
449      * Enable all spoof checks.
450      */
451     public static final int ALL_CHECKS = 0xFFFFFFFF;
452 
453     // Used for checking for ASCII-Only restriction level
454     static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze();
455 
456     /**
457      * private constructor: a SpoofChecker has to be built by the builder
458      */
SpoofChecker()459     private SpoofChecker() {
460     }
461 
462     /**
463      * SpoofChecker Builder. To create a SpoofChecker, first instantiate a SpoofChecker.Builder, set the desired
464      * checking options on the builder, then call the build() function to create a SpoofChecker instance.
465      *
466      * @hide exposed on OHOS
467      */
468     public static class Builder {
469         int fChecks; // Bit vector of checks to perform.
470         SpoofData fSpoofData;
471         final UnicodeSet fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); // The UnicodeSet of allowed characters.
472         // for this Spoof Checker. Defaults to all chars.
473         final Set<ULocale> fAllowedLocales = new LinkedHashSet<>(); // The list of allowed locales.
474         private RestrictionLevel fRestrictionLevel;
475 
476         /**
477          * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for
478          * LOCALE_LIMIT and CHAR_LIMIT. Note that additional checks may be added in the future, resulting in the changes
479          * to the default checking behavior.
480          */
Builder()481         public Builder() {
482             fChecks = ALL_CHECKS;
483             fSpoofData = null;
484             fRestrictionLevel = RestrictionLevel.HIGHLY_RESTRICTIVE;
485         }
486 
487         /**
488          * Constructor: Create a Spoof Checker Builder, and set the configuration from an existing SpoofChecker.
489          *
490          * @param src
491          *            The existing checker.
492          */
Builder(SpoofChecker src)493         public Builder(SpoofChecker src) {
494             fChecks = src.fChecks;
495             fSpoofData = src.fSpoofData; // For the data, we will either use the source data
496                                          // as-is, or drop the builder's reference to it
497                                          // and generate new data, depending on what our
498                                          // caller does with the builder.
499             fAllowedCharsSet.set(src.fAllowedCharsSet);
500             fAllowedLocales.addAll(src.fAllowedLocales);
501             fRestrictionLevel = src.fRestrictionLevel;
502         }
503 
504         /**
505          * Create a SpoofChecker with current configuration.
506          *
507          * @return SpoofChecker
508          */
build()509         public SpoofChecker build() {
510             // TODO: Make this data loading be lazy (see #12696).
511             if (fSpoofData == null) {
512                 // read binary file
513                 fSpoofData = SpoofData.getDefault();
514             }
515 
516             // Copy all state from the builder to the new SpoofChecker.
517             // Make sure that everything is either cloned or copied, so
518             // that subsequent re-use of the builder won't modify the built
519             // SpoofChecker.
520             //
521             // One exception to this: the SpoofData is just assigned.
522             // If the builder subsequently needs to modify fSpoofData
523             // it will create a new SpoofData object first.
524 
525             SpoofChecker result = new SpoofChecker();
526             result.fChecks = this.fChecks;
527             result.fSpoofData = this.fSpoofData;
528             result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone());
529             result.fAllowedCharsSet.freeze();
530             result.fAllowedLocales = new HashSet<>(this.fAllowedLocales);
531             result.fRestrictionLevel = this.fRestrictionLevel;
532             return result;
533         }
534 
535         /**
536          * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data file
537          * confusables.txt as described in Unicode UAX 39. The syntax of the source data is as described in UAX 39 for
538          * these files, and the content of these files is acceptable input.
539          *
540          * @param confusables
541          *            the Reader of confusable characters definitions, as found in file confusables.txt from
542          *            unicode.org.
543          * @throws ParseException
544          *             To report syntax errors in the input.
545          */
setData(Reader confusables)546         public Builder setData(Reader confusables) throws ParseException, IOException {
547 
548             // Compile the binary data from the source (text) format.
549             // Drop the builder's reference to any pre-existing data, which may
550             // be in use in an already-built checker.
551 
552             fSpoofData = new SpoofData();
553             ConfusabledataBuilder.buildConfusableData(confusables, fSpoofData);
554             return this;
555         }
556 
557         /**
558          * Deprecated as of ICU 58; use {@link SpoofChecker.Builder#setData(Reader confusables)} instead.
559          *
560          * @param confusables
561          *            the Reader of confusable characters definitions, as found in file confusables.txt from
562          *            unicode.org.
563          * @param confusablesWholeScript
564          *            No longer supported.
565          * @throws ParseException
566          *             To report syntax errors in the input.
567          *
568          * @deprecated ICU 58
569          */
570         @Deprecated
setData(Reader confusables, Reader confusablesWholeScript)571         public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, IOException {
572             setData(confusables);
573             return this;
574         }
575 
576         /**
577          * Specify the bitmask of checks that will be performed by {@link SpoofChecker#failsChecks}. Calling this method
578          * overwrites any checks that may have already been enabled. By default, all checks are enabled.
579          *
580          * To enable specific checks and disable all others, the "whitelisted" checks should be ORed together. For
581          * example, to fail strings containing characters outside of the set specified by {@link #setAllowedChars} and
582          * also strings that contain digits from mixed numbering systems:
583          *
584          * <pre>
585          * {@code
586          * builder.setChecks(SpoofChecker.CHAR_LIMIT | SpoofChecker.MIXED_NUMBERS);
587          * }
588          * </pre>
589          *
590          * To disable specific checks and enable all others, the "blacklisted" checks should be ANDed away from
591          * ALL_CHECKS. For example, if you are not planning to use the {@link SpoofChecker#areConfusable} functionality,
592          * it is good practice to disable the CONFUSABLE check:
593          *
594          * <pre>
595          * {@code
596          * builder.setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CONFUSABLE);
597          * }
598          * </pre>
599          *
600          * Note that methods such as {@link #setAllowedChars}, {@link #setAllowedLocales}, and
601          * {@link #setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
602          * enable onto the existing bitmask specified by this method. For more details, see the documentation of those
603          * methods.
604          *
605          * @param checks
606          *            The set of checks that this spoof checker will perform. The value is an 'or' of the desired
607          *            checks.
608          * @return self
609          */
setChecks(int checks)610         public Builder setChecks(int checks) {
611             // Verify that the requested checks are all ones (bits) that
612             // are acceptable, known values.
613             if (0 != (checks & ~SpoofChecker.ALL_CHECKS)) {
614                 throw new IllegalArgumentException("Bad Spoof Checks value.");
615             }
616             this.fChecks = (checks & SpoofChecker.ALL_CHECKS);
617             return this;
618         }
619 
620         /**
621          * Limit characters that are acceptable in identifiers being checked to those normally used with the languages
622          * associated with the specified locales. Any previously specified list of locales is replaced by the new
623          * settings.
624          *
625          * A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is
626          * determined. Characters from this set of scripts, along with characters from the "common" and "inherited"
627          * Unicode Script categories will be permitted.
628          *
629          * Supplying an empty string removes all restrictions; characters from any script will be allowed.
630          *
631          * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker when calling this function with a
632          * non-empty list of locales.
633          *
634          * The Unicode Set of characters that will be allowed is accessible via the {@link #getAllowedChars} function.
635          * setAllowedLocales() will <i>replace</i> any previously applied set of allowed characters.
636          *
637          * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of
638          * {@link #setAllowedChars} by fetching the resulting set with {@link #getAllowedChars}, manipulating it with
639          * the Unicode Set API, then resetting the spoof detectors limits with {@link #setAllowedChars}.
640          *
641          * @param locales
642          *            A Set of ULocales, from which the language and associated script are extracted. If the locales Set
643          *            is null, no restrictions will be placed on the allowed characters.
644          *
645          * @return self
646          */
setAllowedLocales(Set<ULocale> locales)647         public Builder setAllowedLocales(Set<ULocale> locales) {
648             fAllowedCharsSet.clear();
649 
650             for (ULocale locale : locales) {
651                 // Add the script chars for this locale to the accumulating set
652                 // of allowed chars.
653                 addScriptChars(locale, fAllowedCharsSet);
654             }
655 
656             // If our caller provided an empty list of locales, we disable the
657             // allowed characters checking
658             fAllowedLocales.clear();
659             if (locales.size() == 0) {
660                 fAllowedCharsSet.add(0, 0x10ffff);
661                 fChecks &= ~CHAR_LIMIT;
662                 return this;
663             }
664 
665             // Add all common and inherited characters to the set of allowed
666             // chars.
667             UnicodeSet tempSet = new UnicodeSet();
668             tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON);
669             fAllowedCharsSet.addAll(tempSet);
670             tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED);
671             fAllowedCharsSet.addAll(tempSet);
672 
673             // Store the updated spoof checker state.
674             fAllowedLocales.clear();
675             fAllowedLocales.addAll(locales);
676             fChecks |= CHAR_LIMIT;
677             return this;
678         }
679 
680         /**
681          * Limit characters that are acceptable in identifiers being checked to those normally used with the languages
682          * associated with the specified locales. Any previously specified list of locales is replaced by the new
683          * settings.
684          *
685          * @param locales
686          *            A Set of Locales, from which the language and associated script are extracted. If the locales Set
687          *            is null, no restrictions will be placed on the allowed characters.
688          *
689          * @return self
690          */
setAllowedJavaLocales(Set<Locale> locales)691         public Builder setAllowedJavaLocales(Set<Locale> locales) {
692             HashSet<ULocale> ulocales = new HashSet<>(locales.size());
693             for (Locale locale : locales) {
694                 ulocales.add(ULocale.forLocale(locale));
695             }
696             return setAllowedLocales(ulocales);
697         }
698 
699         // Add (union) to the UnicodeSet all of the characters for the scripts
700         // used for the specified locale. Part of the implementation of
701         // setAllowedLocales.
addScriptChars(ULocale locale, UnicodeSet allowedChars)702         private void addScriptChars(ULocale locale, UnicodeSet allowedChars) {
703             int scripts[] = UScript.getCode(locale);
704             if (scripts != null) {
705                 UnicodeSet tmpSet = new UnicodeSet();
706                 for (int i = 0; i < scripts.length; i++) {
707                     tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]);
708                     allowedChars.addAll(tmpSet);
709                 }
710             }
711             // else it's an unknown script.
712             // Maybe they asked for the script of "zxx", which refers to no linguistic content.
713             // Maybe they asked for the script of a newer locale that we don't know in the older version of ICU.
714         }
715 
716         /**
717          * Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit
718          * is is replaced by the new settings. This includes limits on characters that were set with the
719          * setAllowedLocales() function. Note that the RESTRICTED set is useful.
720          *
721          * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker by this function.
722          *
723          * @param chars
724          *            A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by
725          *            this function, so there are no restrictions on modifying or deleting the UnicodeSet after calling
726          *            this function. Note that this clears the allowedLocales set.
727          * @return self
728          */
setAllowedChars(UnicodeSet chars)729         public Builder setAllowedChars(UnicodeSet chars) {
730             fAllowedCharsSet.set(chars);
731             fAllowedLocales.clear();
732             fChecks |= CHAR_LIMIT;
733             return this;
734         }
735 
736         /**
737          * Set the loosest restriction level allowed for strings. The default if this is not called is
738          * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. Calling this method enables the {@link #RESTRICTION_LEVEL} and
739          * {@link #MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
740          * to be performed by {@link SpoofChecker#failsChecks}, see {@link #setChecks}.
741          *
742          * @param restrictionLevel
743          *            The loosest restriction level allowed.
744          * @return self
745          * @hide draft / provisional / internal are hidden on OHOS
746          */
setRestrictionLevel(RestrictionLevel restrictionLevel)747         public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) {
748             fRestrictionLevel = restrictionLevel;
749             fChecks |= RESTRICTION_LEVEL | MIXED_NUMBERS;
750             return this;
751         }
752 
753         /*
754          * *****************************************************************************
755          * Internal classes for compililing confusable data into its binary (runtime) form.
756          * *****************************************************************************
757          */
758         // ---------------------------------------------------------------------
759         //
760         // buildConfusableData Compile the source confusable data, as defined by
761         // the Unicode data file confusables.txt, into the binary
762         // structures used by the confusable detector.
763         //
764         // The binary structures are described in uspoof_impl.h
765         //
766         // 1. parse the data, making a hash table mapping from a codepoint to a String.
767         //
768         // 2. Sort all of the strings encountered by length, since they will need to
769         // be stored in that order in the final string table.
770         // TODO: Sorting these strings by length is no longer needed since the removal of
771         // the string lengths table.  This logic can be removed to save processing time
772         // when building confusables data.
773         //
774         // 3. Build a list of keys (UChar32s) from the mapping table. Sort the
775         // list because that will be the ordering of our runtime table.
776         //
777         // 4. Generate the run time string table. This is generated before the key & value
778         // table because we need the string indexes when building those tables.
779         //
780         // 5. Build the run-time key and value table. These are parallel tables, and
781         // are built at the same time
782 
783         // class ConfusabledataBuilder
784         // An instance of this class exists while the confusable data is being built from source.
785         // It encapsulates the intermediate data structures that are used for building.
786         // It exports one static function, to do a confusable data build.
787         private static class ConfusabledataBuilder {
788 
789             private Hashtable<Integer, SPUString> fTable;
790             private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the
791                                         // four mapping tables.
792 
793             // The compiled data is first assembled into the following four collections,
794             // then output to the builder's SpoofData object.
795             private StringBuffer fStringTable;
796             private ArrayList<Integer> fKeyVec;
797             private ArrayList<Integer> fValueVec;
798             private SPUStringPool stringPool;
799             private Pattern fParseLine;
800             private Pattern fParseHexNum;
801             private int fLineNum;
802 
ConfusabledataBuilder()803             ConfusabledataBuilder() {
804                 fTable = new Hashtable<>();
805                 fKeySet = new UnicodeSet();
806                 fKeyVec = new ArrayList<>();
807                 fValueVec = new ArrayList<>();
808                 stringPool = new SPUStringPool();
809             }
810 
build(Reader confusables, SpoofData dest)811             void build(Reader confusables, SpoofData dest) throws ParseException, java.io.IOException {
812                 StringBuffer fInput = new StringBuffer();
813 
814                 // Convert the user input data from UTF-8 to char (UTF-16)
815                 LineNumberReader lnr = new LineNumberReader(confusables);
816                 do {
817                     String line = lnr.readLine();
818                     if (line == null) {
819                         break;
820                     }
821                     fInput.append(line);
822                     fInput.append('\n');
823                 } while (true);
824 
825                 // Regular Expression to parse a line from Confusables.txt. The expression will match
826                 // any line. What was matched is determined by examining which capture groups have a match.
827                 // Capture Group 1: the source char
828                 // Capture Group 2: the replacement chars
829                 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
830                 // Capture Group 7: A blank or comment only line.
831                 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
832                 // Example Line from the confusables.txt source file:
833                 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
834                 fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match the source char
835                         "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s)
836                         "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued)
837                         "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type
838                         "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment
839                         "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment
840                         "|^(.*?)$"); // OR match any line, which catches illegal lines.
841 
842                 // Regular expression for parsing a hex number out of a space-separated list of them.
843                 // Capture group 1 gets the number, with spaces removed.
844                 fParseHexNum = Pattern.compile("\\s*([0-9A-F]+)");
845 
846                 // Zap any Byte Order Mark at the start of input. Changing it to a space
847                 // is benign given the syntax of the input.
848                 if (fInput.charAt(0) == 0xfeff) {
849                     fInput.setCharAt(0, (char) 0x20);
850                 }
851 
852                 // Parse the input, one line per iteration of this loop.
853                 Matcher matcher = fParseLine.matcher(fInput);
854                 while (matcher.find()) {
855                     fLineNum++;
856                     if (matcher.start(7) >= 0) {
857                         // this was a blank or comment line.
858                         continue;
859                     }
860                     if (matcher.start(8) >= 0) {
861                         // input file syntax error.
862                         // status = U_PARSE_ERROR;
863                         throw new ParseException(
864                                 "Confusables, line " + fLineNum + ": Unrecognized Line: " + matcher.group(8),
865                                 matcher.start(8));
866                     }
867 
868                     // We have a good input line. Extract the key character and mapping
869                     // string, and
870                     // put them into the appropriate mapping table.
871                     int keyChar = Integer.parseInt(matcher.group(1), 16);
872                     if (keyChar > 0x10ffff) {
873                         throw new ParseException(
874                                 "Confusables, line " + fLineNum + ": Bad code point: " + matcher.group(1),
875                                 matcher.start(1));
876                     }
877                     Matcher m = fParseHexNum.matcher(matcher.group(2));
878 
879                     StringBuilder mapString = new StringBuilder();
880                     while (m.find()) {
881                         int c = Integer.parseInt(m.group(1), 16);
882                         if (c > 0x10ffff) {
883                             throw new ParseException(
884                                     "Confusables, line " + fLineNum + ": Bad code point: " + Integer.toString(c, 16),
885                                     matcher.start(2));
886                         }
887                         mapString.appendCodePoint(c);
888                     }
889                     assert (mapString.length() >= 1);
890 
891                     // Put the map (value) string into the string pool
892                     // This a little like a Java intern() - any duplicates will be
893                     // eliminated.
894                     SPUString smapString = stringPool.addString(mapString.toString());
895 
896                     // Add the char . string mapping to the table.
897                     // For Unicode 8, the SL, SA and ML tables have been discontinued.
898                     // All input data from confusables.txt is tagged MA.
899                     fTable.put(keyChar, smapString);
900 
901                     fKeySet.add(keyChar);
902                 }
903 
904                 // Input data is now all parsed and collected.
905                 // Now create the run-time binary form of the data.
906                 //
907                 // This is done in two steps. First the data is assembled into vectors and strings,
908                 // for ease of construction, then the contents of these collections are copied
909                 // into the actual SpoofData object.
910 
911                 // Build up the string array, and record the index of each string therein
912                 // in the (build time only) string pool.
913                 // Strings of length one are not entered into the strings array.
914                 // (Strings in the table are sorted by length)
915 
916                 stringPool.sort();
917                 fStringTable = new StringBuffer();
918                 int poolSize = stringPool.size();
919                 int i;
920                 for (i = 0; i < poolSize; i++) {
921                     SPUString s = stringPool.getByIndex(i);
922                     int strLen = s.fStr.length();
923                     int strIndex = fStringTable.length();
924                     if (strLen == 1) {
925                         // strings of length one do not get an entry in the string table.
926                         // Keep the single string character itself here, which is the same
927                         // convention that is used in the final run-time string table index.
928                         s.fCharOrStrTableIndex = s.fStr.charAt(0);
929                     } else {
930                         s.fCharOrStrTableIndex = strIndex;
931                         fStringTable.append(s.fStr);
932                     }
933                 }
934 
935                 // Construct the compile-time Key and Value table.
936                 //
937                 // The keys in the Key table follow the format described in uspoof.h for the
938                 // Cfu confusables data structure.
939                 //
940                 // Starting in ICU 58, each code point has exactly one entry in the data
941                 // structure.
942 
943                 for (String keyCharStr : fKeySet) {
944                     int keyChar = keyCharStr.codePointAt(0);
945                     SPUString targetMapping = fTable.get(keyChar);
946                     assert targetMapping != null;
947 
948                     // Throw a sane exception if trying to consume a long string.  Otherwise,
949                     // codePointAndLengthToKey will throw an assertion error.
950                     if (targetMapping.fStr.length() > 256) {
951                         throw new IllegalArgumentException("Confusable prototypes cannot be longer than 256 entries.");
952                     }
953 
954                     int key = ConfusableDataUtils.codePointAndLengthToKey(keyChar, targetMapping.fStr.length());
955                     int value = targetMapping.fCharOrStrTableIndex;
956 
957                     fKeyVec.add(key);
958                     fValueVec.add(value);
959                 }
960 
961                 // Put the assembled data into the destination SpoofData object.
962 
963                 // The Key Table
964                 // While copying the keys to the output array,
965                 // also sanity check that the keys are sorted.
966                 int numKeys = fKeyVec.size();
967                 dest.fCFUKeys = new int[numKeys];
968                 int previousCodePoint = 0;
969                 for (i = 0; i < numKeys; i++) {
970                     int key = fKeyVec.get(i);
971                     int codePoint = ConfusableDataUtils.keyToCodePoint(key);
972                     // strictly greater because there can be only one entry per code point
973                     assert codePoint > previousCodePoint;
974                     dest.fCFUKeys[i] = key;
975                     previousCodePoint = codePoint;
976                 }
977 
978                 // The Value Table, parallels the key table
979                 int numValues = fValueVec.size();
980                 assert (numKeys == numValues);
981                 dest.fCFUValues = new short[numValues];
982                 i = 0;
983                 for (int value : fValueVec) {
984                     assert (value < 0xffff);
985                     dest.fCFUValues[i++] = (short) value;
986                 }
987 
988                 // The Strings Table.
989                 dest.fCFUStrings = fStringTable.toString();
990             }
991 
992             public static void buildConfusableData(Reader confusables, SpoofData dest)
993                     throws java.io.IOException, ParseException {
994                 ConfusabledataBuilder builder = new ConfusabledataBuilder();
995                 builder.build(confusables, dest);
996             }
997 
998             /*
999              * *****************************************************************************
1000              * Internal classes for compiling confusable data into its binary (runtime) form.
1001              * *****************************************************************************
1002              */
1003             // SPUString
1004             // Holds a string that is the result of one of the mappings defined
1005             // by the confusable mapping data (confusables.txt from Unicode.org)
1006             // Instances of SPUString exist during the compilation process only.
1007 
1008             private static class SPUString {
1009                 String fStr; // The actual string.
1010                 int fCharOrStrTableIndex; // Index into the final runtime data for this string.
1011                 // (or, for length 1, the single string char itself,
1012                 // there being no string table entry for it.)
1013 
1014                 SPUString(String s) {
1015                     fStr = s;
1016                     fCharOrStrTableIndex = 0;
1017                 }
1018             }
1019 
1020             // Comparison function for ordering strings in the string pool.
1021             // Compare by length first, then, within a group of the same length,
1022             // by code point order.
1023 
1024             private static class SPUStringComparator implements Comparator<SPUString> {
1025                 @Override
1026                 public int compare(SPUString sL, SPUString sR) {
1027                     int lenL = sL.fStr.length();
1028                     int lenR = sR.fStr.length();
1029                     if (lenL < lenR) {
1030                         return -1;
1031                     } else if (lenL > lenR) {
1032                         return 1;
1033                     } else {
1034                         return sL.fStr.compareTo(sR.fStr);
1035                     }
1036                 }
1037 
1038                 final static SPUStringComparator INSTANCE = new SPUStringComparator();
1039             }
1040 
1041             // String Pool A utility class for holding the strings that are the result of
1042             // the spoof mappings. These strings will utimately end up in the
1043             // run-time String Table.
1044             // This is sort of like a sorted set of strings, except that ICU's anemic
1045             // built-in collections don't support those, so it is implemented with a
1046             // combination of a uhash and a Vector.
1047             private static class SPUStringPool {
1048                 public SPUStringPool() {
1049                     fVec = new Vector<>();
1050                     fHash = new Hashtable<>();
1051                 }
1052 
1053                 public int size() {
1054                     return fVec.size();
1055                 }
1056 
1057                 // Get the n-th string in the collection.
1058                 public SPUString getByIndex(int index) {
1059                     SPUString retString = fVec.elementAt(index);
1060                     return retString;
1061                 }
1062 
1063                 // Add a string. Return the string from the table.
1064                 // If the input parameter string is already in the table, delete the
1065                 // input parameter and return the existing string.
1066                 public SPUString addString(String src) {
1067                     SPUString hashedString = fHash.get(src);
1068                     if (hashedString == null) {
1069                         hashedString = new SPUString(src);
1070                         fHash.put(src, hashedString);
1071                         fVec.addElement(hashedString);
1072                     }
1073                     return hashedString;
1074                 }
1075 
1076                 // Sort the contents; affects the ordering of getByIndex().
1077                 public void sort() {
1078                     Collections.sort(fVec, SPUStringComparator.INSTANCE);
1079                 }
1080 
1081                 private Vector<SPUString> fVec; // Elements are SPUString *
1082                 private Hashtable<String, SPUString> fHash; // Key: Value:
1083             }
1084 
1085         }
1086     }
1087 
1088     /**
1089      * Get the Restriction Level that is being tested.
1090      *
1091      * @return The restriction level
1092      * @deprecated This API is ICU internal only.
1093      * @hide draft / provisional / internal are hidden on OHOS
1094      */
1095     @Deprecated
1096     public RestrictionLevel getRestrictionLevel() {
1097         return fRestrictionLevel;
1098     }
1099 
1100     /**
1101      * Get the set of checks that this Spoof Checker has been configured to perform.
1102      *
1103      * @return The set of checks that this spoof checker will perform.
1104      */
1105     public int getChecks() {
1106         return fChecks;
1107     }
1108 
1109     /**
1110      * Get a read-only set of locales for the scripts that are acceptable in strings to be checked. If no limitations on
1111      * scripts have been specified, an empty set will be returned.
1112      *
1113      * setAllowedChars() will reset the list of allowed locales to be empty.
1114      *
1115      * The returned set may not be identical to the originally specified set that is supplied to setAllowedLocales();
1116      * the information other than languages from the originally specified locales may be omitted.
1117      *
1118      * @return A set of locales corresponding to the acceptable scripts.
1119      */
1120     public Set<ULocale> getAllowedLocales() {
1121         return Collections.unmodifiableSet(fAllowedLocales);
1122     }
1123 
1124     /**
1125      * Get a set of {@link java.util.Locale} instances for the scripts that are acceptable in strings to be checked. If
1126      * no limitations on scripts have been specified, an empty set will be returned.
1127      *
1128      * @return A set of locales corresponding to the acceptable scripts.
1129      */
1130     public Set<Locale> getAllowedJavaLocales() {
1131         HashSet<Locale> locales = new HashSet<>(fAllowedLocales.size());
1132         for (ULocale uloc : fAllowedLocales) {
1133             locales.add(uloc.toLocale());
1134         }
1135         return locales;
1136     }
1137 
1138     /**
1139      * Get a UnicodeSet for the characters permitted in an identifier. This corresponds to the limits imposed by the Set
1140      * Allowed Characters functions. Limitations imposed by other checks will not be reflected in the set returned by
1141      * this function.
1142      *
1143      * The returned set will be frozen, meaning that it cannot be modified by the caller.
1144      *
1145      * @return A UnicodeSet containing the characters that are permitted by the CHAR_LIMIT test.
1146      */
1147     public UnicodeSet getAllowedChars() {
1148         return fAllowedCharsSet;
1149     }
1150 
1151     /**
1152      * A struct-like class to hold the results of a Spoof Check operation. Tells which check(s) have failed.
1153      *
1154      * @hide exposed on OHOS
1155      */
1156     public static class CheckResult {
1157         /**
1158          * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
1159          * in question: RESTRICTION_LEVEL, CHAR_LIMIT, and so on.
1160          *
1161          * @see Builder#setChecks
1162          */
1163         public int checks;
1164 
1165         /**
1166          * The index of the first string position that failed a check.
1167          *
1168          * @deprecated ICU 51. No longer supported. Always set to zero.
1169          */
1170         @Deprecated
1171         public int position;
1172 
1173         /**
1174          * The numerics found in the string, if MIXED_NUMBERS was set; otherwise null.  The set will contain the zero
1175          * digit from each decimal number system found in the input string.
1176          */
1177         public UnicodeSet numerics;
1178 
1179         /**
1180          * The restriction level that the text meets, if RESTRICTION_LEVEL is set; otherwise null.
1181          */
1182         public RestrictionLevel restrictionLevel;
1183 
1184         /**
1185          * Default constructor
1186          */
1187         public CheckResult() {
1188             checks = 0;
1189             position = 0;
1190         }
1191 
1192         /**
1193          * {@inheritDoc}
1194          */
1195         @Override
1196         public String toString() {
1197             StringBuilder sb = new StringBuilder();
1198             sb.append("checks:");
1199             if (checks == 0) {
1200                 sb.append(" none");
1201             } else if (checks == ALL_CHECKS) {
1202                 sb.append(" all");
1203             } else {
1204                 if ((checks & SINGLE_SCRIPT_CONFUSABLE) != 0) {
1205                     sb.append(" SINGLE_SCRIPT_CONFUSABLE");
1206                 }
1207                 if ((checks & MIXED_SCRIPT_CONFUSABLE) != 0) {
1208                     sb.append(" MIXED_SCRIPT_CONFUSABLE");
1209                 }
1210                 if ((checks & WHOLE_SCRIPT_CONFUSABLE) != 0) {
1211                     sb.append(" WHOLE_SCRIPT_CONFUSABLE");
1212                 }
1213                 if ((checks & ANY_CASE) != 0) {
1214                     sb.append(" ANY_CASE");
1215                 }
1216                 if ((checks & RESTRICTION_LEVEL) != 0) {
1217                     sb.append(" RESTRICTION_LEVEL");
1218                 }
1219                 if ((checks & INVISIBLE) != 0) {
1220                     sb.append(" INVISIBLE");
1221                 }
1222                 if ((checks & CHAR_LIMIT) != 0) {
1223                     sb.append(" CHAR_LIMIT");
1224                 }
1225                 if ((checks & MIXED_NUMBERS) != 0) {
1226                     sb.append(" MIXED_NUMBERS");
1227                 }
1228             }
1229             sb.append(", numerics: ").append(numerics.toPattern(false));
1230             sb.append(", position: ").append(position);
1231             sb.append(", restrictionLevel: ").append(restrictionLevel);
1232             return sb.toString();
1233         }
1234     }
1235 
1236     /**
1237      * Check the specified string for possible security issues. The text to be checked will typically be an identifier
1238      * of some sort. The set of checks to be performed was specified when building the SpoofChecker.
1239      *
1240      * @param text
1241      *            A String to be checked for possible security issues.
1242      * @param checkResult
1243      *            Output parameter, indicates which specific tests failed. May be null if the information is not wanted.
1244      * @return True there any issue is found with the input string.
1245      */
1246     public boolean failsChecks(String text, CheckResult checkResult) {
1247         int length = text.length();
1248 
1249         int result = 0;
1250         if (checkResult != null) {
1251             checkResult.position = 0;
1252             checkResult.numerics = null;
1253             checkResult.restrictionLevel = null;
1254         }
1255 
1256         if (0 != (this.fChecks & RESTRICTION_LEVEL)) {
1257             RestrictionLevel textRestrictionLevel = getRestrictionLevel(text);
1258             if (textRestrictionLevel.compareTo(fRestrictionLevel) > 0) {
1259                 result |= RESTRICTION_LEVEL;
1260             }
1261             if (checkResult != null) {
1262                 checkResult.restrictionLevel = textRestrictionLevel;
1263             }
1264         }
1265 
1266         if (0 != (this.fChecks & MIXED_NUMBERS)) {
1267             UnicodeSet numerics = new UnicodeSet();
1268             getNumerics(text, numerics);
1269             if (numerics.size() > 1) {
1270                 result |= MIXED_NUMBERS;
1271             }
1272             if (checkResult != null) {
1273                 checkResult.numerics = numerics;
1274             }
1275         }
1276 
1277         if (0 != (this.fChecks & HIDDEN_OVERLAY)) {
1278             int index = findHiddenOverlay(text);
1279             if (index != -1) {
1280                 result |= HIDDEN_OVERLAY;
1281             }
1282         }
1283 
1284         if (0 != (this.fChecks & CHAR_LIMIT)) {
1285             int i;
1286             int c;
1287             for (i = 0; i < length;) {
1288                 // U16_NEXT(text, i, length, c);
1289                 c = Character.codePointAt(text, i);
1290                 i = Character.offsetByCodePoints(text, i, 1);
1291                 if (!this.fAllowedCharsSet.contains(c)) {
1292                     result |= CHAR_LIMIT;
1293                     break;
1294                 }
1295             }
1296         }
1297 
1298         if (0 != (this.fChecks & INVISIBLE)) {
1299             // This check needs to be done on NFD input
1300             String nfdText = nfdNormalizer.normalize(text);
1301 
1302             // scan for more than one occurrence of the same non-spacing mark
1303             // in a sequence of non-spacing marks.
1304             int i;
1305             int c;
1306             int firstNonspacingMark = 0;
1307             boolean haveMultipleMarks = false;
1308             UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a
1309                                                           // single combining sequence.
1310             for (i = 0; i < length;) {
1311                 c = Character.codePointAt(nfdText, i);
1312                 i = Character.offsetByCodePoints(nfdText, i, 1);
1313                 if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) {
1314                     firstNonspacingMark = 0;
1315                     if (haveMultipleMarks) {
1316                         marksSeenSoFar.clear();
1317                         haveMultipleMarks = false;
1318                     }
1319                     continue;
1320                 }
1321                 if (firstNonspacingMark == 0) {
1322                     firstNonspacingMark = c;
1323                     continue;
1324                 }
1325                 if (!haveMultipleMarks) {
1326                     marksSeenSoFar.add(firstNonspacingMark);
1327                     haveMultipleMarks = true;
1328                 }
1329                 if (marksSeenSoFar.contains(c)) {
1330                     // report the error, and stop scanning.
1331                     // No need to find more than the first failure.
1332                     result |= INVISIBLE;
1333                     break;
1334                 }
1335                 marksSeenSoFar.add(c);
1336             }
1337         }
1338         if (checkResult != null) {
1339             checkResult.checks = result;
1340         }
1341         return (0 != result);
1342     }
1343 
1344     /**
1345      * Check the specified string for possible security issues. The text to be checked will typically be an identifier
1346      * of some sort. The set of checks to be performed was specified when building the SpoofChecker.
1347      *
1348      * @param text
1349      *            A String to be checked for possible security issues.
1350      * @return True there any issue is found with the input string.
1351      */
failsChecks(String text)1352     public boolean failsChecks(String text) {
1353         return failsChecks(text, null);
1354     }
1355 
1356     /**
1357      * Check the whether two specified strings are visually confusable. The types of confusability to be tested - single
1358      * script, mixed script, or whole script - are determined by the check options set for the SpoofChecker.
1359      *
1360      * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
1361      * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected.
1362      *
1363      * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case
1364      * folded for comparison and display to the user, do not select the ANY_CASE option.
1365      *
1366      *
1367      * @param s1
1368      *            The first of the two strings to be compared for confusability.
1369      * @param s2
1370      *            The second of the two strings to be compared for confusability.
1371      * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability
1372      *         found, as defined by spoof check test constants.
1373      */
areConfusable(String s1, String s2)1374     public int areConfusable(String s1, String s2) {
1375         //
1376         // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
1377         // and for definitions of the types (single, whole, mixed-script) of confusables.
1378 
1379         // We only care about a few of the check flags. Ignore the others.
1380         // If no tests relevant to this function have been specified, signal an error.
1381         // TODO: is this really the right thing to do? It's probably an error on
1382         // the caller's part, but logically we would just return 0 (no error).
1383         if ((this.fChecks & CONFUSABLE) == 0) {
1384             throw new IllegalArgumentException("No confusable checks are enabled.");
1385         }
1386 
1387         // Compute the skeletons and check for confusability.
1388         String s1Skeleton = getSkeleton(s1);
1389         String s2Skeleton = getSkeleton(s2);
1390         if (!s1Skeleton.equals(s2Skeleton)) {
1391             return 0;
1392         }
1393 
1394         // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
1395         // of confusables according to UTS 39 section 4.
1396         // Start by computing the resolved script sets of s1 and s2.
1397         ScriptSet s1RSS = new ScriptSet();
1398         getResolvedScriptSet(s1, s1RSS);
1399         ScriptSet s2RSS = new ScriptSet();
1400         getResolvedScriptSet(s2, s2RSS);
1401 
1402         // Turn on all applicable flags
1403         int result = 0;
1404         if (s1RSS.intersects(s2RSS)) {
1405             result |= SINGLE_SCRIPT_CONFUSABLE;
1406         } else {
1407             result |= MIXED_SCRIPT_CONFUSABLE;
1408             if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) {
1409                 result |= WHOLE_SCRIPT_CONFUSABLE;
1410             }
1411         }
1412 
1413         // Turn off flags that the user doesn't want
1414         result &= fChecks;
1415 
1416         return result;
1417     }
1418 
1419     /**
1420      * Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are
1421      * confusable if their skeletons are identical. See Unicode UAX 39 for additional information.
1422      *
1423      * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some
1424      * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons.
1425      *
1426      * Skeletons are computed using the algorithm and data described in Unicode UAX 39.
1427      *
1428      * @param str
1429      *            The input string whose skeleton will be generated.
1430      * @return The output skeleton string.
1431      */
getSkeleton(CharSequence str)1432     public String getSkeleton(CharSequence str) {
1433         // Apply the skeleton mapping to the NFD normalized input string
1434         // Accumulate the skeleton, possibly unnormalized, in a String.
1435         String nfdId = nfdNormalizer.normalize(str);
1436         int normalizedLen = nfdId.length();
1437         StringBuilder skelSB = new StringBuilder();
1438         for (int inputIndex = 0; inputIndex < normalizedLen;) {
1439             int c = Character.codePointAt(nfdId, inputIndex);
1440             inputIndex += Character.charCount(c);
1441             this.fSpoofData.confusableLookup(c, skelSB);
1442         }
1443         String skelStr = skelSB.toString();
1444         skelStr = nfdNormalizer.normalize(skelStr);
1445         return skelStr;
1446     }
1447 
1448     /**
1449      * Calls {@link SpoofChecker#getSkeleton(CharSequence id)}. Starting with ICU 55, the "type" parameter has been
1450      * ignored, and starting with ICU 58, this function has been deprecated.
1451      *
1452      * @param type
1453      *            No longer supported. Prior to ICU 55, was used to specify the mapping table SL, SA, ML, or MA.
1454      * @param id
1455      *            The input identifier whose skeleton will be generated.
1456      * @return The output skeleton string.
1457      *
1458      * @deprecated ICU 58
1459      */
1460     @Deprecated
getSkeleton(int type, String id)1461     public String getSkeleton(int type, String id) {
1462         return getSkeleton(id);
1463     }
1464 
1465     /**
1466      * Equality function. Return true if the two SpoofChecker objects incorporate the same confusable data and have
1467      * enabled the same set of checks.
1468      *
1469      * @param other
1470      *            the SpoofChecker being compared with.
1471      * @return true if the two SpoofCheckers are equal.
1472      */
1473     @Override
equals(Object other)1474     public boolean equals(Object other) {
1475         if (!(other instanceof SpoofChecker)) {
1476             return false;
1477         }
1478         SpoofChecker otherSC = (SpoofChecker) other;
1479         if (fSpoofData != otherSC.fSpoofData && fSpoofData != null && !fSpoofData.equals(otherSC.fSpoofData)) {
1480             return false;
1481         }
1482         if (fChecks != otherSC.fChecks) {
1483             return false;
1484         }
1485         if (fAllowedLocales != otherSC.fAllowedLocales && fAllowedLocales != null
1486                 && !fAllowedLocales.equals(otherSC.fAllowedLocales)) {
1487             return false;
1488         }
1489         if (fAllowedCharsSet != otherSC.fAllowedCharsSet && fAllowedCharsSet != null
1490                 && !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) {
1491             return false;
1492         }
1493         if (fRestrictionLevel != otherSC.fRestrictionLevel) {
1494             return false;
1495         }
1496         return true;
1497     }
1498 
1499     /**
1500      * Overrides {@link Object#hashCode()}.
1501      */
1502     @Override
hashCode()1503     public int hashCode() {
1504         return fChecks
1505                 ^ fSpoofData.hashCode()
1506                 ^ fAllowedLocales.hashCode()
1507                 ^ fAllowedCharsSet.hashCode()
1508                 ^ fRestrictionLevel.ordinal();
1509     }
1510 
1511     /**
1512      * Computes the augmented script set for a code point, according to UTS 39 section 5.1.
1513      */
getAugmentedScriptSet(int codePoint, ScriptSet result)1514     private static void getAugmentedScriptSet(int codePoint, ScriptSet result) {
1515         result.clear();
1516         UScript.getScriptExtensions(codePoint, result);
1517 
1518         // Section 5.1 step 1
1519         if (result.get(UScript.HAN)) {
1520             result.set(UScript.HAN_WITH_BOPOMOFO);
1521             result.set(UScript.JAPANESE);
1522             result.set(UScript.KOREAN);
1523         }
1524         if (result.get(UScript.HIRAGANA)) {
1525             result.set(UScript.JAPANESE);
1526         }
1527         if (result.get(UScript.KATAKANA)) {
1528             result.set(UScript.JAPANESE);
1529         }
1530         if (result.get(UScript.HANGUL)) {
1531             result.set(UScript.KOREAN);
1532         }
1533         if (result.get(UScript.BOPOMOFO)) {
1534             result.set(UScript.HAN_WITH_BOPOMOFO);
1535         }
1536 
1537         // Section 5.1 step 2
1538         if (result.get(UScript.COMMON) || result.get(UScript.INHERITED)) {
1539             result.setAll();
1540         }
1541     }
1542 
1543     /**
1544      * Computes the resolved script set for a string, according to UTS 39 section 5.1.
1545      */
getResolvedScriptSet(CharSequence input, ScriptSet result)1546     private void getResolvedScriptSet(CharSequence input, ScriptSet result) {
1547         getResolvedScriptSetWithout(input, UScript.CODE_LIMIT, result);
1548     }
1549 
1550     /**
1551      * Computes the resolved script set for a string, omitting characters having the specified script. If
1552      * UScript.CODE_LIMIT is passed as the second argument, all characters are included.
1553      */
getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result)1554     private void getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result) {
1555         result.setAll();
1556 
1557         ScriptSet temp = new ScriptSet();
1558         for (int utf16Offset = 0; utf16Offset < input.length();) {
1559             int codePoint = Character.codePointAt(input, utf16Offset);
1560             utf16Offset += Character.charCount(codePoint);
1561 
1562             // Compute the augmented script set for the character
1563             getAugmentedScriptSet(codePoint, temp);
1564 
1565             // Intersect the augmented script set with the resolved script set, but only if the character doesn't
1566             // have the script specified in the function call
1567             if (script == UScript.CODE_LIMIT || !temp.get(script)) {
1568                 result.and(temp);
1569             }
1570         }
1571     }
1572 
1573     /**
1574      * Computes the set of numerics for a string, according to UTS 39 section 5.3.
1575      */
getNumerics(String input, UnicodeSet result)1576     private void getNumerics(String input, UnicodeSet result) {
1577         result.clear();
1578 
1579         for (int utf16Offset = 0; utf16Offset < input.length();) {
1580             int codePoint = Character.codePointAt(input, utf16Offset);
1581             utf16Offset += Character.charCount(codePoint);
1582 
1583             // Store a representative character for each kind of decimal digit
1584             if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
1585                 // Store the zero character as a representative for comparison.
1586                 // Unicode guarantees it is codePoint - value
1587                 result.add(codePoint - UCharacter.getNumericValue(codePoint));
1588             }
1589         }
1590     }
1591 
1592     /**
1593      * Computes the restriction level of a string, according to UTS 39 section 5.2.
1594      */
getRestrictionLevel(String input)1595     private RestrictionLevel getRestrictionLevel(String input) {
1596         // Section 5.2 step 1:
1597         if (!fAllowedCharsSet.containsAll(input)) {
1598             return RestrictionLevel.UNRESTRICTIVE;
1599         }
1600 
1601         // Section 5.2 step 2:
1602         if (ASCII.containsAll(input)) {
1603             return RestrictionLevel.ASCII;
1604         }
1605 
1606         // Section 5.2 steps 3:
1607         ScriptSet resolvedScriptSet = new ScriptSet();
1608         getResolvedScriptSet(input, resolvedScriptSet);
1609 
1610         // Section 5.2 step 4:
1611         if (!resolvedScriptSet.isEmpty()) {
1612             return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE;
1613         }
1614 
1615         // Section 5.2 step 5:
1616         ScriptSet resolvedNoLatn = new ScriptSet();
1617         getResolvedScriptSetWithout(input, UScript.LATIN, resolvedNoLatn);
1618 
1619         // Section 5.2 step 6:
1620         if (resolvedNoLatn.get(UScript.HAN_WITH_BOPOMOFO) || resolvedNoLatn.get(UScript.JAPANESE)
1621                 || resolvedNoLatn.get(UScript.KOREAN)) {
1622             return RestrictionLevel.HIGHLY_RESTRICTIVE;
1623         }
1624 
1625         // Section 5.2 step 7:
1626         if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) && !resolvedNoLatn.get(UScript.GREEK)
1627                 && !resolvedNoLatn.get(UScript.CHEROKEE)) {
1628             return RestrictionLevel.MODERATELY_RESTRICTIVE;
1629         }
1630 
1631         // Section 5.2 step 8:
1632         return RestrictionLevel.MINIMALLY_RESTRICTIVE;
1633     }
1634 
findHiddenOverlay(String input)1635     int findHiddenOverlay(String input) {
1636         boolean sawLeadCharacter = false;
1637         StringBuilder sb = new StringBuilder();
1638         for (int i=0; i<input.length();) {
1639             int cp = input.codePointAt(i);
1640             if (sawLeadCharacter && cp == 0x0307) {
1641                 return i;
1642             }
1643             int combiningClass = UCharacter.getCombiningClass(cp);
1644             // Skip over characters except for those with combining class 0 (non-combining characters) or with
1645             // combining class 230 (same class as U+0307)
1646             assert UCharacter.getCombiningClass(0x0307) == 230;
1647             if (combiningClass == 0 || combiningClass == 230) {
1648                 sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp, sb);
1649             }
1650             i += UCharacter.charCount(cp);
1651         }
1652         return -1;
1653     }
1654 
isIllegalCombiningDotLeadCharacterNoLookup(int cp)1655     boolean isIllegalCombiningDotLeadCharacterNoLookup(int cp) {
1656         return cp == 'i' || cp == 'j' || cp == 'ı' || cp == 'ȷ' || cp == 'l' ||
1657                UCharacter.hasBinaryProperty(cp, UProperty.SOFT_DOTTED);
1658     }
1659 
isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb)1660     boolean isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb) {
1661         if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
1662             return true;
1663         }
1664         sb.setLength(0);
1665         fSpoofData.confusableLookup(cp, sb);
1666         int finalCp = UCharacter.codePointBefore(sb, sb.length());
1667         if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
1668             return true;
1669         }
1670         return false;
1671     }
1672 
1673     // Data Members
1674     private int fChecks; // Bit vector of checks to perform.
1675     private SpoofData fSpoofData;
1676     private Set<ULocale> fAllowedLocales; // The Set of allowed locales.
1677     private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
1678     private RestrictionLevel fRestrictionLevel;
1679 
1680     private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance();
1681 
1682     // Confusable Mappings Data Structures, version 2.0
1683     //
1684     // This description and the corresponding implementation are to be kept
1685     // in-sync with the copy in icu4c uspoof_impl.h.
1686     //
1687     // For the confusable data, we are essentially implementing a map,
1688     //     key: a code point
1689     //     value: a string. Most commonly one char in length, but can be more.
1690     //
1691     // The keys are stored as a sorted array of 32 bit ints.
1692     //         bits 0-23 a code point value
1693     //         bits 24-31 length of value string, in UChars (between 1 and 256 UChars).
1694     //     The key table is sorted in ascending code point order. (not on the
1695     //     32 bit int value, the flag bits do not participate in the sorting.)
1696     //
1697     //     Lookup is done by means of a binary search in the key table.
1698     //
1699     // The corresponding values are kept in a parallel array of 16 bit ints.
1700     //     If the value string is of length 1, it is literally in the value array.
1701     //     For longer strings, the value array contains an index into the strings
1702     //     table.
1703     //
1704     // String Table:
1705     //     The strings table contains all of the value strings (those of length two or greater)
1706     //     concatentated together into one long char (UTF-16) array.
1707     //
1708     //     There is no nul character or other mark between adjacent strings.
1709     //
1710     //----------------------------------------------------------------------------
1711     //
1712     //  Changes from format version 1 to format version 2:
1713     //        1) Removal of the whole-script confusable data tables.
1714     //        2) Removal of the SL/SA/ML/MA and multi-table flags in the key bitmask.
1715     //        3) Expansion of string length value in the key bitmask from 2 bits to 8 bits.
1716     //        4) Removal of the string lengths table since 8 bits is sufficient for the
1717     //           lengths of all entries in confusables.txt.
1718     //
1719     private static final class ConfusableDataUtils {
1720         public static final int FORMAT_VERSION = 2; // version for ICU 58
1721 
keyToCodePoint(int key)1722         public static final int keyToCodePoint(int key) {
1723             return key & 0x00ffffff;
1724         }
1725 
keyToLength(int key)1726         public static final int keyToLength(int key) {
1727             return ((key & 0xff000000) >> 24) + 1;
1728         }
1729 
codePointAndLengthToKey(int codePoint, int length)1730         public static final int codePointAndLengthToKey(int codePoint, int length) {
1731             assert (codePoint & 0x00ffffff) == codePoint;
1732             assert length <= 256;
1733             return codePoint | ((length - 1) << 24);
1734         }
1735     }
1736 
1737     // -------------------------------------------------------------------------------------
1738     //
1739     // SpoofData
1740     //
1741     // This class corresponds to the ICU SpoofCheck data.
1742     //
1743     // The data can originate with the Binary ICU data that is generated in ICU4C,
1744     // or it can originate from source rules that are compiled in ICU4J.
1745     //
1746     // This class does not include the set of checks to be performed, but only
1747     // data that is serialized into the ICU binary data.
1748     //
1749     // Because Java cannot easily wrap binary data like ICU4C, the binary data is
1750     // copied into Java structures that are convenient for use by the run time code.
1751     //
1752     // ---------------------------------------------------------------------------------------
1753     private static class SpoofData {
1754 
1755         // The Confusable data, Java data structures for.
1756         int[] fCFUKeys;
1757         short[] fCFUValues;
1758         String fCFUStrings;
1759 
1760         private static final int DATA_FORMAT = 0x43667520; // "Cfu "
1761 
1762         private static final class IsAcceptable implements Authenticate {
1763             @Override
isDataVersionAcceptable(byte version[])1764             public boolean isDataVersionAcceptable(byte version[]) {
1765                 return version[0] == ConfusableDataUtils.FORMAT_VERSION || version[1] != 0 || version[2] != 0
1766                         || version[3] != 0;
1767             }
1768         }
1769 
1770         private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
1771 
1772         private static final class DefaultData {
1773             private static SpoofData INSTANCE = null;
1774             private static IOException EXCEPTION = null;
1775 
1776             static {
1777                 // Note: Although this is static, the Java runtime can delay execution of this block until
1778                 // the data is actually requested via SpoofData.getDefault().
1779                 try {
1780                     INSTANCE = new SpoofData(ICUBinary.getRequiredData("confusables.cfu"));
1781                 } catch (IOException e) {
1782                     EXCEPTION = e;
1783                 }
1784             }
1785         }
1786 
1787         /**
1788          * @return instance for Unicode standard data
1789          */
getDefault()1790         public static SpoofData getDefault() {
1791             if (DefaultData.EXCEPTION != null) {
1792                 throw new MissingResourceException(
1793                         "Could not load default confusables data: " + DefaultData.EXCEPTION.getMessage(),
1794                         "SpoofChecker", "");
1795             }
1796             return DefaultData.INSTANCE;
1797         }
1798 
1799         // SpoofChecker Data constructor for use from data builder.
1800         // Initializes a new, empty data area that will be populated later.
SpoofData()1801         private SpoofData() {
1802         }
1803 
1804         // Constructor for use when creating from prebuilt default data.
1805         // A ByteBuffer is what the ICU internal data loading functions provide.
SpoofData(ByteBuffer bytes)1806         private SpoofData(ByteBuffer bytes) throws java.io.IOException {
1807             ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE);
1808             bytes.mark();
1809             readData(bytes);
1810         }
1811 
1812         @Override
equals(Object other)1813         public boolean equals(Object other) {
1814             if (!(other instanceof SpoofData)) {
1815                 return false;
1816             }
1817             SpoofData otherData = (SpoofData) other;
1818             if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys))
1819                 return false;
1820             if (!Arrays.equals(fCFUValues, otherData.fCFUValues))
1821                 return false;
1822             if (!Utility.sameObjects(fCFUStrings, otherData.fCFUStrings) && fCFUStrings != null
1823                     && !fCFUStrings.equals(otherData.fCFUStrings))
1824                 return false;
1825             return true;
1826         }
1827 
1828         @Override
hashCode()1829         public int hashCode() {
1830             return Arrays.hashCode(fCFUKeys)
1831                     ^ Arrays.hashCode(fCFUValues)
1832                     ^ fCFUStrings.hashCode();
1833         }
1834 
1835         // Set the SpoofChecker data from pre-built binary data in a byte buffer.
1836         // The binary data format is as described for ICU4C spoof data.
1837         //
readData(ByteBuffer bytes)1838         private void readData(ByteBuffer bytes) throws java.io.IOException {
1839             int magic = bytes.getInt();
1840             if (magic != 0x3845fdef) {
1841                 throw new IllegalArgumentException("Bad Spoof Check Data.");
1842             }
1843             @SuppressWarnings("unused")
1844             int dataFormatVersion = bytes.getInt();
1845             @SuppressWarnings("unused")
1846             int dataLength = bytes.getInt();
1847 
1848             int CFUKeysOffset = bytes.getInt();
1849             int CFUKeysSize = bytes.getInt();
1850 
1851             int CFUValuesOffset = bytes.getInt();
1852             int CFUValuesSize = bytes.getInt();
1853 
1854             int CFUStringTableOffset = bytes.getInt();
1855             int CFUStringTableSize = bytes.getInt();
1856 
1857             // We have now read the file header, and obtained the position for each
1858             // of the data items. Now read each in turn, first seeking the
1859             // input stream to the position of the data item.
1860 
1861             bytes.reset();
1862             ICUBinary.skipBytes(bytes, CFUKeysOffset);
1863             fCFUKeys = ICUBinary.getInts(bytes, CFUKeysSize, 0);
1864 
1865             bytes.reset();
1866             ICUBinary.skipBytes(bytes, CFUValuesOffset);
1867             fCFUValues = ICUBinary.getShorts(bytes, CFUValuesSize, 0);
1868 
1869             bytes.reset();
1870             ICUBinary.skipBytes(bytes, CFUStringTableOffset);
1871             fCFUStrings = ICUBinary.getString(bytes, CFUStringTableSize, 0);
1872         }
1873 
1874         /**
1875          * Append the confusable skeleton transform for a single code point to a StringBuilder. The string to be
1876          * appended will between 1 and 18 characters as of Unicode 9.
1877          *
1878          * This is the heart of the confusable skeleton generation implementation.
1879          */
confusableLookup(int inChar, StringBuilder dest)1880         public void confusableLookup(int inChar, StringBuilder dest) {
1881             // Perform a binary search.
1882             // [lo, hi), i.e lo is inclusive, hi is exclusive.
1883             // The result after the loop will be in lo.
1884             int lo = 0;
1885             int hi = length();
1886             do {
1887                 int mid = (lo + hi) / 2;
1888                 if (codePointAt(mid) > inChar) {
1889                     hi = mid;
1890                 } else if (codePointAt(mid) < inChar) {
1891                     lo = mid;
1892                 } else {
1893                     // Found result. Break early.
1894                     lo = mid;
1895                     break;
1896                 }
1897             } while (hi - lo > 1);
1898 
1899             // Did we find an entry? If not, the char maps to itself.
1900             if (codePointAt(lo) != inChar) {
1901                 dest.appendCodePoint(inChar);
1902                 return;
1903             }
1904 
1905             // Add the element to the string builder and return.
1906             appendValueTo(lo, dest);
1907             return;
1908         }
1909 
1910         /**
1911          * Return the number of confusable entries in this SpoofData.
1912          *
1913          * @return The number of entries.
1914          */
length()1915         public int length() {
1916             return fCFUKeys.length;
1917         }
1918 
1919         /**
1920          * Return the code point (key) at the specified index.
1921          *
1922          * @param index
1923          *            The index within the SpoofData.
1924          * @return The code point.
1925          */
codePointAt(int index)1926         public int codePointAt(int index) {
1927             return ConfusableDataUtils.keyToCodePoint(fCFUKeys[index]);
1928         }
1929 
1930         /**
1931          * Append the confusable skeleton at the specified index to the StringBuilder dest.
1932          *
1933          * @param index
1934          *            The index within the SpoofData.
1935          * @param dest
1936          *            The StringBuilder to which to append the skeleton.
1937          */
appendValueTo(int index, StringBuilder dest)1938         public void appendValueTo(int index, StringBuilder dest) {
1939             int stringLength = ConfusableDataUtils.keyToLength(fCFUKeys[index]);
1940 
1941             // Value is either a char (for strings of length 1) or
1942             // an index into the string table (for longer strings)
1943             short value = fCFUValues[index];
1944             if (stringLength == 1) {
1945                 dest.append((char) value);
1946             } else {
1947                 dest.append(fCFUStrings, value, value + stringLength);
1948             }
1949         }
1950     }
1951 
1952     // -------------------------------------------------------------------------------
1953     //
1954     // ScriptSet - Script code bit sets.
1955     // Extends Java BitSet with input/output support and a few helper methods.
1956     // Note: The I/O is not currently being used, so it has been commented out. If
1957     // it is needed again, the code can be restored.
1958     //
1959     // -------------------------------------------------------------------------------
1960     static class ScriptSet extends BitSet {
1961 
1962         // Eclipse default value to quell warnings:
1963         private static final long serialVersionUID = 1L;
1964 
1965         // // The serialized version of this class can hold INT_CAPACITY * 32 scripts.
1966         // private static final int INT_CAPACITY = 6;
1967         // private static final long serialVersionUID = INT_CAPACITY;
1968         // static {
1969         // assert ScriptSet.INT_CAPACITY * Integer.SIZE <= UScript.CODE_LIMIT;
1970         // }
1971         //
1972         // public ScriptSet() {
1973         // }
1974         //
1975         // public ScriptSet(ByteBuffer bytes) throws java.io.IOException {
1976         // for (int i = 0; i < INT_CAPACITY; i++) {
1977         // int bits = bytes.getInt();
1978         // for (int j = 0; j < Integer.SIZE; j++) {
1979         // if ((bits & (1 << j)) != 0) {
1980         // set(i * Integer.SIZE + j);
1981         // }
1982         // }
1983         // }
1984         // }
1985         //
1986         // public void output(DataOutputStream os) throws java.io.IOException {
1987         // for (int i = 0; i < INT_CAPACITY; i++) {
1988         // int bits = 0;
1989         // for (int j = 0; j < Integer.SIZE; j++) {
1990         // if (get(i * Integer.SIZE + j)) {
1991         // bits |= (1 << j);
1992         // }
1993         // }
1994         // os.writeInt(bits);
1995         // }
1996         // }
1997 
ScriptSet()1998         ScriptSet() {
1999         }
2000 
and(int script)2001         public void and(int script) {
2002             this.clear(0, script);
2003             this.clear(script + 1, UScript.CODE_LIMIT);
2004         }
2005 
setAll()2006         public void setAll() {
2007             this.set(0, UScript.CODE_LIMIT);
2008         }
2009 
isFull()2010         public boolean isFull() {
2011             return cardinality() == UScript.CODE_LIMIT;
2012         }
2013 
appendStringTo(StringBuilder sb)2014         public void appendStringTo(StringBuilder sb) {
2015             sb.append("{ ");
2016             if (isEmpty()) {
2017                 sb.append("- ");
2018             } else if (isFull()) {
2019                 sb.append("* ");
2020             } else {
2021                 for (int script = 0; script < UScript.CODE_LIMIT; script++) {
2022                     if (get(script)) {
2023                         sb.append(UScript.getShortName(script));
2024                         sb.append(" ");
2025                     }
2026                 }
2027             }
2028             sb.append("}");
2029         }
2030 
2031         @Override
toString()2032         public String toString() {
2033             StringBuilder sb = new StringBuilder();
2034             sb.append("<ScriptSet ");
2035             appendStringTo(sb);
2036             sb.append(">");
2037             return sb.toString();
2038         }
2039     }
2040 }
2041