• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ***************************************************************************
5  * Copyright (C) 2008-2016 International Business Machines Corporation
6  * and others. All Rights Reserved.
7  ***************************************************************************
8  *
9  * Unicode Spoof Detection
10  */
11 
12 package com.ibm.icu.text;
13 
14 import java.io.IOException;
15 import java.io.LineNumberReader;
16 import java.io.Reader;
17 import java.nio.ByteBuffer;
18 import java.text.ParseException;
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.BitSet;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.HashSet;
25 import java.util.Hashtable;
26 import java.util.LinkedHashSet;
27 import java.util.Locale;
28 import java.util.MissingResourceException;
29 import java.util.Set;
30 import java.util.Vector;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
33 
34 import com.ibm.icu.impl.ICUBinary;
35 import com.ibm.icu.impl.ICUBinary.Authenticate;
36 import com.ibm.icu.impl.Utility;
37 import com.ibm.icu.lang.UCharacter;
38 import com.ibm.icu.lang.UCharacter.IdentifierType;
39 import com.ibm.icu.lang.UCharacterCategory;
40 import com.ibm.icu.lang.UProperty;
41 import com.ibm.icu.lang.UScript;
42 import com.ibm.icu.util.ULocale;
43 
44 /**
45  * <p>
46  * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and
47  * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
48  *
49  * <ol>
50  * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desparejado" and
51  * "ԁеѕрагејаԁо".</li>
52  * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
53  * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li>
54  * </ol>
55  *
56  * <p>
57  * Although originally designed as a method for flagging suspicious identifier strings such as URLs,
58  * <code>SpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word
59  * content filters.
60  *
61  * <h2>Confusables</h2>
62  *
63  * <p>
64  * The following example shows how to use <code>SpoofChecker</code> to check for confusability between two strings:
65  *
66  * <pre>
67  * <code>
68  * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
69  * int result = sc.areConfusable("desparejado", "ԁеѕрагејаԁо");
70  * System.out.println(result != 0);  // true
71  * </code>
72  * </pre>
73  *
74  * <p>
75  * <code>SpoofChecker</code> uses a builder paradigm: options are specified within the context of a lightweight
76  * {@link SpoofChecker.Builder} object, and upon calling {@link SpoofChecker.Builder#build}, expensive data loading
77  * operations are performed, and an immutable <code>SpoofChecker</code> is returned.
78  *
79  * <p>
80  * The first line of the example creates a <code>SpoofChecker</code> object with confusable-checking enabled; the second
81  * line performs the confusability test. For best performance, the instance should be created once (e.g., upon
82  * application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime.
83  *
84  * <p>
85  * If the paragraph direction used to display the strings is known, it should be passed to {@link SpoofChecker#areConfusable}:
86  *
87  * <pre>
88  * <code>
89  * // These strings look identical when rendered in a left-to-right context.
90  * // They look distinct in a right-to-left context.
91  * String s1 = "A1\u05D0";  // A1א
92  * String s2 = "A\u05D01";  // Aא1
93  *
94  * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
95  * int result = sc.areConfusable(Bidi.DIRECTION_LEFT_TO_RIGHT, s1, s2);
96  * System.out.println(result != 0);  // true
97  * </code>
98  * </pre>
99  *
100  * <p>
101  * UTS 39 defines two strings to be <em>confusable</em> if they map to the same skeleton. A <em>skeleton</em> is a
102  * sequence of families of confusable characters, where each family has a single exemplar character.
103  * {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so the following snippet is
104  * equivalent to the example above:
105  *
106  * <pre>
107  * <code>
108  * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
109  * boolean result = sc.getSkeleton("desparejado").equals(sc.getSkeleton("ԁеѕрагејаԁо"));
110  * System.out.println(result);  // true
111  * </code>
112  * </pre>
113  *
114  * <p>
115  * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
116  * {@link SpoofChecker#areConfusable} many times in a loop, {@link SpoofChecker#getSkeleton} can be used instead, as
117  * shown below:
118  *
119  * <pre>
120  * // Setup:
121  * String[] DICTIONARY = new String[]{ "lorem", "ipsum" }; // example
122  * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
123  * HashSet&lt;String&gt; skeletons = new HashSet&lt;String&gt;();
124  * for (String word : DICTIONARY) {
125  *   skeletons.add(sc.getSkeleton(word));
126  * }
127  *
128  * // Live Check:
129  * boolean result = skeletons.contains(sc.getSkeleton("1orern"));
130  * System.out.println(result);  // true
131  * </pre>
132  *
133  * <p>
134  * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em>
135  * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
136  * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
137  *
138  * <h2>Spoof Detection</h2>
139  *
140  * <p>
141  * The following snippet shows a minimal example of using <code>SpoofChecker</code> to perform spoof detection on a
142  * string:
143  *
144  * <pre>
145  * SpoofChecker sc = new SpoofChecker.Builder()
146  *     .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
147  *     .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
148  *     .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
149  *     .build();
150  * boolean result = sc.failsChecks("pаypаl");  // with Cyrillic 'а' characters
151  * System.out.println(result);  // true
152  * </pre>
153  *
154  * <p>
155  * As in the case for confusability checking, it is good practice to create one <code>SpoofChecker</code> instance at
156  * startup, and call the cheaper {@link SpoofChecker#failsChecks} online. In the second line, we specify the set of
157  * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. In the
158  * third line, the CONFUSABLE checks are disabled. It is good practice to disable them if you won't be using the
159  * instance to perform confusability checking.
160  *
161  * <p>
162  * To get more details on why a string failed the checks, use a {@link SpoofChecker.CheckResult}:
163  *
164  * <pre>
165  * <code>
166  * SpoofChecker sc = new SpoofChecker.Builder()
167  *     .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
168  *     .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
169  *     .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
170  *     .build();
171  * SpoofChecker.CheckResult checkResult = new SpoofChecker.CheckResult();
172  * boolean result = sc.failsChecks("pаypаl", checkResult);
173  * System.out.println(checkResult.checks);  // 16
174  * </code>
175  * </pre>
176  *
177  * <p>
178  * The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
179  * {@link SpoofChecker#RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
180  *
181  * <ul>
182  * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the
183  * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS
184  * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li>
185  * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character
186  * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li>
187  * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable
188  * characters. See {@link SpoofChecker.Builder#setAllowedChars} and {@link SpoofChecker.Builder#setAllowedLocales}.</li>
189  * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li>
190  * </ul>
191  *
192  * <p>
193  * These checks can be enabled independently of each other. For example, if you were interested in checking for only the
194  * INVISIBLE and MIXED_NUMBERS conditions, you could do:
195  *
196  * <pre>
197  * <code>
198  * SpoofChecker sc = new SpoofChecker.Builder()
199  *     .setChecks(SpoofChecker.INVISIBLE | SpoofChecker.MIXED_NUMBERS)
200  *     .build();
201  * boolean result = sc.failsChecks("৪8");
202  * System.out.println(result);  // true
203  * </code>
204  * </pre>
205  *
206  * <p>
207  * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in
208  * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings
209  * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have
210  * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is
211  * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed
212  * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
213  * the levels, see UTS 39 or {@link SpoofChecker.RestrictionLevel}. The Restriction Level test is aware of the set of
214  * allowed characters set in {@link SpoofChecker.Builder#setAllowedChars}. Note that characters which have script code
215  * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
216  * scripts.
217  *
218  * <h2>Advanced bidirectional usage</h2>
219  * If the paragraph direction with which the identifiers will be displayed is not known, there are
220  * multiple options for confusable detection depending on the circumstances.
221  *
222  * <p>
223  * In some circumstances, the only concern is confusion between identifiers displayed with the same
224  * paragraph direction.
225  *
226  * <p>
227  * An example is the case where identifiers are usernames prefixed with the @ symbol.
228  * That symbol will appear to the left in a left-to-right context, and to the right in a
229  * right-to-left context, so that an identifier displayed in a left-to-right context can never be
230  * confused with an identifier displayed in a right-to-left context:
231  * <ul>
232  * <li>
233  * The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1)
234  * would be considered confusable, since they both appear as @A1א in a left-to-right context, and the
235  * usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered
236  * confusable, since they both appear as A_1א@ in a right-to-left context.
237  * </li>
238  * <li>
239  * The username "Mark_" would not be considered confusable with the username "_Mark",
240  * even though the latter would appear as Mark_@ in a right-to-left context, and the
241  * former as @Mark_ in a left-to-right context.
242  * </li>
243  * </ul>
244  * <p>
245  * In that case, the caller should check for both LTR-confusability and RTL-confusability:
246  *
247  * <pre>
248  * <code>
249  * boolean confusableInEitherDirection =
250  *     sc.areConfusable(Bidi.DIRECTION_LEFT_TO_RIGHT, id1, id2) ||
251  *     sc.areConfusable(Bidi.DIRECTION_RIGHT_TO_LEFT, id1, id2);
252  * </code>
253  * </pre>
254  *
255  * If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR
256  * with LTR and RTL with RTL.
257  *
258  * <p>
259  * In cases where confusability between the visual appearances of an identifier displayed in a
260  * left-to-right context with another identifier displayed in a right-to-left context is a concern,
261  * the LTR skeleton of one can be compared with the RTL skeleton of the other.  However, this
262  * very broad definition of confusability may have unexpected results; for instance, it treats the
263  * ASCII identifiers "Mark_" and "_Mark" as confusable.
264  *
265  * <h2>Additional Information</h2>
266  *
267  * <p>
268  * A <code>SpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
269  *
270  * <p>
271  * <b>Thread Safety:</b> The methods on <code>SpoofChecker</code> objects are thread safe. The test functions for
272  * checking a single identifier, or for testing whether two identifiers are potentially confusable, may called
273  * concurrently from multiple threads using the same <code>SpoofChecker</code> instance.
274  *
275  * @stable ICU 4.6
276  */
277 public class SpoofChecker {
278 
279     /**
280      * Constants from UTS 39 for use in setRestrictionLevel.
281      *
282      * @stable ICU 53
283      */
284     public enum RestrictionLevel {
285         /**
286          * All characters in the string are in the identifier profile and all characters in the string are in the ASCII
287          * range.
288          *
289          * @stable ICU 53
290          */
291         ASCII,
292         /**
293          * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and the
294          * string is single-script, according to the definition in UTS 39 section 5.1.
295          *
296          * @stable ICU 53
297          */
298         SINGLE_SCRIPT_RESTRICTIVE,
299         /**
300          * The string classifies as Single Script, or all characters in the string are in the identifier profile and the
301          * string is covered by any of the following sets of scripts, according to the definition in UTS 39 section 5.1:
302          * <ul>
303          * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li>
304          * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li>
305          * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li>
306          * </ul>
307          *
308          * @stable ICU 53
309          */
310         HIGHLY_RESTRICTIVE,
311         /**
312          * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
313          * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
314          * Greek, and Cherokee.
315          *
316          * @stable ICU 53
317          */
318         MODERATELY_RESTRICTIVE,
319         /**
320          * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts, such as
321          * Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us.
322          *
323          * @stable ICU 53
324          */
325         MINIMALLY_RESTRICTIVE,
326         /**
327          * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org
328          *
329          * @stable ICU 53
330          */
331         UNRESTRICTIVE,
332     }
333 
334     /**
335      * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}.
336      *
337      * @stable ICU 58
338      */
339     public static final UnicodeSet INCLUSION =
340             new UnicodeSet().
341             applyIntPropertyValue(UProperty.IDENTIFIER_TYPE, IdentifierType.INCLUSION.ordinal()).
342             freeze();
343 
344     /**
345      * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}.
346      *
347      * @stable ICU 58
348      */
349     public static final UnicodeSet RECOMMENDED =
350             new UnicodeSet().
351             applyIntPropertyValue(UProperty.IDENTIFIER_TYPE, IdentifierType.RECOMMENDED.ordinal()).
352             freeze();
353 
354     /**
355      * Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of
356      * checks that will be performed, and to report results from the check function.
357      *
358      */
359 
360     /**
361      * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
362      * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
363      * 4.
364      *
365      * @stable ICU 4.6
366      */
367     public static final int SINGLE_SCRIPT_CONFUSABLE = 1;
368 
369     /**
370      * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
371      * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS
372      * 39 section 4.
373      *
374      * @stable ICU 4.6
375      */
376     public static final int MIXED_SCRIPT_CONFUSABLE = 2;
377 
378     /**
379      * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
380      * that the two strings are visually confusable and that they are not from the same script but both of them are
381      * single-script strings, according to UTS 39 section 4.
382      *
383      * @stable ICU 4.6
384      */
385     public static final int WHOLE_SCRIPT_CONFUSABLE = 4;
386 
387     /**
388      * Enable this flag in {@link SpoofChecker.Builder#setChecks} to turn on all types of confusables. You may set the
389      * checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to make
390      * {@link SpoofChecker#areConfusable} return only those types of confusables.
391      *
392      * @stable ICU 58
393      */
394     public static final int CONFUSABLE = SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE;
395 
396     /**
397      * This flag is deprecated and no longer affects the behavior of SpoofChecker.
398      *
399      * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was
400      * deprecated.
401      */
402     @Deprecated
403     public static final int ANY_CASE = 8;
404 
405     /**
406      * Check that an identifier satisfies the requirements for the restriction level specified in
407      * {@link SpoofChecker.Builder#setRestrictionLevel}. The default restriction level is
408      * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}.
409      *
410      * @stable ICU 58
411      */
412     public static final int RESTRICTION_LEVEL = 16;
413 
414     /**
415      * Check that an identifier contains only characters from a single script (plus chars from the common and inherited
416      * scripts.) Applies to checks of a single identifier check only.
417      *
418      * @deprecated ICU 51 Use RESTRICTION_LEVEL
419      */
420     @Deprecated
421     public static final int SINGLE_SCRIPT = RESTRICTION_LEVEL;
422 
423     /**
424      * Check an identifier for the presence of invisible characters, such as zero-width spaces, or character sequences
425      * that are likely not to display, such as multiple occurrences of the same non-spacing mark. This check does not
426      * test the input string as a whole for conformance to any particular syntax for identifiers.
427      *
428      * @stable ICU 4.6
429      */
430     public static final int INVISIBLE = 32;
431 
432     /**
433      * Check that an identifier contains only characters from a specified set of acceptable characters. See
434      * {@link Builder#setAllowedChars} and {@link Builder#setAllowedLocales}. Note that a string that fails this check
435      * will also fail the {@link #RESTRICTION_LEVEL} check.
436      *
437      * @stable ICU 4.6
438      */
439     public static final int CHAR_LIMIT = 64;
440 
441     /**
442      * Check that an identifier does not mix numbers from different numbering systems. For more information, see UTS 39
443      * section 5.3.
444      *
445      * @stable ICU 58
446      */
447     public static final int MIXED_NUMBERS = 128;
448 
449     /**
450      * Check that an identifier does not have a combining character following a character in which that
451      * combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
452      * <p>
453      * More specifically, the following characters are forbidden from preceding a U+0307:
454      * <ul>
455      * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
456      * <li>Latin lowercase letter 'l'</li>
457      * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
458      * <li>Any character whose confusable prototype ends with such a character
459      * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
460      * </ul>
461      * In addition, combining characters are allowed between the above characters and U+0307 except those
462      * with combining class 0 or combining class "Above" (230, same class as U+0307).
463      * <p>
464      * This list and the number of combing characters considered by this check may grow over time.
465      *
466      * @stable ICU 62
467      */
468     public static final int HIDDEN_OVERLAY = 256;
469 
470     // Update CheckResult.toString() when a new check is added.
471 
472     /**
473      * Enable all spoof checks.
474      *
475      * @stable ICU 4.6
476      */
477     public static final int ALL_CHECKS = 0xFFFFFFFF;
478 
479     // Used for checking for ASCII-Only restriction level
480     static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze();
481 
482     /**
483      * private constructor: a SpoofChecker has to be built by the builder
484      */
SpoofChecker()485     private SpoofChecker() {
486     }
487 
488     /**
489      * SpoofChecker Builder. To create a SpoofChecker, first instantiate a SpoofChecker.Builder, set the desired
490      * checking options on the builder, then call the build() function to create a SpoofChecker instance.
491      *
492      * @stable ICU 4.6
493      */
494     public static class Builder {
495         int fChecks; // Bit vector of checks to perform.
496         SpoofData fSpoofData;
497         final UnicodeSet fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); // The UnicodeSet of allowed characters.
498         // for this Spoof Checker. Defaults to all chars.
499         final Set<ULocale> fAllowedLocales = new LinkedHashSet<>(); // The list of allowed locales.
500         private RestrictionLevel fRestrictionLevel;
501 
502         /**
503          * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for
504          * LOCALE_LIMIT and CHAR_LIMIT. Note that additional checks may be added in the future, resulting in the changes
505          * to the default checking behavior.
506          *
507          * @stable ICU 4.6
508          */
Builder()509         public Builder() {
510             fChecks = ALL_CHECKS;
511             fSpoofData = null;
512             fRestrictionLevel = RestrictionLevel.HIGHLY_RESTRICTIVE;
513         }
514 
515         /**
516          * Constructor: Create a Spoof Checker Builder, and set the configuration from an existing SpoofChecker.
517          *
518          * @param src
519          *            The existing checker.
520          * @stable ICU 4.6
521          */
Builder(SpoofChecker src)522         public Builder(SpoofChecker src) {
523             fChecks = src.fChecks;
524             fSpoofData = src.fSpoofData; // For the data, we will either use the source data
525                                          // as-is, or drop the builder's reference to it
526                                          // and generate new data, depending on what our
527                                          // caller does with the builder.
528             fAllowedCharsSet.set(src.fAllowedCharsSet);
529             fAllowedLocales.addAll(src.fAllowedLocales);
530             fRestrictionLevel = src.fRestrictionLevel;
531         }
532 
533         /**
534          * Create a SpoofChecker with current configuration.
535          *
536          * @return SpoofChecker
537          * @stable ICU 4.6
538          */
build()539         public SpoofChecker build() {
540             // TODO: Make this data loading be lazy (see #12696).
541             if (fSpoofData == null) {
542                 // read binary file
543                 fSpoofData = SpoofData.getDefault();
544             }
545 
546             // Copy all state from the builder to the new SpoofChecker.
547             // Make sure that everything is either cloned or copied, so
548             // that subsequent re-use of the builder won't modify the built
549             // SpoofChecker.
550             //
551             // One exception to this: the SpoofData is just assigned.
552             // If the builder subsequently needs to modify fSpoofData
553             // it will create a new SpoofData object first.
554 
555             SpoofChecker result = new SpoofChecker();
556             result.fChecks = this.fChecks;
557             result.fSpoofData = this.fSpoofData;
558             result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone());
559             result.fAllowedCharsSet.freeze();
560             result.fAllowedLocales = new HashSet<>(this.fAllowedLocales);
561             result.fRestrictionLevel = this.fRestrictionLevel;
562             return result;
563         }
564 
565         /**
566          * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data file
567          * confusables.txt as described in Unicode UAX 39. The syntax of the source data is as described in UAX 39 for
568          * these files, and the content of these files is acceptable input.
569          *
570          * @param confusables
571          *            the Reader of confusable characters definitions, as found in file confusables.txt from
572          *            unicode.org.
573          * @throws ParseException
574          *             To report syntax errors in the input.
575          *
576          * @stable ICU 58
577          */
setData(Reader confusables)578         public Builder setData(Reader confusables) throws ParseException, IOException {
579 
580             // Compile the binary data from the source (text) format.
581             // Drop the builder's reference to any pre-existing data, which may
582             // be in use in an already-built checker.
583 
584             fSpoofData = new SpoofData();
585             ConfusabledataBuilder.buildConfusableData(confusables, fSpoofData);
586             return this;
587         }
588 
589         /**
590          * Deprecated as of ICU 58; use {@link SpoofChecker.Builder#setData(Reader confusables)} instead.
591          *
592          * @param confusables
593          *            the Reader of confusable characters definitions, as found in file confusables.txt from
594          *            unicode.org.
595          * @param confusablesWholeScript
596          *            No longer supported.
597          * @throws ParseException
598          *             To report syntax errors in the input.
599          *
600          * @deprecated ICU 58
601          */
602         @Deprecated
setData(Reader confusables, Reader confusablesWholeScript)603         public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, IOException {
604             setData(confusables);
605             return this;
606         }
607 
608         /**
609          * Specify the bitmask of checks that will be performed by {@link SpoofChecker#failsChecks}. Calling this method
610          * overwrites any checks that may have already been enabled. By default, all checks are enabled.
611          *
612          * To enable specific checks and disable all others,
613          * OR together only the bit constants for the desired checks.
614          * For example, to fail strings containing characters outside of
615          * the set specified by {@link #setAllowedChars} and
616          * also strings that contain digits from mixed numbering systems:
617          *
618          * <pre>
619          * {@code
620          * builder.setChecks(SpoofChecker.CHAR_LIMIT | SpoofChecker.MIXED_NUMBERS);
621          * }
622          * </pre>
623          *
624          * To disable specific checks and enable all others,
625          * start with ALL_CHECKS and "AND away" the not-desired checks.
626          * For example, if you are not planning to use the {@link SpoofChecker#areConfusable} functionality,
627          * it is good practice to disable the CONFUSABLE check:
628          *
629          * <pre>
630          * {@code
631          * builder.setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CONFUSABLE);
632          * }
633          * </pre>
634          *
635          * Note that methods such as {@link #setAllowedChars}, {@link #setAllowedLocales}, and
636          * {@link #setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
637          * enable onto the existing bitmask specified by this method. For more details, see the documentation of those
638          * methods.
639          *
640          * @param checks
641          *            The set of checks that this spoof checker will perform. The value is an 'or' of the desired
642          *            checks.
643          * @return self
644          * @stable ICU 4.6
645          */
setChecks(int checks)646         public Builder setChecks(int checks) {
647             // Verify that the requested checks are all ones (bits) that
648             // are acceptable, known values.
649             if (0 != (checks & ~SpoofChecker.ALL_CHECKS)) {
650                 throw new IllegalArgumentException("Bad Spoof Checks value.");
651             }
652             this.fChecks = (checks & SpoofChecker.ALL_CHECKS);
653             return this;
654         }
655 
656         /**
657          * Limit characters that are acceptable in identifiers being checked to those normally used with the languages
658          * associated with the specified locales. Any previously specified list of locales is replaced by the new
659          * settings.
660          *
661          * A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is
662          * determined. Characters from this set of scripts, along with characters from the "common" and "inherited"
663          * Unicode Script categories will be permitted.
664          *
665          * Supplying an empty string removes all restrictions; characters from any script will be allowed.
666          *
667          * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker when calling this function with a
668          * non-empty list of locales.
669          *
670          * The Unicode Set of characters that will be allowed is accessible via the {@link #getAllowedChars} function.
671          * setAllowedLocales() will <i>replace</i> any previously applied set of allowed characters.
672          *
673          * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of
674          * {@link #setAllowedChars} by fetching the resulting set with {@link #getAllowedChars}, manipulating it with
675          * the Unicode Set API, then resetting the spoof detectors limits with {@link #setAllowedChars}.
676          *
677          * @param locales
678          *            A Set of ULocales, from which the language and associated script are extracted. If the locales Set
679          *            is null, no restrictions will be placed on the allowed characters.
680          *
681          * @return self
682          * @stable ICU 4.6
683          */
setAllowedLocales(Set<ULocale> locales)684         public Builder setAllowedLocales(Set<ULocale> locales) {
685             fAllowedCharsSet.clear();
686 
687             for (ULocale locale : locales) {
688                 // Add the script chars for this locale to the accumulating set
689                 // of allowed chars.
690                 addScriptChars(locale, fAllowedCharsSet);
691             }
692 
693             // If our caller provided an empty list of locales, we disable the
694             // allowed characters checking
695             fAllowedLocales.clear();
696             if (locales.size() == 0) {
697                 fAllowedCharsSet.add(0, 0x10ffff);
698                 fChecks &= ~CHAR_LIMIT;
699                 return this;
700             }
701 
702             // Add all common and inherited characters to the set of allowed
703             // chars.
704             UnicodeSet tempSet = new UnicodeSet();
705             tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON);
706             fAllowedCharsSet.addAll(tempSet);
707             tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED);
708             fAllowedCharsSet.addAll(tempSet);
709 
710             // Store the updated spoof checker state.
711             fAllowedLocales.clear();
712             fAllowedLocales.addAll(locales);
713             fChecks |= CHAR_LIMIT;
714             return this;
715         }
716 
717         /**
718          * Limit characters that are acceptable in identifiers being checked to those normally used with the languages
719          * associated with the specified locales. Any previously specified list of locales is replaced by the new
720          * settings.
721          *
722          * @param locales
723          *            A Set of Locales, from which the language and associated script are extracted. If the locales Set
724          *            is null, no restrictions will be placed on the allowed characters.
725          *
726          * @return self
727          * @stable ICU 54
728          */
setAllowedJavaLocales(Set<Locale> locales)729         public Builder setAllowedJavaLocales(Set<Locale> locales) {
730             HashSet<ULocale> ulocales = new HashSet<>(locales.size());
731             for (Locale locale : locales) {
732                 ulocales.add(ULocale.forLocale(locale));
733             }
734             return setAllowedLocales(ulocales);
735         }
736 
737         // Add (union) to the UnicodeSet all of the characters for the scripts
738         // used for the specified locale. Part of the implementation of
739         // setAllowedLocales.
addScriptChars(ULocale locale, UnicodeSet allowedChars)740         private void addScriptChars(ULocale locale, UnicodeSet allowedChars) {
741             int scripts[] = UScript.getCode(locale);
742             if (scripts != null) {
743                 UnicodeSet tmpSet = new UnicodeSet();
744                 for (int i = 0; i < scripts.length; i++) {
745                     tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]);
746                     allowedChars.addAll(tmpSet);
747                 }
748             }
749             // else it's an unknown script.
750             // Maybe they asked for the script of "zxx", which refers to no linguistic content.
751             // Maybe they asked for the script of a newer locale that we don't know in the older version of ICU.
752         }
753 
754         /**
755          * Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit
756          * is replaced by the new settings. This includes limits on characters that were set with the
757          * setAllowedLocales() function. Note that the RESTRICTED set is useful.
758          *
759          * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker by this function.
760          *
761          * @param chars
762          *            A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by
763          *            this function, so there are no restrictions on modifying or deleting the UnicodeSet after calling
764          *            this function. Note that this clears the allowedLocales set.
765          * @return self
766          * @stable ICU 4.6
767          */
setAllowedChars(UnicodeSet chars)768         public Builder setAllowedChars(UnicodeSet chars) {
769             fAllowedCharsSet.set(chars);
770             fAllowedLocales.clear();
771             fChecks |= CHAR_LIMIT;
772             return this;
773         }
774 
775         /**
776          * Set the loosest restriction level allowed for strings. The default if this is not called is
777          * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. Calling this method enables the {@link #RESTRICTION_LEVEL} and
778          * {@link #MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
779          * to be performed by {@link SpoofChecker#failsChecks}, see {@link #setChecks}.
780          *
781          * @param restrictionLevel
782          *            The loosest restriction level allowed.
783          * @return self
784          * @stable ICU 58
785          */
setRestrictionLevel(RestrictionLevel restrictionLevel)786         public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) {
787             fRestrictionLevel = restrictionLevel;
788             fChecks |= RESTRICTION_LEVEL | MIXED_NUMBERS;
789             return this;
790         }
791 
792         /*
793          * *****************************************************************************
794          * Internal classes for compiling confusable data into its binary (runtime) form.
795          * *****************************************************************************
796          */
797         // ---------------------------------------------------------------------
798         //
799         // buildConfusableData Compile the source confusable data, as defined by
800         // the Unicode data file confusables.txt, into the binary
801         // structures used by the confusable detector.
802         //
803         // The binary structures are described in uspoof_impl.h
804         //
805         // 1. parse the data, making a hash table mapping from a codepoint to a String.
806         //
807         // 2. Sort all of the strings encountered by length, since they will need to
808         // be stored in that order in the final string table.
809         // TODO: Sorting these strings by length is no longer needed since the removal of
810         // the string lengths table.  This logic can be removed to save processing time
811         // when building confusables data.
812         //
813         // 3. Build a list of keys (UChar32s) from the mapping table. Sort the
814         // list because that will be the ordering of our runtime table.
815         //
816         // 4. Generate the run time string table. This is generated before the key & value
817         // table because we need the string indexes when building those tables.
818         //
819         // 5. Build the run-time key and value table. These are parallel tables, and
820         // are built at the same time
821 
822         // class ConfusabledataBuilder
823         // An instance of this class exists while the confusable data is being built from source.
824         // It encapsulates the intermediate data structures that are used for building.
825         // It exports one static function, to do a confusable data build.
826         private static class ConfusabledataBuilder {
827 
828             private Hashtable<Integer, SPUString> fTable;
829             private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the
830                                         // four mapping tables.
831 
832             // The compiled data is first assembled into the following four collections,
833             // then output to the builder's SpoofData object.
834             private StringBuffer fStringTable;
835             private ArrayList<Integer> fKeyVec;
836             private ArrayList<Integer> fValueVec;
837             private SPUStringPool stringPool;
838             private Pattern fParseLine;
839             private Pattern fParseHexNum;
840             private int fLineNum;
841 
ConfusabledataBuilder()842             ConfusabledataBuilder() {
843                 fTable = new Hashtable<>();
844                 fKeySet = new UnicodeSet();
845                 fKeyVec = new ArrayList<>();
846                 fValueVec = new ArrayList<>();
847                 stringPool = new SPUStringPool();
848             }
849 
build(Reader confusables, SpoofData dest)850             void build(Reader confusables, SpoofData dest) throws ParseException, java.io.IOException {
851                 StringBuffer fInput = new StringBuffer();
852 
853                 // Convert the user input data from UTF-8 to char (UTF-16)
854                 LineNumberReader lnr = new LineNumberReader(confusables);
855                 do {
856                     String line = lnr.readLine();
857                     if (line == null) {
858                         break;
859                     }
860                     fInput.append(line);
861                     fInput.append('\n');
862                 } while (true);
863 
864                 // Regular Expression to parse a line from Confusables.txt. The expression will match
865                 // any line. What was matched is determined by examining which capture groups have a match.
866                 // Capture Group 1: the source char
867                 // Capture Group 2: the replacement chars
868                 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
869                 // Capture Group 7: A blank or comment only line.
870                 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
871                 // Example Line from the confusables.txt source file:
872                 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
873                 fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match the source char
874                         "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s)
875                         "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued)
876                         "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type
877                         "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment
878                         "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment
879                         "|^(.*?)$"); // OR match any line, which catches illegal lines.
880 
881                 // Regular expression for parsing a hex number out of a space-separated list of them.
882                 // Capture group 1 gets the number, with spaces removed.
883                 fParseHexNum = Pattern.compile("\\s*([0-9A-F]+)");
884 
885                 // Zap any Byte Order Mark at the start of input. Changing it to a space
886                 // is benign given the syntax of the input.
887                 if (fInput.charAt(0) == 0xfeff) {
888                     fInput.setCharAt(0, (char) 0x20);
889                 }
890 
891                 // Parse the input, one line per iteration of this loop.
892                 Matcher matcher = fParseLine.matcher(fInput);
893                 while (matcher.find()) {
894                     fLineNum++;
895                     if (matcher.start(7) >= 0) {
896                         // this was a blank or comment line.
897                         continue;
898                     }
899                     if (matcher.start(8) >= 0) {
900                         // input file syntax error.
901                         // status = U_PARSE_ERROR;
902                         throw new ParseException(
903                                 "Confusables, line " + fLineNum + ": Unrecognized Line: " + matcher.group(8),
904                                 matcher.start(8));
905                     }
906 
907                     // We have a good input line. Extract the key character and mapping
908                     // string, and
909                     // put them into the appropriate mapping table.
910                     int keyChar = Integer.parseInt(matcher.group(1), 16);
911                     if (keyChar > 0x10ffff) {
912                         throw new ParseException(
913                                 "Confusables, line " + fLineNum + ": Bad code point: " + matcher.group(1),
914                                 matcher.start(1));
915                     }
916                     Matcher m = fParseHexNum.matcher(matcher.group(2));
917 
918                     StringBuilder mapString = new StringBuilder();
919                     while (m.find()) {
920                         int c = Integer.parseInt(m.group(1), 16);
921                         if (c > 0x10ffff) {
922                             throw new ParseException(
923                                     "Confusables, line " + fLineNum + ": Bad code point: " + Integer.toString(c, 16),
924                                     matcher.start(2));
925                         }
926                         mapString.appendCodePoint(c);
927                     }
928                     assert (mapString.length() >= 1);
929 
930                     // Put the map (value) string into the string pool
931                     // This a little like a Java intern() - any duplicates will be
932                     // eliminated.
933                     SPUString smapString = stringPool.addString(mapString.toString());
934 
935                     // Add the char . string mapping to the table.
936                     // For Unicode 8, the SL, SA and ML tables have been discontinued.
937                     // All input data from confusables.txt is tagged MA.
938                     fTable.put(keyChar, smapString);
939 
940                     fKeySet.add(keyChar);
941                 }
942 
943                 // Input data is now all parsed and collected.
944                 // Now create the run-time binary form of the data.
945                 //
946                 // This is done in two steps. First the data is assembled into vectors and strings,
947                 // for ease of construction, then the contents of these collections are copied
948                 // into the actual SpoofData object.
949 
950                 // Build up the string array, and record the index of each string therein
951                 // in the (build time only) string pool.
952                 // Strings of length one are not entered into the strings array.
953                 // (Strings in the table are sorted by length)
954 
955                 stringPool.sort();
956                 fStringTable = new StringBuffer();
957                 int poolSize = stringPool.size();
958                 int i;
959                 for (i = 0; i < poolSize; i++) {
960                     SPUString s = stringPool.getByIndex(i);
961                     int strLen = s.fStr.length();
962                     int strIndex = fStringTable.length();
963                     if (strLen == 1) {
964                         // strings of length one do not get an entry in the string table.
965                         // Keep the single string character itself here, which is the same
966                         // convention that is used in the final run-time string table index.
967                         s.fCharOrStrTableIndex = s.fStr.charAt(0);
968                     } else {
969                         s.fCharOrStrTableIndex = strIndex;
970                         fStringTable.append(s.fStr);
971                     }
972                 }
973 
974                 // Construct the compile-time Key and Value table.
975                 //
976                 // The keys in the Key table follow the format described in uspoof.h for the
977                 // Cfu confusables data structure.
978                 //
979                 // Starting in ICU 58, each code point has exactly one entry in the data
980                 // structure.
981 
982                 for (String keyCharStr : fKeySet) {
983                     int keyChar = keyCharStr.codePointAt(0);
984                     SPUString targetMapping = fTable.get(keyChar);
985                     assert targetMapping != null;
986 
987                     // Throw a sane exception if trying to consume a long string.  Otherwise,
988                     // codePointAndLengthToKey will throw an assertion error.
989                     if (targetMapping.fStr.length() > 256) {
990                         throw new IllegalArgumentException("Confusable prototypes cannot be longer than 256 entries.");
991                     }
992 
993                     int key = ConfusableDataUtils.codePointAndLengthToKey(keyChar, targetMapping.fStr.length());
994                     int value = targetMapping.fCharOrStrTableIndex;
995 
996                     fKeyVec.add(key);
997                     fValueVec.add(value);
998                 }
999 
1000                 // Put the assembled data into the destination SpoofData object.
1001 
1002                 // The Key Table
1003                 // While copying the keys to the output array,
1004                 // also sanity check that the keys are sorted.
1005                 int numKeys = fKeyVec.size();
1006                 dest.fCFUKeys = new int[numKeys];
1007                 int previousCodePoint = 0;
1008                 for (i = 0; i < numKeys; i++) {
1009                     int key = fKeyVec.get(i);
1010                     int codePoint = ConfusableDataUtils.keyToCodePoint(key);
1011                     // strictly greater because there can be only one entry per code point
1012                     assert codePoint > previousCodePoint;
1013                     dest.fCFUKeys[i] = key;
1014                     previousCodePoint = codePoint;
1015                 }
1016 
1017                 // The Value Table, parallels the key table
1018                 int numValues = fValueVec.size();
1019                 assert (numKeys == numValues);
1020                 dest.fCFUValues = new short[numValues];
1021                 i = 0;
1022                 for (int value : fValueVec) {
1023                     assert (value < 0xffff);
1024                     dest.fCFUValues[i++] = (short) value;
1025                 }
1026 
1027                 // The Strings Table.
1028                 dest.fCFUStrings = fStringTable.toString();
1029             }
1030 
1031             public static void buildConfusableData(Reader confusables, SpoofData dest)
1032                     throws java.io.IOException, ParseException {
1033                 ConfusabledataBuilder builder = new ConfusabledataBuilder();
1034                 builder.build(confusables, dest);
1035             }
1036 
1037             /*
1038              * *****************************************************************************
1039              * Internal classes for compiling confusable data into its binary (runtime) form.
1040              * *****************************************************************************
1041              */
1042             // SPUString
1043             // Holds a string that is the result of one of the mappings defined
1044             // by the confusable mapping data (confusables.txt from Unicode.org)
1045             // Instances of SPUString exist during the compilation process only.
1046 
1047             private static class SPUString {
1048                 String fStr; // The actual string.
1049                 int fCharOrStrTableIndex; // Index into the final runtime data for this string.
1050                 // (or, for length 1, the single string char itself,
1051                 // there being no string table entry for it.)
1052 
1053                 SPUString(String s) {
1054                     fStr = s;
1055                     fCharOrStrTableIndex = 0;
1056                 }
1057             }
1058 
1059             // Comparison function for ordering strings in the string pool.
1060             // Compare by length first, then, within a group of the same length,
1061             // by code point order.
1062 
1063             private static class SPUStringComparator implements Comparator<SPUString> {
1064                 @Override
1065                 public int compare(SPUString sL, SPUString sR) {
1066                     int lenL = sL.fStr.length();
1067                     int lenR = sR.fStr.length();
1068                     if (lenL < lenR) {
1069                         return -1;
1070                     } else if (lenL > lenR) {
1071                         return 1;
1072                     } else {
1073                         return sL.fStr.compareTo(sR.fStr);
1074                     }
1075                 }
1076 
1077                 final static SPUStringComparator INSTANCE = new SPUStringComparator();
1078             }
1079 
1080             // String Pool A utility class for holding the strings that are the result of
1081             // the spoof mappings. These strings will utimately end up in the
1082             // run-time String Table.
1083             // This is sort of like a sorted set of strings, except that ICU's anemic
1084             // built-in collections don't support those, so it is implemented with a
1085             // combination of a uhash and a Vector.
1086             private static class SPUStringPool {
1087                 public SPUStringPool() {
1088                     fVec = new Vector<>();
1089                     fHash = new Hashtable<>();
1090                 }
1091 
1092                 public int size() {
1093                     return fVec.size();
1094                 }
1095 
1096                 // Get the n-th string in the collection.
1097                 public SPUString getByIndex(int index) {
1098                     SPUString retString = fVec.elementAt(index);
1099                     return retString;
1100                 }
1101 
1102                 // Add a string. Return the string from the table.
1103                 // If the input parameter string is already in the table, delete the
1104                 // input parameter and return the existing string.
1105                 public SPUString addString(String src) {
1106                     SPUString hashedString = fHash.get(src);
1107                     if (hashedString == null) {
1108                         hashedString = new SPUString(src);
1109                         fHash.put(src, hashedString);
1110                         fVec.addElement(hashedString);
1111                     }
1112                     return hashedString;
1113                 }
1114 
1115                 // Sort the contents; affects the ordering of getByIndex().
1116                 public void sort() {
1117                     Collections.sort(fVec, SPUStringComparator.INSTANCE);
1118                 }
1119 
1120                 private Vector<SPUString> fVec; // Elements are SPUString *
1121                 private Hashtable<String, SPUString> fHash; // Key: Value:
1122             }
1123 
1124         }
1125     }
1126 
1127     /**
1128      * Get the Restriction Level that is being tested.
1129      *
1130      * @return The restriction level
1131      * @internal
1132      * @deprecated This API is ICU internal only.
1133      */
1134     @Deprecated
1135     public RestrictionLevel getRestrictionLevel() {
1136         return fRestrictionLevel;
1137     }
1138 
1139     /**
1140      * Get the set of checks that this Spoof Checker has been configured to perform.
1141      *
1142      * @return The set of checks that this spoof checker will perform.
1143      * @stable ICU 4.6
1144      */
1145     public int getChecks() {
1146         return fChecks;
1147     }
1148 
1149     /**
1150      * Get a read-only set of locales for the scripts that are acceptable in strings to be checked. If no limitations on
1151      * scripts have been specified, an empty set will be returned.
1152      *
1153      * setAllowedChars() will reset the list of allowed locales to be empty.
1154      *
1155      * The returned set may not be identical to the originally specified set that is supplied to setAllowedLocales();
1156      * the information other than languages from the originally specified locales may be omitted.
1157      *
1158      * @return A set of locales corresponding to the acceptable scripts.
1159      *
1160      * @stable ICU 4.6
1161      */
1162     public Set<ULocale> getAllowedLocales() {
1163         return Collections.unmodifiableSet(fAllowedLocales);
1164     }
1165 
1166     /**
1167      * Get a set of {@link java.util.Locale} instances for the scripts that are acceptable in strings to be checked. If
1168      * no limitations on scripts have been specified, an empty set will be returned.
1169      *
1170      * @return A set of locales corresponding to the acceptable scripts.
1171      * @stable ICU 54
1172      */
1173     public Set<Locale> getAllowedJavaLocales() {
1174         HashSet<Locale> locales = new HashSet<>(fAllowedLocales.size());
1175         for (ULocale uloc : fAllowedLocales) {
1176             locales.add(uloc.toLocale());
1177         }
1178         return locales;
1179     }
1180 
1181     /**
1182      * Get a UnicodeSet for the characters permitted in an identifier. This corresponds to the limits imposed by the Set
1183      * Allowed Characters functions. Limitations imposed by other checks will not be reflected in the set returned by
1184      * this function.
1185      *
1186      * The returned set will be frozen, meaning that it cannot be modified by the caller.
1187      *
1188      * @return A UnicodeSet containing the characters that are permitted by the CHAR_LIMIT test.
1189      * @stable ICU 4.6
1190      */
1191     public UnicodeSet getAllowedChars() {
1192         return fAllowedCharsSet;
1193     }
1194 
1195     /**
1196      * A struct-like class to hold the results of a Spoof Check operation. Tells which check(s) have failed.
1197      *
1198      * @stable ICU 4.6
1199      */
1200     public static class CheckResult {
1201         /**
1202          * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
1203          * in question: RESTRICTION_LEVEL, CHAR_LIMIT, and so on.
1204          *
1205          * @stable ICU 4.6
1206          * @see Builder#setChecks
1207          */
1208         public int checks;
1209 
1210         /**
1211          * The index of the first string position that failed a check.
1212          *
1213          * @deprecated ICU 51. No longer supported. Always set to zero.
1214          */
1215         @Deprecated
1216         public int position;
1217 
1218         /**
1219          * The numerics found in the string, if MIXED_NUMBERS was set; otherwise null.  The set will contain the zero
1220          * digit from each decimal number system found in the input string.
1221          *
1222          * @stable ICU 58
1223          */
1224         public UnicodeSet numerics;
1225 
1226         /**
1227          * The restriction level that the text meets, if RESTRICTION_LEVEL is set; otherwise null.
1228          *
1229          * @stable ICU 58
1230          */
1231         public RestrictionLevel restrictionLevel;
1232 
1233         /**
1234          * Default constructor
1235          *
1236          * @stable ICU 4.6
1237          */
1238         public CheckResult() {
1239             checks = 0;
1240             position = 0;
1241         }
1242 
1243         /**
1244          * {@inheritDoc}
1245          *
1246          * @stable ICU 4.6
1247          */
1248         @Override
1249         public String toString() {
1250             StringBuilder sb = new StringBuilder();
1251             sb.append("checks:");
1252             if (checks == 0) {
1253                 sb.append(" none");
1254             } else if (checks == ALL_CHECKS) {
1255                 sb.append(" all");
1256             } else {
1257                 if ((checks & SINGLE_SCRIPT_CONFUSABLE) != 0) {
1258                     sb.append(" SINGLE_SCRIPT_CONFUSABLE");
1259                 }
1260                 if ((checks & MIXED_SCRIPT_CONFUSABLE) != 0) {
1261                     sb.append(" MIXED_SCRIPT_CONFUSABLE");
1262                 }
1263                 if ((checks & WHOLE_SCRIPT_CONFUSABLE) != 0) {
1264                     sb.append(" WHOLE_SCRIPT_CONFUSABLE");
1265                 }
1266                 if ((checks & ANY_CASE) != 0) {
1267                     sb.append(" ANY_CASE");
1268                 }
1269                 if ((checks & RESTRICTION_LEVEL) != 0) {
1270                     sb.append(" RESTRICTION_LEVEL");
1271                 }
1272                 if ((checks & INVISIBLE) != 0) {
1273                     sb.append(" INVISIBLE");
1274                 }
1275                 if ((checks & CHAR_LIMIT) != 0) {
1276                     sb.append(" CHAR_LIMIT");
1277                 }
1278                 if ((checks & MIXED_NUMBERS) != 0) {
1279                     sb.append(" MIXED_NUMBERS");
1280                 }
1281             }
1282             sb.append(", numerics: ").append(numerics.toPattern(false));
1283             sb.append(", position: ").append(position);
1284             sb.append(", restrictionLevel: ").append(restrictionLevel);
1285             return sb.toString();
1286         }
1287     }
1288 
1289     /**
1290      * Check the specified string for possible security issues. The text to be checked will typically be an identifier
1291      * of some sort. The set of checks to be performed was specified when building the SpoofChecker.
1292      *
1293      * @param text
1294      *            A String to be checked for possible security issues.
1295      * @param checkResult
1296      *            Output parameter, indicates which specific tests failed. May be null if the information is not wanted.
1297      * @return True there any issue is found with the input string.
1298      * @stable ICU 4.8
1299      */
1300     public boolean failsChecks(String text, CheckResult checkResult) {
1301         int length = text.length();
1302 
1303         int result = 0;
1304         if (checkResult != null) {
1305             checkResult.position = 0;
1306             checkResult.numerics = null;
1307             checkResult.restrictionLevel = null;
1308         }
1309 
1310         if (0 != (this.fChecks & RESTRICTION_LEVEL)) {
1311             RestrictionLevel textRestrictionLevel = getRestrictionLevel(text);
1312             if (textRestrictionLevel.compareTo(fRestrictionLevel) > 0) {
1313                 result |= RESTRICTION_LEVEL;
1314             }
1315             if (checkResult != null) {
1316                 checkResult.restrictionLevel = textRestrictionLevel;
1317             }
1318         }
1319 
1320         if (0 != (this.fChecks & MIXED_NUMBERS)) {
1321             UnicodeSet numerics = new UnicodeSet();
1322             getNumerics(text, numerics);
1323             if (numerics.size() > 1) {
1324                 result |= MIXED_NUMBERS;
1325             }
1326             if (checkResult != null) {
1327                 checkResult.numerics = numerics;
1328             }
1329         }
1330 
1331         if (0 != (this.fChecks & HIDDEN_OVERLAY)) {
1332             int index = findHiddenOverlay(text);
1333             if (index != -1) {
1334                 result |= HIDDEN_OVERLAY;
1335             }
1336         }
1337 
1338         if (0 != (this.fChecks & CHAR_LIMIT)) {
1339             int i;
1340             int c;
1341             for (i = 0; i < length;) {
1342                 // U16_NEXT(text, i, length, c);
1343                 c = Character.codePointAt(text, i);
1344                 i = Character.offsetByCodePoints(text, i, 1);
1345                 if (!this.fAllowedCharsSet.contains(c)) {
1346                     result |= CHAR_LIMIT;
1347                     break;
1348                 }
1349             }
1350         }
1351 
1352         if (0 != (this.fChecks & INVISIBLE)) {
1353             // This check needs to be done on NFD input
1354             String nfdText = nfdNormalizer.normalize(text);
1355 
1356             // scan for more than one occurrence of the same non-spacing mark
1357             // in a sequence of non-spacing marks.
1358             int i;
1359             int c;
1360             int firstNonspacingMark = 0;
1361             boolean haveMultipleMarks = false;
1362             UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a
1363                                                           // single combining sequence.
1364             for (i = 0; i < length;) {
1365                 c = Character.codePointAt(nfdText, i);
1366                 i = Character.offsetByCodePoints(nfdText, i, 1);
1367                 if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) {
1368                     firstNonspacingMark = 0;
1369                     if (haveMultipleMarks) {
1370                         marksSeenSoFar.clear();
1371                         haveMultipleMarks = false;
1372                     }
1373                     continue;
1374                 }
1375                 if (firstNonspacingMark == 0) {
1376                     firstNonspacingMark = c;
1377                     continue;
1378                 }
1379                 if (!haveMultipleMarks) {
1380                     marksSeenSoFar.add(firstNonspacingMark);
1381                     haveMultipleMarks = true;
1382                 }
1383                 if (marksSeenSoFar.contains(c)) {
1384                     // report the error, and stop scanning.
1385                     // No need to find more than the first failure.
1386                     result |= INVISIBLE;
1387                     break;
1388                 }
1389                 marksSeenSoFar.add(c);
1390             }
1391         }
1392         if (checkResult != null) {
1393             checkResult.checks = result;
1394         }
1395         return (0 != result);
1396     }
1397 
1398     /**
1399      * Check the specified string for possible security issues. The text to be checked will typically be an identifier
1400      * of some sort. The set of checks to be performed was specified when building the SpoofChecker.
1401      *
1402      * @param text
1403      *            A String to be checked for possible security issues.
1404      * @return True there any issue is found with the input string.
1405      * @stable ICU 4.8
1406      */
failsChecks(String text)1407     public boolean failsChecks(String text) {
1408         return failsChecks(text, null);
1409     }
1410 
1411     /**
1412      * Check whether two specified strings are visually confusable. The types of confusability to be tested - single
1413      * script, mixed script, or whole script - are determined by the check options set for the SpoofChecker.
1414      *
1415      * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
1416      * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected.
1417      *
1418      * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case
1419      * folded for comparison and display to the user, do not select the ANY_CASE option.
1420      *
1421      *
1422      * @param s1
1423      *            The first of the two strings to be compared for confusability.
1424      * @param s2
1425      *            The second of the two strings to be compared for confusability.
1426      * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability
1427      *         found, as defined by spoof check test constants.
1428      * @stable ICU 4.6
1429      */
areConfusable(String s1, String s2)1430     public int areConfusable(String s1, String s2) {
1431         //
1432         // See section 4 of UTS #39 for the algorithm for checking whether two strings are confusable,
1433         // and for definitions of the types (single, whole, mixed-script) of confusables.
1434 
1435         // We only care about a few of the check flags. Ignore the others.
1436         // If no tests relevant to this function have been specified, signal an error.
1437         // TODO: is this really the right thing to do? It's probably an error on
1438         // the caller's part, but logically we would just return 0 (no error).
1439         if ((this.fChecks & CONFUSABLE) == 0) {
1440             throw new IllegalArgumentException("No confusable checks are enabled.");
1441         }
1442 
1443         // Compute the skeletons and check for confusability.
1444         String s1Skeleton = getSkeleton(s1);
1445         String s2Skeleton = getSkeleton(s2);
1446         if (!s1Skeleton.equals(s2Skeleton)) {
1447             return 0;
1448         }
1449 
1450         // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
1451         // of confusables according to UTS 39 section 4.
1452         // Start by computing the resolved script sets of s1 and s2.
1453         ScriptSet s1RSS = new ScriptSet();
1454         getResolvedScriptSet(s1, s1RSS);
1455         ScriptSet s2RSS = new ScriptSet();
1456         getResolvedScriptSet(s2, s2RSS);
1457 
1458         // Turn on all applicable flags
1459         int result = 0;
1460         if (s1RSS.intersects(s2RSS)) {
1461             result |= SINGLE_SCRIPT_CONFUSABLE;
1462         } else {
1463             result |= MIXED_SCRIPT_CONFUSABLE;
1464             if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) {
1465                 result |= WHOLE_SCRIPT_CONFUSABLE;
1466             }
1467         }
1468 
1469         // Turn off flags that the user doesn't want
1470         return result & fChecks;
1471     }
1472 
1473     /**
1474      * Check whether two specified strings are visually when displayed in a paragraph with the given direction.
1475      * The types of confusability to be tested—single script, mixed script, or whole script—are determined by the check options set for the SpoofChecker.
1476      *
1477      * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
1478      * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected.
1479      *
1480      * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case
1481      * folded for comparison and display to the user, do not select the ANY_CASE option.
1482      *
1483      *
1484      * @param direction The paragraph direction with which the identifiers are displayed.
1485      *                  Must be either {@link Bidi#DIRECTION_LEFT_TO_RIGHT} or {@link Bidi#DIRECTION_RIGHT_TO_LEFT}.
1486      * @param s1
1487      *            The first of the two strings to be compared for confusability.
1488      * @param s2
1489      *            The second of the two strings to be compared for confusability.
1490      * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability
1491      *         found, as defined by spoof check test constants.
1492      * @draft ICU 74
1493      */
areConfusable(int direction, CharSequence s1, CharSequence s2)1494     public int areConfusable(int direction, CharSequence s1, CharSequence s2) {
1495         //
1496         // See section 4 of UTS #39 for the algorithm for checking whether two strings are confusable,
1497         // and for definitions of the types (single, whole, mixed-script) of confusables.
1498 
1499         // We only care about a few of the check flags. Ignore the others.
1500         // If no tests relevant to this function have been specified, signal an error.
1501         // TODO: is this really the right thing to do? It's probably an error on
1502         // the caller's part, but logically we would just return 0 (no error).
1503         if ((this.fChecks & CONFUSABLE) == 0) {
1504             throw new IllegalArgumentException("No confusable checks are enabled.");
1505         }
1506 
1507         // Compute the skeletons and check for confusability.
1508         String s1Skeleton = getBidiSkeleton(direction, s1);
1509         String s2Skeleton = getBidiSkeleton(direction, s2);
1510         if (!s1Skeleton.equals(s2Skeleton)) {
1511             return 0;
1512         }
1513 
1514         // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
1515         // of confusables according to UTS 39 section 4.
1516         // Start by computing the resolved script sets of s1 and s2.
1517         ScriptSet s1RSS = new ScriptSet();
1518         getResolvedScriptSet(s1, s1RSS);
1519         ScriptSet s2RSS = new ScriptSet();
1520         getResolvedScriptSet(s2, s2RSS);
1521 
1522         // Turn on all applicable flags
1523         int result = 0;
1524         if (s1RSS.intersects(s2RSS)) {
1525             result |= SINGLE_SCRIPT_CONFUSABLE;
1526         } else {
1527             result |= MIXED_SCRIPT_CONFUSABLE;
1528             if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) {
1529                 result |= WHOLE_SCRIPT_CONFUSABLE;
1530             }
1531         }
1532 
1533         // Turn off flags that the user doesn't want
1534         result &= fChecks;
1535 
1536         return result;
1537     }
1538 
1539     /**
1540      * Get the "bidiSkeleton" for an identifier string and a direction.
1541      * Skeletons are a transformation of the input string;
1542      * Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1543      * they are RTL-confusable if their RTL bidiSkeletons are identical.
1544      * See Unicode Technical Standard #39 for additional information:
1545      * https://www.unicode.org/reports/tr39/#Confusable_Detection.
1546      *
1547      * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some
1548      * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons.
1549      *
1550      * Skeletons are computed using the algorithm and data described in UTS #39.
1551      *
1552      * @param direction The paragraph direction with which the string is displayed.
1553      *                  Must be either {@link Bidi#DIRECTION_LEFT_TO_RIGHT} or {@link Bidi#DIRECTION_RIGHT_TO_LEFT}.
1554      * @param str The input string whose bidiSkeleton will be generated.
1555      * @return The output skeleton string.
1556      *
1557      * @draft ICU 74
1558      */
getBidiSkeleton(int direction, CharSequence str)1559     public String getBidiSkeleton(int direction, CharSequence str) {
1560         if (direction != Bidi.DIRECTION_LEFT_TO_RIGHT && direction != Bidi.DIRECTION_RIGHT_TO_LEFT) {
1561             throw new IllegalArgumentException("direction should be DIRECTION_LEFT_TO_RIGHT or DIRECTION_RIGHT_TO_LEFT");
1562         }
1563         Bidi bidi = new Bidi(str.toString(), direction);
1564         return getSkeleton(bidi.writeReordered(Bidi.KEEP_BASE_COMBINING | Bidi.DO_MIRRORING));
1565     }
1566 
1567     /**
1568      * Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are
1569      * confusable if their skeletons are identical. See Unicode UAX 39 for additional information.
1570      *
1571      * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some
1572      * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons.
1573      *
1574      * Skeletons are computed using the algorithm and data described in Unicode UAX 39.
1575      *
1576      * @param str
1577      *            The input string whose skeleton will be generated.
1578      * @return The output skeleton string.
1579      *
1580      * @stable ICU 58
1581      */
getSkeleton(CharSequence str)1582     public String getSkeleton(CharSequence str) {
1583         // Apply the skeleton mapping to the NFD normalized input string
1584         // Accumulate the skeleton, possibly unnormalized, in a String.
1585         String nfdId = nfdNormalizer.normalize(str);
1586         int normalizedLen = nfdId.length();
1587         StringBuilder skelSB = new StringBuilder();
1588         for (int inputIndex = 0; inputIndex < normalizedLen;) {
1589             int c = Character.codePointAt(nfdId, inputIndex);
1590             inputIndex += Character.charCount(c);
1591             if (!UCharacter.hasBinaryProperty(c, UProperty.DEFAULT_IGNORABLE_CODE_POINT)) {
1592                 this.fSpoofData.confusableLookup(c, skelSB);
1593             }
1594         }
1595         String skelStr = skelSB.toString();
1596         skelStr = nfdNormalizer.normalize(skelStr);
1597         return skelStr;
1598     }
1599 
1600     /**
1601      * Calls {@link SpoofChecker#getSkeleton(CharSequence id)}. Starting with ICU 55, the "type" parameter has been
1602      * ignored, and starting with ICU 58, this function has been deprecated.
1603      *
1604      * @param type
1605      *            No longer supported. Prior to ICU 55, was used to specify the mapping table SL, SA, ML, or MA.
1606      * @param id
1607      *            The input identifier whose skeleton will be generated.
1608      * @return The output skeleton string.
1609      *
1610      * @deprecated ICU 58
1611      */
1612     @Deprecated
getSkeleton(int type, String id)1613     public String getSkeleton(int type, String id) {
1614         return getSkeleton(id);
1615     }
1616 
1617     /**
1618      * Equality function. Return true if the two SpoofChecker objects incorporate the same confusable data and have
1619      * enabled the same set of checks.
1620      *
1621      * @param other
1622      *            the SpoofChecker being compared with.
1623      * @return true if the two SpoofCheckers are equal.
1624      * @stable ICU 4.6
1625      */
1626     @Override
equals(Object other)1627     public boolean equals(Object other) {
1628         if (!(other instanceof SpoofChecker)) {
1629             return false;
1630         }
1631         SpoofChecker otherSC = (SpoofChecker) other;
1632         if (fSpoofData != otherSC.fSpoofData && fSpoofData != null && !fSpoofData.equals(otherSC.fSpoofData)) {
1633             return false;
1634         }
1635         if (fChecks != otherSC.fChecks) {
1636             return false;
1637         }
1638         if (fAllowedLocales != otherSC.fAllowedLocales && fAllowedLocales != null
1639                 && !fAllowedLocales.equals(otherSC.fAllowedLocales)) {
1640             return false;
1641         }
1642         if (fAllowedCharsSet != otherSC.fAllowedCharsSet && fAllowedCharsSet != null
1643                 && !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) {
1644             return false;
1645         }
1646         if (fRestrictionLevel != otherSC.fRestrictionLevel) {
1647             return false;
1648         }
1649         return true;
1650     }
1651 
1652     /**
1653      * Overrides {@link Object#hashCode()}.
1654      * @stable ICU 4.6
1655      */
1656     @Override
hashCode()1657     public int hashCode() {
1658         return fChecks
1659                 ^ fSpoofData.hashCode()
1660                 ^ fAllowedLocales.hashCode()
1661                 ^ fAllowedCharsSet.hashCode()
1662                 ^ fRestrictionLevel.ordinal();
1663     }
1664 
1665     /**
1666      * Computes the augmented script set for a code point, according to UTS 39 section 5.1.
1667      */
getAugmentedScriptSet(int codePoint, ScriptSet result)1668     private static void getAugmentedScriptSet(int codePoint, ScriptSet result) {
1669         result.clear();
1670         UScript.getScriptExtensions(codePoint, result);
1671 
1672         // Section 5.1 step 1
1673         if (result.get(UScript.HAN)) {
1674             result.set(UScript.HAN_WITH_BOPOMOFO);
1675             result.set(UScript.JAPANESE);
1676             result.set(UScript.KOREAN);
1677         }
1678         if (result.get(UScript.HIRAGANA)) {
1679             result.set(UScript.JAPANESE);
1680         }
1681         if (result.get(UScript.KATAKANA)) {
1682             result.set(UScript.JAPANESE);
1683         }
1684         if (result.get(UScript.HANGUL)) {
1685             result.set(UScript.KOREAN);
1686         }
1687         if (result.get(UScript.BOPOMOFO)) {
1688             result.set(UScript.HAN_WITH_BOPOMOFO);
1689         }
1690 
1691         // Section 5.1 step 2
1692         if (result.get(UScript.COMMON) || result.get(UScript.INHERITED)) {
1693             result.setAll();
1694         }
1695     }
1696 
1697     /**
1698      * Computes the resolved script set for a string, according to UTS 39 section 5.1.
1699      */
getResolvedScriptSet(CharSequence input, ScriptSet result)1700     private void getResolvedScriptSet(CharSequence input, ScriptSet result) {
1701         getResolvedScriptSetWithout(input, UScript.CODE_LIMIT, result);
1702     }
1703 
1704     /**
1705      * Computes the resolved script set for a string, omitting characters having the specified script. If
1706      * UScript.CODE_LIMIT is passed as the second argument, all characters are included.
1707      */
getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result)1708     private void getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result) {
1709         result.setAll();
1710 
1711         ScriptSet temp = new ScriptSet();
1712         for (int utf16Offset = 0; utf16Offset < input.length();) {
1713             int codePoint = Character.codePointAt(input, utf16Offset);
1714             utf16Offset += Character.charCount(codePoint);
1715 
1716             // Compute the augmented script set for the character
1717             getAugmentedScriptSet(codePoint, temp);
1718 
1719             // Intersect the augmented script set with the resolved script set, but only if the character doesn't
1720             // have the script specified in the function call
1721             if (script == UScript.CODE_LIMIT || !temp.get(script)) {
1722                 result.and(temp);
1723             }
1724         }
1725     }
1726 
1727     /**
1728      * Computes the set of numerics for a string, according to UTS 39 section 5.3.
1729      */
getNumerics(String input, UnicodeSet result)1730     private void getNumerics(String input, UnicodeSet result) {
1731         result.clear();
1732 
1733         for (int utf16Offset = 0; utf16Offset < input.length();) {
1734             int codePoint = Character.codePointAt(input, utf16Offset);
1735             utf16Offset += Character.charCount(codePoint);
1736 
1737             // Store a representative character for each kind of decimal digit
1738             if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
1739                 // Store the zero character as a representative for comparison.
1740                 // Unicode guarantees it is codePoint - value
1741                 result.add(codePoint - UCharacter.getNumericValue(codePoint));
1742             }
1743         }
1744     }
1745 
1746     /**
1747      * Computes the restriction level of a string, according to UTS 39 section 5.2.
1748      */
getRestrictionLevel(String input)1749     private RestrictionLevel getRestrictionLevel(String input) {
1750         // Section 5.2 step 1:
1751         if (!fAllowedCharsSet.containsAll(input)) {
1752             return RestrictionLevel.UNRESTRICTIVE;
1753         }
1754 
1755         // Section 5.2 step 2:
1756         if (ASCII.containsAll(input)) {
1757             return RestrictionLevel.ASCII;
1758         }
1759 
1760         // Section 5.2 steps 3:
1761         ScriptSet resolvedScriptSet = new ScriptSet();
1762         getResolvedScriptSet(input, resolvedScriptSet);
1763 
1764         // Section 5.2 step 4:
1765         if (!resolvedScriptSet.isEmpty()) {
1766             return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE;
1767         }
1768 
1769         // Section 5.2 step 5:
1770         ScriptSet resolvedNoLatn = new ScriptSet();
1771         getResolvedScriptSetWithout(input, UScript.LATIN, resolvedNoLatn);
1772 
1773         // Section 5.2 step 6:
1774         if (resolvedNoLatn.get(UScript.HAN_WITH_BOPOMOFO) || resolvedNoLatn.get(UScript.JAPANESE)
1775                 || resolvedNoLatn.get(UScript.KOREAN)) {
1776             return RestrictionLevel.HIGHLY_RESTRICTIVE;
1777         }
1778 
1779         // Section 5.2 step 7:
1780         if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) && !resolvedNoLatn.get(UScript.GREEK)
1781                 && !resolvedNoLatn.get(UScript.CHEROKEE)) {
1782             return RestrictionLevel.MODERATELY_RESTRICTIVE;
1783         }
1784 
1785         // Section 5.2 step 8:
1786         return RestrictionLevel.MINIMALLY_RESTRICTIVE;
1787     }
1788 
findHiddenOverlay(String input)1789     int findHiddenOverlay(String input) {
1790         boolean sawLeadCharacter = false;
1791         StringBuilder sb = new StringBuilder();
1792         for (int i=0; i<input.length();) {
1793             int cp = input.codePointAt(i);
1794             if (sawLeadCharacter && cp == 0x0307) {
1795                 return i;
1796             }
1797             int combiningClass = UCharacter.getCombiningClass(cp);
1798             // Skip over characters except for those with combining class 0 (non-combining characters) or with
1799             // combining class 230 (same class as U+0307)
1800             assert UCharacter.getCombiningClass(0x0307) == 230;
1801             if (combiningClass == 0 || combiningClass == 230) {
1802                 sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp, sb);
1803             }
1804             i += UCharacter.charCount(cp);
1805         }
1806         return -1;
1807     }
1808 
isIllegalCombiningDotLeadCharacterNoLookup(int cp)1809     boolean isIllegalCombiningDotLeadCharacterNoLookup(int cp) {
1810         return cp == 'i' || cp == 'j' || cp == 'ı' || cp == 'ȷ' || cp == 'l' ||
1811                UCharacter.hasBinaryProperty(cp, UProperty.SOFT_DOTTED);
1812     }
1813 
isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb)1814     boolean isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb) {
1815         if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
1816             return true;
1817         }
1818         sb.setLength(0);
1819         fSpoofData.confusableLookup(cp, sb);
1820         int finalCp = UCharacter.codePointBefore(sb, sb.length());
1821         if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
1822             return true;
1823         }
1824         return false;
1825     }
1826 
1827     // Data Members
1828     private int fChecks; // Bit vector of checks to perform.
1829     private SpoofData fSpoofData;
1830     private Set<ULocale> fAllowedLocales; // The Set of allowed locales.
1831     private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
1832     private RestrictionLevel fRestrictionLevel;
1833 
1834     private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance();
1835 
1836     // Confusable Mappings Data Structures, version 2.0
1837     //
1838     // This description and the corresponding implementation are to be kept
1839     // in-sync with the copy in icu4c uspoof_impl.h.
1840     //
1841     // For the confusable data, we are essentially implementing a map,
1842     //     key: a code point
1843     //     value: a string. Most commonly one char in length, but can be more.
1844     //
1845     // The keys are stored as a sorted array of 32 bit ints.
1846     //         bits 0-23 a code point value
1847     //         bits 24-31 length of value string, in UChars (between 1 and 256 UChars).
1848     //     The key table is sorted in ascending code point order. (not on the
1849     //     32 bit int value, the flag bits do not participate in the sorting.)
1850     //
1851     //     Lookup is done by means of a binary search in the key table.
1852     //
1853     // The corresponding values are kept in a parallel array of 16 bit ints.
1854     //     If the value string is of length 1, it is literally in the value array.
1855     //     For longer strings, the value array contains an index into the strings
1856     //     table.
1857     //
1858     // String Table:
1859     //     The strings table contains all of the value strings (those of length two or greater)
1860     //     concatenated together into one long char (UTF-16) array.
1861     //
1862     //     There is no nul character or other mark between adjacent strings.
1863     //
1864     //----------------------------------------------------------------------------
1865     //
1866     //  Changes from format version 1 to format version 2:
1867     //        1) Removal of the whole-script confusable data tables.
1868     //        2) Removal of the SL/SA/ML/MA and multi-table flags in the key bitmask.
1869     //        3) Expansion of string length value in the key bitmask from 2 bits to 8 bits.
1870     //        4) Removal of the string lengths table since 8 bits is sufficient for the
1871     //           lengths of all entries in confusables.txt.
1872     //
1873     private static final class ConfusableDataUtils {
1874         public static final int FORMAT_VERSION = 2; // version for ICU 58
1875 
keyToCodePoint(int key)1876         public static final int keyToCodePoint(int key) {
1877             return key & 0x00ffffff;
1878         }
1879 
keyToLength(int key)1880         public static final int keyToLength(int key) {
1881             return ((key & 0xff000000) >> 24) + 1;
1882         }
1883 
codePointAndLengthToKey(int codePoint, int length)1884         public static final int codePointAndLengthToKey(int codePoint, int length) {
1885             assert (codePoint & 0x00ffffff) == codePoint;
1886             assert length <= 256;
1887             return codePoint | ((length - 1) << 24);
1888         }
1889     }
1890 
1891     // -------------------------------------------------------------------------------------
1892     //
1893     // SpoofData
1894     //
1895     // This class corresponds to the ICU SpoofCheck data.
1896     //
1897     // The data can originate with the Binary ICU data that is generated in ICU4C,
1898     // or it can originate from source rules that are compiled in ICU4J.
1899     //
1900     // This class does not include the set of checks to be performed, but only
1901     // data that is serialized into the ICU binary data.
1902     //
1903     // Because Java cannot easily wrap binary data like ICU4C, the binary data is
1904     // copied into Java structures that are convenient for use by the run time code.
1905     //
1906     // ---------------------------------------------------------------------------------------
1907     private static class SpoofData {
1908 
1909         // The Confusable data, Java data structures for.
1910         int[] fCFUKeys;
1911         short[] fCFUValues;
1912         String fCFUStrings;
1913 
1914         private static final int DATA_FORMAT = 0x43667520; // "Cfu "
1915 
1916         private static final class IsAcceptable implements Authenticate {
1917             @Override
isDataVersionAcceptable(byte version[])1918             public boolean isDataVersionAcceptable(byte version[]) {
1919                 return version[0] == ConfusableDataUtils.FORMAT_VERSION || version[1] != 0 || version[2] != 0
1920                         || version[3] != 0;
1921             }
1922         }
1923 
1924         private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
1925 
1926         private static final class DefaultData {
1927             private static SpoofData INSTANCE = null;
1928             private static IOException EXCEPTION = null;
1929 
1930             static {
1931                 // Note: Although this is static, the Java runtime can delay execution of this block until
1932                 // the data is actually requested via SpoofData.getDefault().
1933                 try {
1934                     INSTANCE = new SpoofData(ICUBinary.getRequiredData("confusables.cfu"));
1935                 } catch (IOException e) {
1936                     EXCEPTION = e;
1937                 }
1938             }
1939         }
1940 
1941         /**
1942          * @return instance for Unicode standard data
1943          */
getDefault()1944         public static SpoofData getDefault() {
1945             if (DefaultData.EXCEPTION != null) {
1946                 throw new MissingResourceException(
1947                         "Could not load default confusables data: " + DefaultData.EXCEPTION.getMessage(),
1948                         "SpoofChecker", "");
1949             }
1950             return DefaultData.INSTANCE;
1951         }
1952 
1953         // SpoofChecker Data constructor for use from data builder.
1954         // Initializes a new, empty data area that will be populated later.
SpoofData()1955         private SpoofData() {
1956         }
1957 
1958         // Constructor for use when creating from prebuilt default data.
1959         // A ByteBuffer is what the ICU internal data loading functions provide.
SpoofData(ByteBuffer bytes)1960         private SpoofData(ByteBuffer bytes) throws java.io.IOException {
1961             ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE);
1962             bytes.mark();
1963             readData(bytes);
1964         }
1965 
1966         @Override
equals(Object other)1967         public boolean equals(Object other) {
1968             if (!(other instanceof SpoofData)) {
1969                 return false;
1970             }
1971             SpoofData otherData = (SpoofData) other;
1972             if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys))
1973                 return false;
1974             if (!Arrays.equals(fCFUValues, otherData.fCFUValues))
1975                 return false;
1976             if (!Utility.sameObjects(fCFUStrings, otherData.fCFUStrings) && fCFUStrings != null
1977                     && !fCFUStrings.equals(otherData.fCFUStrings))
1978                 return false;
1979             return true;
1980         }
1981 
1982         @Override
hashCode()1983         public int hashCode() {
1984             return Arrays.hashCode(fCFUKeys)
1985                     ^ Arrays.hashCode(fCFUValues)
1986                     ^ fCFUStrings.hashCode();
1987         }
1988 
1989         // Set the SpoofChecker data from pre-built binary data in a byte buffer.
1990         // The binary data format is as described for ICU4C spoof data.
1991         //
readData(ByteBuffer bytes)1992         private void readData(ByteBuffer bytes) throws java.io.IOException {
1993             int magic = bytes.getInt();
1994             if (magic != 0x3845fdef) {
1995                 throw new IllegalArgumentException("Bad Spoof Check Data.");
1996             }
1997             @SuppressWarnings("unused")
1998             int dataFormatVersion = bytes.getInt();
1999             @SuppressWarnings("unused")
2000             int dataLength = bytes.getInt();
2001 
2002             int CFUKeysOffset = bytes.getInt();
2003             int CFUKeysSize = bytes.getInt();
2004 
2005             int CFUValuesOffset = bytes.getInt();
2006             int CFUValuesSize = bytes.getInt();
2007 
2008             int CFUStringTableOffset = bytes.getInt();
2009             int CFUStringTableSize = bytes.getInt();
2010 
2011             // We have now read the file header, and obtained the position for each
2012             // of the data items. Now read each in turn, first seeking the
2013             // input stream to the position of the data item.
2014 
2015             bytes.reset();
2016             ICUBinary.skipBytes(bytes, CFUKeysOffset);
2017             fCFUKeys = ICUBinary.getInts(bytes, CFUKeysSize, 0);
2018 
2019             bytes.reset();
2020             ICUBinary.skipBytes(bytes, CFUValuesOffset);
2021             fCFUValues = ICUBinary.getShorts(bytes, CFUValuesSize, 0);
2022 
2023             bytes.reset();
2024             ICUBinary.skipBytes(bytes, CFUStringTableOffset);
2025             fCFUStrings = ICUBinary.getString(bytes, CFUStringTableSize, 0);
2026         }
2027 
2028         /**
2029          * Append the confusable skeleton transform for a single code point to a StringBuilder. The string to be
2030          * appended will between 1 and 18 characters as of Unicode 9.
2031          *
2032          * This is the heart of the confusable skeleton generation implementation.
2033          */
confusableLookup(int inChar, StringBuilder dest)2034         public void confusableLookup(int inChar, StringBuilder dest) {
2035             // Perform a binary search.
2036             // [lo, hi), i.e lo is inclusive, hi is exclusive.
2037             // The result after the loop will be in lo.
2038             int lo = 0;
2039             int hi = length();
2040             do {
2041                 int mid = (lo + hi) / 2;
2042                 if (codePointAt(mid) > inChar) {
2043                     hi = mid;
2044                 } else if (codePointAt(mid) < inChar) {
2045                     lo = mid;
2046                 } else {
2047                     // Found result. Break early.
2048                     lo = mid;
2049                     break;
2050                 }
2051             } while (hi - lo > 1);
2052 
2053             // Did we find an entry? If not, the char maps to itself.
2054             if (codePointAt(lo) != inChar) {
2055                 dest.appendCodePoint(inChar);
2056                 return;
2057             }
2058 
2059             // Add the element to the string builder and return.
2060             appendValueTo(lo, dest);
2061             return;
2062         }
2063 
2064         /**
2065          * Return the number of confusable entries in this SpoofData.
2066          *
2067          * @return The number of entries.
2068          */
length()2069         public int length() {
2070             return fCFUKeys.length;
2071         }
2072 
2073         /**
2074          * Return the code point (key) at the specified index.
2075          *
2076          * @param index
2077          *            The index within the SpoofData.
2078          * @return The code point.
2079          */
codePointAt(int index)2080         public int codePointAt(int index) {
2081             return ConfusableDataUtils.keyToCodePoint(fCFUKeys[index]);
2082         }
2083 
2084         /**
2085          * Append the confusable skeleton at the specified index to the StringBuilder dest.
2086          *
2087          * @param index
2088          *            The index within the SpoofData.
2089          * @param dest
2090          *            The StringBuilder to which to append the skeleton.
2091          */
appendValueTo(int index, StringBuilder dest)2092         public void appendValueTo(int index, StringBuilder dest) {
2093             int stringLength = ConfusableDataUtils.keyToLength(fCFUKeys[index]);
2094 
2095             // Value is either a char (for strings of length 1) or
2096             // an index into the string table (for longer strings)
2097             short value = fCFUValues[index];
2098             if (stringLength == 1) {
2099                 dest.append((char) value);
2100             } else {
2101                 dest.append(fCFUStrings, value, value + stringLength);
2102             }
2103         }
2104     }
2105 
2106     // -------------------------------------------------------------------------------
2107     //
2108     // ScriptSet - Script code bit sets.
2109     // Extends Java BitSet with input/output support and a few helper methods.
2110     // Note: The I/O is not currently being used, so it has been commented out. If
2111     // it is needed again, the code can be restored.
2112     //
2113     // -------------------------------------------------------------------------------
2114     static class ScriptSet extends BitSet {
2115 
2116         // Eclipse default value to quell warnings:
2117         private static final long serialVersionUID = 1L;
2118 
2119         // // The serialized version of this class can hold INT_CAPACITY * 32 scripts.
2120         // private static final int INT_CAPACITY = 6;
2121         // private static final long serialVersionUID = INT_CAPACITY;
2122         // static {
2123         // assert ScriptSet.INT_CAPACITY * Integer.SIZE <= UScript.CODE_LIMIT;
2124         // }
2125         //
2126         // public ScriptSet() {
2127         // }
2128         //
2129         // public ScriptSet(ByteBuffer bytes) throws java.io.IOException {
2130         // for (int i = 0; i < INT_CAPACITY; i++) {
2131         // int bits = bytes.getInt();
2132         // for (int j = 0; j < Integer.SIZE; j++) {
2133         // if ((bits & (1 << j)) != 0) {
2134         // set(i * Integer.SIZE + j);
2135         // }
2136         // }
2137         // }
2138         // }
2139         //
2140         // public void output(DataOutputStream os) throws java.io.IOException {
2141         // for (int i = 0; i < INT_CAPACITY; i++) {
2142         // int bits = 0;
2143         // for (int j = 0; j < Integer.SIZE; j++) {
2144         // if (get(i * Integer.SIZE + j)) {
2145         // bits |= (1 << j);
2146         // }
2147         // }
2148         // os.writeInt(bits);
2149         // }
2150         // }
2151 
and(int script)2152         public void and(int script) {
2153             this.clear(0, script);
2154             this.clear(script + 1, UScript.CODE_LIMIT);
2155         }
2156 
setAll()2157         public void setAll() {
2158             this.set(0, UScript.CODE_LIMIT);
2159         }
2160 
isFull()2161         public boolean isFull() {
2162             return cardinality() == UScript.CODE_LIMIT;
2163         }
2164 
appendStringTo(StringBuilder sb)2165         public void appendStringTo(StringBuilder sb) {
2166             sb.append("{ ");
2167             if (isEmpty()) {
2168                 sb.append("- ");
2169             } else if (isFull()) {
2170                 sb.append("* ");
2171             } else {
2172                 for (int script = 0; script < UScript.CODE_LIMIT; script++) {
2173                     if (get(script)) {
2174                         sb.append(UScript.getShortName(script));
2175                         sb.append(" ");
2176                     }
2177                 }
2178             }
2179             sb.append("}");
2180         }
2181 
2182         @Override
toString()2183         public String toString() {
2184             StringBuilder sb = new StringBuilder();
2185             sb.append("<ScriptSet ");
2186             appendStringTo(sb);
2187             sb.append(">");
2188             return sb.toString();
2189         }
2190     }
2191 }
2192