1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 *************************************************************************** 5 * Copyright (C) 2008-2016 International Business Machines Corporation 6 * and others. All Rights Reserved. 7 *************************************************************************** 8 * 9 * Unicode Spoof Detection 10 */ 11 12 package com.ibm.icu.text; 13 14 import java.io.IOException; 15 import java.io.LineNumberReader; 16 import java.io.Reader; 17 import java.nio.ByteBuffer; 18 import java.text.ParseException; 19 import java.util.ArrayList; 20 import java.util.Arrays; 21 import java.util.BitSet; 22 import java.util.Collections; 23 import java.util.Comparator; 24 import java.util.HashSet; 25 import java.util.Hashtable; 26 import java.util.LinkedHashSet; 27 import java.util.Locale; 28 import java.util.MissingResourceException; 29 import java.util.Set; 30 import java.util.Vector; 31 import java.util.regex.Matcher; 32 import java.util.regex.Pattern; 33 34 import com.ibm.icu.impl.ICUBinary; 35 import com.ibm.icu.impl.ICUBinary.Authenticate; 36 import com.ibm.icu.impl.Utility; 37 import com.ibm.icu.lang.UCharacter; 38 import com.ibm.icu.lang.UCharacter.IdentifierType; 39 import com.ibm.icu.lang.UCharacterCategory; 40 import com.ibm.icu.lang.UProperty; 41 import com.ibm.icu.lang.UScript; 42 import com.ibm.icu.util.ULocale; 43 44 /** 45 * <p> 46 * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and 47 * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions: 48 * 49 * <ol> 50 * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desparejado" and 51 * "ԁеѕрагејаԁо".</li> 52 * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof 53 * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li> 54 * </ol> 55 * 56 * <p> 57 * Although originally designed as a method for flagging suspicious identifier strings such as URLs, 58 * <code>SpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word 59 * content filters. 60 * 61 * <h2>Confusables</h2> 62 * 63 * <p> 64 * The following example shows how to use <code>SpoofChecker</code> to check for confusability between two strings: 65 * 66 * <pre> 67 * <code> 68 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 69 * int result = sc.areConfusable("desparejado", "ԁеѕрагејаԁо"); 70 * System.out.println(result != 0); // true 71 * </code> 72 * </pre> 73 * 74 * <p> 75 * <code>SpoofChecker</code> uses a builder paradigm: options are specified within the context of a lightweight 76 * {@link SpoofChecker.Builder} object, and upon calling {@link SpoofChecker.Builder#build}, expensive data loading 77 * operations are performed, and an immutable <code>SpoofChecker</code> is returned. 78 * 79 * <p> 80 * The first line of the example creates a <code>SpoofChecker</code> object with confusable-checking enabled; the second 81 * line performs the confusability test. For best performance, the instance should be created once (e.g., upon 82 * application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime. 83 * 84 * <p> 85 * If the paragraph direction used to display the strings is known, it should be passed to {@link SpoofChecker#areConfusable}: 86 * 87 * <pre> 88 * <code> 89 * // These strings look identical when rendered in a left-to-right context. 90 * // They look distinct in a right-to-left context. 91 * String s1 = "A1\u05D0"; // A1א 92 * String s2 = "A\u05D01"; // Aא1 93 * 94 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 95 * int result = sc.areConfusable(Bidi.DIRECTION_LEFT_TO_RIGHT, s1, s2); 96 * System.out.println(result != 0); // true 97 * </code> 98 * </pre> 99 * 100 * <p> 101 * UTS 39 defines two strings to be <em>confusable</em> if they map to the same skeleton. A <em>skeleton</em> is a 102 * sequence of families of confusable characters, where each family has a single exemplar character. 103 * {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so the following snippet is 104 * equivalent to the example above: 105 * 106 * <pre> 107 * <code> 108 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 109 * boolean result = sc.getSkeleton("desparejado").equals(sc.getSkeleton("ԁеѕрагејаԁо")); 110 * System.out.println(result); // true 111 * </code> 112 * </pre> 113 * 114 * <p> 115 * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling 116 * {@link SpoofChecker#areConfusable} many times in a loop, {@link SpoofChecker#getSkeleton} can be used instead, as 117 * shown below: 118 * 119 * <pre> 120 * // Setup: 121 * String[] DICTIONARY = new String[]{ "lorem", "ipsum" }; // example 122 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 123 * HashSet<String> skeletons = new HashSet<String>(); 124 * for (String word : DICTIONARY) { 125 * skeletons.add(sc.getSkeleton(word)); 126 * } 127 * 128 * // Live Check: 129 * boolean result = skeletons.contains(sc.getSkeleton("1orern")); 130 * System.out.println(result); // true 131 * </pre> 132 * 133 * <p> 134 * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em> 135 * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons 136 * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons. 137 * 138 * <h2>Spoof Detection</h2> 139 * 140 * <p> 141 * The following snippet shows a minimal example of using <code>SpoofChecker</code> to perform spoof detection on a 142 * string: 143 * 144 * <pre> 145 * SpoofChecker sc = new SpoofChecker.Builder() 146 * .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION)) 147 * .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE) 148 * .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE) 149 * .build(); 150 * boolean result = sc.failsChecks("pаypаl"); // with Cyrillic 'а' characters 151 * System.out.println(result); // true 152 * </pre> 153 * 154 * <p> 155 * As in the case for confusability checking, it is good practice to create one <code>SpoofChecker</code> instance at 156 * startup, and call the cheaper {@link SpoofChecker#failsChecks} online. In the second line, we specify the set of 157 * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. In the 158 * third line, the CONFUSABLE checks are disabled. It is good practice to disable them if you won't be using the 159 * instance to perform confusability checking. 160 * 161 * <p> 162 * To get more details on why a string failed the checks, use a {@link SpoofChecker.CheckResult}: 163 * 164 * <pre> 165 * <code> 166 * SpoofChecker sc = new SpoofChecker.Builder() 167 * .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION)) 168 * .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE) 169 * .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE) 170 * .build(); 171 * SpoofChecker.CheckResult checkResult = new SpoofChecker.CheckResult(); 172 * boolean result = sc.failsChecks("pаypаl", checkResult); 173 * System.out.println(checkResult.checks); // 16 174 * </code> 175 * </pre> 176 * 177 * <p> 178 * The return value is a bitmask of the checks that failed. In this case, there was one check that failed: 179 * {@link SpoofChecker#RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are: 180 * 181 * <ul> 182 * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the 183 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS 184 * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li> 185 * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character 186 * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li> 187 * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable 188 * characters. See {@link SpoofChecker.Builder#setAllowedChars} and {@link SpoofChecker.Builder#setAllowedLocales}.</li> 189 * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li> 190 * </ul> 191 * 192 * <p> 193 * These checks can be enabled independently of each other. For example, if you were interested in checking for only the 194 * INVISIBLE and MIXED_NUMBERS conditions, you could do: 195 * 196 * <pre> 197 * <code> 198 * SpoofChecker sc = new SpoofChecker.Builder() 199 * .setChecks(SpoofChecker.INVISIBLE | SpoofChecker.MIXED_NUMBERS) 200 * .build(); 201 * boolean result = sc.failsChecks("৪8"); 202 * System.out.println(result); // true 203 * </code> 204 * </pre> 205 * 206 * <p> 207 * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in 208 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings 209 * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have 210 * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is 211 * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed 212 * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on 213 * the levels, see UTS 39 or {@link SpoofChecker.RestrictionLevel}. The Restriction Level test is aware of the set of 214 * allowed characters set in {@link SpoofChecker.Builder#setAllowedChars}. Note that characters which have script code 215 * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple 216 * scripts. 217 * 218 * <h2>Advanced bidirectional usage</h2> 219 * If the paragraph direction with which the identifiers will be displayed is not known, there are 220 * multiple options for confusable detection depending on the circumstances. 221 * 222 * <p> 223 * In some circumstances, the only concern is confusion between identifiers displayed with the same 224 * paragraph direction. 225 * 226 * <p> 227 * An example is the case where identifiers are usernames prefixed with the @ symbol. 228 * That symbol will appear to the left in a left-to-right context, and to the right in a 229 * right-to-left context, so that an identifier displayed in a left-to-right context can never be 230 * confused with an identifier displayed in a right-to-left context: 231 * <ul> 232 * <li> 233 * The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1) 234 * would be considered confusable, since they both appear as @A1א in a left-to-right context, and the 235 * usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered 236 * confusable, since they both appear as A_1א@ in a right-to-left context. 237 * </li> 238 * <li> 239 * The username "Mark_" would not be considered confusable with the username "_Mark", 240 * even though the latter would appear as Mark_@ in a right-to-left context, and the 241 * former as @Mark_ in a left-to-right context. 242 * </li> 243 * </ul> 244 * <p> 245 * In that case, the caller should check for both LTR-confusability and RTL-confusability: 246 * 247 * <pre> 248 * <code> 249 * boolean confusableInEitherDirection = 250 * sc.areConfusable(Bidi.DIRECTION_LEFT_TO_RIGHT, id1, id2) || 251 * sc.areConfusable(Bidi.DIRECTION_RIGHT_TO_LEFT, id1, id2); 252 * </code> 253 * </pre> 254 * 255 * If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR 256 * with LTR and RTL with RTL. 257 * 258 * <p> 259 * In cases where confusability between the visual appearances of an identifier displayed in a 260 * left-to-right context with another identifier displayed in a right-to-left context is a concern, 261 * the LTR skeleton of one can be compared with the RTL skeleton of the other. However, this 262 * very broad definition of confusability may have unexpected results; for instance, it treats the 263 * ASCII identifiers "Mark_" and "_Mark" as confusable. 264 * 265 * <h2>Additional Information</h2> 266 * 267 * <p> 268 * A <code>SpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers. 269 * 270 * <p> 271 * <b>Thread Safety:</b> The methods on <code>SpoofChecker</code> objects are thread safe. The test functions for 272 * checking a single identifier, or for testing whether two identifiers are potentially confusable, may called 273 * concurrently from multiple threads using the same <code>SpoofChecker</code> instance. 274 * 275 * @stable ICU 4.6 276 */ 277 public class SpoofChecker { 278 279 /** 280 * Constants from UTS 39 for use in setRestrictionLevel. 281 * 282 * @stable ICU 53 283 */ 284 public enum RestrictionLevel { 285 /** 286 * All characters in the string are in the identifier profile and all characters in the string are in the ASCII 287 * range. 288 * 289 * @stable ICU 53 290 */ 291 ASCII, 292 /** 293 * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and the 294 * string is single-script, according to the definition in UTS 39 section 5.1. 295 * 296 * @stable ICU 53 297 */ 298 SINGLE_SCRIPT_RESTRICTIVE, 299 /** 300 * The string classifies as Single Script, or all characters in the string are in the identifier profile and the 301 * string is covered by any of the following sets of scripts, according to the definition in UTS 39 section 5.1: 302 * <ul> 303 * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li> 304 * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li> 305 * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li> 306 * </ul> 307 * 308 * @stable ICU 53 309 */ 310 HIGHLY_RESTRICTIVE, 311 /** 312 * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile 313 * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic, 314 * Greek, and Cherokee. 315 * 316 * @stable ICU 53 317 */ 318 MODERATELY_RESTRICTIVE, 319 /** 320 * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts, such as 321 * Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. 322 * 323 * @stable ICU 53 324 */ 325 MINIMALLY_RESTRICTIVE, 326 /** 327 * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org 328 * 329 * @stable ICU 53 330 */ 331 UNRESTRICTIVE, 332 } 333 334 /** 335 * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}. 336 * 337 * @stable ICU 58 338 */ 339 public static final UnicodeSet INCLUSION = 340 new UnicodeSet(). 341 applyIntPropertyValue(UProperty.IDENTIFIER_TYPE, IdentifierType.INCLUSION.ordinal()). 342 freeze(); 343 344 /** 345 * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}. 346 * 347 * @stable ICU 58 348 */ 349 public static final UnicodeSet RECOMMENDED = 350 new UnicodeSet(). 351 applyIntPropertyValue(UProperty.IDENTIFIER_TYPE, IdentifierType.RECOMMENDED.ordinal()). 352 freeze(); 353 354 /** 355 * Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of 356 * checks that will be performed, and to report results from the check function. 357 * 358 */ 359 360 /** 361 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 362 * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section 363 * 4. 364 * 365 * @stable ICU 4.6 366 */ 367 public static final int SINGLE_SCRIPT_CONFUSABLE = 1; 368 369 /** 370 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 371 * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS 372 * 39 section 4. 373 * 374 * @stable ICU 4.6 375 */ 376 public static final int MIXED_SCRIPT_CONFUSABLE = 2; 377 378 /** 379 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 380 * that the two strings are visually confusable and that they are not from the same script but both of them are 381 * single-script strings, according to UTS 39 section 4. 382 * 383 * @stable ICU 4.6 384 */ 385 public static final int WHOLE_SCRIPT_CONFUSABLE = 4; 386 387 /** 388 * Enable this flag in {@link SpoofChecker.Builder#setChecks} to turn on all types of confusables. You may set the 389 * checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to make 390 * {@link SpoofChecker#areConfusable} return only those types of confusables. 391 * 392 * @stable ICU 58 393 */ 394 public static final int CONFUSABLE = SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE; 395 396 /** 397 * This flag is deprecated and no longer affects the behavior of SpoofChecker. 398 * 399 * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was 400 * deprecated. 401 */ 402 @Deprecated 403 public static final int ANY_CASE = 8; 404 405 /** 406 * Check that an identifier satisfies the requirements for the restriction level specified in 407 * {@link SpoofChecker.Builder#setRestrictionLevel}. The default restriction level is 408 * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. 409 * 410 * @stable ICU 58 411 */ 412 public static final int RESTRICTION_LEVEL = 16; 413 414 /** 415 * Check that an identifier contains only characters from a single script (plus chars from the common and inherited 416 * scripts.) Applies to checks of a single identifier check only. 417 * 418 * @deprecated ICU 51 Use RESTRICTION_LEVEL 419 */ 420 @Deprecated 421 public static final int SINGLE_SCRIPT = RESTRICTION_LEVEL; 422 423 /** 424 * Check an identifier for the presence of invisible characters, such as zero-width spaces, or character sequences 425 * that are likely not to display, such as multiple occurrences of the same non-spacing mark. This check does not 426 * test the input string as a whole for conformance to any particular syntax for identifiers. 427 * 428 * @stable ICU 4.6 429 */ 430 public static final int INVISIBLE = 32; 431 432 /** 433 * Check that an identifier contains only characters from a specified set of acceptable characters. See 434 * {@link Builder#setAllowedChars} and {@link Builder#setAllowedLocales}. Note that a string that fails this check 435 * will also fail the {@link #RESTRICTION_LEVEL} check. 436 * 437 * @stable ICU 4.6 438 */ 439 public static final int CHAR_LIMIT = 64; 440 441 /** 442 * Check that an identifier does not mix numbers from different numbering systems. For more information, see UTS 39 443 * section 5.3. 444 * 445 * @stable ICU 58 446 */ 447 public static final int MIXED_NUMBERS = 128; 448 449 /** 450 * Check that an identifier does not have a combining character following a character in which that 451 * combining character would be hidden; for example 'i' followed by a U+0307 combining dot. 452 * <p> 453 * More specifically, the following characters are forbidden from preceding a U+0307: 454 * <ul> 455 * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li> 456 * <li>Latin lowercase letter 'l'</li> 457 * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li> 458 * <li>Any character whose confusable prototype ends with such a character 459 * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li> 460 * </ul> 461 * In addition, combining characters are allowed between the above characters and U+0307 except those 462 * with combining class 0 or combining class "Above" (230, same class as U+0307). 463 * <p> 464 * This list and the number of combing characters considered by this check may grow over time. 465 * 466 * @stable ICU 62 467 */ 468 public static final int HIDDEN_OVERLAY = 256; 469 470 // Update CheckResult.toString() when a new check is added. 471 472 /** 473 * Enable all spoof checks. 474 * 475 * @stable ICU 4.6 476 */ 477 public static final int ALL_CHECKS = 0xFFFFFFFF; 478 479 // Used for checking for ASCII-Only restriction level 480 static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze(); 481 482 /** 483 * private constructor: a SpoofChecker has to be built by the builder 484 */ SpoofChecker()485 private SpoofChecker() { 486 } 487 488 /** 489 * SpoofChecker Builder. To create a SpoofChecker, first instantiate a SpoofChecker.Builder, set the desired 490 * checking options on the builder, then call the build() function to create a SpoofChecker instance. 491 * 492 * @stable ICU 4.6 493 */ 494 public static class Builder { 495 int fChecks; // Bit vector of checks to perform. 496 SpoofData fSpoofData; 497 final UnicodeSet fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); // The UnicodeSet of allowed characters. 498 // for this Spoof Checker. Defaults to all chars. 499 final Set<ULocale> fAllowedLocales = new LinkedHashSet<>(); // The list of allowed locales. 500 private RestrictionLevel fRestrictionLevel; 501 502 /** 503 * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for 504 * LOCALE_LIMIT and CHAR_LIMIT. Note that additional checks may be added in the future, resulting in the changes 505 * to the default checking behavior. 506 * 507 * @stable ICU 4.6 508 */ Builder()509 public Builder() { 510 fChecks = ALL_CHECKS; 511 fSpoofData = null; 512 fRestrictionLevel = RestrictionLevel.HIGHLY_RESTRICTIVE; 513 } 514 515 /** 516 * Constructor: Create a Spoof Checker Builder, and set the configuration from an existing SpoofChecker. 517 * 518 * @param src 519 * The existing checker. 520 * @stable ICU 4.6 521 */ Builder(SpoofChecker src)522 public Builder(SpoofChecker src) { 523 fChecks = src.fChecks; 524 fSpoofData = src.fSpoofData; // For the data, we will either use the source data 525 // as-is, or drop the builder's reference to it 526 // and generate new data, depending on what our 527 // caller does with the builder. 528 fAllowedCharsSet.set(src.fAllowedCharsSet); 529 fAllowedLocales.addAll(src.fAllowedLocales); 530 fRestrictionLevel = src.fRestrictionLevel; 531 } 532 533 /** 534 * Create a SpoofChecker with current configuration. 535 * 536 * @return SpoofChecker 537 * @stable ICU 4.6 538 */ build()539 public SpoofChecker build() { 540 // TODO: Make this data loading be lazy (see #12696). 541 if (fSpoofData == null) { 542 // read binary file 543 fSpoofData = SpoofData.getDefault(); 544 } 545 546 // Copy all state from the builder to the new SpoofChecker. 547 // Make sure that everything is either cloned or copied, so 548 // that subsequent re-use of the builder won't modify the built 549 // SpoofChecker. 550 // 551 // One exception to this: the SpoofData is just assigned. 552 // If the builder subsequently needs to modify fSpoofData 553 // it will create a new SpoofData object first. 554 555 SpoofChecker result = new SpoofChecker(); 556 result.fChecks = this.fChecks; 557 result.fSpoofData = this.fSpoofData; 558 result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone()); 559 result.fAllowedCharsSet.freeze(); 560 result.fAllowedLocales = new HashSet<>(this.fAllowedLocales); 561 result.fRestrictionLevel = this.fRestrictionLevel; 562 return result; 563 } 564 565 /** 566 * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data file 567 * confusables.txt as described in Unicode UAX 39. The syntax of the source data is as described in UAX 39 for 568 * these files, and the content of these files is acceptable input. 569 * 570 * @param confusables 571 * the Reader of confusable characters definitions, as found in file confusables.txt from 572 * unicode.org. 573 * @throws ParseException 574 * To report syntax errors in the input. 575 * 576 * @stable ICU 58 577 */ setData(Reader confusables)578 public Builder setData(Reader confusables) throws ParseException, IOException { 579 580 // Compile the binary data from the source (text) format. 581 // Drop the builder's reference to any pre-existing data, which may 582 // be in use in an already-built checker. 583 584 fSpoofData = new SpoofData(); 585 ConfusabledataBuilder.buildConfusableData(confusables, fSpoofData); 586 return this; 587 } 588 589 /** 590 * Deprecated as of ICU 58; use {@link SpoofChecker.Builder#setData(Reader confusables)} instead. 591 * 592 * @param confusables 593 * the Reader of confusable characters definitions, as found in file confusables.txt from 594 * unicode.org. 595 * @param confusablesWholeScript 596 * No longer supported. 597 * @throws ParseException 598 * To report syntax errors in the input. 599 * 600 * @deprecated ICU 58 601 */ 602 @Deprecated setData(Reader confusables, Reader confusablesWholeScript)603 public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, IOException { 604 setData(confusables); 605 return this; 606 } 607 608 /** 609 * Specify the bitmask of checks that will be performed by {@link SpoofChecker#failsChecks}. Calling this method 610 * overwrites any checks that may have already been enabled. By default, all checks are enabled. 611 * 612 * To enable specific checks and disable all others, 613 * OR together only the bit constants for the desired checks. 614 * For example, to fail strings containing characters outside of 615 * the set specified by {@link #setAllowedChars} and 616 * also strings that contain digits from mixed numbering systems: 617 * 618 * <pre> 619 * {@code 620 * builder.setChecks(SpoofChecker.CHAR_LIMIT | SpoofChecker.MIXED_NUMBERS); 621 * } 622 * </pre> 623 * 624 * To disable specific checks and enable all others, 625 * start with ALL_CHECKS and "AND away" the not-desired checks. 626 * For example, if you are not planning to use the {@link SpoofChecker#areConfusable} functionality, 627 * it is good practice to disable the CONFUSABLE check: 628 * 629 * <pre> 630 * {@code 631 * builder.setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CONFUSABLE); 632 * } 633 * </pre> 634 * 635 * Note that methods such as {@link #setAllowedChars}, {@link #setAllowedLocales}, and 636 * {@link #setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they 637 * enable onto the existing bitmask specified by this method. For more details, see the documentation of those 638 * methods. 639 * 640 * @param checks 641 * The set of checks that this spoof checker will perform. The value is an 'or' of the desired 642 * checks. 643 * @return self 644 * @stable ICU 4.6 645 */ setChecks(int checks)646 public Builder setChecks(int checks) { 647 // Verify that the requested checks are all ones (bits) that 648 // are acceptable, known values. 649 if (0 != (checks & ~SpoofChecker.ALL_CHECKS)) { 650 throw new IllegalArgumentException("Bad Spoof Checks value."); 651 } 652 this.fChecks = (checks & SpoofChecker.ALL_CHECKS); 653 return this; 654 } 655 656 /** 657 * Limit characters that are acceptable in identifiers being checked to those normally used with the languages 658 * associated with the specified locales. Any previously specified list of locales is replaced by the new 659 * settings. 660 * 661 * A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is 662 * determined. Characters from this set of scripts, along with characters from the "common" and "inherited" 663 * Unicode Script categories will be permitted. 664 * 665 * Supplying an empty string removes all restrictions; characters from any script will be allowed. 666 * 667 * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker when calling this function with a 668 * non-empty list of locales. 669 * 670 * The Unicode Set of characters that will be allowed is accessible via the {@link #getAllowedChars} function. 671 * setAllowedLocales() will <i>replace</i> any previously applied set of allowed characters. 672 * 673 * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of 674 * {@link #setAllowedChars} by fetching the resulting set with {@link #getAllowedChars}, manipulating it with 675 * the Unicode Set API, then resetting the spoof detectors limits with {@link #setAllowedChars}. 676 * 677 * @param locales 678 * A Set of ULocales, from which the language and associated script are extracted. If the locales Set 679 * is null, no restrictions will be placed on the allowed characters. 680 * 681 * @return self 682 * @stable ICU 4.6 683 */ setAllowedLocales(Set<ULocale> locales)684 public Builder setAllowedLocales(Set<ULocale> locales) { 685 fAllowedCharsSet.clear(); 686 687 for (ULocale locale : locales) { 688 // Add the script chars for this locale to the accumulating set 689 // of allowed chars. 690 addScriptChars(locale, fAllowedCharsSet); 691 } 692 693 // If our caller provided an empty list of locales, we disable the 694 // allowed characters checking 695 fAllowedLocales.clear(); 696 if (locales.size() == 0) { 697 fAllowedCharsSet.add(0, 0x10ffff); 698 fChecks &= ~CHAR_LIMIT; 699 return this; 700 } 701 702 // Add all common and inherited characters to the set of allowed 703 // chars. 704 UnicodeSet tempSet = new UnicodeSet(); 705 tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON); 706 fAllowedCharsSet.addAll(tempSet); 707 tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED); 708 fAllowedCharsSet.addAll(tempSet); 709 710 // Store the updated spoof checker state. 711 fAllowedLocales.clear(); 712 fAllowedLocales.addAll(locales); 713 fChecks |= CHAR_LIMIT; 714 return this; 715 } 716 717 /** 718 * Limit characters that are acceptable in identifiers being checked to those normally used with the languages 719 * associated with the specified locales. Any previously specified list of locales is replaced by the new 720 * settings. 721 * 722 * @param locales 723 * A Set of Locales, from which the language and associated script are extracted. If the locales Set 724 * is null, no restrictions will be placed on the allowed characters. 725 * 726 * @return self 727 * @stable ICU 54 728 */ setAllowedJavaLocales(Set<Locale> locales)729 public Builder setAllowedJavaLocales(Set<Locale> locales) { 730 HashSet<ULocale> ulocales = new HashSet<>(locales.size()); 731 for (Locale locale : locales) { 732 ulocales.add(ULocale.forLocale(locale)); 733 } 734 return setAllowedLocales(ulocales); 735 } 736 737 // Add (union) to the UnicodeSet all of the characters for the scripts 738 // used for the specified locale. Part of the implementation of 739 // setAllowedLocales. addScriptChars(ULocale locale, UnicodeSet allowedChars)740 private void addScriptChars(ULocale locale, UnicodeSet allowedChars) { 741 int scripts[] = UScript.getCode(locale); 742 if (scripts != null) { 743 UnicodeSet tmpSet = new UnicodeSet(); 744 for (int i = 0; i < scripts.length; i++) { 745 tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]); 746 allowedChars.addAll(tmpSet); 747 } 748 } 749 // else it's an unknown script. 750 // Maybe they asked for the script of "zxx", which refers to no linguistic content. 751 // Maybe they asked for the script of a newer locale that we don't know in the older version of ICU. 752 } 753 754 /** 755 * Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit 756 * is replaced by the new settings. This includes limits on characters that were set with the 757 * setAllowedLocales() function. Note that the RESTRICTED set is useful. 758 * 759 * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker by this function. 760 * 761 * @param chars 762 * A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by 763 * this function, so there are no restrictions on modifying or deleting the UnicodeSet after calling 764 * this function. Note that this clears the allowedLocales set. 765 * @return self 766 * @stable ICU 4.6 767 */ setAllowedChars(UnicodeSet chars)768 public Builder setAllowedChars(UnicodeSet chars) { 769 fAllowedCharsSet.set(chars); 770 fAllowedLocales.clear(); 771 fChecks |= CHAR_LIMIT; 772 return this; 773 } 774 775 /** 776 * Set the loosest restriction level allowed for strings. The default if this is not called is 777 * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. Calling this method enables the {@link #RESTRICTION_LEVEL} and 778 * {@link #MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are 779 * to be performed by {@link SpoofChecker#failsChecks}, see {@link #setChecks}. 780 * 781 * @param restrictionLevel 782 * The loosest restriction level allowed. 783 * @return self 784 * @stable ICU 58 785 */ setRestrictionLevel(RestrictionLevel restrictionLevel)786 public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) { 787 fRestrictionLevel = restrictionLevel; 788 fChecks |= RESTRICTION_LEVEL | MIXED_NUMBERS; 789 return this; 790 } 791 792 /* 793 * ***************************************************************************** 794 * Internal classes for compiling confusable data into its binary (runtime) form. 795 * ***************************************************************************** 796 */ 797 // --------------------------------------------------------------------- 798 // 799 // buildConfusableData Compile the source confusable data, as defined by 800 // the Unicode data file confusables.txt, into the binary 801 // structures used by the confusable detector. 802 // 803 // The binary structures are described in uspoof_impl.h 804 // 805 // 1. parse the data, making a hash table mapping from a codepoint to a String. 806 // 807 // 2. Sort all of the strings encountered by length, since they will need to 808 // be stored in that order in the final string table. 809 // TODO: Sorting these strings by length is no longer needed since the removal of 810 // the string lengths table. This logic can be removed to save processing time 811 // when building confusables data. 812 // 813 // 3. Build a list of keys (UChar32s) from the mapping table. Sort the 814 // list because that will be the ordering of our runtime table. 815 // 816 // 4. Generate the run time string table. This is generated before the key & value 817 // table because we need the string indexes when building those tables. 818 // 819 // 5. Build the run-time key and value table. These are parallel tables, and 820 // are built at the same time 821 822 // class ConfusabledataBuilder 823 // An instance of this class exists while the confusable data is being built from source. 824 // It encapsulates the intermediate data structures that are used for building. 825 // It exports one static function, to do a confusable data build. 826 private static class ConfusabledataBuilder { 827 828 private Hashtable<Integer, SPUString> fTable; 829 private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the 830 // four mapping tables. 831 832 // The compiled data is first assembled into the following four collections, 833 // then output to the builder's SpoofData object. 834 private StringBuffer fStringTable; 835 private ArrayList<Integer> fKeyVec; 836 private ArrayList<Integer> fValueVec; 837 private SPUStringPool stringPool; 838 private Pattern fParseLine; 839 private Pattern fParseHexNum; 840 private int fLineNum; 841 ConfusabledataBuilder()842 ConfusabledataBuilder() { 843 fTable = new Hashtable<>(); 844 fKeySet = new UnicodeSet(); 845 fKeyVec = new ArrayList<>(); 846 fValueVec = new ArrayList<>(); 847 stringPool = new SPUStringPool(); 848 } 849 build(Reader confusables, SpoofData dest)850 void build(Reader confusables, SpoofData dest) throws ParseException, java.io.IOException { 851 StringBuffer fInput = new StringBuffer(); 852 853 // Convert the user input data from UTF-8 to char (UTF-16) 854 LineNumberReader lnr = new LineNumberReader(confusables); 855 do { 856 String line = lnr.readLine(); 857 if (line == null) { 858 break; 859 } 860 fInput.append(line); 861 fInput.append('\n'); 862 } while (true); 863 864 // Regular Expression to parse a line from Confusables.txt. The expression will match 865 // any line. What was matched is determined by examining which capture groups have a match. 866 // Capture Group 1: the source char 867 // Capture Group 2: the replacement chars 868 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated) 869 // Capture Group 7: A blank or comment only line. 870 // Capture Group 8: A syntactically invalid line. Anything that didn't match before. 871 // Example Line from the confusables.txt source file: 872 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " 873 fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match the source char 874 "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s) 875 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued) 876 "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type 877 "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment 878 "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment 879 "|^(.*?)$"); // OR match any line, which catches illegal lines. 880 881 // Regular expression for parsing a hex number out of a space-separated list of them. 882 // Capture group 1 gets the number, with spaces removed. 883 fParseHexNum = Pattern.compile("\\s*([0-9A-F]+)"); 884 885 // Zap any Byte Order Mark at the start of input. Changing it to a space 886 // is benign given the syntax of the input. 887 if (fInput.charAt(0) == 0xfeff) { 888 fInput.setCharAt(0, (char) 0x20); 889 } 890 891 // Parse the input, one line per iteration of this loop. 892 Matcher matcher = fParseLine.matcher(fInput); 893 while (matcher.find()) { 894 fLineNum++; 895 if (matcher.start(7) >= 0) { 896 // this was a blank or comment line. 897 continue; 898 } 899 if (matcher.start(8) >= 0) { 900 // input file syntax error. 901 // status = U_PARSE_ERROR; 902 throw new ParseException( 903 "Confusables, line " + fLineNum + ": Unrecognized Line: " + matcher.group(8), 904 matcher.start(8)); 905 } 906 907 // We have a good input line. Extract the key character and mapping 908 // string, and 909 // put them into the appropriate mapping table. 910 int keyChar = Integer.parseInt(matcher.group(1), 16); 911 if (keyChar > 0x10ffff) { 912 throw new ParseException( 913 "Confusables, line " + fLineNum + ": Bad code point: " + matcher.group(1), 914 matcher.start(1)); 915 } 916 Matcher m = fParseHexNum.matcher(matcher.group(2)); 917 918 StringBuilder mapString = new StringBuilder(); 919 while (m.find()) { 920 int c = Integer.parseInt(m.group(1), 16); 921 if (c > 0x10ffff) { 922 throw new ParseException( 923 "Confusables, line " + fLineNum + ": Bad code point: " + Integer.toString(c, 16), 924 matcher.start(2)); 925 } 926 mapString.appendCodePoint(c); 927 } 928 assert (mapString.length() >= 1); 929 930 // Put the map (value) string into the string pool 931 // This a little like a Java intern() - any duplicates will be 932 // eliminated. 933 SPUString smapString = stringPool.addString(mapString.toString()); 934 935 // Add the char . string mapping to the table. 936 // For Unicode 8, the SL, SA and ML tables have been discontinued. 937 // All input data from confusables.txt is tagged MA. 938 fTable.put(keyChar, smapString); 939 940 fKeySet.add(keyChar); 941 } 942 943 // Input data is now all parsed and collected. 944 // Now create the run-time binary form of the data. 945 // 946 // This is done in two steps. First the data is assembled into vectors and strings, 947 // for ease of construction, then the contents of these collections are copied 948 // into the actual SpoofData object. 949 950 // Build up the string array, and record the index of each string therein 951 // in the (build time only) string pool. 952 // Strings of length one are not entered into the strings array. 953 // (Strings in the table are sorted by length) 954 955 stringPool.sort(); 956 fStringTable = new StringBuffer(); 957 int poolSize = stringPool.size(); 958 int i; 959 for (i = 0; i < poolSize; i++) { 960 SPUString s = stringPool.getByIndex(i); 961 int strLen = s.fStr.length(); 962 int strIndex = fStringTable.length(); 963 if (strLen == 1) { 964 // strings of length one do not get an entry in the string table. 965 // Keep the single string character itself here, which is the same 966 // convention that is used in the final run-time string table index. 967 s.fCharOrStrTableIndex = s.fStr.charAt(0); 968 } else { 969 s.fCharOrStrTableIndex = strIndex; 970 fStringTable.append(s.fStr); 971 } 972 } 973 974 // Construct the compile-time Key and Value table. 975 // 976 // The keys in the Key table follow the format described in uspoof.h for the 977 // Cfu confusables data structure. 978 // 979 // Starting in ICU 58, each code point has exactly one entry in the data 980 // structure. 981 982 for (String keyCharStr : fKeySet) { 983 int keyChar = keyCharStr.codePointAt(0); 984 SPUString targetMapping = fTable.get(keyChar); 985 assert targetMapping != null; 986 987 // Throw a sane exception if trying to consume a long string. Otherwise, 988 // codePointAndLengthToKey will throw an assertion error. 989 if (targetMapping.fStr.length() > 256) { 990 throw new IllegalArgumentException("Confusable prototypes cannot be longer than 256 entries."); 991 } 992 993 int key = ConfusableDataUtils.codePointAndLengthToKey(keyChar, targetMapping.fStr.length()); 994 int value = targetMapping.fCharOrStrTableIndex; 995 996 fKeyVec.add(key); 997 fValueVec.add(value); 998 } 999 1000 // Put the assembled data into the destination SpoofData object. 1001 1002 // The Key Table 1003 // While copying the keys to the output array, 1004 // also sanity check that the keys are sorted. 1005 int numKeys = fKeyVec.size(); 1006 dest.fCFUKeys = new int[numKeys]; 1007 int previousCodePoint = 0; 1008 for (i = 0; i < numKeys; i++) { 1009 int key = fKeyVec.get(i); 1010 int codePoint = ConfusableDataUtils.keyToCodePoint(key); 1011 // strictly greater because there can be only one entry per code point 1012 assert codePoint > previousCodePoint; 1013 dest.fCFUKeys[i] = key; 1014 previousCodePoint = codePoint; 1015 } 1016 1017 // The Value Table, parallels the key table 1018 int numValues = fValueVec.size(); 1019 assert (numKeys == numValues); 1020 dest.fCFUValues = new short[numValues]; 1021 i = 0; 1022 for (int value : fValueVec) { 1023 assert (value < 0xffff); 1024 dest.fCFUValues[i++] = (short) value; 1025 } 1026 1027 // The Strings Table. 1028 dest.fCFUStrings = fStringTable.toString(); 1029 } 1030 1031 public static void buildConfusableData(Reader confusables, SpoofData dest) 1032 throws java.io.IOException, ParseException { 1033 ConfusabledataBuilder builder = new ConfusabledataBuilder(); 1034 builder.build(confusables, dest); 1035 } 1036 1037 /* 1038 * ***************************************************************************** 1039 * Internal classes for compiling confusable data into its binary (runtime) form. 1040 * ***************************************************************************** 1041 */ 1042 // SPUString 1043 // Holds a string that is the result of one of the mappings defined 1044 // by the confusable mapping data (confusables.txt from Unicode.org) 1045 // Instances of SPUString exist during the compilation process only. 1046 1047 private static class SPUString { 1048 String fStr; // The actual string. 1049 int fCharOrStrTableIndex; // Index into the final runtime data for this string. 1050 // (or, for length 1, the single string char itself, 1051 // there being no string table entry for it.) 1052 1053 SPUString(String s) { 1054 fStr = s; 1055 fCharOrStrTableIndex = 0; 1056 } 1057 } 1058 1059 // Comparison function for ordering strings in the string pool. 1060 // Compare by length first, then, within a group of the same length, 1061 // by code point order. 1062 1063 private static class SPUStringComparator implements Comparator<SPUString> { 1064 @Override 1065 public int compare(SPUString sL, SPUString sR) { 1066 int lenL = sL.fStr.length(); 1067 int lenR = sR.fStr.length(); 1068 if (lenL < lenR) { 1069 return -1; 1070 } else if (lenL > lenR) { 1071 return 1; 1072 } else { 1073 return sL.fStr.compareTo(sR.fStr); 1074 } 1075 } 1076 1077 final static SPUStringComparator INSTANCE = new SPUStringComparator(); 1078 } 1079 1080 // String Pool A utility class for holding the strings that are the result of 1081 // the spoof mappings. These strings will utimately end up in the 1082 // run-time String Table. 1083 // This is sort of like a sorted set of strings, except that ICU's anemic 1084 // built-in collections don't support those, so it is implemented with a 1085 // combination of a uhash and a Vector. 1086 private static class SPUStringPool { 1087 public SPUStringPool() { 1088 fVec = new Vector<>(); 1089 fHash = new Hashtable<>(); 1090 } 1091 1092 public int size() { 1093 return fVec.size(); 1094 } 1095 1096 // Get the n-th string in the collection. 1097 public SPUString getByIndex(int index) { 1098 SPUString retString = fVec.elementAt(index); 1099 return retString; 1100 } 1101 1102 // Add a string. Return the string from the table. 1103 // If the input parameter string is already in the table, delete the 1104 // input parameter and return the existing string. 1105 public SPUString addString(String src) { 1106 SPUString hashedString = fHash.get(src); 1107 if (hashedString == null) { 1108 hashedString = new SPUString(src); 1109 fHash.put(src, hashedString); 1110 fVec.addElement(hashedString); 1111 } 1112 return hashedString; 1113 } 1114 1115 // Sort the contents; affects the ordering of getByIndex(). 1116 public void sort() { 1117 Collections.sort(fVec, SPUStringComparator.INSTANCE); 1118 } 1119 1120 private Vector<SPUString> fVec; // Elements are SPUString * 1121 private Hashtable<String, SPUString> fHash; // Key: Value: 1122 } 1123 1124 } 1125 } 1126 1127 /** 1128 * Get the Restriction Level that is being tested. 1129 * 1130 * @return The restriction level 1131 * @internal 1132 * @deprecated This API is ICU internal only. 1133 */ 1134 @Deprecated 1135 public RestrictionLevel getRestrictionLevel() { 1136 return fRestrictionLevel; 1137 } 1138 1139 /** 1140 * Get the set of checks that this Spoof Checker has been configured to perform. 1141 * 1142 * @return The set of checks that this spoof checker will perform. 1143 * @stable ICU 4.6 1144 */ 1145 public int getChecks() { 1146 return fChecks; 1147 } 1148 1149 /** 1150 * Get a read-only set of locales for the scripts that are acceptable in strings to be checked. If no limitations on 1151 * scripts have been specified, an empty set will be returned. 1152 * 1153 * setAllowedChars() will reset the list of allowed locales to be empty. 1154 * 1155 * The returned set may not be identical to the originally specified set that is supplied to setAllowedLocales(); 1156 * the information other than languages from the originally specified locales may be omitted. 1157 * 1158 * @return A set of locales corresponding to the acceptable scripts. 1159 * 1160 * @stable ICU 4.6 1161 */ 1162 public Set<ULocale> getAllowedLocales() { 1163 return Collections.unmodifiableSet(fAllowedLocales); 1164 } 1165 1166 /** 1167 * Get a set of {@link java.util.Locale} instances for the scripts that are acceptable in strings to be checked. If 1168 * no limitations on scripts have been specified, an empty set will be returned. 1169 * 1170 * @return A set of locales corresponding to the acceptable scripts. 1171 * @stable ICU 54 1172 */ 1173 public Set<Locale> getAllowedJavaLocales() { 1174 HashSet<Locale> locales = new HashSet<>(fAllowedLocales.size()); 1175 for (ULocale uloc : fAllowedLocales) { 1176 locales.add(uloc.toLocale()); 1177 } 1178 return locales; 1179 } 1180 1181 /** 1182 * Get a UnicodeSet for the characters permitted in an identifier. This corresponds to the limits imposed by the Set 1183 * Allowed Characters functions. Limitations imposed by other checks will not be reflected in the set returned by 1184 * this function. 1185 * 1186 * The returned set will be frozen, meaning that it cannot be modified by the caller. 1187 * 1188 * @return A UnicodeSet containing the characters that are permitted by the CHAR_LIMIT test. 1189 * @stable ICU 4.6 1190 */ 1191 public UnicodeSet getAllowedChars() { 1192 return fAllowedCharsSet; 1193 } 1194 1195 /** 1196 * A struct-like class to hold the results of a Spoof Check operation. Tells which check(s) have failed. 1197 * 1198 * @stable ICU 4.6 1199 */ 1200 public static class CheckResult { 1201 /** 1202 * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests 1203 * in question: RESTRICTION_LEVEL, CHAR_LIMIT, and so on. 1204 * 1205 * @stable ICU 4.6 1206 * @see Builder#setChecks 1207 */ 1208 public int checks; 1209 1210 /** 1211 * The index of the first string position that failed a check. 1212 * 1213 * @deprecated ICU 51. No longer supported. Always set to zero. 1214 */ 1215 @Deprecated 1216 public int position; 1217 1218 /** 1219 * The numerics found in the string, if MIXED_NUMBERS was set; otherwise null. The set will contain the zero 1220 * digit from each decimal number system found in the input string. 1221 * 1222 * @stable ICU 58 1223 */ 1224 public UnicodeSet numerics; 1225 1226 /** 1227 * The restriction level that the text meets, if RESTRICTION_LEVEL is set; otherwise null. 1228 * 1229 * @stable ICU 58 1230 */ 1231 public RestrictionLevel restrictionLevel; 1232 1233 /** 1234 * Default constructor 1235 * 1236 * @stable ICU 4.6 1237 */ 1238 public CheckResult() { 1239 checks = 0; 1240 position = 0; 1241 } 1242 1243 /** 1244 * {@inheritDoc} 1245 * 1246 * @stable ICU 4.6 1247 */ 1248 @Override 1249 public String toString() { 1250 StringBuilder sb = new StringBuilder(); 1251 sb.append("checks:"); 1252 if (checks == 0) { 1253 sb.append(" none"); 1254 } else if (checks == ALL_CHECKS) { 1255 sb.append(" all"); 1256 } else { 1257 if ((checks & SINGLE_SCRIPT_CONFUSABLE) != 0) { 1258 sb.append(" SINGLE_SCRIPT_CONFUSABLE"); 1259 } 1260 if ((checks & MIXED_SCRIPT_CONFUSABLE) != 0) { 1261 sb.append(" MIXED_SCRIPT_CONFUSABLE"); 1262 } 1263 if ((checks & WHOLE_SCRIPT_CONFUSABLE) != 0) { 1264 sb.append(" WHOLE_SCRIPT_CONFUSABLE"); 1265 } 1266 if ((checks & ANY_CASE) != 0) { 1267 sb.append(" ANY_CASE"); 1268 } 1269 if ((checks & RESTRICTION_LEVEL) != 0) { 1270 sb.append(" RESTRICTION_LEVEL"); 1271 } 1272 if ((checks & INVISIBLE) != 0) { 1273 sb.append(" INVISIBLE"); 1274 } 1275 if ((checks & CHAR_LIMIT) != 0) { 1276 sb.append(" CHAR_LIMIT"); 1277 } 1278 if ((checks & MIXED_NUMBERS) != 0) { 1279 sb.append(" MIXED_NUMBERS"); 1280 } 1281 } 1282 sb.append(", numerics: ").append(numerics.toPattern(false)); 1283 sb.append(", position: ").append(position); 1284 sb.append(", restrictionLevel: ").append(restrictionLevel); 1285 return sb.toString(); 1286 } 1287 } 1288 1289 /** 1290 * Check the specified string for possible security issues. The text to be checked will typically be an identifier 1291 * of some sort. The set of checks to be performed was specified when building the SpoofChecker. 1292 * 1293 * @param text 1294 * A String to be checked for possible security issues. 1295 * @param checkResult 1296 * Output parameter, indicates which specific tests failed. May be null if the information is not wanted. 1297 * @return True there any issue is found with the input string. 1298 * @stable ICU 4.8 1299 */ 1300 public boolean failsChecks(String text, CheckResult checkResult) { 1301 int length = text.length(); 1302 1303 int result = 0; 1304 if (checkResult != null) { 1305 checkResult.position = 0; 1306 checkResult.numerics = null; 1307 checkResult.restrictionLevel = null; 1308 } 1309 1310 if (0 != (this.fChecks & RESTRICTION_LEVEL)) { 1311 RestrictionLevel textRestrictionLevel = getRestrictionLevel(text); 1312 if (textRestrictionLevel.compareTo(fRestrictionLevel) > 0) { 1313 result |= RESTRICTION_LEVEL; 1314 } 1315 if (checkResult != null) { 1316 checkResult.restrictionLevel = textRestrictionLevel; 1317 } 1318 } 1319 1320 if (0 != (this.fChecks & MIXED_NUMBERS)) { 1321 UnicodeSet numerics = new UnicodeSet(); 1322 getNumerics(text, numerics); 1323 if (numerics.size() > 1) { 1324 result |= MIXED_NUMBERS; 1325 } 1326 if (checkResult != null) { 1327 checkResult.numerics = numerics; 1328 } 1329 } 1330 1331 if (0 != (this.fChecks & HIDDEN_OVERLAY)) { 1332 int index = findHiddenOverlay(text); 1333 if (index != -1) { 1334 result |= HIDDEN_OVERLAY; 1335 } 1336 } 1337 1338 if (0 != (this.fChecks & CHAR_LIMIT)) { 1339 int i; 1340 int c; 1341 for (i = 0; i < length;) { 1342 // U16_NEXT(text, i, length, c); 1343 c = Character.codePointAt(text, i); 1344 i = Character.offsetByCodePoints(text, i, 1); 1345 if (!this.fAllowedCharsSet.contains(c)) { 1346 result |= CHAR_LIMIT; 1347 break; 1348 } 1349 } 1350 } 1351 1352 if (0 != (this.fChecks & INVISIBLE)) { 1353 // This check needs to be done on NFD input 1354 String nfdText = nfdNormalizer.normalize(text); 1355 1356 // scan for more than one occurrence of the same non-spacing mark 1357 // in a sequence of non-spacing marks. 1358 int i; 1359 int c; 1360 int firstNonspacingMark = 0; 1361 boolean haveMultipleMarks = false; 1362 UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a 1363 // single combining sequence. 1364 for (i = 0; i < length;) { 1365 c = Character.codePointAt(nfdText, i); 1366 i = Character.offsetByCodePoints(nfdText, i, 1); 1367 if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) { 1368 firstNonspacingMark = 0; 1369 if (haveMultipleMarks) { 1370 marksSeenSoFar.clear(); 1371 haveMultipleMarks = false; 1372 } 1373 continue; 1374 } 1375 if (firstNonspacingMark == 0) { 1376 firstNonspacingMark = c; 1377 continue; 1378 } 1379 if (!haveMultipleMarks) { 1380 marksSeenSoFar.add(firstNonspacingMark); 1381 haveMultipleMarks = true; 1382 } 1383 if (marksSeenSoFar.contains(c)) { 1384 // report the error, and stop scanning. 1385 // No need to find more than the first failure. 1386 result |= INVISIBLE; 1387 break; 1388 } 1389 marksSeenSoFar.add(c); 1390 } 1391 } 1392 if (checkResult != null) { 1393 checkResult.checks = result; 1394 } 1395 return (0 != result); 1396 } 1397 1398 /** 1399 * Check the specified string for possible security issues. The text to be checked will typically be an identifier 1400 * of some sort. The set of checks to be performed was specified when building the SpoofChecker. 1401 * 1402 * @param text 1403 * A String to be checked for possible security issues. 1404 * @return True there any issue is found with the input string. 1405 * @stable ICU 4.8 1406 */ failsChecks(String text)1407 public boolean failsChecks(String text) { 1408 return failsChecks(text, null); 1409 } 1410 1411 /** 1412 * Check whether two specified strings are visually confusable. The types of confusability to be tested - single 1413 * script, mixed script, or whole script - are determined by the check options set for the SpoofChecker. 1414 * 1415 * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE 1416 * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected. 1417 * 1418 * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case 1419 * folded for comparison and display to the user, do not select the ANY_CASE option. 1420 * 1421 * 1422 * @param s1 1423 * The first of the two strings to be compared for confusability. 1424 * @param s2 1425 * The second of the two strings to be compared for confusability. 1426 * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability 1427 * found, as defined by spoof check test constants. 1428 * @stable ICU 4.6 1429 */ areConfusable(String s1, String s2)1430 public int areConfusable(String s1, String s2) { 1431 // 1432 // See section 4 of UTS #39 for the algorithm for checking whether two strings are confusable, 1433 // and for definitions of the types (single, whole, mixed-script) of confusables. 1434 1435 // We only care about a few of the check flags. Ignore the others. 1436 // If no tests relevant to this function have been specified, signal an error. 1437 // TODO: is this really the right thing to do? It's probably an error on 1438 // the caller's part, but logically we would just return 0 (no error). 1439 if ((this.fChecks & CONFUSABLE) == 0) { 1440 throw new IllegalArgumentException("No confusable checks are enabled."); 1441 } 1442 1443 // Compute the skeletons and check for confusability. 1444 String s1Skeleton = getSkeleton(s1); 1445 String s2Skeleton = getSkeleton(s2); 1446 if (!s1Skeleton.equals(s2Skeleton)) { 1447 return 0; 1448 } 1449 1450 // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes 1451 // of confusables according to UTS 39 section 4. 1452 // Start by computing the resolved script sets of s1 and s2. 1453 ScriptSet s1RSS = new ScriptSet(); 1454 getResolvedScriptSet(s1, s1RSS); 1455 ScriptSet s2RSS = new ScriptSet(); 1456 getResolvedScriptSet(s2, s2RSS); 1457 1458 // Turn on all applicable flags 1459 int result = 0; 1460 if (s1RSS.intersects(s2RSS)) { 1461 result |= SINGLE_SCRIPT_CONFUSABLE; 1462 } else { 1463 result |= MIXED_SCRIPT_CONFUSABLE; 1464 if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) { 1465 result |= WHOLE_SCRIPT_CONFUSABLE; 1466 } 1467 } 1468 1469 // Turn off flags that the user doesn't want 1470 return result & fChecks; 1471 } 1472 1473 /** 1474 * Check whether two specified strings are visually when displayed in a paragraph with the given direction. 1475 * The types of confusability to be tested—single script, mixed script, or whole script—are determined by the check options set for the SpoofChecker. 1476 * 1477 * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE 1478 * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected. 1479 * 1480 * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case 1481 * folded for comparison and display to the user, do not select the ANY_CASE option. 1482 * 1483 * 1484 * @param direction The paragraph direction with which the identifiers are displayed. 1485 * Must be either {@link Bidi#DIRECTION_LEFT_TO_RIGHT} or {@link Bidi#DIRECTION_RIGHT_TO_LEFT}. 1486 * @param s1 1487 * The first of the two strings to be compared for confusability. 1488 * @param s2 1489 * The second of the two strings to be compared for confusability. 1490 * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability 1491 * found, as defined by spoof check test constants. 1492 * @draft ICU 74 1493 */ areConfusable(int direction, CharSequence s1, CharSequence s2)1494 public int areConfusable(int direction, CharSequence s1, CharSequence s2) { 1495 // 1496 // See section 4 of UTS #39 for the algorithm for checking whether two strings are confusable, 1497 // and for definitions of the types (single, whole, mixed-script) of confusables. 1498 1499 // We only care about a few of the check flags. Ignore the others. 1500 // If no tests relevant to this function have been specified, signal an error. 1501 // TODO: is this really the right thing to do? It's probably an error on 1502 // the caller's part, but logically we would just return 0 (no error). 1503 if ((this.fChecks & CONFUSABLE) == 0) { 1504 throw new IllegalArgumentException("No confusable checks are enabled."); 1505 } 1506 1507 // Compute the skeletons and check for confusability. 1508 String s1Skeleton = getBidiSkeleton(direction, s1); 1509 String s2Skeleton = getBidiSkeleton(direction, s2); 1510 if (!s1Skeleton.equals(s2Skeleton)) { 1511 return 0; 1512 } 1513 1514 // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes 1515 // of confusables according to UTS 39 section 4. 1516 // Start by computing the resolved script sets of s1 and s2. 1517 ScriptSet s1RSS = new ScriptSet(); 1518 getResolvedScriptSet(s1, s1RSS); 1519 ScriptSet s2RSS = new ScriptSet(); 1520 getResolvedScriptSet(s2, s2RSS); 1521 1522 // Turn on all applicable flags 1523 int result = 0; 1524 if (s1RSS.intersects(s2RSS)) { 1525 result |= SINGLE_SCRIPT_CONFUSABLE; 1526 } else { 1527 result |= MIXED_SCRIPT_CONFUSABLE; 1528 if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) { 1529 result |= WHOLE_SCRIPT_CONFUSABLE; 1530 } 1531 } 1532 1533 // Turn off flags that the user doesn't want 1534 result &= fChecks; 1535 1536 return result; 1537 } 1538 1539 /** 1540 * Get the "bidiSkeleton" for an identifier string and a direction. 1541 * Skeletons are a transformation of the input string; 1542 * Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical; 1543 * they are RTL-confusable if their RTL bidiSkeletons are identical. 1544 * See Unicode Technical Standard #39 for additional information: 1545 * https://www.unicode.org/reports/tr39/#Confusable_Detection. 1546 * 1547 * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some 1548 * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons. 1549 * 1550 * Skeletons are computed using the algorithm and data described in UTS #39. 1551 * 1552 * @param direction The paragraph direction with which the string is displayed. 1553 * Must be either {@link Bidi#DIRECTION_LEFT_TO_RIGHT} or {@link Bidi#DIRECTION_RIGHT_TO_LEFT}. 1554 * @param str The input string whose bidiSkeleton will be generated. 1555 * @return The output skeleton string. 1556 * 1557 * @draft ICU 74 1558 */ getBidiSkeleton(int direction, CharSequence str)1559 public String getBidiSkeleton(int direction, CharSequence str) { 1560 if (direction != Bidi.DIRECTION_LEFT_TO_RIGHT && direction != Bidi.DIRECTION_RIGHT_TO_LEFT) { 1561 throw new IllegalArgumentException("direction should be DIRECTION_LEFT_TO_RIGHT or DIRECTION_RIGHT_TO_LEFT"); 1562 } 1563 Bidi bidi = new Bidi(str.toString(), direction); 1564 return getSkeleton(bidi.writeReordered(Bidi.KEEP_BASE_COMBINING | Bidi.DO_MIRRORING)); 1565 } 1566 1567 /** 1568 * Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are 1569 * confusable if their skeletons are identical. See Unicode UAX 39 for additional information. 1570 * 1571 * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some 1572 * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons. 1573 * 1574 * Skeletons are computed using the algorithm and data described in Unicode UAX 39. 1575 * 1576 * @param str 1577 * The input string whose skeleton will be generated. 1578 * @return The output skeleton string. 1579 * 1580 * @stable ICU 58 1581 */ getSkeleton(CharSequence str)1582 public String getSkeleton(CharSequence str) { 1583 // Apply the skeleton mapping to the NFD normalized input string 1584 // Accumulate the skeleton, possibly unnormalized, in a String. 1585 String nfdId = nfdNormalizer.normalize(str); 1586 int normalizedLen = nfdId.length(); 1587 StringBuilder skelSB = new StringBuilder(); 1588 for (int inputIndex = 0; inputIndex < normalizedLen;) { 1589 int c = Character.codePointAt(nfdId, inputIndex); 1590 inputIndex += Character.charCount(c); 1591 if (!UCharacter.hasBinaryProperty(c, UProperty.DEFAULT_IGNORABLE_CODE_POINT)) { 1592 this.fSpoofData.confusableLookup(c, skelSB); 1593 } 1594 } 1595 String skelStr = skelSB.toString(); 1596 skelStr = nfdNormalizer.normalize(skelStr); 1597 return skelStr; 1598 } 1599 1600 /** 1601 * Calls {@link SpoofChecker#getSkeleton(CharSequence id)}. Starting with ICU 55, the "type" parameter has been 1602 * ignored, and starting with ICU 58, this function has been deprecated. 1603 * 1604 * @param type 1605 * No longer supported. Prior to ICU 55, was used to specify the mapping table SL, SA, ML, or MA. 1606 * @param id 1607 * The input identifier whose skeleton will be generated. 1608 * @return The output skeleton string. 1609 * 1610 * @deprecated ICU 58 1611 */ 1612 @Deprecated getSkeleton(int type, String id)1613 public String getSkeleton(int type, String id) { 1614 return getSkeleton(id); 1615 } 1616 1617 /** 1618 * Equality function. Return true if the two SpoofChecker objects incorporate the same confusable data and have 1619 * enabled the same set of checks. 1620 * 1621 * @param other 1622 * the SpoofChecker being compared with. 1623 * @return true if the two SpoofCheckers are equal. 1624 * @stable ICU 4.6 1625 */ 1626 @Override equals(Object other)1627 public boolean equals(Object other) { 1628 if (!(other instanceof SpoofChecker)) { 1629 return false; 1630 } 1631 SpoofChecker otherSC = (SpoofChecker) other; 1632 if (fSpoofData != otherSC.fSpoofData && fSpoofData != null && !fSpoofData.equals(otherSC.fSpoofData)) { 1633 return false; 1634 } 1635 if (fChecks != otherSC.fChecks) { 1636 return false; 1637 } 1638 if (fAllowedLocales != otherSC.fAllowedLocales && fAllowedLocales != null 1639 && !fAllowedLocales.equals(otherSC.fAllowedLocales)) { 1640 return false; 1641 } 1642 if (fAllowedCharsSet != otherSC.fAllowedCharsSet && fAllowedCharsSet != null 1643 && !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) { 1644 return false; 1645 } 1646 if (fRestrictionLevel != otherSC.fRestrictionLevel) { 1647 return false; 1648 } 1649 return true; 1650 } 1651 1652 /** 1653 * Overrides {@link Object#hashCode()}. 1654 * @stable ICU 4.6 1655 */ 1656 @Override hashCode()1657 public int hashCode() { 1658 return fChecks 1659 ^ fSpoofData.hashCode() 1660 ^ fAllowedLocales.hashCode() 1661 ^ fAllowedCharsSet.hashCode() 1662 ^ fRestrictionLevel.ordinal(); 1663 } 1664 1665 /** 1666 * Computes the augmented script set for a code point, according to UTS 39 section 5.1. 1667 */ getAugmentedScriptSet(int codePoint, ScriptSet result)1668 private static void getAugmentedScriptSet(int codePoint, ScriptSet result) { 1669 result.clear(); 1670 UScript.getScriptExtensions(codePoint, result); 1671 1672 // Section 5.1 step 1 1673 if (result.get(UScript.HAN)) { 1674 result.set(UScript.HAN_WITH_BOPOMOFO); 1675 result.set(UScript.JAPANESE); 1676 result.set(UScript.KOREAN); 1677 } 1678 if (result.get(UScript.HIRAGANA)) { 1679 result.set(UScript.JAPANESE); 1680 } 1681 if (result.get(UScript.KATAKANA)) { 1682 result.set(UScript.JAPANESE); 1683 } 1684 if (result.get(UScript.HANGUL)) { 1685 result.set(UScript.KOREAN); 1686 } 1687 if (result.get(UScript.BOPOMOFO)) { 1688 result.set(UScript.HAN_WITH_BOPOMOFO); 1689 } 1690 1691 // Section 5.1 step 2 1692 if (result.get(UScript.COMMON) || result.get(UScript.INHERITED)) { 1693 result.setAll(); 1694 } 1695 } 1696 1697 /** 1698 * Computes the resolved script set for a string, according to UTS 39 section 5.1. 1699 */ getResolvedScriptSet(CharSequence input, ScriptSet result)1700 private void getResolvedScriptSet(CharSequence input, ScriptSet result) { 1701 getResolvedScriptSetWithout(input, UScript.CODE_LIMIT, result); 1702 } 1703 1704 /** 1705 * Computes the resolved script set for a string, omitting characters having the specified script. If 1706 * UScript.CODE_LIMIT is passed as the second argument, all characters are included. 1707 */ getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result)1708 private void getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result) { 1709 result.setAll(); 1710 1711 ScriptSet temp = new ScriptSet(); 1712 for (int utf16Offset = 0; utf16Offset < input.length();) { 1713 int codePoint = Character.codePointAt(input, utf16Offset); 1714 utf16Offset += Character.charCount(codePoint); 1715 1716 // Compute the augmented script set for the character 1717 getAugmentedScriptSet(codePoint, temp); 1718 1719 // Intersect the augmented script set with the resolved script set, but only if the character doesn't 1720 // have the script specified in the function call 1721 if (script == UScript.CODE_LIMIT || !temp.get(script)) { 1722 result.and(temp); 1723 } 1724 } 1725 } 1726 1727 /** 1728 * Computes the set of numerics for a string, according to UTS 39 section 5.3. 1729 */ getNumerics(String input, UnicodeSet result)1730 private void getNumerics(String input, UnicodeSet result) { 1731 result.clear(); 1732 1733 for (int utf16Offset = 0; utf16Offset < input.length();) { 1734 int codePoint = Character.codePointAt(input, utf16Offset); 1735 utf16Offset += Character.charCount(codePoint); 1736 1737 // Store a representative character for each kind of decimal digit 1738 if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) { 1739 // Store the zero character as a representative for comparison. 1740 // Unicode guarantees it is codePoint - value 1741 result.add(codePoint - UCharacter.getNumericValue(codePoint)); 1742 } 1743 } 1744 } 1745 1746 /** 1747 * Computes the restriction level of a string, according to UTS 39 section 5.2. 1748 */ getRestrictionLevel(String input)1749 private RestrictionLevel getRestrictionLevel(String input) { 1750 // Section 5.2 step 1: 1751 if (!fAllowedCharsSet.containsAll(input)) { 1752 return RestrictionLevel.UNRESTRICTIVE; 1753 } 1754 1755 // Section 5.2 step 2: 1756 if (ASCII.containsAll(input)) { 1757 return RestrictionLevel.ASCII; 1758 } 1759 1760 // Section 5.2 steps 3: 1761 ScriptSet resolvedScriptSet = new ScriptSet(); 1762 getResolvedScriptSet(input, resolvedScriptSet); 1763 1764 // Section 5.2 step 4: 1765 if (!resolvedScriptSet.isEmpty()) { 1766 return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE; 1767 } 1768 1769 // Section 5.2 step 5: 1770 ScriptSet resolvedNoLatn = new ScriptSet(); 1771 getResolvedScriptSetWithout(input, UScript.LATIN, resolvedNoLatn); 1772 1773 // Section 5.2 step 6: 1774 if (resolvedNoLatn.get(UScript.HAN_WITH_BOPOMOFO) || resolvedNoLatn.get(UScript.JAPANESE) 1775 || resolvedNoLatn.get(UScript.KOREAN)) { 1776 return RestrictionLevel.HIGHLY_RESTRICTIVE; 1777 } 1778 1779 // Section 5.2 step 7: 1780 if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) && !resolvedNoLatn.get(UScript.GREEK) 1781 && !resolvedNoLatn.get(UScript.CHEROKEE)) { 1782 return RestrictionLevel.MODERATELY_RESTRICTIVE; 1783 } 1784 1785 // Section 5.2 step 8: 1786 return RestrictionLevel.MINIMALLY_RESTRICTIVE; 1787 } 1788 findHiddenOverlay(String input)1789 int findHiddenOverlay(String input) { 1790 boolean sawLeadCharacter = false; 1791 StringBuilder sb = new StringBuilder(); 1792 for (int i=0; i<input.length();) { 1793 int cp = input.codePointAt(i); 1794 if (sawLeadCharacter && cp == 0x0307) { 1795 return i; 1796 } 1797 int combiningClass = UCharacter.getCombiningClass(cp); 1798 // Skip over characters except for those with combining class 0 (non-combining characters) or with 1799 // combining class 230 (same class as U+0307) 1800 assert UCharacter.getCombiningClass(0x0307) == 230; 1801 if (combiningClass == 0 || combiningClass == 230) { 1802 sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp, sb); 1803 } 1804 i += UCharacter.charCount(cp); 1805 } 1806 return -1; 1807 } 1808 isIllegalCombiningDotLeadCharacterNoLookup(int cp)1809 boolean isIllegalCombiningDotLeadCharacterNoLookup(int cp) { 1810 return cp == 'i' || cp == 'j' || cp == 'ı' || cp == 'ȷ' || cp == 'l' || 1811 UCharacter.hasBinaryProperty(cp, UProperty.SOFT_DOTTED); 1812 } 1813 isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb)1814 boolean isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb) { 1815 if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) { 1816 return true; 1817 } 1818 sb.setLength(0); 1819 fSpoofData.confusableLookup(cp, sb); 1820 int finalCp = UCharacter.codePointBefore(sb, sb.length()); 1821 if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) { 1822 return true; 1823 } 1824 return false; 1825 } 1826 1827 // Data Members 1828 private int fChecks; // Bit vector of checks to perform. 1829 private SpoofData fSpoofData; 1830 private Set<ULocale> fAllowedLocales; // The Set of allowed locales. 1831 private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters. 1832 private RestrictionLevel fRestrictionLevel; 1833 1834 private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance(); 1835 1836 // Confusable Mappings Data Structures, version 2.0 1837 // 1838 // This description and the corresponding implementation are to be kept 1839 // in-sync with the copy in icu4c uspoof_impl.h. 1840 // 1841 // For the confusable data, we are essentially implementing a map, 1842 // key: a code point 1843 // value: a string. Most commonly one char in length, but can be more. 1844 // 1845 // The keys are stored as a sorted array of 32 bit ints. 1846 // bits 0-23 a code point value 1847 // bits 24-31 length of value string, in UChars (between 1 and 256 UChars). 1848 // The key table is sorted in ascending code point order. (not on the 1849 // 32 bit int value, the flag bits do not participate in the sorting.) 1850 // 1851 // Lookup is done by means of a binary search in the key table. 1852 // 1853 // The corresponding values are kept in a parallel array of 16 bit ints. 1854 // If the value string is of length 1, it is literally in the value array. 1855 // For longer strings, the value array contains an index into the strings 1856 // table. 1857 // 1858 // String Table: 1859 // The strings table contains all of the value strings (those of length two or greater) 1860 // concatenated together into one long char (UTF-16) array. 1861 // 1862 // There is no nul character or other mark between adjacent strings. 1863 // 1864 //---------------------------------------------------------------------------- 1865 // 1866 // Changes from format version 1 to format version 2: 1867 // 1) Removal of the whole-script confusable data tables. 1868 // 2) Removal of the SL/SA/ML/MA and multi-table flags in the key bitmask. 1869 // 3) Expansion of string length value in the key bitmask from 2 bits to 8 bits. 1870 // 4) Removal of the string lengths table since 8 bits is sufficient for the 1871 // lengths of all entries in confusables.txt. 1872 // 1873 private static final class ConfusableDataUtils { 1874 public static final int FORMAT_VERSION = 2; // version for ICU 58 1875 keyToCodePoint(int key)1876 public static final int keyToCodePoint(int key) { 1877 return key & 0x00ffffff; 1878 } 1879 keyToLength(int key)1880 public static final int keyToLength(int key) { 1881 return ((key & 0xff000000) >> 24) + 1; 1882 } 1883 codePointAndLengthToKey(int codePoint, int length)1884 public static final int codePointAndLengthToKey(int codePoint, int length) { 1885 assert (codePoint & 0x00ffffff) == codePoint; 1886 assert length <= 256; 1887 return codePoint | ((length - 1) << 24); 1888 } 1889 } 1890 1891 // ------------------------------------------------------------------------------------- 1892 // 1893 // SpoofData 1894 // 1895 // This class corresponds to the ICU SpoofCheck data. 1896 // 1897 // The data can originate with the Binary ICU data that is generated in ICU4C, 1898 // or it can originate from source rules that are compiled in ICU4J. 1899 // 1900 // This class does not include the set of checks to be performed, but only 1901 // data that is serialized into the ICU binary data. 1902 // 1903 // Because Java cannot easily wrap binary data like ICU4C, the binary data is 1904 // copied into Java structures that are convenient for use by the run time code. 1905 // 1906 // --------------------------------------------------------------------------------------- 1907 private static class SpoofData { 1908 1909 // The Confusable data, Java data structures for. 1910 int[] fCFUKeys; 1911 short[] fCFUValues; 1912 String fCFUStrings; 1913 1914 private static final int DATA_FORMAT = 0x43667520; // "Cfu " 1915 1916 private static final class IsAcceptable implements Authenticate { 1917 @Override isDataVersionAcceptable(byte version[])1918 public boolean isDataVersionAcceptable(byte version[]) { 1919 return version[0] == ConfusableDataUtils.FORMAT_VERSION || version[1] != 0 || version[2] != 0 1920 || version[3] != 0; 1921 } 1922 } 1923 1924 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 1925 1926 private static final class DefaultData { 1927 private static SpoofData INSTANCE = null; 1928 private static IOException EXCEPTION = null; 1929 1930 static { 1931 // Note: Although this is static, the Java runtime can delay execution of this block until 1932 // the data is actually requested via SpoofData.getDefault(). 1933 try { 1934 INSTANCE = new SpoofData(ICUBinary.getRequiredData("confusables.cfu")); 1935 } catch (IOException e) { 1936 EXCEPTION = e; 1937 } 1938 } 1939 } 1940 1941 /** 1942 * @return instance for Unicode standard data 1943 */ getDefault()1944 public static SpoofData getDefault() { 1945 if (DefaultData.EXCEPTION != null) { 1946 throw new MissingResourceException( 1947 "Could not load default confusables data: " + DefaultData.EXCEPTION.getMessage(), 1948 "SpoofChecker", ""); 1949 } 1950 return DefaultData.INSTANCE; 1951 } 1952 1953 // SpoofChecker Data constructor for use from data builder. 1954 // Initializes a new, empty data area that will be populated later. SpoofData()1955 private SpoofData() { 1956 } 1957 1958 // Constructor for use when creating from prebuilt default data. 1959 // A ByteBuffer is what the ICU internal data loading functions provide. SpoofData(ByteBuffer bytes)1960 private SpoofData(ByteBuffer bytes) throws java.io.IOException { 1961 ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE); 1962 bytes.mark(); 1963 readData(bytes); 1964 } 1965 1966 @Override equals(Object other)1967 public boolean equals(Object other) { 1968 if (!(other instanceof SpoofData)) { 1969 return false; 1970 } 1971 SpoofData otherData = (SpoofData) other; 1972 if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys)) 1973 return false; 1974 if (!Arrays.equals(fCFUValues, otherData.fCFUValues)) 1975 return false; 1976 if (!Utility.sameObjects(fCFUStrings, otherData.fCFUStrings) && fCFUStrings != null 1977 && !fCFUStrings.equals(otherData.fCFUStrings)) 1978 return false; 1979 return true; 1980 } 1981 1982 @Override hashCode()1983 public int hashCode() { 1984 return Arrays.hashCode(fCFUKeys) 1985 ^ Arrays.hashCode(fCFUValues) 1986 ^ fCFUStrings.hashCode(); 1987 } 1988 1989 // Set the SpoofChecker data from pre-built binary data in a byte buffer. 1990 // The binary data format is as described for ICU4C spoof data. 1991 // readData(ByteBuffer bytes)1992 private void readData(ByteBuffer bytes) throws java.io.IOException { 1993 int magic = bytes.getInt(); 1994 if (magic != 0x3845fdef) { 1995 throw new IllegalArgumentException("Bad Spoof Check Data."); 1996 } 1997 @SuppressWarnings("unused") 1998 int dataFormatVersion = bytes.getInt(); 1999 @SuppressWarnings("unused") 2000 int dataLength = bytes.getInt(); 2001 2002 int CFUKeysOffset = bytes.getInt(); 2003 int CFUKeysSize = bytes.getInt(); 2004 2005 int CFUValuesOffset = bytes.getInt(); 2006 int CFUValuesSize = bytes.getInt(); 2007 2008 int CFUStringTableOffset = bytes.getInt(); 2009 int CFUStringTableSize = bytes.getInt(); 2010 2011 // We have now read the file header, and obtained the position for each 2012 // of the data items. Now read each in turn, first seeking the 2013 // input stream to the position of the data item. 2014 2015 bytes.reset(); 2016 ICUBinary.skipBytes(bytes, CFUKeysOffset); 2017 fCFUKeys = ICUBinary.getInts(bytes, CFUKeysSize, 0); 2018 2019 bytes.reset(); 2020 ICUBinary.skipBytes(bytes, CFUValuesOffset); 2021 fCFUValues = ICUBinary.getShorts(bytes, CFUValuesSize, 0); 2022 2023 bytes.reset(); 2024 ICUBinary.skipBytes(bytes, CFUStringTableOffset); 2025 fCFUStrings = ICUBinary.getString(bytes, CFUStringTableSize, 0); 2026 } 2027 2028 /** 2029 * Append the confusable skeleton transform for a single code point to a StringBuilder. The string to be 2030 * appended will between 1 and 18 characters as of Unicode 9. 2031 * 2032 * This is the heart of the confusable skeleton generation implementation. 2033 */ confusableLookup(int inChar, StringBuilder dest)2034 public void confusableLookup(int inChar, StringBuilder dest) { 2035 // Perform a binary search. 2036 // [lo, hi), i.e lo is inclusive, hi is exclusive. 2037 // The result after the loop will be in lo. 2038 int lo = 0; 2039 int hi = length(); 2040 do { 2041 int mid = (lo + hi) / 2; 2042 if (codePointAt(mid) > inChar) { 2043 hi = mid; 2044 } else if (codePointAt(mid) < inChar) { 2045 lo = mid; 2046 } else { 2047 // Found result. Break early. 2048 lo = mid; 2049 break; 2050 } 2051 } while (hi - lo > 1); 2052 2053 // Did we find an entry? If not, the char maps to itself. 2054 if (codePointAt(lo) != inChar) { 2055 dest.appendCodePoint(inChar); 2056 return; 2057 } 2058 2059 // Add the element to the string builder and return. 2060 appendValueTo(lo, dest); 2061 return; 2062 } 2063 2064 /** 2065 * Return the number of confusable entries in this SpoofData. 2066 * 2067 * @return The number of entries. 2068 */ length()2069 public int length() { 2070 return fCFUKeys.length; 2071 } 2072 2073 /** 2074 * Return the code point (key) at the specified index. 2075 * 2076 * @param index 2077 * The index within the SpoofData. 2078 * @return The code point. 2079 */ codePointAt(int index)2080 public int codePointAt(int index) { 2081 return ConfusableDataUtils.keyToCodePoint(fCFUKeys[index]); 2082 } 2083 2084 /** 2085 * Append the confusable skeleton at the specified index to the StringBuilder dest. 2086 * 2087 * @param index 2088 * The index within the SpoofData. 2089 * @param dest 2090 * The StringBuilder to which to append the skeleton. 2091 */ appendValueTo(int index, StringBuilder dest)2092 public void appendValueTo(int index, StringBuilder dest) { 2093 int stringLength = ConfusableDataUtils.keyToLength(fCFUKeys[index]); 2094 2095 // Value is either a char (for strings of length 1) or 2096 // an index into the string table (for longer strings) 2097 short value = fCFUValues[index]; 2098 if (stringLength == 1) { 2099 dest.append((char) value); 2100 } else { 2101 dest.append(fCFUStrings, value, value + stringLength); 2102 } 2103 } 2104 } 2105 2106 // ------------------------------------------------------------------------------- 2107 // 2108 // ScriptSet - Script code bit sets. 2109 // Extends Java BitSet with input/output support and a few helper methods. 2110 // Note: The I/O is not currently being used, so it has been commented out. If 2111 // it is needed again, the code can be restored. 2112 // 2113 // ------------------------------------------------------------------------------- 2114 static class ScriptSet extends BitSet { 2115 2116 // Eclipse default value to quell warnings: 2117 private static final long serialVersionUID = 1L; 2118 2119 // // The serialized version of this class can hold INT_CAPACITY * 32 scripts. 2120 // private static final int INT_CAPACITY = 6; 2121 // private static final long serialVersionUID = INT_CAPACITY; 2122 // static { 2123 // assert ScriptSet.INT_CAPACITY * Integer.SIZE <= UScript.CODE_LIMIT; 2124 // } 2125 // 2126 // public ScriptSet() { 2127 // } 2128 // 2129 // public ScriptSet(ByteBuffer bytes) throws java.io.IOException { 2130 // for (int i = 0; i < INT_CAPACITY; i++) { 2131 // int bits = bytes.getInt(); 2132 // for (int j = 0; j < Integer.SIZE; j++) { 2133 // if ((bits & (1 << j)) != 0) { 2134 // set(i * Integer.SIZE + j); 2135 // } 2136 // } 2137 // } 2138 // } 2139 // 2140 // public void output(DataOutputStream os) throws java.io.IOException { 2141 // for (int i = 0; i < INT_CAPACITY; i++) { 2142 // int bits = 0; 2143 // for (int j = 0; j < Integer.SIZE; j++) { 2144 // if (get(i * Integer.SIZE + j)) { 2145 // bits |= (1 << j); 2146 // } 2147 // } 2148 // os.writeInt(bits); 2149 // } 2150 // } 2151 and(int script)2152 public void and(int script) { 2153 this.clear(0, script); 2154 this.clear(script + 1, UScript.CODE_LIMIT); 2155 } 2156 setAll()2157 public void setAll() { 2158 this.set(0, UScript.CODE_LIMIT); 2159 } 2160 isFull()2161 public boolean isFull() { 2162 return cardinality() == UScript.CODE_LIMIT; 2163 } 2164 appendStringTo(StringBuilder sb)2165 public void appendStringTo(StringBuilder sb) { 2166 sb.append("{ "); 2167 if (isEmpty()) { 2168 sb.append("- "); 2169 } else if (isFull()) { 2170 sb.append("* "); 2171 } else { 2172 for (int script = 0; script < UScript.CODE_LIMIT; script++) { 2173 if (get(script)) { 2174 sb.append(UScript.getShortName(script)); 2175 sb.append(" "); 2176 } 2177 } 2178 } 2179 sb.append("}"); 2180 } 2181 2182 @Override toString()2183 public String toString() { 2184 StringBuilder sb = new StringBuilder(); 2185 sb.append("<ScriptSet "); 2186 appendStringTo(sb); 2187 sb.append(">"); 2188 return sb.toString(); 2189 } 2190 } 2191 } 2192