1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 *************************************************************************** 5 * Copyright (C) 2008-2016 International Business Machines Corporation 6 * and others. All Rights Reserved. 7 *************************************************************************** 8 * 9 * Unicode Spoof Detection 10 */ 11 12 package com.ibm.icu.text; 13 14 import java.io.IOException; 15 import java.io.LineNumberReader; 16 import java.io.Reader; 17 import java.nio.ByteBuffer; 18 import java.text.ParseException; 19 import java.util.ArrayList; 20 import java.util.Arrays; 21 import java.util.BitSet; 22 import java.util.Collections; 23 import java.util.Comparator; 24 import java.util.HashSet; 25 import java.util.Hashtable; 26 import java.util.LinkedHashSet; 27 import java.util.Locale; 28 import java.util.MissingResourceException; 29 import java.util.Set; 30 import java.util.Vector; 31 import java.util.regex.Matcher; 32 import java.util.regex.Pattern; 33 34 import com.ibm.icu.impl.ICUBinary; 35 import com.ibm.icu.impl.ICUBinary.Authenticate; 36 import com.ibm.icu.impl.Utility; 37 import com.ibm.icu.lang.UCharacter; 38 import com.ibm.icu.lang.UCharacterCategory; 39 import com.ibm.icu.lang.UProperty; 40 import com.ibm.icu.lang.UScript; 41 import com.ibm.icu.util.ULocale; 42 43 /** 44 * <p> 45 * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and 46 * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions: 47 * 48 * <ol> 49 * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desparejado" and 50 * "ԁеѕрагејаԁо".</li> 51 * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof 52 * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li> 53 * </ol> 54 * 55 * <p> 56 * Although originally designed as a method for flagging suspicious identifier strings such as URLs, 57 * <code>SpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word 58 * content filters. 59 * 60 * <h2>Confusables</h2> 61 * 62 * <p> 63 * The following example shows how to use <code>SpoofChecker</code> to check for confusability between two strings: 64 * 65 * <pre> 66 * <code> 67 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 68 * int result = sc.areConfusable("desparejado", "ԁеѕрагејаԁо"); 69 * System.out.println(result != 0); // true 70 * </code> 71 * </pre> 72 * 73 * <p> 74 * <code>SpoofChecker</code> uses a builder paradigm: options are specified within the context of a lightweight 75 * {@link SpoofChecker.Builder} object, and upon calling {@link SpoofChecker.Builder#build}, expensive data loading 76 * operations are performed, and an immutable <code>SpoofChecker</code> is returned. 77 * 78 * <p> 79 * The first line of the example creates a <code>SpoofChecker</code> object with confusable-checking enabled; the second 80 * line performs the confusability test. For best performance, the instance should be created once (e.g., upon 81 * application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime. 82 * 83 * <p> 84 * UTS 39 defines two strings to be <em>confusable</em> if they map to the same skeleton. A <em>skeleton</em> is a 85 * sequence of families of confusable characters, where each family has a single exemplar character. 86 * {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so the following snippet is 87 * equivalent to the example above: 88 * 89 * <pre> 90 * <code> 91 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 92 * boolean result = sc.getSkeleton("desparejado").equals(sc.getSkeleton("ԁеѕрагејаԁо")); 93 * System.out.println(result); // true 94 * </code> 95 * </pre> 96 * 97 * <p> 98 * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling 99 * {@link SpoofChecker#areConfusable} many times in a loop, {@link SpoofChecker#getSkeleton} can be used instead, as 100 * shown below: 101 * 102 * <pre> 103 * // Setup: 104 * String[] DICTIONARY = new String[]{ "lorem", "ipsum" }; // example 105 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 106 * HashSet<String> skeletons = new HashSet<String>(); 107 * for (String word : DICTIONARY) { 108 * skeletons.add(sc.getSkeleton(word)); 109 * } 110 * 111 * // Live Check: 112 * boolean result = skeletons.contains(sc.getSkeleton("1orern")); 113 * System.out.println(result); // true 114 * </pre> 115 * 116 * <p> 117 * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em> 118 * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons 119 * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons. 120 * 121 * <h2>Spoof Detection</h2> 122 * 123 * <p> 124 * The following snippet shows a minimal example of using <code>SpoofChecker</code> to perform spoof detection on a 125 * string: 126 * 127 * <pre> 128 * SpoofChecker sc = new SpoofChecker.Builder() 129 * .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION)) 130 * .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE) 131 * .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE) 132 * .build(); 133 * boolean result = sc.failsChecks("pаypаl"); // with Cyrillic 'а' characters 134 * System.out.println(result); // true 135 * </pre> 136 * 137 * <p> 138 * As in the case for confusability checking, it is good practice to create one <code>SpoofChecker</code> instance at 139 * startup, and call the cheaper {@link SpoofChecker#failsChecks} online. In the second line, we specify the set of 140 * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. In the 141 * third line, the CONFUSABLE checks are disabled. It is good practice to disable them if you won't be using the 142 * instance to perform confusability checking. 143 * 144 * <p> 145 * To get more details on why a string failed the checks, use a {@link SpoofChecker.CheckResult}: 146 * 147 * <pre> 148 * <code> 149 * SpoofChecker sc = new SpoofChecker.Builder() 150 * .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION)) 151 * .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE) 152 * .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE) 153 * .build(); 154 * SpoofChecker.CheckResult checkResult = new SpoofChecker.CheckResult(); 155 * boolean result = sc.failsChecks("pаypаl", checkResult); 156 * System.out.println(checkResult.checks); // 16 157 * </code> 158 * </pre> 159 * 160 * <p> 161 * The return value is a bitmask of the checks that failed. In this case, there was one check that failed: 162 * {@link SpoofChecker#RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are: 163 * 164 * <ul> 165 * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the 166 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS 167 * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li> 168 * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character 169 * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li> 170 * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable 171 * characters. See {@link SpoofChecker.Builder#setAllowedChars} and {@link SpoofChecker.Builder#setAllowedLocales}.</li> 172 * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li> 173 * </ul> 174 * 175 * <p> 176 * These checks can be enabled independently of each other. For example, if you were interested in checking for only the 177 * INVISIBLE and MIXED_NUMBERS conditions, you could do: 178 * 179 * <pre> 180 * <code> 181 * SpoofChecker sc = new SpoofChecker.Builder() 182 * .setChecks(SpoofChecker.INVISIBLE | SpoofChecker.MIXED_NUMBERS) 183 * .build(); 184 * boolean result = sc.failsChecks("৪8"); 185 * System.out.println(result); // true 186 * </code> 187 * </pre> 188 * 189 * <p> 190 * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in 191 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings 192 * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have 193 * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is 194 * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed 195 * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on 196 * the levels, see UTS 39 or {@link SpoofChecker.RestrictionLevel}. The Restriction Level test is aware of the set of 197 * allowed characters set in {@link SpoofChecker.Builder#setAllowedChars}. Note that characters which have script code 198 * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple 199 * scripts. 200 * 201 * <h2>Additional Information</h2> 202 * 203 * <p> 204 * A <code>SpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers. 205 * 206 * <p> 207 * <b>Thread Safety:</b> The methods on <code>SpoofChecker</code> objects are thread safe. The test functions for 208 * checking a single identifier, or for testing whether two identifiers are potentially confusable, may called 209 * concurrently from multiple threads using the same <code>SpoofChecker</code> instance. 210 * 211 * @stable ICU 4.6 212 */ 213 public class SpoofChecker { 214 215 /** 216 * Constants from UTS 39 for use in setRestrictionLevel. 217 * 218 * @stable ICU 53 219 */ 220 public enum RestrictionLevel { 221 /** 222 * All characters in the string are in the identifier profile and all characters in the string are in the ASCII 223 * range. 224 * 225 * @stable ICU 53 226 */ 227 ASCII, 228 /** 229 * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and the 230 * string is single-script, according to the definition in UTS 39 section 5.1. 231 * 232 * @stable ICU 53 233 */ 234 SINGLE_SCRIPT_RESTRICTIVE, 235 /** 236 * The string classifies as Single Script, or all characters in the string are in the identifier profile and the 237 * string is covered by any of the following sets of scripts, according to the definition in UTS 39 section 5.1: 238 * <ul> 239 * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li> 240 * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li> 241 * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li> 242 * </ul> 243 * 244 * @stable ICU 53 245 */ 246 HIGHLY_RESTRICTIVE, 247 /** 248 * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile 249 * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic, 250 * Greek, and Cherokee. 251 * 252 * @stable ICU 53 253 */ 254 MODERATELY_RESTRICTIVE, 255 /** 256 * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts, such as 257 * Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. 258 * 259 * @stable ICU 53 260 */ 261 MINIMALLY_RESTRICTIVE, 262 /** 263 * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org 264 * 265 * @stable ICU 53 266 */ 267 UNRESTRICTIVE, 268 } 269 270 /** 271 * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}. 272 * 273 * @stable ICU 58 274 */ 275 public static final UnicodeSet INCLUSION = new UnicodeSet( 276 "['\\-.\\:\\u00B7\\u0375\\u058A\\u05F3\\u05F4\\u06FD\\u06FE\\u0F0B\\u2010" 277 + "\\u2019\\u2027\\u30A0\\u30FB]" 278 ).freeze(); 279 // Note: data from IdentifierStatus.txt & IdentifierType.txt 280 // There is tooling to generate this constant in the unicodetools project: 281 // org.unicode.text.tools.RecommendedSetGenerator 282 // It will print the Java and C++ code to the console for easy copy-paste into this file. 283 284 /** 285 * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}. 286 * 287 * @stable ICU 58 288 */ 289 public static final UnicodeSet RECOMMENDED = new UnicodeSet( 290 "[0-9A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u0131\\u0134-\\u013E" 291 + "\\u0141-\\u0148\\u014A-\\u017E\\u018F\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-" 292 + "\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B\\u021E" 293 + "\\u021F\\u0226-\\u0233\\u0259\\u02BB\\u02BC\\u02EC\\u0300-\\u0304\\u0306-" 294 + "\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\\u0328\\u032D\\u032E" 295 + "\\u0330\\u0331\\u0335\\u0338\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386" 296 + "\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-" 297 + "\\u04FF\\u0510-\\u0529\\u052E\\u052F\\u0531-\\u0556\\u0559\\u0561-\\u0586" 298 + "\\u05B4\\u05D0-\\u05EA\\u05EF-\\u05F2\\u0620-\\u063F\\u0641-\\u0655\\u0660-" 299 + "\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-\\u06A0\\u06A2-\\u06D3" 300 + "\\u06D5\\u06E5\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u0870-\\u0887" 301 + "\\u0889-\\u088E\\u08A0-\\u08AC\\u08B2\\u08B5-\\u08C9\\u0901-\\u094D\\u094F" 302 + "\\u0950\\u0956\\u0957\\u0960-\\u0963\\u0966-\\u096F\\u0971-\\u0977\\u0979-" 303 + "\\u097F\\u0981-\\u0983\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u09A8\\u09AA-" 304 + "\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CE" 305 + "\\u09D7\\u09E0-\\u09E3\\u09E6-\\u09F1\\u09FE\\u0A01-\\u0A03\\u0A05-\\u0A0A" 306 + "\\u0A0F\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38\\u0A39" 307 + "\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74" 308 + "\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0" 309 + "\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD" 310 + "\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0AFA-\\u0AFF\\u0B01-\\u0B03\\u0B05-" 311 + "\\u0B0C\\u0B0F\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-" 312 + "\\u0B39\\u0B3C-\\u0B43\\u0B47\\u0B48\\u0B4B-\\u0B4D\\u0B55-\\u0B57\\u0B5F-" 313 + "\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82\\u0B83\\u0B85-\\u0B8A\\u0B8E-\\u0B90" 314 + "\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3\\u0BA4\\u0BA8-" 315 + "\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD0" 316 + "\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-" 317 + "\\u0C33\\u0C35-\\u0C39\\u0C3C-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55" 318 + "\\u0C56\\u0C5D\\u0C60\\u0C61\\u0C66-\\u0C6F\\u0C80\\u0C82\\u0C83\\u0C85-" 319 + "\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-" 320 + "\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5\\u0CD6\\u0CDD\\u0CE0-\\u0CE3" 321 + "\\u0CE6-\\u0CEF\\u0CF1-\\u0CF3\\u0D00\\u0D02\\u0D03\\u0D05-\\u0D0C\\u0D0E-" 322 + "\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D54-" 323 + "\\u0D57\\u0D60\\u0D61\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82\\u0D83\\u0D85-" 324 + "\\u0D8E\\u0D91-\\u0D96\\u0D9A-\\u0DA5\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD" 325 + "\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0DD4\\u0DD6\\u0DD8-\\u0DDE\\u0DF2\\u0E01-" 326 + "\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\u0E59\\u0E81\\u0E82\\u0E84" 327 + "\\u0E86-\\u0E8A\\u0E8C-\\u0EA3\\u0EA5\\u0EA7-\\u0EB2\\u0EB4-\\u0EBD\\u0EC0-" 328 + "\\u0EC4\\u0EC6\\u0EC8-\\u0ECE\\u0ED0-\\u0ED9\\u0EDE\\u0EDF\\u0F00\\u0F20-" 329 + "\\u0F29\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-" 330 + "\\u0F51\\u0F53-\\u0F56\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71" 331 + "\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97" 332 + "\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8" 333 + "\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D\\u10C7\\u10CD\\u10D0-" 334 + "\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D\\u1250-" 335 + "\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0" 336 + "\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-" 337 + "\\u1310\\u1312-\\u1315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-" 338 + "\\u17A2\\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CD\\u17D0\\u17D2\\u17D7" 339 + "\\u17DC\\u17E0-\\u17E9\\u1C90-\\u1CBA\\u1CBD-\\u1CBF\\u1E00-\\u1E99\\u1E9E" 340 + "\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D" 341 + "\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76" 342 + "\\u1F78\\u1F7A\\u1F7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4" 343 + "\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2" 344 + "\\u1FE4-\\u1FEA\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FF8\\u1FFA\\u1FFC\\u2D27" 345 + "\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-" 346 + "\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u3005-" 347 + "\\u3007\\u3041-\\u3096\\u3099\\u309A\\u309D\\u309E\\u30A1-\\u30FA\\u30FC-" 348 + "\\u30FE\\u3105-\\u312D\\u312F\\u31A0-\\u31BF\\u3400-\\u4DBF\\u4E00-\\u9FFF" 349 + "\\uA67F\\uA717-\\uA71F\\uA788\\uA78D\\uA792\\uA793\\uA7AA\\uA7C0-\\uA7CA" 350 + "\\uA7D0\\uA7D1\\uA7D3\\uA7D5-\\uA7D9\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-" 351 + "\\uAA7F\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26\\uAB28-" 352 + "\\uAB2E\\uAB66\\uAB67\\uAC00-\\uD7A3\\uFA0E\\uFA0F\\uFA11\\uFA13\\uFA14" 353 + "\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\U00011301\\U00011303" 354 + "\\U0001133B\\U0001133C\\U00016FF0\\U00016FF1\\U0001B11F-\\U0001B122" 355 + "\\U0001B132\\U0001B150-\\U0001B152\\U0001B155\\U0001B164-\\U0001B167" 356 + "\\U0001DF00-\\U0001DF1E\\U0001DF25-\\U0001DF2A\\U0001E08F\\U0001E7E0-" 357 + "\\U0001E7E6\\U0001E7E8-\\U0001E7EB\\U0001E7ED\\U0001E7EE\\U0001E7F0-" 358 + "\\U0001E7FE\\U00020000-\\U0002A6DF\\U0002A700-\\U0002B739\\U0002B740-" 359 + "\\U0002B81D\\U0002B820-\\U0002CEA1\\U0002CEB0-\\U0002EBE0\\U00030000-" 360 + "\\U0003134A\\U00031350-\\U000323AF]" 361 ).freeze(); 362 // Note: data from IdentifierStatus.txt & IdentifierType.txt 363 // There is tooling to generate this constant in the unicodetools project: 364 // org.unicode.text.tools.RecommendedSetGenerator 365 // It will print the Java and C++ code to the console for easy copy-paste into this file. 366 367 /** 368 * Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of 369 * checks that will be performed, and to report results from the check function. 370 * 371 */ 372 373 /** 374 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 375 * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section 376 * 4. 377 * 378 * @stable ICU 4.6 379 */ 380 public static final int SINGLE_SCRIPT_CONFUSABLE = 1; 381 382 /** 383 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 384 * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS 385 * 39 section 4. 386 * 387 * @stable ICU 4.6 388 */ 389 public static final int MIXED_SCRIPT_CONFUSABLE = 2; 390 391 /** 392 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 393 * that the two strings are visually confusable and that they are not from the same script but both of them are 394 * single-script strings, according to UTS 39 section 4. 395 * 396 * @stable ICU 4.6 397 */ 398 public static final int WHOLE_SCRIPT_CONFUSABLE = 4; 399 400 /** 401 * Enable this flag in {@link SpoofChecker.Builder#setChecks} to turn on all types of confusables. You may set the 402 * checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to make 403 * {@link SpoofChecker#areConfusable} return only those types of confusables. 404 * 405 * @stable ICU 58 406 */ 407 public static final int CONFUSABLE = SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE; 408 409 /** 410 * This flag is deprecated and no longer affects the behavior of SpoofChecker. 411 * 412 * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was 413 * deprecated. 414 */ 415 @Deprecated 416 public static final int ANY_CASE = 8; 417 418 /** 419 * Check that an identifier satisfies the requirements for the restriction level specified in 420 * {@link SpoofChecker.Builder#setRestrictionLevel}. The default restriction level is 421 * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. 422 * 423 * @stable ICU 58 424 */ 425 public static final int RESTRICTION_LEVEL = 16; 426 427 /** 428 * Check that an identifier contains only characters from a single script (plus chars from the common and inherited 429 * scripts.) Applies to checks of a single identifier check only. 430 * 431 * @deprecated ICU 51 Use RESTRICTION_LEVEL 432 */ 433 @Deprecated 434 public static final int SINGLE_SCRIPT = RESTRICTION_LEVEL; 435 436 /** 437 * Check an identifier for the presence of invisible characters, such as zero-width spaces, or character sequences 438 * that are likely not to display, such as multiple occurrences of the same non-spacing mark. This check does not 439 * test the input string as a whole for conformance to any particular syntax for identifiers. 440 * 441 * @stable ICU 4.6 442 */ 443 public static final int INVISIBLE = 32; 444 445 /** 446 * Check that an identifier contains only characters from a specified set of acceptable characters. See 447 * {@link Builder#setAllowedChars} and {@link Builder#setAllowedLocales}. Note that a string that fails this check 448 * will also fail the {@link #RESTRICTION_LEVEL} check. 449 * 450 * @stable ICU 4.6 451 */ 452 public static final int CHAR_LIMIT = 64; 453 454 /** 455 * Check that an identifier does not mix numbers from different numbering systems. For more information, see UTS 39 456 * section 5.3. 457 * 458 * @stable ICU 58 459 */ 460 public static final int MIXED_NUMBERS = 128; 461 462 /** 463 * Check that an identifier does not have a combining character following a character in which that 464 * combining character would be hidden; for example 'i' followed by a U+0307 combining dot. 465 * <p> 466 * More specifically, the following characters are forbidden from preceding a U+0307: 467 * <ul> 468 * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li> 469 * <li>Latin lowercase letter 'l'</li> 470 * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li> 471 * <li>Any character whose confusable prototype ends with such a character 472 * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li> 473 * </ul> 474 * In addition, combining characters are allowed between the above characters and U+0307 except those 475 * with combining class 0 or combining class "Above" (230, same class as U+0307). 476 * <p> 477 * This list and the number of combing characters considered by this check may grow over time. 478 * 479 * @stable ICU 62 480 */ 481 public static final int HIDDEN_OVERLAY = 256; 482 483 // Update CheckResult.toString() when a new check is added. 484 485 /** 486 * Enable all spoof checks. 487 * 488 * @stable ICU 4.6 489 */ 490 public static final int ALL_CHECKS = 0xFFFFFFFF; 491 492 // Used for checking for ASCII-Only restriction level 493 static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze(); 494 495 /** 496 * private constructor: a SpoofChecker has to be built by the builder 497 */ SpoofChecker()498 private SpoofChecker() { 499 } 500 501 /** 502 * SpoofChecker Builder. To create a SpoofChecker, first instantiate a SpoofChecker.Builder, set the desired 503 * checking options on the builder, then call the build() function to create a SpoofChecker instance. 504 * 505 * @stable ICU 4.6 506 */ 507 public static class Builder { 508 int fChecks; // Bit vector of checks to perform. 509 SpoofData fSpoofData; 510 final UnicodeSet fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); // The UnicodeSet of allowed characters. 511 // for this Spoof Checker. Defaults to all chars. 512 final Set<ULocale> fAllowedLocales = new LinkedHashSet<>(); // The list of allowed locales. 513 private RestrictionLevel fRestrictionLevel; 514 515 /** 516 * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for 517 * LOCALE_LIMIT and CHAR_LIMIT. Note that additional checks may be added in the future, resulting in the changes 518 * to the default checking behavior. 519 * 520 * @stable ICU 4.6 521 */ Builder()522 public Builder() { 523 fChecks = ALL_CHECKS; 524 fSpoofData = null; 525 fRestrictionLevel = RestrictionLevel.HIGHLY_RESTRICTIVE; 526 } 527 528 /** 529 * Constructor: Create a Spoof Checker Builder, and set the configuration from an existing SpoofChecker. 530 * 531 * @param src 532 * The existing checker. 533 * @stable ICU 4.6 534 */ Builder(SpoofChecker src)535 public Builder(SpoofChecker src) { 536 fChecks = src.fChecks; 537 fSpoofData = src.fSpoofData; // For the data, we will either use the source data 538 // as-is, or drop the builder's reference to it 539 // and generate new data, depending on what our 540 // caller does with the builder. 541 fAllowedCharsSet.set(src.fAllowedCharsSet); 542 fAllowedLocales.addAll(src.fAllowedLocales); 543 fRestrictionLevel = src.fRestrictionLevel; 544 } 545 546 /** 547 * Create a SpoofChecker with current configuration. 548 * 549 * @return SpoofChecker 550 * @stable ICU 4.6 551 */ build()552 public SpoofChecker build() { 553 // TODO: Make this data loading be lazy (see #12696). 554 if (fSpoofData == null) { 555 // read binary file 556 fSpoofData = SpoofData.getDefault(); 557 } 558 559 // Copy all state from the builder to the new SpoofChecker. 560 // Make sure that everything is either cloned or copied, so 561 // that subsequent re-use of the builder won't modify the built 562 // SpoofChecker. 563 // 564 // One exception to this: the SpoofData is just assigned. 565 // If the builder subsequently needs to modify fSpoofData 566 // it will create a new SpoofData object first. 567 568 SpoofChecker result = new SpoofChecker(); 569 result.fChecks = this.fChecks; 570 result.fSpoofData = this.fSpoofData; 571 result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone()); 572 result.fAllowedCharsSet.freeze(); 573 result.fAllowedLocales = new HashSet<>(this.fAllowedLocales); 574 result.fRestrictionLevel = this.fRestrictionLevel; 575 return result; 576 } 577 578 /** 579 * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data file 580 * confusables.txt as described in Unicode UAX 39. The syntax of the source data is as described in UAX 39 for 581 * these files, and the content of these files is acceptable input. 582 * 583 * @param confusables 584 * the Reader of confusable characters definitions, as found in file confusables.txt from 585 * unicode.org. 586 * @throws ParseException 587 * To report syntax errors in the input. 588 * 589 * @stable ICU 58 590 */ setData(Reader confusables)591 public Builder setData(Reader confusables) throws ParseException, IOException { 592 593 // Compile the binary data from the source (text) format. 594 // Drop the builder's reference to any pre-existing data, which may 595 // be in use in an already-built checker. 596 597 fSpoofData = new SpoofData(); 598 ConfusabledataBuilder.buildConfusableData(confusables, fSpoofData); 599 return this; 600 } 601 602 /** 603 * Deprecated as of ICU 58; use {@link SpoofChecker.Builder#setData(Reader confusables)} instead. 604 * 605 * @param confusables 606 * the Reader of confusable characters definitions, as found in file confusables.txt from 607 * unicode.org. 608 * @param confusablesWholeScript 609 * No longer supported. 610 * @throws ParseException 611 * To report syntax errors in the input. 612 * 613 * @deprecated ICU 58 614 */ 615 @Deprecated setData(Reader confusables, Reader confusablesWholeScript)616 public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, IOException { 617 setData(confusables); 618 return this; 619 } 620 621 /** 622 * Specify the bitmask of checks that will be performed by {@link SpoofChecker#failsChecks}. Calling this method 623 * overwrites any checks that may have already been enabled. By default, all checks are enabled. 624 * 625 * To enable specific checks and disable all others, 626 * OR together only the bit constants for the desired checks. 627 * For example, to fail strings containing characters outside of 628 * the set specified by {@link #setAllowedChars} and 629 * also strings that contain digits from mixed numbering systems: 630 * 631 * <pre> 632 * {@code 633 * builder.setChecks(SpoofChecker.CHAR_LIMIT | SpoofChecker.MIXED_NUMBERS); 634 * } 635 * </pre> 636 * 637 * To disable specific checks and enable all others, 638 * start with ALL_CHECKS and "AND away" the not-desired checks. 639 * For example, if you are not planning to use the {@link SpoofChecker#areConfusable} functionality, 640 * it is good practice to disable the CONFUSABLE check: 641 * 642 * <pre> 643 * {@code 644 * builder.setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CONFUSABLE); 645 * } 646 * </pre> 647 * 648 * Note that methods such as {@link #setAllowedChars}, {@link #setAllowedLocales}, and 649 * {@link #setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they 650 * enable onto the existing bitmask specified by this method. For more details, see the documentation of those 651 * methods. 652 * 653 * @param checks 654 * The set of checks that this spoof checker will perform. The value is an 'or' of the desired 655 * checks. 656 * @return self 657 * @stable ICU 4.6 658 */ setChecks(int checks)659 public Builder setChecks(int checks) { 660 // Verify that the requested checks are all ones (bits) that 661 // are acceptable, known values. 662 if (0 != (checks & ~SpoofChecker.ALL_CHECKS)) { 663 throw new IllegalArgumentException("Bad Spoof Checks value."); 664 } 665 this.fChecks = (checks & SpoofChecker.ALL_CHECKS); 666 return this; 667 } 668 669 /** 670 * Limit characters that are acceptable in identifiers being checked to those normally used with the languages 671 * associated with the specified locales. Any previously specified list of locales is replaced by the new 672 * settings. 673 * 674 * A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is 675 * determined. Characters from this set of scripts, along with characters from the "common" and "inherited" 676 * Unicode Script categories will be permitted. 677 * 678 * Supplying an empty string removes all restrictions; characters from any script will be allowed. 679 * 680 * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker when calling this function with a 681 * non-empty list of locales. 682 * 683 * The Unicode Set of characters that will be allowed is accessible via the {@link #getAllowedChars} function. 684 * setAllowedLocales() will <i>replace</i> any previously applied set of allowed characters. 685 * 686 * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of 687 * {@link #setAllowedChars} by fetching the resulting set with {@link #getAllowedChars}, manipulating it with 688 * the Unicode Set API, then resetting the spoof detectors limits with {@link #setAllowedChars}. 689 * 690 * @param locales 691 * A Set of ULocales, from which the language and associated script are extracted. If the locales Set 692 * is null, no restrictions will be placed on the allowed characters. 693 * 694 * @return self 695 * @stable ICU 4.6 696 */ setAllowedLocales(Set<ULocale> locales)697 public Builder setAllowedLocales(Set<ULocale> locales) { 698 fAllowedCharsSet.clear(); 699 700 for (ULocale locale : locales) { 701 // Add the script chars for this locale to the accumulating set 702 // of allowed chars. 703 addScriptChars(locale, fAllowedCharsSet); 704 } 705 706 // If our caller provided an empty list of locales, we disable the 707 // allowed characters checking 708 fAllowedLocales.clear(); 709 if (locales.size() == 0) { 710 fAllowedCharsSet.add(0, 0x10ffff); 711 fChecks &= ~CHAR_LIMIT; 712 return this; 713 } 714 715 // Add all common and inherited characters to the set of allowed 716 // chars. 717 UnicodeSet tempSet = new UnicodeSet(); 718 tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON); 719 fAllowedCharsSet.addAll(tempSet); 720 tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED); 721 fAllowedCharsSet.addAll(tempSet); 722 723 // Store the updated spoof checker state. 724 fAllowedLocales.clear(); 725 fAllowedLocales.addAll(locales); 726 fChecks |= CHAR_LIMIT; 727 return this; 728 } 729 730 /** 731 * Limit characters that are acceptable in identifiers being checked to those normally used with the languages 732 * associated with the specified locales. Any previously specified list of locales is replaced by the new 733 * settings. 734 * 735 * @param locales 736 * A Set of Locales, from which the language and associated script are extracted. If the locales Set 737 * is null, no restrictions will be placed on the allowed characters. 738 * 739 * @return self 740 * @stable ICU 54 741 */ setAllowedJavaLocales(Set<Locale> locales)742 public Builder setAllowedJavaLocales(Set<Locale> locales) { 743 HashSet<ULocale> ulocales = new HashSet<>(locales.size()); 744 for (Locale locale : locales) { 745 ulocales.add(ULocale.forLocale(locale)); 746 } 747 return setAllowedLocales(ulocales); 748 } 749 750 // Add (union) to the UnicodeSet all of the characters for the scripts 751 // used for the specified locale. Part of the implementation of 752 // setAllowedLocales. addScriptChars(ULocale locale, UnicodeSet allowedChars)753 private void addScriptChars(ULocale locale, UnicodeSet allowedChars) { 754 int scripts[] = UScript.getCode(locale); 755 if (scripts != null) { 756 UnicodeSet tmpSet = new UnicodeSet(); 757 for (int i = 0; i < scripts.length; i++) { 758 tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]); 759 allowedChars.addAll(tmpSet); 760 } 761 } 762 // else it's an unknown script. 763 // Maybe they asked for the script of "zxx", which refers to no linguistic content. 764 // Maybe they asked for the script of a newer locale that we don't know in the older version of ICU. 765 } 766 767 /** 768 * Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit 769 * is replaced by the new settings. This includes limits on characters that were set with the 770 * setAllowedLocales() function. Note that the RESTRICTED set is useful. 771 * 772 * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker by this function. 773 * 774 * @param chars 775 * A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by 776 * this function, so there are no restrictions on modifying or deleting the UnicodeSet after calling 777 * this function. Note that this clears the allowedLocales set. 778 * @return self 779 * @stable ICU 4.6 780 */ setAllowedChars(UnicodeSet chars)781 public Builder setAllowedChars(UnicodeSet chars) { 782 fAllowedCharsSet.set(chars); 783 fAllowedLocales.clear(); 784 fChecks |= CHAR_LIMIT; 785 return this; 786 } 787 788 /** 789 * Set the loosest restriction level allowed for strings. The default if this is not called is 790 * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. Calling this method enables the {@link #RESTRICTION_LEVEL} and 791 * {@link #MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are 792 * to be performed by {@link SpoofChecker#failsChecks}, see {@link #setChecks}. 793 * 794 * @param restrictionLevel 795 * The loosest restriction level allowed. 796 * @return self 797 * @stable ICU 58 798 */ setRestrictionLevel(RestrictionLevel restrictionLevel)799 public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) { 800 fRestrictionLevel = restrictionLevel; 801 fChecks |= RESTRICTION_LEVEL | MIXED_NUMBERS; 802 return this; 803 } 804 805 /* 806 * ***************************************************************************** 807 * Internal classes for compiling confusable data into its binary (runtime) form. 808 * ***************************************************************************** 809 */ 810 // --------------------------------------------------------------------- 811 // 812 // buildConfusableData Compile the source confusable data, as defined by 813 // the Unicode data file confusables.txt, into the binary 814 // structures used by the confusable detector. 815 // 816 // The binary structures are described in uspoof_impl.h 817 // 818 // 1. parse the data, making a hash table mapping from a codepoint to a String. 819 // 820 // 2. Sort all of the strings encountered by length, since they will need to 821 // be stored in that order in the final string table. 822 // TODO: Sorting these strings by length is no longer needed since the removal of 823 // the string lengths table. This logic can be removed to save processing time 824 // when building confusables data. 825 // 826 // 3. Build a list of keys (UChar32s) from the mapping table. Sort the 827 // list because that will be the ordering of our runtime table. 828 // 829 // 4. Generate the run time string table. This is generated before the key & value 830 // table because we need the string indexes when building those tables. 831 // 832 // 5. Build the run-time key and value table. These are parallel tables, and 833 // are built at the same time 834 835 // class ConfusabledataBuilder 836 // An instance of this class exists while the confusable data is being built from source. 837 // It encapsulates the intermediate data structures that are used for building. 838 // It exports one static function, to do a confusable data build. 839 private static class ConfusabledataBuilder { 840 841 private Hashtable<Integer, SPUString> fTable; 842 private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the 843 // four mapping tables. 844 845 // The compiled data is first assembled into the following four collections, 846 // then output to the builder's SpoofData object. 847 private StringBuffer fStringTable; 848 private ArrayList<Integer> fKeyVec; 849 private ArrayList<Integer> fValueVec; 850 private SPUStringPool stringPool; 851 private Pattern fParseLine; 852 private Pattern fParseHexNum; 853 private int fLineNum; 854 ConfusabledataBuilder()855 ConfusabledataBuilder() { 856 fTable = new Hashtable<>(); 857 fKeySet = new UnicodeSet(); 858 fKeyVec = new ArrayList<>(); 859 fValueVec = new ArrayList<>(); 860 stringPool = new SPUStringPool(); 861 } 862 build(Reader confusables, SpoofData dest)863 void build(Reader confusables, SpoofData dest) throws ParseException, java.io.IOException { 864 StringBuffer fInput = new StringBuffer(); 865 866 // Convert the user input data from UTF-8 to char (UTF-16) 867 LineNumberReader lnr = new LineNumberReader(confusables); 868 do { 869 String line = lnr.readLine(); 870 if (line == null) { 871 break; 872 } 873 fInput.append(line); 874 fInput.append('\n'); 875 } while (true); 876 877 // Regular Expression to parse a line from Confusables.txt. The expression will match 878 // any line. What was matched is determined by examining which capture groups have a match. 879 // Capture Group 1: the source char 880 // Capture Group 2: the replacement chars 881 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated) 882 // Capture Group 7: A blank or comment only line. 883 // Capture Group 8: A syntactically invalid line. Anything that didn't match before. 884 // Example Line from the confusables.txt source file: 885 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " 886 fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match the source char 887 "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s) 888 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued) 889 "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type 890 "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment 891 "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment 892 "|^(.*?)$"); // OR match any line, which catches illegal lines. 893 894 // Regular expression for parsing a hex number out of a space-separated list of them. 895 // Capture group 1 gets the number, with spaces removed. 896 fParseHexNum = Pattern.compile("\\s*([0-9A-F]+)"); 897 898 // Zap any Byte Order Mark at the start of input. Changing it to a space 899 // is benign given the syntax of the input. 900 if (fInput.charAt(0) == 0xfeff) { 901 fInput.setCharAt(0, (char) 0x20); 902 } 903 904 // Parse the input, one line per iteration of this loop. 905 Matcher matcher = fParseLine.matcher(fInput); 906 while (matcher.find()) { 907 fLineNum++; 908 if (matcher.start(7) >= 0) { 909 // this was a blank or comment line. 910 continue; 911 } 912 if (matcher.start(8) >= 0) { 913 // input file syntax error. 914 // status = U_PARSE_ERROR; 915 throw new ParseException( 916 "Confusables, line " + fLineNum + ": Unrecognized Line: " + matcher.group(8), 917 matcher.start(8)); 918 } 919 920 // We have a good input line. Extract the key character and mapping 921 // string, and 922 // put them into the appropriate mapping table. 923 int keyChar = Integer.parseInt(matcher.group(1), 16); 924 if (keyChar > 0x10ffff) { 925 throw new ParseException( 926 "Confusables, line " + fLineNum + ": Bad code point: " + matcher.group(1), 927 matcher.start(1)); 928 } 929 Matcher m = fParseHexNum.matcher(matcher.group(2)); 930 931 StringBuilder mapString = new StringBuilder(); 932 while (m.find()) { 933 int c = Integer.parseInt(m.group(1), 16); 934 if (c > 0x10ffff) { 935 throw new ParseException( 936 "Confusables, line " + fLineNum + ": Bad code point: " + Integer.toString(c, 16), 937 matcher.start(2)); 938 } 939 mapString.appendCodePoint(c); 940 } 941 assert (mapString.length() >= 1); 942 943 // Put the map (value) string into the string pool 944 // This a little like a Java intern() - any duplicates will be 945 // eliminated. 946 SPUString smapString = stringPool.addString(mapString.toString()); 947 948 // Add the char . string mapping to the table. 949 // For Unicode 8, the SL, SA and ML tables have been discontinued. 950 // All input data from confusables.txt is tagged MA. 951 fTable.put(keyChar, smapString); 952 953 fKeySet.add(keyChar); 954 } 955 956 // Input data is now all parsed and collected. 957 // Now create the run-time binary form of the data. 958 // 959 // This is done in two steps. First the data is assembled into vectors and strings, 960 // for ease of construction, then the contents of these collections are copied 961 // into the actual SpoofData object. 962 963 // Build up the string array, and record the index of each string therein 964 // in the (build time only) string pool. 965 // Strings of length one are not entered into the strings array. 966 // (Strings in the table are sorted by length) 967 968 stringPool.sort(); 969 fStringTable = new StringBuffer(); 970 int poolSize = stringPool.size(); 971 int i; 972 for (i = 0; i < poolSize; i++) { 973 SPUString s = stringPool.getByIndex(i); 974 int strLen = s.fStr.length(); 975 int strIndex = fStringTable.length(); 976 if (strLen == 1) { 977 // strings of length one do not get an entry in the string table. 978 // Keep the single string character itself here, which is the same 979 // convention that is used in the final run-time string table index. 980 s.fCharOrStrTableIndex = s.fStr.charAt(0); 981 } else { 982 s.fCharOrStrTableIndex = strIndex; 983 fStringTable.append(s.fStr); 984 } 985 } 986 987 // Construct the compile-time Key and Value table. 988 // 989 // The keys in the Key table follow the format described in uspoof.h for the 990 // Cfu confusables data structure. 991 // 992 // Starting in ICU 58, each code point has exactly one entry in the data 993 // structure. 994 995 for (String keyCharStr : fKeySet) { 996 int keyChar = keyCharStr.codePointAt(0); 997 SPUString targetMapping = fTable.get(keyChar); 998 assert targetMapping != null; 999 1000 // Throw a sane exception if trying to consume a long string. Otherwise, 1001 // codePointAndLengthToKey will throw an assertion error. 1002 if (targetMapping.fStr.length() > 256) { 1003 throw new IllegalArgumentException("Confusable prototypes cannot be longer than 256 entries."); 1004 } 1005 1006 int key = ConfusableDataUtils.codePointAndLengthToKey(keyChar, targetMapping.fStr.length()); 1007 int value = targetMapping.fCharOrStrTableIndex; 1008 1009 fKeyVec.add(key); 1010 fValueVec.add(value); 1011 } 1012 1013 // Put the assembled data into the destination SpoofData object. 1014 1015 // The Key Table 1016 // While copying the keys to the output array, 1017 // also sanity check that the keys are sorted. 1018 int numKeys = fKeyVec.size(); 1019 dest.fCFUKeys = new int[numKeys]; 1020 int previousCodePoint = 0; 1021 for (i = 0; i < numKeys; i++) { 1022 int key = fKeyVec.get(i); 1023 int codePoint = ConfusableDataUtils.keyToCodePoint(key); 1024 // strictly greater because there can be only one entry per code point 1025 assert codePoint > previousCodePoint; 1026 dest.fCFUKeys[i] = key; 1027 previousCodePoint = codePoint; 1028 } 1029 1030 // The Value Table, parallels the key table 1031 int numValues = fValueVec.size(); 1032 assert (numKeys == numValues); 1033 dest.fCFUValues = new short[numValues]; 1034 i = 0; 1035 for (int value : fValueVec) { 1036 assert (value < 0xffff); 1037 dest.fCFUValues[i++] = (short) value; 1038 } 1039 1040 // The Strings Table. 1041 dest.fCFUStrings = fStringTable.toString(); 1042 } 1043 1044 public static void buildConfusableData(Reader confusables, SpoofData dest) 1045 throws java.io.IOException, ParseException { 1046 ConfusabledataBuilder builder = new ConfusabledataBuilder(); 1047 builder.build(confusables, dest); 1048 } 1049 1050 /* 1051 * ***************************************************************************** 1052 * Internal classes for compiling confusable data into its binary (runtime) form. 1053 * ***************************************************************************** 1054 */ 1055 // SPUString 1056 // Holds a string that is the result of one of the mappings defined 1057 // by the confusable mapping data (confusables.txt from Unicode.org) 1058 // Instances of SPUString exist during the compilation process only. 1059 1060 private static class SPUString { 1061 String fStr; // The actual string. 1062 int fCharOrStrTableIndex; // Index into the final runtime data for this string. 1063 // (or, for length 1, the single string char itself, 1064 // there being no string table entry for it.) 1065 1066 SPUString(String s) { 1067 fStr = s; 1068 fCharOrStrTableIndex = 0; 1069 } 1070 } 1071 1072 // Comparison function for ordering strings in the string pool. 1073 // Compare by length first, then, within a group of the same length, 1074 // by code point order. 1075 1076 private static class SPUStringComparator implements Comparator<SPUString> { 1077 @Override 1078 public int compare(SPUString sL, SPUString sR) { 1079 int lenL = sL.fStr.length(); 1080 int lenR = sR.fStr.length(); 1081 if (lenL < lenR) { 1082 return -1; 1083 } else if (lenL > lenR) { 1084 return 1; 1085 } else { 1086 return sL.fStr.compareTo(sR.fStr); 1087 } 1088 } 1089 1090 final static SPUStringComparator INSTANCE = new SPUStringComparator(); 1091 } 1092 1093 // String Pool A utility class for holding the strings that are the result of 1094 // the spoof mappings. These strings will utimately end up in the 1095 // run-time String Table. 1096 // This is sort of like a sorted set of strings, except that ICU's anemic 1097 // built-in collections don't support those, so it is implemented with a 1098 // combination of a uhash and a Vector. 1099 private static class SPUStringPool { 1100 public SPUStringPool() { 1101 fVec = new Vector<>(); 1102 fHash = new Hashtable<>(); 1103 } 1104 1105 public int size() { 1106 return fVec.size(); 1107 } 1108 1109 // Get the n-th string in the collection. 1110 public SPUString getByIndex(int index) { 1111 SPUString retString = fVec.elementAt(index); 1112 return retString; 1113 } 1114 1115 // Add a string. Return the string from the table. 1116 // If the input parameter string is already in the table, delete the 1117 // input parameter and return the existing string. 1118 public SPUString addString(String src) { 1119 SPUString hashedString = fHash.get(src); 1120 if (hashedString == null) { 1121 hashedString = new SPUString(src); 1122 fHash.put(src, hashedString); 1123 fVec.addElement(hashedString); 1124 } 1125 return hashedString; 1126 } 1127 1128 // Sort the contents; affects the ordering of getByIndex(). 1129 public void sort() { 1130 Collections.sort(fVec, SPUStringComparator.INSTANCE); 1131 } 1132 1133 private Vector<SPUString> fVec; // Elements are SPUString * 1134 private Hashtable<String, SPUString> fHash; // Key: Value: 1135 } 1136 1137 } 1138 } 1139 1140 /** 1141 * Get the Restriction Level that is being tested. 1142 * 1143 * @return The restriction level 1144 * @internal 1145 * @deprecated This API is ICU internal only. 1146 */ 1147 @Deprecated 1148 public RestrictionLevel getRestrictionLevel() { 1149 return fRestrictionLevel; 1150 } 1151 1152 /** 1153 * Get the set of checks that this Spoof Checker has been configured to perform. 1154 * 1155 * @return The set of checks that this spoof checker will perform. 1156 * @stable ICU 4.6 1157 */ 1158 public int getChecks() { 1159 return fChecks; 1160 } 1161 1162 /** 1163 * Get a read-only set of locales for the scripts that are acceptable in strings to be checked. If no limitations on 1164 * scripts have been specified, an empty set will be returned. 1165 * 1166 * setAllowedChars() will reset the list of allowed locales to be empty. 1167 * 1168 * The returned set may not be identical to the originally specified set that is supplied to setAllowedLocales(); 1169 * the information other than languages from the originally specified locales may be omitted. 1170 * 1171 * @return A set of locales corresponding to the acceptable scripts. 1172 * 1173 * @stable ICU 4.6 1174 */ 1175 public Set<ULocale> getAllowedLocales() { 1176 return Collections.unmodifiableSet(fAllowedLocales); 1177 } 1178 1179 /** 1180 * Get a set of {@link java.util.Locale} instances for the scripts that are acceptable in strings to be checked. If 1181 * no limitations on scripts have been specified, an empty set will be returned. 1182 * 1183 * @return A set of locales corresponding to the acceptable scripts. 1184 * @stable ICU 54 1185 */ 1186 public Set<Locale> getAllowedJavaLocales() { 1187 HashSet<Locale> locales = new HashSet<>(fAllowedLocales.size()); 1188 for (ULocale uloc : fAllowedLocales) { 1189 locales.add(uloc.toLocale()); 1190 } 1191 return locales; 1192 } 1193 1194 /** 1195 * Get a UnicodeSet for the characters permitted in an identifier. This corresponds to the limits imposed by the Set 1196 * Allowed Characters functions. Limitations imposed by other checks will not be reflected in the set returned by 1197 * this function. 1198 * 1199 * The returned set will be frozen, meaning that it cannot be modified by the caller. 1200 * 1201 * @return A UnicodeSet containing the characters that are permitted by the CHAR_LIMIT test. 1202 * @stable ICU 4.6 1203 */ 1204 public UnicodeSet getAllowedChars() { 1205 return fAllowedCharsSet; 1206 } 1207 1208 /** 1209 * A struct-like class to hold the results of a Spoof Check operation. Tells which check(s) have failed. 1210 * 1211 * @stable ICU 4.6 1212 */ 1213 public static class CheckResult { 1214 /** 1215 * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests 1216 * in question: RESTRICTION_LEVEL, CHAR_LIMIT, and so on. 1217 * 1218 * @stable ICU 4.6 1219 * @see Builder#setChecks 1220 */ 1221 public int checks; 1222 1223 /** 1224 * The index of the first string position that failed a check. 1225 * 1226 * @deprecated ICU 51. No longer supported. Always set to zero. 1227 */ 1228 @Deprecated 1229 public int position; 1230 1231 /** 1232 * The numerics found in the string, if MIXED_NUMBERS was set; otherwise null. The set will contain the zero 1233 * digit from each decimal number system found in the input string. 1234 * 1235 * @stable ICU 58 1236 */ 1237 public UnicodeSet numerics; 1238 1239 /** 1240 * The restriction level that the text meets, if RESTRICTION_LEVEL is set; otherwise null. 1241 * 1242 * @stable ICU 58 1243 */ 1244 public RestrictionLevel restrictionLevel; 1245 1246 /** 1247 * Default constructor 1248 * 1249 * @stable ICU 4.6 1250 */ 1251 public CheckResult() { 1252 checks = 0; 1253 position = 0; 1254 } 1255 1256 /** 1257 * {@inheritDoc} 1258 * 1259 * @stable ICU 4.6 1260 */ 1261 @Override 1262 public String toString() { 1263 StringBuilder sb = new StringBuilder(); 1264 sb.append("checks:"); 1265 if (checks == 0) { 1266 sb.append(" none"); 1267 } else if (checks == ALL_CHECKS) { 1268 sb.append(" all"); 1269 } else { 1270 if ((checks & SINGLE_SCRIPT_CONFUSABLE) != 0) { 1271 sb.append(" SINGLE_SCRIPT_CONFUSABLE"); 1272 } 1273 if ((checks & MIXED_SCRIPT_CONFUSABLE) != 0) { 1274 sb.append(" MIXED_SCRIPT_CONFUSABLE"); 1275 } 1276 if ((checks & WHOLE_SCRIPT_CONFUSABLE) != 0) { 1277 sb.append(" WHOLE_SCRIPT_CONFUSABLE"); 1278 } 1279 if ((checks & ANY_CASE) != 0) { 1280 sb.append(" ANY_CASE"); 1281 } 1282 if ((checks & RESTRICTION_LEVEL) != 0) { 1283 sb.append(" RESTRICTION_LEVEL"); 1284 } 1285 if ((checks & INVISIBLE) != 0) { 1286 sb.append(" INVISIBLE"); 1287 } 1288 if ((checks & CHAR_LIMIT) != 0) { 1289 sb.append(" CHAR_LIMIT"); 1290 } 1291 if ((checks & MIXED_NUMBERS) != 0) { 1292 sb.append(" MIXED_NUMBERS"); 1293 } 1294 } 1295 sb.append(", numerics: ").append(numerics.toPattern(false)); 1296 sb.append(", position: ").append(position); 1297 sb.append(", restrictionLevel: ").append(restrictionLevel); 1298 return sb.toString(); 1299 } 1300 } 1301 1302 /** 1303 * Check the specified string for possible security issues. The text to be checked will typically be an identifier 1304 * of some sort. The set of checks to be performed was specified when building the SpoofChecker. 1305 * 1306 * @param text 1307 * A String to be checked for possible security issues. 1308 * @param checkResult 1309 * Output parameter, indicates which specific tests failed. May be null if the information is not wanted. 1310 * @return True there any issue is found with the input string. 1311 * @stable ICU 4.8 1312 */ 1313 public boolean failsChecks(String text, CheckResult checkResult) { 1314 int length = text.length(); 1315 1316 int result = 0; 1317 if (checkResult != null) { 1318 checkResult.position = 0; 1319 checkResult.numerics = null; 1320 checkResult.restrictionLevel = null; 1321 } 1322 1323 if (0 != (this.fChecks & RESTRICTION_LEVEL)) { 1324 RestrictionLevel textRestrictionLevel = getRestrictionLevel(text); 1325 if (textRestrictionLevel.compareTo(fRestrictionLevel) > 0) { 1326 result |= RESTRICTION_LEVEL; 1327 } 1328 if (checkResult != null) { 1329 checkResult.restrictionLevel = textRestrictionLevel; 1330 } 1331 } 1332 1333 if (0 != (this.fChecks & MIXED_NUMBERS)) { 1334 UnicodeSet numerics = new UnicodeSet(); 1335 getNumerics(text, numerics); 1336 if (numerics.size() > 1) { 1337 result |= MIXED_NUMBERS; 1338 } 1339 if (checkResult != null) { 1340 checkResult.numerics = numerics; 1341 } 1342 } 1343 1344 if (0 != (this.fChecks & HIDDEN_OVERLAY)) { 1345 int index = findHiddenOverlay(text); 1346 if (index != -1) { 1347 result |= HIDDEN_OVERLAY; 1348 } 1349 } 1350 1351 if (0 != (this.fChecks & CHAR_LIMIT)) { 1352 int i; 1353 int c; 1354 for (i = 0; i < length;) { 1355 // U16_NEXT(text, i, length, c); 1356 c = Character.codePointAt(text, i); 1357 i = Character.offsetByCodePoints(text, i, 1); 1358 if (!this.fAllowedCharsSet.contains(c)) { 1359 result |= CHAR_LIMIT; 1360 break; 1361 } 1362 } 1363 } 1364 1365 if (0 != (this.fChecks & INVISIBLE)) { 1366 // This check needs to be done on NFD input 1367 String nfdText = nfdNormalizer.normalize(text); 1368 1369 // scan for more than one occurrence of the same non-spacing mark 1370 // in a sequence of non-spacing marks. 1371 int i; 1372 int c; 1373 int firstNonspacingMark = 0; 1374 boolean haveMultipleMarks = false; 1375 UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a 1376 // single combining sequence. 1377 for (i = 0; i < length;) { 1378 c = Character.codePointAt(nfdText, i); 1379 i = Character.offsetByCodePoints(nfdText, i, 1); 1380 if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) { 1381 firstNonspacingMark = 0; 1382 if (haveMultipleMarks) { 1383 marksSeenSoFar.clear(); 1384 haveMultipleMarks = false; 1385 } 1386 continue; 1387 } 1388 if (firstNonspacingMark == 0) { 1389 firstNonspacingMark = c; 1390 continue; 1391 } 1392 if (!haveMultipleMarks) { 1393 marksSeenSoFar.add(firstNonspacingMark); 1394 haveMultipleMarks = true; 1395 } 1396 if (marksSeenSoFar.contains(c)) { 1397 // report the error, and stop scanning. 1398 // No need to find more than the first failure. 1399 result |= INVISIBLE; 1400 break; 1401 } 1402 marksSeenSoFar.add(c); 1403 } 1404 } 1405 if (checkResult != null) { 1406 checkResult.checks = result; 1407 } 1408 return (0 != result); 1409 } 1410 1411 /** 1412 * Check the specified string for possible security issues. The text to be checked will typically be an identifier 1413 * of some sort. The set of checks to be performed was specified when building the SpoofChecker. 1414 * 1415 * @param text 1416 * A String to be checked for possible security issues. 1417 * @return True there any issue is found with the input string. 1418 * @stable ICU 4.8 1419 */ failsChecks(String text)1420 public boolean failsChecks(String text) { 1421 return failsChecks(text, null); 1422 } 1423 1424 /** 1425 * Check the whether two specified strings are visually confusable. The types of confusability to be tested - single 1426 * script, mixed script, or whole script - are determined by the check options set for the SpoofChecker. 1427 * 1428 * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE 1429 * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected. 1430 * 1431 * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case 1432 * folded for comparison and display to the user, do not select the ANY_CASE option. 1433 * 1434 * 1435 * @param s1 1436 * The first of the two strings to be compared for confusability. 1437 * @param s2 1438 * The second of the two strings to be compared for confusability. 1439 * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability 1440 * found, as defined by spoof check test constants. 1441 * @stable ICU 4.6 1442 */ areConfusable(String s1, String s2)1443 public int areConfusable(String s1, String s2) { 1444 // 1445 // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable, 1446 // and for definitions of the types (single, whole, mixed-script) of confusables. 1447 1448 // We only care about a few of the check flags. Ignore the others. 1449 // If no tests relevant to this function have been specified, signal an error. 1450 // TODO: is this really the right thing to do? It's probably an error on 1451 // the caller's part, but logically we would just return 0 (no error). 1452 if ((this.fChecks & CONFUSABLE) == 0) { 1453 throw new IllegalArgumentException("No confusable checks are enabled."); 1454 } 1455 1456 // Compute the skeletons and check for confusability. 1457 String s1Skeleton = getSkeleton(s1); 1458 String s2Skeleton = getSkeleton(s2); 1459 if (!s1Skeleton.equals(s2Skeleton)) { 1460 return 0; 1461 } 1462 1463 // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes 1464 // of confusables according to UTS 39 section 4. 1465 // Start by computing the resolved script sets of s1 and s2. 1466 ScriptSet s1RSS = new ScriptSet(); 1467 getResolvedScriptSet(s1, s1RSS); 1468 ScriptSet s2RSS = new ScriptSet(); 1469 getResolvedScriptSet(s2, s2RSS); 1470 1471 // Turn on all applicable flags 1472 int result = 0; 1473 if (s1RSS.intersects(s2RSS)) { 1474 result |= SINGLE_SCRIPT_CONFUSABLE; 1475 } else { 1476 result |= MIXED_SCRIPT_CONFUSABLE; 1477 if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) { 1478 result |= WHOLE_SCRIPT_CONFUSABLE; 1479 } 1480 } 1481 1482 // Turn off flags that the user doesn't want 1483 result &= fChecks; 1484 1485 return result; 1486 } 1487 1488 /** 1489 * Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are 1490 * confusable if their skeletons are identical. See Unicode UAX 39 for additional information. 1491 * 1492 * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some 1493 * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons. 1494 * 1495 * Skeletons are computed using the algorithm and data described in Unicode UAX 39. 1496 * 1497 * @param str 1498 * The input string whose skeleton will be generated. 1499 * @return The output skeleton string. 1500 * 1501 * @stable ICU 58 1502 */ getSkeleton(CharSequence str)1503 public String getSkeleton(CharSequence str) { 1504 // Apply the skeleton mapping to the NFD normalized input string 1505 // Accumulate the skeleton, possibly unnormalized, in a String. 1506 String nfdId = nfdNormalizer.normalize(str); 1507 int normalizedLen = nfdId.length(); 1508 StringBuilder skelSB = new StringBuilder(); 1509 for (int inputIndex = 0; inputIndex < normalizedLen;) { 1510 int c = Character.codePointAt(nfdId, inputIndex); 1511 inputIndex += Character.charCount(c); 1512 this.fSpoofData.confusableLookup(c, skelSB); 1513 } 1514 String skelStr = skelSB.toString(); 1515 skelStr = nfdNormalizer.normalize(skelStr); 1516 return skelStr; 1517 } 1518 1519 /** 1520 * Calls {@link SpoofChecker#getSkeleton(CharSequence id)}. Starting with ICU 55, the "type" parameter has been 1521 * ignored, and starting with ICU 58, this function has been deprecated. 1522 * 1523 * @param type 1524 * No longer supported. Prior to ICU 55, was used to specify the mapping table SL, SA, ML, or MA. 1525 * @param id 1526 * The input identifier whose skeleton will be generated. 1527 * @return The output skeleton string. 1528 * 1529 * @deprecated ICU 58 1530 */ 1531 @Deprecated getSkeleton(int type, String id)1532 public String getSkeleton(int type, String id) { 1533 return getSkeleton(id); 1534 } 1535 1536 /** 1537 * Equality function. Return true if the two SpoofChecker objects incorporate the same confusable data and have 1538 * enabled the same set of checks. 1539 * 1540 * @param other 1541 * the SpoofChecker being compared with. 1542 * @return true if the two SpoofCheckers are equal. 1543 * @stable ICU 4.6 1544 */ 1545 @Override equals(Object other)1546 public boolean equals(Object other) { 1547 if (!(other instanceof SpoofChecker)) { 1548 return false; 1549 } 1550 SpoofChecker otherSC = (SpoofChecker) other; 1551 if (fSpoofData != otherSC.fSpoofData && fSpoofData != null && !fSpoofData.equals(otherSC.fSpoofData)) { 1552 return false; 1553 } 1554 if (fChecks != otherSC.fChecks) { 1555 return false; 1556 } 1557 if (fAllowedLocales != otherSC.fAllowedLocales && fAllowedLocales != null 1558 && !fAllowedLocales.equals(otherSC.fAllowedLocales)) { 1559 return false; 1560 } 1561 if (fAllowedCharsSet != otherSC.fAllowedCharsSet && fAllowedCharsSet != null 1562 && !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) { 1563 return false; 1564 } 1565 if (fRestrictionLevel != otherSC.fRestrictionLevel) { 1566 return false; 1567 } 1568 return true; 1569 } 1570 1571 /** 1572 * Overrides {@link Object#hashCode()}. 1573 * @stable ICU 4.6 1574 */ 1575 @Override hashCode()1576 public int hashCode() { 1577 return fChecks 1578 ^ fSpoofData.hashCode() 1579 ^ fAllowedLocales.hashCode() 1580 ^ fAllowedCharsSet.hashCode() 1581 ^ fRestrictionLevel.ordinal(); 1582 } 1583 1584 /** 1585 * Computes the augmented script set for a code point, according to UTS 39 section 5.1. 1586 */ getAugmentedScriptSet(int codePoint, ScriptSet result)1587 private static void getAugmentedScriptSet(int codePoint, ScriptSet result) { 1588 result.clear(); 1589 UScript.getScriptExtensions(codePoint, result); 1590 1591 // Section 5.1 step 1 1592 if (result.get(UScript.HAN)) { 1593 result.set(UScript.HAN_WITH_BOPOMOFO); 1594 result.set(UScript.JAPANESE); 1595 result.set(UScript.KOREAN); 1596 } 1597 if (result.get(UScript.HIRAGANA)) { 1598 result.set(UScript.JAPANESE); 1599 } 1600 if (result.get(UScript.KATAKANA)) { 1601 result.set(UScript.JAPANESE); 1602 } 1603 if (result.get(UScript.HANGUL)) { 1604 result.set(UScript.KOREAN); 1605 } 1606 if (result.get(UScript.BOPOMOFO)) { 1607 result.set(UScript.HAN_WITH_BOPOMOFO); 1608 } 1609 1610 // Section 5.1 step 2 1611 if (result.get(UScript.COMMON) || result.get(UScript.INHERITED)) { 1612 result.setAll(); 1613 } 1614 } 1615 1616 /** 1617 * Computes the resolved script set for a string, according to UTS 39 section 5.1. 1618 */ getResolvedScriptSet(CharSequence input, ScriptSet result)1619 private void getResolvedScriptSet(CharSequence input, ScriptSet result) { 1620 getResolvedScriptSetWithout(input, UScript.CODE_LIMIT, result); 1621 } 1622 1623 /** 1624 * Computes the resolved script set for a string, omitting characters having the specified script. If 1625 * UScript.CODE_LIMIT is passed as the second argument, all characters are included. 1626 */ getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result)1627 private void getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result) { 1628 result.setAll(); 1629 1630 ScriptSet temp = new ScriptSet(); 1631 for (int utf16Offset = 0; utf16Offset < input.length();) { 1632 int codePoint = Character.codePointAt(input, utf16Offset); 1633 utf16Offset += Character.charCount(codePoint); 1634 1635 // Compute the augmented script set for the character 1636 getAugmentedScriptSet(codePoint, temp); 1637 1638 // Intersect the augmented script set with the resolved script set, but only if the character doesn't 1639 // have the script specified in the function call 1640 if (script == UScript.CODE_LIMIT || !temp.get(script)) { 1641 result.and(temp); 1642 } 1643 } 1644 } 1645 1646 /** 1647 * Computes the set of numerics for a string, according to UTS 39 section 5.3. 1648 */ getNumerics(String input, UnicodeSet result)1649 private void getNumerics(String input, UnicodeSet result) { 1650 result.clear(); 1651 1652 for (int utf16Offset = 0; utf16Offset < input.length();) { 1653 int codePoint = Character.codePointAt(input, utf16Offset); 1654 utf16Offset += Character.charCount(codePoint); 1655 1656 // Store a representative character for each kind of decimal digit 1657 if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) { 1658 // Store the zero character as a representative for comparison. 1659 // Unicode guarantees it is codePoint - value 1660 result.add(codePoint - UCharacter.getNumericValue(codePoint)); 1661 } 1662 } 1663 } 1664 1665 /** 1666 * Computes the restriction level of a string, according to UTS 39 section 5.2. 1667 */ getRestrictionLevel(String input)1668 private RestrictionLevel getRestrictionLevel(String input) { 1669 // Section 5.2 step 1: 1670 if (!fAllowedCharsSet.containsAll(input)) { 1671 return RestrictionLevel.UNRESTRICTIVE; 1672 } 1673 1674 // Section 5.2 step 2: 1675 if (ASCII.containsAll(input)) { 1676 return RestrictionLevel.ASCII; 1677 } 1678 1679 // Section 5.2 steps 3: 1680 ScriptSet resolvedScriptSet = new ScriptSet(); 1681 getResolvedScriptSet(input, resolvedScriptSet); 1682 1683 // Section 5.2 step 4: 1684 if (!resolvedScriptSet.isEmpty()) { 1685 return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE; 1686 } 1687 1688 // Section 5.2 step 5: 1689 ScriptSet resolvedNoLatn = new ScriptSet(); 1690 getResolvedScriptSetWithout(input, UScript.LATIN, resolvedNoLatn); 1691 1692 // Section 5.2 step 6: 1693 if (resolvedNoLatn.get(UScript.HAN_WITH_BOPOMOFO) || resolvedNoLatn.get(UScript.JAPANESE) 1694 || resolvedNoLatn.get(UScript.KOREAN)) { 1695 return RestrictionLevel.HIGHLY_RESTRICTIVE; 1696 } 1697 1698 // Section 5.2 step 7: 1699 if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) && !resolvedNoLatn.get(UScript.GREEK) 1700 && !resolvedNoLatn.get(UScript.CHEROKEE)) { 1701 return RestrictionLevel.MODERATELY_RESTRICTIVE; 1702 } 1703 1704 // Section 5.2 step 8: 1705 return RestrictionLevel.MINIMALLY_RESTRICTIVE; 1706 } 1707 findHiddenOverlay(String input)1708 int findHiddenOverlay(String input) { 1709 boolean sawLeadCharacter = false; 1710 StringBuilder sb = new StringBuilder(); 1711 for (int i=0; i<input.length();) { 1712 int cp = input.codePointAt(i); 1713 if (sawLeadCharacter && cp == 0x0307) { 1714 return i; 1715 } 1716 int combiningClass = UCharacter.getCombiningClass(cp); 1717 // Skip over characters except for those with combining class 0 (non-combining characters) or with 1718 // combining class 230 (same class as U+0307) 1719 assert UCharacter.getCombiningClass(0x0307) == 230; 1720 if (combiningClass == 0 || combiningClass == 230) { 1721 sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp, sb); 1722 } 1723 i += UCharacter.charCount(cp); 1724 } 1725 return -1; 1726 } 1727 isIllegalCombiningDotLeadCharacterNoLookup(int cp)1728 boolean isIllegalCombiningDotLeadCharacterNoLookup(int cp) { 1729 return cp == 'i' || cp == 'j' || cp == 'ı' || cp == 'ȷ' || cp == 'l' || 1730 UCharacter.hasBinaryProperty(cp, UProperty.SOFT_DOTTED); 1731 } 1732 isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb)1733 boolean isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb) { 1734 if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) { 1735 return true; 1736 } 1737 sb.setLength(0); 1738 fSpoofData.confusableLookup(cp, sb); 1739 int finalCp = UCharacter.codePointBefore(sb, sb.length()); 1740 if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) { 1741 return true; 1742 } 1743 return false; 1744 } 1745 1746 // Data Members 1747 private int fChecks; // Bit vector of checks to perform. 1748 private SpoofData fSpoofData; 1749 private Set<ULocale> fAllowedLocales; // The Set of allowed locales. 1750 private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters. 1751 private RestrictionLevel fRestrictionLevel; 1752 1753 private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance(); 1754 1755 // Confusable Mappings Data Structures, version 2.0 1756 // 1757 // This description and the corresponding implementation are to be kept 1758 // in-sync with the copy in icu4c uspoof_impl.h. 1759 // 1760 // For the confusable data, we are essentially implementing a map, 1761 // key: a code point 1762 // value: a string. Most commonly one char in length, but can be more. 1763 // 1764 // The keys are stored as a sorted array of 32 bit ints. 1765 // bits 0-23 a code point value 1766 // bits 24-31 length of value string, in UChars (between 1 and 256 UChars). 1767 // The key table is sorted in ascending code point order. (not on the 1768 // 32 bit int value, the flag bits do not participate in the sorting.) 1769 // 1770 // Lookup is done by means of a binary search in the key table. 1771 // 1772 // The corresponding values are kept in a parallel array of 16 bit ints. 1773 // If the value string is of length 1, it is literally in the value array. 1774 // For longer strings, the value array contains an index into the strings 1775 // table. 1776 // 1777 // String Table: 1778 // The strings table contains all of the value strings (those of length two or greater) 1779 // concatenated together into one long char (UTF-16) array. 1780 // 1781 // There is no nul character or other mark between adjacent strings. 1782 // 1783 //---------------------------------------------------------------------------- 1784 // 1785 // Changes from format version 1 to format version 2: 1786 // 1) Removal of the whole-script confusable data tables. 1787 // 2) Removal of the SL/SA/ML/MA and multi-table flags in the key bitmask. 1788 // 3) Expansion of string length value in the key bitmask from 2 bits to 8 bits. 1789 // 4) Removal of the string lengths table since 8 bits is sufficient for the 1790 // lengths of all entries in confusables.txt. 1791 // 1792 private static final class ConfusableDataUtils { 1793 public static final int FORMAT_VERSION = 2; // version for ICU 58 1794 keyToCodePoint(int key)1795 public static final int keyToCodePoint(int key) { 1796 return key & 0x00ffffff; 1797 } 1798 keyToLength(int key)1799 public static final int keyToLength(int key) { 1800 return ((key & 0xff000000) >> 24) + 1; 1801 } 1802 codePointAndLengthToKey(int codePoint, int length)1803 public static final int codePointAndLengthToKey(int codePoint, int length) { 1804 assert (codePoint & 0x00ffffff) == codePoint; 1805 assert length <= 256; 1806 return codePoint | ((length - 1) << 24); 1807 } 1808 } 1809 1810 // ------------------------------------------------------------------------------------- 1811 // 1812 // SpoofData 1813 // 1814 // This class corresponds to the ICU SpoofCheck data. 1815 // 1816 // The data can originate with the Binary ICU data that is generated in ICU4C, 1817 // or it can originate from source rules that are compiled in ICU4J. 1818 // 1819 // This class does not include the set of checks to be performed, but only 1820 // data that is serialized into the ICU binary data. 1821 // 1822 // Because Java cannot easily wrap binary data like ICU4C, the binary data is 1823 // copied into Java structures that are convenient for use by the run time code. 1824 // 1825 // --------------------------------------------------------------------------------------- 1826 private static class SpoofData { 1827 1828 // The Confusable data, Java data structures for. 1829 int[] fCFUKeys; 1830 short[] fCFUValues; 1831 String fCFUStrings; 1832 1833 private static final int DATA_FORMAT = 0x43667520; // "Cfu " 1834 1835 private static final class IsAcceptable implements Authenticate { 1836 @Override isDataVersionAcceptable(byte version[])1837 public boolean isDataVersionAcceptable(byte version[]) { 1838 return version[0] == ConfusableDataUtils.FORMAT_VERSION || version[1] != 0 || version[2] != 0 1839 || version[3] != 0; 1840 } 1841 } 1842 1843 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 1844 1845 private static final class DefaultData { 1846 private static SpoofData INSTANCE = null; 1847 private static IOException EXCEPTION = null; 1848 1849 static { 1850 // Note: Although this is static, the Java runtime can delay execution of this block until 1851 // the data is actually requested via SpoofData.getDefault(). 1852 try { 1853 INSTANCE = new SpoofData(ICUBinary.getRequiredData("confusables.cfu")); 1854 } catch (IOException e) { 1855 EXCEPTION = e; 1856 } 1857 } 1858 } 1859 1860 /** 1861 * @return instance for Unicode standard data 1862 */ getDefault()1863 public static SpoofData getDefault() { 1864 if (DefaultData.EXCEPTION != null) { 1865 throw new MissingResourceException( 1866 "Could not load default confusables data: " + DefaultData.EXCEPTION.getMessage(), 1867 "SpoofChecker", ""); 1868 } 1869 return DefaultData.INSTANCE; 1870 } 1871 1872 // SpoofChecker Data constructor for use from data builder. 1873 // Initializes a new, empty data area that will be populated later. SpoofData()1874 private SpoofData() { 1875 } 1876 1877 // Constructor for use when creating from prebuilt default data. 1878 // A ByteBuffer is what the ICU internal data loading functions provide. SpoofData(ByteBuffer bytes)1879 private SpoofData(ByteBuffer bytes) throws java.io.IOException { 1880 ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE); 1881 bytes.mark(); 1882 readData(bytes); 1883 } 1884 1885 @Override equals(Object other)1886 public boolean equals(Object other) { 1887 if (!(other instanceof SpoofData)) { 1888 return false; 1889 } 1890 SpoofData otherData = (SpoofData) other; 1891 if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys)) 1892 return false; 1893 if (!Arrays.equals(fCFUValues, otherData.fCFUValues)) 1894 return false; 1895 if (!Utility.sameObjects(fCFUStrings, otherData.fCFUStrings) && fCFUStrings != null 1896 && !fCFUStrings.equals(otherData.fCFUStrings)) 1897 return false; 1898 return true; 1899 } 1900 1901 @Override hashCode()1902 public int hashCode() { 1903 return Arrays.hashCode(fCFUKeys) 1904 ^ Arrays.hashCode(fCFUValues) 1905 ^ fCFUStrings.hashCode(); 1906 } 1907 1908 // Set the SpoofChecker data from pre-built binary data in a byte buffer. 1909 // The binary data format is as described for ICU4C spoof data. 1910 // readData(ByteBuffer bytes)1911 private void readData(ByteBuffer bytes) throws java.io.IOException { 1912 int magic = bytes.getInt(); 1913 if (magic != 0x3845fdef) { 1914 throw new IllegalArgumentException("Bad Spoof Check Data."); 1915 } 1916 @SuppressWarnings("unused") 1917 int dataFormatVersion = bytes.getInt(); 1918 @SuppressWarnings("unused") 1919 int dataLength = bytes.getInt(); 1920 1921 int CFUKeysOffset = bytes.getInt(); 1922 int CFUKeysSize = bytes.getInt(); 1923 1924 int CFUValuesOffset = bytes.getInt(); 1925 int CFUValuesSize = bytes.getInt(); 1926 1927 int CFUStringTableOffset = bytes.getInt(); 1928 int CFUStringTableSize = bytes.getInt(); 1929 1930 // We have now read the file header, and obtained the position for each 1931 // of the data items. Now read each in turn, first seeking the 1932 // input stream to the position of the data item. 1933 1934 bytes.reset(); 1935 ICUBinary.skipBytes(bytes, CFUKeysOffset); 1936 fCFUKeys = ICUBinary.getInts(bytes, CFUKeysSize, 0); 1937 1938 bytes.reset(); 1939 ICUBinary.skipBytes(bytes, CFUValuesOffset); 1940 fCFUValues = ICUBinary.getShorts(bytes, CFUValuesSize, 0); 1941 1942 bytes.reset(); 1943 ICUBinary.skipBytes(bytes, CFUStringTableOffset); 1944 fCFUStrings = ICUBinary.getString(bytes, CFUStringTableSize, 0); 1945 } 1946 1947 /** 1948 * Append the confusable skeleton transform for a single code point to a StringBuilder. The string to be 1949 * appended will between 1 and 18 characters as of Unicode 9. 1950 * 1951 * This is the heart of the confusable skeleton generation implementation. 1952 */ confusableLookup(int inChar, StringBuilder dest)1953 public void confusableLookup(int inChar, StringBuilder dest) { 1954 // Perform a binary search. 1955 // [lo, hi), i.e lo is inclusive, hi is exclusive. 1956 // The result after the loop will be in lo. 1957 int lo = 0; 1958 int hi = length(); 1959 do { 1960 int mid = (lo + hi) / 2; 1961 if (codePointAt(mid) > inChar) { 1962 hi = mid; 1963 } else if (codePointAt(mid) < inChar) { 1964 lo = mid; 1965 } else { 1966 // Found result. Break early. 1967 lo = mid; 1968 break; 1969 } 1970 } while (hi - lo > 1); 1971 1972 // Did we find an entry? If not, the char maps to itself. 1973 if (codePointAt(lo) != inChar) { 1974 dest.appendCodePoint(inChar); 1975 return; 1976 } 1977 1978 // Add the element to the string builder and return. 1979 appendValueTo(lo, dest); 1980 return; 1981 } 1982 1983 /** 1984 * Return the number of confusable entries in this SpoofData. 1985 * 1986 * @return The number of entries. 1987 */ length()1988 public int length() { 1989 return fCFUKeys.length; 1990 } 1991 1992 /** 1993 * Return the code point (key) at the specified index. 1994 * 1995 * @param index 1996 * The index within the SpoofData. 1997 * @return The code point. 1998 */ codePointAt(int index)1999 public int codePointAt(int index) { 2000 return ConfusableDataUtils.keyToCodePoint(fCFUKeys[index]); 2001 } 2002 2003 /** 2004 * Append the confusable skeleton at the specified index to the StringBuilder dest. 2005 * 2006 * @param index 2007 * The index within the SpoofData. 2008 * @param dest 2009 * The StringBuilder to which to append the skeleton. 2010 */ appendValueTo(int index, StringBuilder dest)2011 public void appendValueTo(int index, StringBuilder dest) { 2012 int stringLength = ConfusableDataUtils.keyToLength(fCFUKeys[index]); 2013 2014 // Value is either a char (for strings of length 1) or 2015 // an index into the string table (for longer strings) 2016 short value = fCFUValues[index]; 2017 if (stringLength == 1) { 2018 dest.append((char) value); 2019 } else { 2020 dest.append(fCFUStrings, value, value + stringLength); 2021 } 2022 } 2023 } 2024 2025 // ------------------------------------------------------------------------------- 2026 // 2027 // ScriptSet - Script code bit sets. 2028 // Extends Java BitSet with input/output support and a few helper methods. 2029 // Note: The I/O is not currently being used, so it has been commented out. If 2030 // it is needed again, the code can be restored. 2031 // 2032 // ------------------------------------------------------------------------------- 2033 static class ScriptSet extends BitSet { 2034 2035 // Eclipse default value to quell warnings: 2036 private static final long serialVersionUID = 1L; 2037 2038 // // The serialized version of this class can hold INT_CAPACITY * 32 scripts. 2039 // private static final int INT_CAPACITY = 6; 2040 // private static final long serialVersionUID = INT_CAPACITY; 2041 // static { 2042 // assert ScriptSet.INT_CAPACITY * Integer.SIZE <= UScript.CODE_LIMIT; 2043 // } 2044 // 2045 // public ScriptSet() { 2046 // } 2047 // 2048 // public ScriptSet(ByteBuffer bytes) throws java.io.IOException { 2049 // for (int i = 0; i < INT_CAPACITY; i++) { 2050 // int bits = bytes.getInt(); 2051 // for (int j = 0; j < Integer.SIZE; j++) { 2052 // if ((bits & (1 << j)) != 0) { 2053 // set(i * Integer.SIZE + j); 2054 // } 2055 // } 2056 // } 2057 // } 2058 // 2059 // public void output(DataOutputStream os) throws java.io.IOException { 2060 // for (int i = 0; i < INT_CAPACITY; i++) { 2061 // int bits = 0; 2062 // for (int j = 0; j < Integer.SIZE; j++) { 2063 // if (get(i * Integer.SIZE + j)) { 2064 // bits |= (1 << j); 2065 // } 2066 // } 2067 // os.writeInt(bits); 2068 // } 2069 // } 2070 and(int script)2071 public void and(int script) { 2072 this.clear(0, script); 2073 this.clear(script + 1, UScript.CODE_LIMIT); 2074 } 2075 setAll()2076 public void setAll() { 2077 this.set(0, UScript.CODE_LIMIT); 2078 } 2079 isFull()2080 public boolean isFull() { 2081 return cardinality() == UScript.CODE_LIMIT; 2082 } 2083 appendStringTo(StringBuilder sb)2084 public void appendStringTo(StringBuilder sb) { 2085 sb.append("{ "); 2086 if (isEmpty()) { 2087 sb.append("- "); 2088 } else if (isFull()) { 2089 sb.append("* "); 2090 } else { 2091 for (int script = 0; script < UScript.CODE_LIMIT; script++) { 2092 if (get(script)) { 2093 sb.append(UScript.getShortName(script)); 2094 sb.append(" "); 2095 } 2096 } 2097 } 2098 sb.append("}"); 2099 } 2100 2101 @Override toString()2102 public String toString() { 2103 StringBuilder sb = new StringBuilder(); 2104 sb.append("<ScriptSet "); 2105 appendStringTo(sb); 2106 sb.append(">"); 2107 return sb.toString(); 2108 } 2109 } 2110 } 2111