1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 *************************************************************************** 6 * Copyright (C) 2008-2016 International Business Machines Corporation 7 * and others. All Rights Reserved. 8 *************************************************************************** 9 * 10 * Unicode Spoof Detection 11 */ 12 13 package ohos.global.icu.text; 14 15 import java.io.IOException; 16 import java.io.LineNumberReader; 17 import java.io.Reader; 18 import java.nio.ByteBuffer; 19 import java.text.ParseException; 20 import java.util.ArrayList; 21 import java.util.Arrays; 22 import java.util.BitSet; 23 import java.util.Collections; 24 import java.util.Comparator; 25 import java.util.HashSet; 26 import java.util.Hashtable; 27 import java.util.LinkedHashSet; 28 import java.util.Locale; 29 import java.util.MissingResourceException; 30 import java.util.Set; 31 import java.util.Vector; 32 import java.util.regex.Matcher; 33 import java.util.regex.Pattern; 34 35 import ohos.global.icu.impl.ICUBinary; 36 import ohos.global.icu.impl.ICUBinary.Authenticate; 37 import ohos.global.icu.impl.Utility; 38 import ohos.global.icu.lang.UCharacter; 39 import ohos.global.icu.lang.UCharacterCategory; 40 import ohos.global.icu.lang.UProperty; 41 import ohos.global.icu.lang.UScript; 42 import ohos.global.icu.util.ULocale; 43 44 /** 45 * <p> 46 * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and 47 * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions: 48 * 49 * <ol> 50 * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desparejado" and 51 * "ԁеѕрагејаԁо".</li> 52 * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof 53 * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li> 54 * </ol> 55 * 56 * <p> 57 * Although originally designed as a method for flagging suspicious identifier strings such as URLs, 58 * <code>SpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word 59 * content filters. 60 * 61 * <h2>Confusables</h2> 62 * 63 * <p> 64 * The following example shows how to use <code>SpoofChecker</code> to check for confusability between two strings: 65 * 66 * <pre> 67 * <code> 68 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 69 * int result = sc.areConfusable("desparejado", "ԁеѕрагејаԁо"); 70 * System.out.println(result != 0); // true 71 * </code> 72 * </pre> 73 * 74 * <p> 75 * <code>SpoofChecker</code> uses a builder paradigm: options are specified within the context of a lightweight 76 * {@link SpoofChecker.Builder} object, and upon calling {@link SpoofChecker.Builder#build}, expensive data loading 77 * operations are performed, and an immutable <code>SpoofChecker</code> is returned. 78 * 79 * <p> 80 * The first line of the example creates a <code>SpoofChecker</code> object with confusable-checking enabled; the second 81 * line performs the confusability test. For best performance, the instance should be created once (e.g., upon 82 * application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime. 83 * 84 * <p> 85 * UTS 39 defines two strings to be <em>confusable</em> if they map to the same skeleton. A <em>skeleton</em> is a 86 * sequence of families of confusable characters, where each family has a single exemplar character. 87 * {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so the following snippet is 88 * equivalent to the example above: 89 * 90 * <pre> 91 * <code> 92 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 93 * boolean result = sc.getSkeleton("desparejado").equals(sc.getSkeleton("ԁеѕрагејаԁо")); 94 * System.out.println(result); // true 95 * </code> 96 * </pre> 97 * 98 * <p> 99 * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling 100 * {@link SpoofChecker#areConfusable} many times in a loop, {@link SpoofChecker#getSkeleton} can be used instead, as 101 * shown below: 102 * 103 * <pre> 104 * // Setup: 105 * String[] DICTIONARY = new String[]{ "lorem", "ipsum" }; // example 106 * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build(); 107 * HashSet<String> skeletons = new HashSet<String>(); 108 * for (String word : DICTIONARY) { 109 * skeletons.add(sc.getSkeleton(word)); 110 * } 111 * 112 * // Live Check: 113 * boolean result = skeletons.contains(sc.getSkeleton("1orern")); 114 * System.out.println(result); // true 115 * </pre> 116 * 117 * <p> 118 * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em> 119 * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons 120 * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons. 121 * 122 * <h2>Spoof Detection</h2> 123 * 124 * <p> 125 * The following snippet shows a minimal example of using <code>SpoofChecker</code> to perform spoof detection on a 126 * string: 127 * 128 * <pre> 129 * SpoofChecker sc = new SpoofChecker.Builder() 130 * .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION)) 131 * .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE) 132 * .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE) 133 * .build(); 134 * boolean result = sc.failsChecks("pаypаl"); // with Cyrillic 'а' characters 135 * System.out.println(result); // true 136 * </pre> 137 * 138 * <p> 139 * As in the case for confusability checking, it is good practice to create one <code>SpoofChecker</code> instance at 140 * startup, and call the cheaper {@link SpoofChecker#failsChecks} online. In the second line, we specify the set of 141 * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. In the 142 * third line, the CONFUSABLE checks are disabled. It is good practice to disable them if you won't be using the 143 * instance to perform confusability checking. 144 * 145 * <p> 146 * To get more details on why a string failed the checks, use a {@link SpoofChecker.CheckResult}: 147 * 148 * <pre> 149 * <code> 150 * SpoofChecker sc = new SpoofChecker.Builder() 151 * .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION)) 152 * .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE) 153 * .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE) 154 * .build(); 155 * SpoofChecker.CheckResult checkResult = new SpoofChecker.CheckResult(); 156 * boolean result = sc.failsChecks("pаypаl", checkResult); 157 * System.out.println(checkResult.checks); // 16 158 * </code> 159 * </pre> 160 * 161 * <p> 162 * The return value is a bitmask of the checks that failed. In this case, there was one check that failed: 163 * {@link SpoofChecker#RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are: 164 * 165 * <ul> 166 * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the 167 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS 168 * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li> 169 * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character 170 * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li> 171 * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable 172 * characters. See {@link SpoofChecker.Builder#setAllowedChars} and {@link SpoofChecker.Builder#setAllowedLocales}.</li> 173 * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li> 174 * </ul> 175 * 176 * <p> 177 * These checks can be enabled independently of each other. For example, if you were interested in checking for only the 178 * INVISIBLE and MIXED_NUMBERS conditions, you could do: 179 * 180 * <pre> 181 * <code> 182 * SpoofChecker sc = new SpoofChecker.Builder() 183 * .setChecks(SpoofChecker.INVISIBLE | SpoofChecker.MIXED_NUMBERS) 184 * .build(); 185 * boolean result = sc.failsChecks("৪8"); 186 * System.out.println(result); // true 187 * </code> 188 * </pre> 189 * 190 * <p> 191 * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in 192 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings 193 * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have 194 * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is 195 * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed 196 * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on 197 * the levels, see UTS 39 or {@link SpoofChecker.RestrictionLevel}. The Restriction Level test is aware of the set of 198 * allowed characters set in {@link SpoofChecker.Builder#setAllowedChars}. Note that characters which have script code 199 * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple 200 * scripts. 201 * 202 * <h2>Additional Information</h2> 203 * 204 * <p> 205 * A <code>SpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers. 206 * 207 * <p> 208 * <b>Thread Safety:</b> The methods on <code>SpoofChecker</code> objects are thread safe. The test functions for 209 * checking a single identifier, or for testing whether two identifiers are potentially confusable, may called 210 * concurrently from multiple threads using the same <code>SpoofChecker</code> instance. 211 * 212 * @hide exposed on OHOS 213 */ 214 public class SpoofChecker { 215 216 /** 217 * Constants from UTS 39 for use in setRestrictionLevel. 218 * 219 * @hide exposed on OHOS 220 */ 221 public enum RestrictionLevel { 222 /** 223 * All characters in the string are in the identifier profile and all characters in the string are in the ASCII 224 * range. 225 */ 226 ASCII, 227 /** 228 * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and the 229 * string is single-script, according to the definition in UTS 39 section 5.1. 230 */ 231 SINGLE_SCRIPT_RESTRICTIVE, 232 /** 233 * The string classifies as Single Script, or all characters in the string are in the identifier profile and the 234 * string is covered by any of the following sets of scripts, according to the definition in UTS 39 section 5.1: 235 * <ul> 236 * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li> 237 * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li> 238 * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li> 239 * </ul> 240 */ 241 HIGHLY_RESTRICTIVE, 242 /** 243 * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile 244 * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic, 245 * Greek, and Cherokee. 246 */ 247 MODERATELY_RESTRICTIVE, 248 /** 249 * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts, such as 250 * Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. 251 */ 252 MINIMALLY_RESTRICTIVE, 253 /** 254 * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org 255 */ 256 UNRESTRICTIVE, 257 } 258 259 /** 260 * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}. 261 */ 262 public static final UnicodeSet INCLUSION = new UnicodeSet( 263 "['\\-.\\:\\u00B7\\u0375\\u058A\\u05F3\\u05F4\\u06FD\\u06FE\\u0F0B\\u200C" 264 + "\\u200D\\u2010\\u2019\\u2027\\u30A0\\u30FB]" 265 ).freeze(); 266 // Note: data from IdentifierStatus.txt & IdentifierType.txt 267 // There is tooling to generate this constant in the unicodetools project: 268 // org.unicode.text.tools.RecommendedSetGenerator 269 // It will print the Java and C++ code to the console for easy copy-paste into this file. 270 271 /** 272 * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}. 273 */ 274 public static final UnicodeSet RECOMMENDED = new UnicodeSet( 275 "[0-9A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u0131\\u0134-\\u013E" 276 + "\\u0141-\\u0148\\u014A-\\u017E\\u018F\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-" 277 + "\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B\\u021E" 278 + "\\u021F\\u0226-\\u0233\\u0259\\u02BB\\u02BC\\u02EC\\u0300-\\u0304\\u0306-" 279 + "\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\\u0328\\u032D\\u032E" 280 + "\\u0330\\u0331\\u0335\\u0338\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386" 281 + "\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-" 282 + "\\u04FF\\u0510-\\u0529\\u052E\\u052F\\u0531-\\u0556\\u0559\\u0561-\\u0586" 283 + "\\u05B4\\u05D0-\\u05EA\\u05EF-\\u05F2\\u0620-\\u063F\\u0641-\\u0655\\u0660-" 284 + "\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-\\u06A0\\u06A2-\\u06D3" 285 + "\\u06D5\\u06E5\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC" 286 + "\\u08B2\\u08B6-\\u08C7\\u0901-\\u094D\\u094F\\u0950\\u0956\\u0957\\u0960-" 287 + "\\u0963\\u0966-\\u096F\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983\\u0985-" 288 + "\\u098C\\u098F\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9" 289 + "\\u09BC-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CE\\u09D7\\u09E0-\\u09E3\\u09E6-" 290 + "\\u09F1\\u09FE\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-\\u0A28" 291 + "\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47" 292 + "\\u0A48\\u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D" 293 + "\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9" 294 + "\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-" 295 + "\\u0AEF\\u0AFA-\\u0AFF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F\\u0B10\\u0B13-" 296 + "\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47" 297 + "\\u0B48\\u0B4B-\\u0B4D\\u0B55-\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71" 298 + "\\u0B82\\u0B83\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A" 299 + "\\u0B9C\\u0B9E\\u0B9F\\u0BA3\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-" 300 + "\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD0\\u0BD7\\u0BE6-\\u0BEF\\u0C01-" 301 + "\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-" 302 + "\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56\\u0C60\\u0C61\\u0C66-" 303 + "\\u0C6F\\u0C80\\u0C82\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8" 304 + "\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD" 305 + "\\u0CD5\\u0CD6\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1\\u0CF2\\u0D00\\u0D02" 306 + "\\u0D03\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-" 307 + "\\u0D48\\u0D4A-\\u0D4E\\u0D54-\\u0D57\\u0D60\\u0D61\\u0D66-\\u0D6F\\u0D7A-" 308 + "\\u0D7F\\u0D82\\u0D83\\u0D85-\\u0D8E\\u0D91-\\u0D96\\u0D9A-\\u0DA5\\u0DA7-" 309 + "\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0DD4\\u0DD6" 310 + "\\u0DD8-\\u0DDE\\u0DF2\\u0E01-\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-" 311 + "\\u0E59\\u0E81\\u0E82\\u0E84\\u0E86-\\u0E8A\\u0E8C-\\u0EA3\\u0EA5\\u0EA7-" 312 + "\\u0EB2\\u0EB4-\\u0EBD\\u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9" 313 + "\\u0EDE\\u0EDF\\u0F00\\u0F20-\\u0F29\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-" 314 + "\\u0F47\\u0F49-\\u0F4C\\u0F4E-\\u0F51\\u0F53-\\u0F56\\u0F58-\\u0F5B\\u0F5D-" 315 + "\\u0F68\\u0F6A-\\u0F6C\\u0F71\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0F82-\\u0F84" 316 + "\\u0F86-\\u0F92\\u0F94-\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6" 317 + "\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-" 318 + "\\u109D\\u10C7\\u10CD\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-" 319 + "\\u1248\\u124A-\\u124D\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288" 320 + "\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-" 321 + "\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1315\\u1318-\\u135A\\u135D-" 322 + "\\u135F\\u1380-\\u138F\\u1780-\\u17A2\\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-" 323 + "\\u17CA\\u17D2\\u17D7\\u17DC\\u17E0-\\u17E9\\u1C90-\\u1CBA\\u1CBD-\\u1CBF" 324 + "\\u1E00-\\u1E99\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-" 325 + "\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70" 326 + "\\u1F72\\u1F74\\u1F76\\u1F78\\u1F7A\\u1F7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA" 327 + "\\u1FBC\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1FD0-\\u1FD2\\u1FD6-" 328 + "\\u1FDA\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FF8" 329 + "\\u1FFA\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE" 330 + "\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6" 331 + "\\u2DD8-\\u2DDE\\u3005-\\u3007\\u3041-\\u3096\\u3099\\u309A\\u309D\\u309E" 332 + "\\u30A1-\\u30FA\\u30FC-\\u30FE\\u3105-\\u312D\\u312F\\u31A0-\\u31BF\\u3400-" 333 + "\\u4DBF\\u4E00-\\u9FFC\\uA67F\\uA717-\\uA71F\\uA788\\uA78D\\uA792\\uA793" 334 + "\\uA7AA\\uA7AE\\uA7B8\\uA7B9\\uA7C2-\\uA7CA\\uA9E7-\\uA9FE\\uAA60-\\uAA76" 335 + "\\uAA7A-\\uAA7F\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26" 336 + "\\uAB28-\\uAB2E\\uAB66\\uAB67\\uAC00-\\uD7A3\\uFA0E\\uFA0F\\uFA11\\uFA13" 337 + "\\uFA14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\U00011301\\U00011303" 338 + "\\U0001133B\\U0001133C\\U00016FF0\\U00016FF1\\U0001B150-\\U0001B152" 339 + "\\U0001B164-\\U0001B167\\U00020000-\\U0002A6DD\\U0002A700-\\U0002B734" 340 + "\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1\\U0002CEB0-\\U0002EBE0" 341 + "\\U00030000-\\U0003134A]" 342 ).freeze(); 343 // Note: data from IdentifierStatus.txt & IdentifierType.txt 344 // There is tooling to generate this constant in the unicodetools project: 345 // org.unicode.text.tools.RecommendedSetGenerator 346 // It will print the Java and C++ code to the console for easy copy-paste into this file. 347 348 /** 349 * Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of 350 * checks that will be performed, and to report results from the check function. 351 * 352 */ 353 354 /** 355 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 356 * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section 357 * 4. 358 */ 359 public static final int SINGLE_SCRIPT_CONFUSABLE = 1; 360 361 /** 362 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 363 * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS 364 * 39 section 4. 365 */ 366 public static final int MIXED_SCRIPT_CONFUSABLE = 2; 367 368 /** 369 * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates 370 * that the two strings are visually confusable and that they are not from the same script but both of them are 371 * single-script strings, according to UTS 39 section 4. 372 */ 373 public static final int WHOLE_SCRIPT_CONFUSABLE = 4; 374 375 /** 376 * Enable this flag in {@link SpoofChecker.Builder#setChecks} to turn on all types of confusables. You may set the 377 * checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to make 378 * {@link SpoofChecker#areConfusable} return only those types of confusables. 379 */ 380 public static final int CONFUSABLE = SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE; 381 382 /** 383 * This flag is deprecated and no longer affects the behavior of SpoofChecker. 384 * 385 * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was 386 * deprecated. 387 */ 388 @Deprecated 389 public static final int ANY_CASE = 8; 390 391 /** 392 * Check that an identifier satisfies the requirements for the restriction level specified in 393 * {@link SpoofChecker.Builder#setRestrictionLevel}. The default restriction level is 394 * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. 395 */ 396 public static final int RESTRICTION_LEVEL = 16; 397 398 /** 399 * Check that an identifier contains only characters from a single script (plus chars from the common and inherited 400 * scripts.) Applies to checks of a single identifier check only. 401 * 402 * @deprecated ICU 51 Use RESTRICTION_LEVEL 403 */ 404 @Deprecated 405 public static final int SINGLE_SCRIPT = RESTRICTION_LEVEL; 406 407 /** 408 * Check an identifier for the presence of invisible characters, such as zero-width spaces, or character sequences 409 * that are likely not to display, such as multiple occurrences of the same non-spacing mark. This check does not 410 * test the input string as a whole for conformance to any particular syntax for identifiers. 411 */ 412 public static final int INVISIBLE = 32; 413 414 /** 415 * Check that an identifier contains only characters from a specified set of acceptable characters. See 416 * {@link Builder#setAllowedChars} and {@link Builder#setAllowedLocales}. Note that a string that fails this check 417 * will also fail the {@link #RESTRICTION_LEVEL} check. 418 */ 419 public static final int CHAR_LIMIT = 64; 420 421 /** 422 * Check that an identifier does not mix numbers from different numbering systems. For more information, see UTS 39 423 * section 5.3. 424 */ 425 public static final int MIXED_NUMBERS = 128; 426 427 /** 428 * Check that an identifier does not have a combining character following a character in which that 429 * combining character would be hidden; for example 'i' followed by a U+0307 combining dot. 430 * <p> 431 * More specifically, the following characters are forbidden from preceding a U+0307: 432 * <ul> 433 * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li> 434 * <li>Latin lowercase letter 'l'</li> 435 * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li> 436 * <li>Any character whose confusable prototype ends with such a character 437 * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li> 438 * </ul> 439 * In addition, combining characters are allowed between the above characters and U+0307 except those 440 * with combining class 0 or combining class "Above" (230, same class as U+0307). 441 * <p> 442 * This list and the number of combing characters considered by this check may grow over time. 443 */ 444 public static final int HIDDEN_OVERLAY = 256; 445 446 // Update CheckResult.toString() when a new check is added. 447 448 /** 449 * Enable all spoof checks. 450 */ 451 public static final int ALL_CHECKS = 0xFFFFFFFF; 452 453 // Used for checking for ASCII-Only restriction level 454 static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze(); 455 456 /** 457 * private constructor: a SpoofChecker has to be built by the builder 458 */ SpoofChecker()459 private SpoofChecker() { 460 } 461 462 /** 463 * SpoofChecker Builder. To create a SpoofChecker, first instantiate a SpoofChecker.Builder, set the desired 464 * checking options on the builder, then call the build() function to create a SpoofChecker instance. 465 * 466 * @hide exposed on OHOS 467 */ 468 public static class Builder { 469 int fChecks; // Bit vector of checks to perform. 470 SpoofData fSpoofData; 471 final UnicodeSet fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); // The UnicodeSet of allowed characters. 472 // for this Spoof Checker. Defaults to all chars. 473 final Set<ULocale> fAllowedLocales = new LinkedHashSet<>(); // The list of allowed locales. 474 private RestrictionLevel fRestrictionLevel; 475 476 /** 477 * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for 478 * LOCALE_LIMIT and CHAR_LIMIT. Note that additional checks may be added in the future, resulting in the changes 479 * to the default checking behavior. 480 */ Builder()481 public Builder() { 482 fChecks = ALL_CHECKS; 483 fSpoofData = null; 484 fRestrictionLevel = RestrictionLevel.HIGHLY_RESTRICTIVE; 485 } 486 487 /** 488 * Constructor: Create a Spoof Checker Builder, and set the configuration from an existing SpoofChecker. 489 * 490 * @param src 491 * The existing checker. 492 */ Builder(SpoofChecker src)493 public Builder(SpoofChecker src) { 494 fChecks = src.fChecks; 495 fSpoofData = src.fSpoofData; // For the data, we will either use the source data 496 // as-is, or drop the builder's reference to it 497 // and generate new data, depending on what our 498 // caller does with the builder. 499 fAllowedCharsSet.set(src.fAllowedCharsSet); 500 fAllowedLocales.addAll(src.fAllowedLocales); 501 fRestrictionLevel = src.fRestrictionLevel; 502 } 503 504 /** 505 * Create a SpoofChecker with current configuration. 506 * 507 * @return SpoofChecker 508 */ build()509 public SpoofChecker build() { 510 // TODO: Make this data loading be lazy (see #12696). 511 if (fSpoofData == null) { 512 // read binary file 513 fSpoofData = SpoofData.getDefault(); 514 } 515 516 // Copy all state from the builder to the new SpoofChecker. 517 // Make sure that everything is either cloned or copied, so 518 // that subsequent re-use of the builder won't modify the built 519 // SpoofChecker. 520 // 521 // One exception to this: the SpoofData is just assigned. 522 // If the builder subsequently needs to modify fSpoofData 523 // it will create a new SpoofData object first. 524 525 SpoofChecker result = new SpoofChecker(); 526 result.fChecks = this.fChecks; 527 result.fSpoofData = this.fSpoofData; 528 result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone()); 529 result.fAllowedCharsSet.freeze(); 530 result.fAllowedLocales = new HashSet<>(this.fAllowedLocales); 531 result.fRestrictionLevel = this.fRestrictionLevel; 532 return result; 533 } 534 535 /** 536 * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data file 537 * confusables.txt as described in Unicode UAX 39. The syntax of the source data is as described in UAX 39 for 538 * these files, and the content of these files is acceptable input. 539 * 540 * @param confusables 541 * the Reader of confusable characters definitions, as found in file confusables.txt from 542 * unicode.org. 543 * @throws ParseException 544 * To report syntax errors in the input. 545 */ setData(Reader confusables)546 public Builder setData(Reader confusables) throws ParseException, IOException { 547 548 // Compile the binary data from the source (text) format. 549 // Drop the builder's reference to any pre-existing data, which may 550 // be in use in an already-built checker. 551 552 fSpoofData = new SpoofData(); 553 ConfusabledataBuilder.buildConfusableData(confusables, fSpoofData); 554 return this; 555 } 556 557 /** 558 * Deprecated as of ICU 58; use {@link SpoofChecker.Builder#setData(Reader confusables)} instead. 559 * 560 * @param confusables 561 * the Reader of confusable characters definitions, as found in file confusables.txt from 562 * unicode.org. 563 * @param confusablesWholeScript 564 * No longer supported. 565 * @throws ParseException 566 * To report syntax errors in the input. 567 * 568 * @deprecated ICU 58 569 */ 570 @Deprecated setData(Reader confusables, Reader confusablesWholeScript)571 public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, IOException { 572 setData(confusables); 573 return this; 574 } 575 576 /** 577 * Specify the bitmask of checks that will be performed by {@link SpoofChecker#failsChecks}. Calling this method 578 * overwrites any checks that may have already been enabled. By default, all checks are enabled. 579 * 580 * To enable specific checks and disable all others, the "whitelisted" checks should be ORed together. For 581 * example, to fail strings containing characters outside of the set specified by {@link #setAllowedChars} and 582 * also strings that contain digits from mixed numbering systems: 583 * 584 * <pre> 585 * {@code 586 * builder.setChecks(SpoofChecker.CHAR_LIMIT | SpoofChecker.MIXED_NUMBERS); 587 * } 588 * </pre> 589 * 590 * To disable specific checks and enable all others, the "blacklisted" checks should be ANDed away from 591 * ALL_CHECKS. For example, if you are not planning to use the {@link SpoofChecker#areConfusable} functionality, 592 * it is good practice to disable the CONFUSABLE check: 593 * 594 * <pre> 595 * {@code 596 * builder.setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CONFUSABLE); 597 * } 598 * </pre> 599 * 600 * Note that methods such as {@link #setAllowedChars}, {@link #setAllowedLocales}, and 601 * {@link #setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they 602 * enable onto the existing bitmask specified by this method. For more details, see the documentation of those 603 * methods. 604 * 605 * @param checks 606 * The set of checks that this spoof checker will perform. The value is an 'or' of the desired 607 * checks. 608 * @return self 609 */ setChecks(int checks)610 public Builder setChecks(int checks) { 611 // Verify that the requested checks are all ones (bits) that 612 // are acceptable, known values. 613 if (0 != (checks & ~SpoofChecker.ALL_CHECKS)) { 614 throw new IllegalArgumentException("Bad Spoof Checks value."); 615 } 616 this.fChecks = (checks & SpoofChecker.ALL_CHECKS); 617 return this; 618 } 619 620 /** 621 * Limit characters that are acceptable in identifiers being checked to those normally used with the languages 622 * associated with the specified locales. Any previously specified list of locales is replaced by the new 623 * settings. 624 * 625 * A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is 626 * determined. Characters from this set of scripts, along with characters from the "common" and "inherited" 627 * Unicode Script categories will be permitted. 628 * 629 * Supplying an empty string removes all restrictions; characters from any script will be allowed. 630 * 631 * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker when calling this function with a 632 * non-empty list of locales. 633 * 634 * The Unicode Set of characters that will be allowed is accessible via the {@link #getAllowedChars} function. 635 * setAllowedLocales() will <i>replace</i> any previously applied set of allowed characters. 636 * 637 * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of 638 * {@link #setAllowedChars} by fetching the resulting set with {@link #getAllowedChars}, manipulating it with 639 * the Unicode Set API, then resetting the spoof detectors limits with {@link #setAllowedChars}. 640 * 641 * @param locales 642 * A Set of ULocales, from which the language and associated script are extracted. If the locales Set 643 * is null, no restrictions will be placed on the allowed characters. 644 * 645 * @return self 646 */ setAllowedLocales(Set<ULocale> locales)647 public Builder setAllowedLocales(Set<ULocale> locales) { 648 fAllowedCharsSet.clear(); 649 650 for (ULocale locale : locales) { 651 // Add the script chars for this locale to the accumulating set 652 // of allowed chars. 653 addScriptChars(locale, fAllowedCharsSet); 654 } 655 656 // If our caller provided an empty list of locales, we disable the 657 // allowed characters checking 658 fAllowedLocales.clear(); 659 if (locales.size() == 0) { 660 fAllowedCharsSet.add(0, 0x10ffff); 661 fChecks &= ~CHAR_LIMIT; 662 return this; 663 } 664 665 // Add all common and inherited characters to the set of allowed 666 // chars. 667 UnicodeSet tempSet = new UnicodeSet(); 668 tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON); 669 fAllowedCharsSet.addAll(tempSet); 670 tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED); 671 fAllowedCharsSet.addAll(tempSet); 672 673 // Store the updated spoof checker state. 674 fAllowedLocales.clear(); 675 fAllowedLocales.addAll(locales); 676 fChecks |= CHAR_LIMIT; 677 return this; 678 } 679 680 /** 681 * Limit characters that are acceptable in identifiers being checked to those normally used with the languages 682 * associated with the specified locales. Any previously specified list of locales is replaced by the new 683 * settings. 684 * 685 * @param locales 686 * A Set of Locales, from which the language and associated script are extracted. If the locales Set 687 * is null, no restrictions will be placed on the allowed characters. 688 * 689 * @return self 690 */ setAllowedJavaLocales(Set<Locale> locales)691 public Builder setAllowedJavaLocales(Set<Locale> locales) { 692 HashSet<ULocale> ulocales = new HashSet<>(locales.size()); 693 for (Locale locale : locales) { 694 ulocales.add(ULocale.forLocale(locale)); 695 } 696 return setAllowedLocales(ulocales); 697 } 698 699 // Add (union) to the UnicodeSet all of the characters for the scripts 700 // used for the specified locale. Part of the implementation of 701 // setAllowedLocales. addScriptChars(ULocale locale, UnicodeSet allowedChars)702 private void addScriptChars(ULocale locale, UnicodeSet allowedChars) { 703 int scripts[] = UScript.getCode(locale); 704 if (scripts != null) { 705 UnicodeSet tmpSet = new UnicodeSet(); 706 for (int i = 0; i < scripts.length; i++) { 707 tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]); 708 allowedChars.addAll(tmpSet); 709 } 710 } 711 // else it's an unknown script. 712 // Maybe they asked for the script of "zxx", which refers to no linguistic content. 713 // Maybe they asked for the script of a newer locale that we don't know in the older version of ICU. 714 } 715 716 /** 717 * Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit 718 * is is replaced by the new settings. This includes limits on characters that were set with the 719 * setAllowedLocales() function. Note that the RESTRICTED set is useful. 720 * 721 * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker by this function. 722 * 723 * @param chars 724 * A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by 725 * this function, so there are no restrictions on modifying or deleting the UnicodeSet after calling 726 * this function. Note that this clears the allowedLocales set. 727 * @return self 728 */ setAllowedChars(UnicodeSet chars)729 public Builder setAllowedChars(UnicodeSet chars) { 730 fAllowedCharsSet.set(chars); 731 fAllowedLocales.clear(); 732 fChecks |= CHAR_LIMIT; 733 return this; 734 } 735 736 /** 737 * Set the loosest restriction level allowed for strings. The default if this is not called is 738 * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. Calling this method enables the {@link #RESTRICTION_LEVEL} and 739 * {@link #MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are 740 * to be performed by {@link SpoofChecker#failsChecks}, see {@link #setChecks}. 741 * 742 * @param restrictionLevel 743 * The loosest restriction level allowed. 744 * @return self 745 * @hide draft / provisional / internal are hidden on OHOS 746 */ setRestrictionLevel(RestrictionLevel restrictionLevel)747 public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) { 748 fRestrictionLevel = restrictionLevel; 749 fChecks |= RESTRICTION_LEVEL | MIXED_NUMBERS; 750 return this; 751 } 752 753 /* 754 * ***************************************************************************** 755 * Internal classes for compililing confusable data into its binary (runtime) form. 756 * ***************************************************************************** 757 */ 758 // --------------------------------------------------------------------- 759 // 760 // buildConfusableData Compile the source confusable data, as defined by 761 // the Unicode data file confusables.txt, into the binary 762 // structures used by the confusable detector. 763 // 764 // The binary structures are described in uspoof_impl.h 765 // 766 // 1. parse the data, making a hash table mapping from a codepoint to a String. 767 // 768 // 2. Sort all of the strings encountered by length, since they will need to 769 // be stored in that order in the final string table. 770 // TODO: Sorting these strings by length is no longer needed since the removal of 771 // the string lengths table. This logic can be removed to save processing time 772 // when building confusables data. 773 // 774 // 3. Build a list of keys (UChar32s) from the mapping table. Sort the 775 // list because that will be the ordering of our runtime table. 776 // 777 // 4. Generate the run time string table. This is generated before the key & value 778 // table because we need the string indexes when building those tables. 779 // 780 // 5. Build the run-time key and value table. These are parallel tables, and 781 // are built at the same time 782 783 // class ConfusabledataBuilder 784 // An instance of this class exists while the confusable data is being built from source. 785 // It encapsulates the intermediate data structures that are used for building. 786 // It exports one static function, to do a confusable data build. 787 private static class ConfusabledataBuilder { 788 789 private Hashtable<Integer, SPUString> fTable; 790 private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the 791 // four mapping tables. 792 793 // The compiled data is first assembled into the following four collections, 794 // then output to the builder's SpoofData object. 795 private StringBuffer fStringTable; 796 private ArrayList<Integer> fKeyVec; 797 private ArrayList<Integer> fValueVec; 798 private SPUStringPool stringPool; 799 private Pattern fParseLine; 800 private Pattern fParseHexNum; 801 private int fLineNum; 802 ConfusabledataBuilder()803 ConfusabledataBuilder() { 804 fTable = new Hashtable<>(); 805 fKeySet = new UnicodeSet(); 806 fKeyVec = new ArrayList<>(); 807 fValueVec = new ArrayList<>(); 808 stringPool = new SPUStringPool(); 809 } 810 build(Reader confusables, SpoofData dest)811 void build(Reader confusables, SpoofData dest) throws ParseException, java.io.IOException { 812 StringBuffer fInput = new StringBuffer(); 813 814 // Convert the user input data from UTF-8 to char (UTF-16) 815 LineNumberReader lnr = new LineNumberReader(confusables); 816 do { 817 String line = lnr.readLine(); 818 if (line == null) { 819 break; 820 } 821 fInput.append(line); 822 fInput.append('\n'); 823 } while (true); 824 825 // Regular Expression to parse a line from Confusables.txt. The expression will match 826 // any line. What was matched is determined by examining which capture groups have a match. 827 // Capture Group 1: the source char 828 // Capture Group 2: the replacement chars 829 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated) 830 // Capture Group 7: A blank or comment only line. 831 // Capture Group 8: A syntactically invalid line. Anything that didn't match before. 832 // Example Line from the confusables.txt source file: 833 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... " 834 fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match the source char 835 "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s) 836 "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued) 837 "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type 838 "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment 839 "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment 840 "|^(.*?)$"); // OR match any line, which catches illegal lines. 841 842 // Regular expression for parsing a hex number out of a space-separated list of them. 843 // Capture group 1 gets the number, with spaces removed. 844 fParseHexNum = Pattern.compile("\\s*([0-9A-F]+)"); 845 846 // Zap any Byte Order Mark at the start of input. Changing it to a space 847 // is benign given the syntax of the input. 848 if (fInput.charAt(0) == 0xfeff) { 849 fInput.setCharAt(0, (char) 0x20); 850 } 851 852 // Parse the input, one line per iteration of this loop. 853 Matcher matcher = fParseLine.matcher(fInput); 854 while (matcher.find()) { 855 fLineNum++; 856 if (matcher.start(7) >= 0) { 857 // this was a blank or comment line. 858 continue; 859 } 860 if (matcher.start(8) >= 0) { 861 // input file syntax error. 862 // status = U_PARSE_ERROR; 863 throw new ParseException( 864 "Confusables, line " + fLineNum + ": Unrecognized Line: " + matcher.group(8), 865 matcher.start(8)); 866 } 867 868 // We have a good input line. Extract the key character and mapping 869 // string, and 870 // put them into the appropriate mapping table. 871 int keyChar = Integer.parseInt(matcher.group(1), 16); 872 if (keyChar > 0x10ffff) { 873 throw new ParseException( 874 "Confusables, line " + fLineNum + ": Bad code point: " + matcher.group(1), 875 matcher.start(1)); 876 } 877 Matcher m = fParseHexNum.matcher(matcher.group(2)); 878 879 StringBuilder mapString = new StringBuilder(); 880 while (m.find()) { 881 int c = Integer.parseInt(m.group(1), 16); 882 if (c > 0x10ffff) { 883 throw new ParseException( 884 "Confusables, line " + fLineNum + ": Bad code point: " + Integer.toString(c, 16), 885 matcher.start(2)); 886 } 887 mapString.appendCodePoint(c); 888 } 889 assert (mapString.length() >= 1); 890 891 // Put the map (value) string into the string pool 892 // This a little like a Java intern() - any duplicates will be 893 // eliminated. 894 SPUString smapString = stringPool.addString(mapString.toString()); 895 896 // Add the char . string mapping to the table. 897 // For Unicode 8, the SL, SA and ML tables have been discontinued. 898 // All input data from confusables.txt is tagged MA. 899 fTable.put(keyChar, smapString); 900 901 fKeySet.add(keyChar); 902 } 903 904 // Input data is now all parsed and collected. 905 // Now create the run-time binary form of the data. 906 // 907 // This is done in two steps. First the data is assembled into vectors and strings, 908 // for ease of construction, then the contents of these collections are copied 909 // into the actual SpoofData object. 910 911 // Build up the string array, and record the index of each string therein 912 // in the (build time only) string pool. 913 // Strings of length one are not entered into the strings array. 914 // (Strings in the table are sorted by length) 915 916 stringPool.sort(); 917 fStringTable = new StringBuffer(); 918 int poolSize = stringPool.size(); 919 int i; 920 for (i = 0; i < poolSize; i++) { 921 SPUString s = stringPool.getByIndex(i); 922 int strLen = s.fStr.length(); 923 int strIndex = fStringTable.length(); 924 if (strLen == 1) { 925 // strings of length one do not get an entry in the string table. 926 // Keep the single string character itself here, which is the same 927 // convention that is used in the final run-time string table index. 928 s.fCharOrStrTableIndex = s.fStr.charAt(0); 929 } else { 930 s.fCharOrStrTableIndex = strIndex; 931 fStringTable.append(s.fStr); 932 } 933 } 934 935 // Construct the compile-time Key and Value table. 936 // 937 // The keys in the Key table follow the format described in uspoof.h for the 938 // Cfu confusables data structure. 939 // 940 // Starting in ICU 58, each code point has exactly one entry in the data 941 // structure. 942 943 for (String keyCharStr : fKeySet) { 944 int keyChar = keyCharStr.codePointAt(0); 945 SPUString targetMapping = fTable.get(keyChar); 946 assert targetMapping != null; 947 948 // Throw a sane exception if trying to consume a long string. Otherwise, 949 // codePointAndLengthToKey will throw an assertion error. 950 if (targetMapping.fStr.length() > 256) { 951 throw new IllegalArgumentException("Confusable prototypes cannot be longer than 256 entries."); 952 } 953 954 int key = ConfusableDataUtils.codePointAndLengthToKey(keyChar, targetMapping.fStr.length()); 955 int value = targetMapping.fCharOrStrTableIndex; 956 957 fKeyVec.add(key); 958 fValueVec.add(value); 959 } 960 961 // Put the assembled data into the destination SpoofData object. 962 963 // The Key Table 964 // While copying the keys to the output array, 965 // also sanity check that the keys are sorted. 966 int numKeys = fKeyVec.size(); 967 dest.fCFUKeys = new int[numKeys]; 968 int previousCodePoint = 0; 969 for (i = 0; i < numKeys; i++) { 970 int key = fKeyVec.get(i); 971 int codePoint = ConfusableDataUtils.keyToCodePoint(key); 972 // strictly greater because there can be only one entry per code point 973 assert codePoint > previousCodePoint; 974 dest.fCFUKeys[i] = key; 975 previousCodePoint = codePoint; 976 } 977 978 // The Value Table, parallels the key table 979 int numValues = fValueVec.size(); 980 assert (numKeys == numValues); 981 dest.fCFUValues = new short[numValues]; 982 i = 0; 983 for (int value : fValueVec) { 984 assert (value < 0xffff); 985 dest.fCFUValues[i++] = (short) value; 986 } 987 988 // The Strings Table. 989 dest.fCFUStrings = fStringTable.toString(); 990 } 991 992 public static void buildConfusableData(Reader confusables, SpoofData dest) 993 throws java.io.IOException, ParseException { 994 ConfusabledataBuilder builder = new ConfusabledataBuilder(); 995 builder.build(confusables, dest); 996 } 997 998 /* 999 * ***************************************************************************** 1000 * Internal classes for compiling confusable data into its binary (runtime) form. 1001 * ***************************************************************************** 1002 */ 1003 // SPUString 1004 // Holds a string that is the result of one of the mappings defined 1005 // by the confusable mapping data (confusables.txt from Unicode.org) 1006 // Instances of SPUString exist during the compilation process only. 1007 1008 private static class SPUString { 1009 String fStr; // The actual string. 1010 int fCharOrStrTableIndex; // Index into the final runtime data for this string. 1011 // (or, for length 1, the single string char itself, 1012 // there being no string table entry for it.) 1013 1014 SPUString(String s) { 1015 fStr = s; 1016 fCharOrStrTableIndex = 0; 1017 } 1018 } 1019 1020 // Comparison function for ordering strings in the string pool. 1021 // Compare by length first, then, within a group of the same length, 1022 // by code point order. 1023 1024 private static class SPUStringComparator implements Comparator<SPUString> { 1025 @Override 1026 public int compare(SPUString sL, SPUString sR) { 1027 int lenL = sL.fStr.length(); 1028 int lenR = sR.fStr.length(); 1029 if (lenL < lenR) { 1030 return -1; 1031 } else if (lenL > lenR) { 1032 return 1; 1033 } else { 1034 return sL.fStr.compareTo(sR.fStr); 1035 } 1036 } 1037 1038 final static SPUStringComparator INSTANCE = new SPUStringComparator(); 1039 } 1040 1041 // String Pool A utility class for holding the strings that are the result of 1042 // the spoof mappings. These strings will utimately end up in the 1043 // run-time String Table. 1044 // This is sort of like a sorted set of strings, except that ICU's anemic 1045 // built-in collections don't support those, so it is implemented with a 1046 // combination of a uhash and a Vector. 1047 private static class SPUStringPool { 1048 public SPUStringPool() { 1049 fVec = new Vector<>(); 1050 fHash = new Hashtable<>(); 1051 } 1052 1053 public int size() { 1054 return fVec.size(); 1055 } 1056 1057 // Get the n-th string in the collection. 1058 public SPUString getByIndex(int index) { 1059 SPUString retString = fVec.elementAt(index); 1060 return retString; 1061 } 1062 1063 // Add a string. Return the string from the table. 1064 // If the input parameter string is already in the table, delete the 1065 // input parameter and return the existing string. 1066 public SPUString addString(String src) { 1067 SPUString hashedString = fHash.get(src); 1068 if (hashedString == null) { 1069 hashedString = new SPUString(src); 1070 fHash.put(src, hashedString); 1071 fVec.addElement(hashedString); 1072 } 1073 return hashedString; 1074 } 1075 1076 // Sort the contents; affects the ordering of getByIndex(). 1077 public void sort() { 1078 Collections.sort(fVec, SPUStringComparator.INSTANCE); 1079 } 1080 1081 private Vector<SPUString> fVec; // Elements are SPUString * 1082 private Hashtable<String, SPUString> fHash; // Key: Value: 1083 } 1084 1085 } 1086 } 1087 1088 /** 1089 * Get the Restriction Level that is being tested. 1090 * 1091 * @return The restriction level 1092 * @deprecated This API is ICU internal only. 1093 * @hide draft / provisional / internal are hidden on OHOS 1094 */ 1095 @Deprecated 1096 public RestrictionLevel getRestrictionLevel() { 1097 return fRestrictionLevel; 1098 } 1099 1100 /** 1101 * Get the set of checks that this Spoof Checker has been configured to perform. 1102 * 1103 * @return The set of checks that this spoof checker will perform. 1104 */ 1105 public int getChecks() { 1106 return fChecks; 1107 } 1108 1109 /** 1110 * Get a read-only set of locales for the scripts that are acceptable in strings to be checked. If no limitations on 1111 * scripts have been specified, an empty set will be returned. 1112 * 1113 * setAllowedChars() will reset the list of allowed locales to be empty. 1114 * 1115 * The returned set may not be identical to the originally specified set that is supplied to setAllowedLocales(); 1116 * the information other than languages from the originally specified locales may be omitted. 1117 * 1118 * @return A set of locales corresponding to the acceptable scripts. 1119 */ 1120 public Set<ULocale> getAllowedLocales() { 1121 return Collections.unmodifiableSet(fAllowedLocales); 1122 } 1123 1124 /** 1125 * Get a set of {@link java.util.Locale} instances for the scripts that are acceptable in strings to be checked. If 1126 * no limitations on scripts have been specified, an empty set will be returned. 1127 * 1128 * @return A set of locales corresponding to the acceptable scripts. 1129 */ 1130 public Set<Locale> getAllowedJavaLocales() { 1131 HashSet<Locale> locales = new HashSet<>(fAllowedLocales.size()); 1132 for (ULocale uloc : fAllowedLocales) { 1133 locales.add(uloc.toLocale()); 1134 } 1135 return locales; 1136 } 1137 1138 /** 1139 * Get a UnicodeSet for the characters permitted in an identifier. This corresponds to the limits imposed by the Set 1140 * Allowed Characters functions. Limitations imposed by other checks will not be reflected in the set returned by 1141 * this function. 1142 * 1143 * The returned set will be frozen, meaning that it cannot be modified by the caller. 1144 * 1145 * @return A UnicodeSet containing the characters that are permitted by the CHAR_LIMIT test. 1146 */ 1147 public UnicodeSet getAllowedChars() { 1148 return fAllowedCharsSet; 1149 } 1150 1151 /** 1152 * A struct-like class to hold the results of a Spoof Check operation. Tells which check(s) have failed. 1153 * 1154 * @hide exposed on OHOS 1155 */ 1156 public static class CheckResult { 1157 /** 1158 * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests 1159 * in question: RESTRICTION_LEVEL, CHAR_LIMIT, and so on. 1160 * 1161 * @see Builder#setChecks 1162 */ 1163 public int checks; 1164 1165 /** 1166 * The index of the first string position that failed a check. 1167 * 1168 * @deprecated ICU 51. No longer supported. Always set to zero. 1169 */ 1170 @Deprecated 1171 public int position; 1172 1173 /** 1174 * The numerics found in the string, if MIXED_NUMBERS was set; otherwise null. The set will contain the zero 1175 * digit from each decimal number system found in the input string. 1176 */ 1177 public UnicodeSet numerics; 1178 1179 /** 1180 * The restriction level that the text meets, if RESTRICTION_LEVEL is set; otherwise null. 1181 */ 1182 public RestrictionLevel restrictionLevel; 1183 1184 /** 1185 * Default constructor 1186 */ 1187 public CheckResult() { 1188 checks = 0; 1189 position = 0; 1190 } 1191 1192 /** 1193 * {@inheritDoc} 1194 */ 1195 @Override 1196 public String toString() { 1197 StringBuilder sb = new StringBuilder(); 1198 sb.append("checks:"); 1199 if (checks == 0) { 1200 sb.append(" none"); 1201 } else if (checks == ALL_CHECKS) { 1202 sb.append(" all"); 1203 } else { 1204 if ((checks & SINGLE_SCRIPT_CONFUSABLE) != 0) { 1205 sb.append(" SINGLE_SCRIPT_CONFUSABLE"); 1206 } 1207 if ((checks & MIXED_SCRIPT_CONFUSABLE) != 0) { 1208 sb.append(" MIXED_SCRIPT_CONFUSABLE"); 1209 } 1210 if ((checks & WHOLE_SCRIPT_CONFUSABLE) != 0) { 1211 sb.append(" WHOLE_SCRIPT_CONFUSABLE"); 1212 } 1213 if ((checks & ANY_CASE) != 0) { 1214 sb.append(" ANY_CASE"); 1215 } 1216 if ((checks & RESTRICTION_LEVEL) != 0) { 1217 sb.append(" RESTRICTION_LEVEL"); 1218 } 1219 if ((checks & INVISIBLE) != 0) { 1220 sb.append(" INVISIBLE"); 1221 } 1222 if ((checks & CHAR_LIMIT) != 0) { 1223 sb.append(" CHAR_LIMIT"); 1224 } 1225 if ((checks & MIXED_NUMBERS) != 0) { 1226 sb.append(" MIXED_NUMBERS"); 1227 } 1228 } 1229 sb.append(", numerics: ").append(numerics.toPattern(false)); 1230 sb.append(", position: ").append(position); 1231 sb.append(", restrictionLevel: ").append(restrictionLevel); 1232 return sb.toString(); 1233 } 1234 } 1235 1236 /** 1237 * Check the specified string for possible security issues. The text to be checked will typically be an identifier 1238 * of some sort. The set of checks to be performed was specified when building the SpoofChecker. 1239 * 1240 * @param text 1241 * A String to be checked for possible security issues. 1242 * @param checkResult 1243 * Output parameter, indicates which specific tests failed. May be null if the information is not wanted. 1244 * @return True there any issue is found with the input string. 1245 */ 1246 public boolean failsChecks(String text, CheckResult checkResult) { 1247 int length = text.length(); 1248 1249 int result = 0; 1250 if (checkResult != null) { 1251 checkResult.position = 0; 1252 checkResult.numerics = null; 1253 checkResult.restrictionLevel = null; 1254 } 1255 1256 if (0 != (this.fChecks & RESTRICTION_LEVEL)) { 1257 RestrictionLevel textRestrictionLevel = getRestrictionLevel(text); 1258 if (textRestrictionLevel.compareTo(fRestrictionLevel) > 0) { 1259 result |= RESTRICTION_LEVEL; 1260 } 1261 if (checkResult != null) { 1262 checkResult.restrictionLevel = textRestrictionLevel; 1263 } 1264 } 1265 1266 if (0 != (this.fChecks & MIXED_NUMBERS)) { 1267 UnicodeSet numerics = new UnicodeSet(); 1268 getNumerics(text, numerics); 1269 if (numerics.size() > 1) { 1270 result |= MIXED_NUMBERS; 1271 } 1272 if (checkResult != null) { 1273 checkResult.numerics = numerics; 1274 } 1275 } 1276 1277 if (0 != (this.fChecks & HIDDEN_OVERLAY)) { 1278 int index = findHiddenOverlay(text); 1279 if (index != -1) { 1280 result |= HIDDEN_OVERLAY; 1281 } 1282 } 1283 1284 if (0 != (this.fChecks & CHAR_LIMIT)) { 1285 int i; 1286 int c; 1287 for (i = 0; i < length;) { 1288 // U16_NEXT(text, i, length, c); 1289 c = Character.codePointAt(text, i); 1290 i = Character.offsetByCodePoints(text, i, 1); 1291 if (!this.fAllowedCharsSet.contains(c)) { 1292 result |= CHAR_LIMIT; 1293 break; 1294 } 1295 } 1296 } 1297 1298 if (0 != (this.fChecks & INVISIBLE)) { 1299 // This check needs to be done on NFD input 1300 String nfdText = nfdNormalizer.normalize(text); 1301 1302 // scan for more than one occurrence of the same non-spacing mark 1303 // in a sequence of non-spacing marks. 1304 int i; 1305 int c; 1306 int firstNonspacingMark = 0; 1307 boolean haveMultipleMarks = false; 1308 UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a 1309 // single combining sequence. 1310 for (i = 0; i < length;) { 1311 c = Character.codePointAt(nfdText, i); 1312 i = Character.offsetByCodePoints(nfdText, i, 1); 1313 if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) { 1314 firstNonspacingMark = 0; 1315 if (haveMultipleMarks) { 1316 marksSeenSoFar.clear(); 1317 haveMultipleMarks = false; 1318 } 1319 continue; 1320 } 1321 if (firstNonspacingMark == 0) { 1322 firstNonspacingMark = c; 1323 continue; 1324 } 1325 if (!haveMultipleMarks) { 1326 marksSeenSoFar.add(firstNonspacingMark); 1327 haveMultipleMarks = true; 1328 } 1329 if (marksSeenSoFar.contains(c)) { 1330 // report the error, and stop scanning. 1331 // No need to find more than the first failure. 1332 result |= INVISIBLE; 1333 break; 1334 } 1335 marksSeenSoFar.add(c); 1336 } 1337 } 1338 if (checkResult != null) { 1339 checkResult.checks = result; 1340 } 1341 return (0 != result); 1342 } 1343 1344 /** 1345 * Check the specified string for possible security issues. The text to be checked will typically be an identifier 1346 * of some sort. The set of checks to be performed was specified when building the SpoofChecker. 1347 * 1348 * @param text 1349 * A String to be checked for possible security issues. 1350 * @return True there any issue is found with the input string. 1351 */ failsChecks(String text)1352 public boolean failsChecks(String text) { 1353 return failsChecks(text, null); 1354 } 1355 1356 /** 1357 * Check the whether two specified strings are visually confusable. The types of confusability to be tested - single 1358 * script, mixed script, or whole script - are determined by the check options set for the SpoofChecker. 1359 * 1360 * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE 1361 * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected. 1362 * 1363 * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case 1364 * folded for comparison and display to the user, do not select the ANY_CASE option. 1365 * 1366 * 1367 * @param s1 1368 * The first of the two strings to be compared for confusability. 1369 * @param s2 1370 * The second of the two strings to be compared for confusability. 1371 * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability 1372 * found, as defined by spoof check test constants. 1373 */ areConfusable(String s1, String s2)1374 public int areConfusable(String s1, String s2) { 1375 // 1376 // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable, 1377 // and for definitions of the types (single, whole, mixed-script) of confusables. 1378 1379 // We only care about a few of the check flags. Ignore the others. 1380 // If no tests relevant to this function have been specified, signal an error. 1381 // TODO: is this really the right thing to do? It's probably an error on 1382 // the caller's part, but logically we would just return 0 (no error). 1383 if ((this.fChecks & CONFUSABLE) == 0) { 1384 throw new IllegalArgumentException("No confusable checks are enabled."); 1385 } 1386 1387 // Compute the skeletons and check for confusability. 1388 String s1Skeleton = getSkeleton(s1); 1389 String s2Skeleton = getSkeleton(s2); 1390 if (!s1Skeleton.equals(s2Skeleton)) { 1391 return 0; 1392 } 1393 1394 // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes 1395 // of confusables according to UTS 39 section 4. 1396 // Start by computing the resolved script sets of s1 and s2. 1397 ScriptSet s1RSS = new ScriptSet(); 1398 getResolvedScriptSet(s1, s1RSS); 1399 ScriptSet s2RSS = new ScriptSet(); 1400 getResolvedScriptSet(s2, s2RSS); 1401 1402 // Turn on all applicable flags 1403 int result = 0; 1404 if (s1RSS.intersects(s2RSS)) { 1405 result |= SINGLE_SCRIPT_CONFUSABLE; 1406 } else { 1407 result |= MIXED_SCRIPT_CONFUSABLE; 1408 if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) { 1409 result |= WHOLE_SCRIPT_CONFUSABLE; 1410 } 1411 } 1412 1413 // Turn off flags that the user doesn't want 1414 result &= fChecks; 1415 1416 return result; 1417 } 1418 1419 /** 1420 * Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are 1421 * confusable if their skeletons are identical. See Unicode UAX 39 for additional information. 1422 * 1423 * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some 1424 * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons. 1425 * 1426 * Skeletons are computed using the algorithm and data described in Unicode UAX 39. 1427 * 1428 * @param str 1429 * The input string whose skeleton will be generated. 1430 * @return The output skeleton string. 1431 */ getSkeleton(CharSequence str)1432 public String getSkeleton(CharSequence str) { 1433 // Apply the skeleton mapping to the NFD normalized input string 1434 // Accumulate the skeleton, possibly unnormalized, in a String. 1435 String nfdId = nfdNormalizer.normalize(str); 1436 int normalizedLen = nfdId.length(); 1437 StringBuilder skelSB = new StringBuilder(); 1438 for (int inputIndex = 0; inputIndex < normalizedLen;) { 1439 int c = Character.codePointAt(nfdId, inputIndex); 1440 inputIndex += Character.charCount(c); 1441 this.fSpoofData.confusableLookup(c, skelSB); 1442 } 1443 String skelStr = skelSB.toString(); 1444 skelStr = nfdNormalizer.normalize(skelStr); 1445 return skelStr; 1446 } 1447 1448 /** 1449 * Calls {@link SpoofChecker#getSkeleton(CharSequence id)}. Starting with ICU 55, the "type" parameter has been 1450 * ignored, and starting with ICU 58, this function has been deprecated. 1451 * 1452 * @param type 1453 * No longer supported. Prior to ICU 55, was used to specify the mapping table SL, SA, ML, or MA. 1454 * @param id 1455 * The input identifier whose skeleton will be generated. 1456 * @return The output skeleton string. 1457 * 1458 * @deprecated ICU 58 1459 */ 1460 @Deprecated getSkeleton(int type, String id)1461 public String getSkeleton(int type, String id) { 1462 return getSkeleton(id); 1463 } 1464 1465 /** 1466 * Equality function. Return true if the two SpoofChecker objects incorporate the same confusable data and have 1467 * enabled the same set of checks. 1468 * 1469 * @param other 1470 * the SpoofChecker being compared with. 1471 * @return true if the two SpoofCheckers are equal. 1472 */ 1473 @Override equals(Object other)1474 public boolean equals(Object other) { 1475 if (!(other instanceof SpoofChecker)) { 1476 return false; 1477 } 1478 SpoofChecker otherSC = (SpoofChecker) other; 1479 if (fSpoofData != otherSC.fSpoofData && fSpoofData != null && !fSpoofData.equals(otherSC.fSpoofData)) { 1480 return false; 1481 } 1482 if (fChecks != otherSC.fChecks) { 1483 return false; 1484 } 1485 if (fAllowedLocales != otherSC.fAllowedLocales && fAllowedLocales != null 1486 && !fAllowedLocales.equals(otherSC.fAllowedLocales)) { 1487 return false; 1488 } 1489 if (fAllowedCharsSet != otherSC.fAllowedCharsSet && fAllowedCharsSet != null 1490 && !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) { 1491 return false; 1492 } 1493 if (fRestrictionLevel != otherSC.fRestrictionLevel) { 1494 return false; 1495 } 1496 return true; 1497 } 1498 1499 /** 1500 * Overrides {@link Object#hashCode()}. 1501 */ 1502 @Override hashCode()1503 public int hashCode() { 1504 return fChecks 1505 ^ fSpoofData.hashCode() 1506 ^ fAllowedLocales.hashCode() 1507 ^ fAllowedCharsSet.hashCode() 1508 ^ fRestrictionLevel.ordinal(); 1509 } 1510 1511 /** 1512 * Computes the augmented script set for a code point, according to UTS 39 section 5.1. 1513 */ getAugmentedScriptSet(int codePoint, ScriptSet result)1514 private static void getAugmentedScriptSet(int codePoint, ScriptSet result) { 1515 result.clear(); 1516 UScript.getScriptExtensions(codePoint, result); 1517 1518 // Section 5.1 step 1 1519 if (result.get(UScript.HAN)) { 1520 result.set(UScript.HAN_WITH_BOPOMOFO); 1521 result.set(UScript.JAPANESE); 1522 result.set(UScript.KOREAN); 1523 } 1524 if (result.get(UScript.HIRAGANA)) { 1525 result.set(UScript.JAPANESE); 1526 } 1527 if (result.get(UScript.KATAKANA)) { 1528 result.set(UScript.JAPANESE); 1529 } 1530 if (result.get(UScript.HANGUL)) { 1531 result.set(UScript.KOREAN); 1532 } 1533 if (result.get(UScript.BOPOMOFO)) { 1534 result.set(UScript.HAN_WITH_BOPOMOFO); 1535 } 1536 1537 // Section 5.1 step 2 1538 if (result.get(UScript.COMMON) || result.get(UScript.INHERITED)) { 1539 result.setAll(); 1540 } 1541 } 1542 1543 /** 1544 * Computes the resolved script set for a string, according to UTS 39 section 5.1. 1545 */ getResolvedScriptSet(CharSequence input, ScriptSet result)1546 private void getResolvedScriptSet(CharSequence input, ScriptSet result) { 1547 getResolvedScriptSetWithout(input, UScript.CODE_LIMIT, result); 1548 } 1549 1550 /** 1551 * Computes the resolved script set for a string, omitting characters having the specified script. If 1552 * UScript.CODE_LIMIT is passed as the second argument, all characters are included. 1553 */ getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result)1554 private void getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result) { 1555 result.setAll(); 1556 1557 ScriptSet temp = new ScriptSet(); 1558 for (int utf16Offset = 0; utf16Offset < input.length();) { 1559 int codePoint = Character.codePointAt(input, utf16Offset); 1560 utf16Offset += Character.charCount(codePoint); 1561 1562 // Compute the augmented script set for the character 1563 getAugmentedScriptSet(codePoint, temp); 1564 1565 // Intersect the augmented script set with the resolved script set, but only if the character doesn't 1566 // have the script specified in the function call 1567 if (script == UScript.CODE_LIMIT || !temp.get(script)) { 1568 result.and(temp); 1569 } 1570 } 1571 } 1572 1573 /** 1574 * Computes the set of numerics for a string, according to UTS 39 section 5.3. 1575 */ getNumerics(String input, UnicodeSet result)1576 private void getNumerics(String input, UnicodeSet result) { 1577 result.clear(); 1578 1579 for (int utf16Offset = 0; utf16Offset < input.length();) { 1580 int codePoint = Character.codePointAt(input, utf16Offset); 1581 utf16Offset += Character.charCount(codePoint); 1582 1583 // Store a representative character for each kind of decimal digit 1584 if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) { 1585 // Store the zero character as a representative for comparison. 1586 // Unicode guarantees it is codePoint - value 1587 result.add(codePoint - UCharacter.getNumericValue(codePoint)); 1588 } 1589 } 1590 } 1591 1592 /** 1593 * Computes the restriction level of a string, according to UTS 39 section 5.2. 1594 */ getRestrictionLevel(String input)1595 private RestrictionLevel getRestrictionLevel(String input) { 1596 // Section 5.2 step 1: 1597 if (!fAllowedCharsSet.containsAll(input)) { 1598 return RestrictionLevel.UNRESTRICTIVE; 1599 } 1600 1601 // Section 5.2 step 2: 1602 if (ASCII.containsAll(input)) { 1603 return RestrictionLevel.ASCII; 1604 } 1605 1606 // Section 5.2 steps 3: 1607 ScriptSet resolvedScriptSet = new ScriptSet(); 1608 getResolvedScriptSet(input, resolvedScriptSet); 1609 1610 // Section 5.2 step 4: 1611 if (!resolvedScriptSet.isEmpty()) { 1612 return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE; 1613 } 1614 1615 // Section 5.2 step 5: 1616 ScriptSet resolvedNoLatn = new ScriptSet(); 1617 getResolvedScriptSetWithout(input, UScript.LATIN, resolvedNoLatn); 1618 1619 // Section 5.2 step 6: 1620 if (resolvedNoLatn.get(UScript.HAN_WITH_BOPOMOFO) || resolvedNoLatn.get(UScript.JAPANESE) 1621 || resolvedNoLatn.get(UScript.KOREAN)) { 1622 return RestrictionLevel.HIGHLY_RESTRICTIVE; 1623 } 1624 1625 // Section 5.2 step 7: 1626 if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) && !resolvedNoLatn.get(UScript.GREEK) 1627 && !resolvedNoLatn.get(UScript.CHEROKEE)) { 1628 return RestrictionLevel.MODERATELY_RESTRICTIVE; 1629 } 1630 1631 // Section 5.2 step 8: 1632 return RestrictionLevel.MINIMALLY_RESTRICTIVE; 1633 } 1634 findHiddenOverlay(String input)1635 int findHiddenOverlay(String input) { 1636 boolean sawLeadCharacter = false; 1637 StringBuilder sb = new StringBuilder(); 1638 for (int i=0; i<input.length();) { 1639 int cp = input.codePointAt(i); 1640 if (sawLeadCharacter && cp == 0x0307) { 1641 return i; 1642 } 1643 int combiningClass = UCharacter.getCombiningClass(cp); 1644 // Skip over characters except for those with combining class 0 (non-combining characters) or with 1645 // combining class 230 (same class as U+0307) 1646 assert UCharacter.getCombiningClass(0x0307) == 230; 1647 if (combiningClass == 0 || combiningClass == 230) { 1648 sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp, sb); 1649 } 1650 i += UCharacter.charCount(cp); 1651 } 1652 return -1; 1653 } 1654 isIllegalCombiningDotLeadCharacterNoLookup(int cp)1655 boolean isIllegalCombiningDotLeadCharacterNoLookup(int cp) { 1656 return cp == 'i' || cp == 'j' || cp == 'ı' || cp == 'ȷ' || cp == 'l' || 1657 UCharacter.hasBinaryProperty(cp, UProperty.SOFT_DOTTED); 1658 } 1659 isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb)1660 boolean isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb) { 1661 if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) { 1662 return true; 1663 } 1664 sb.setLength(0); 1665 fSpoofData.confusableLookup(cp, sb); 1666 int finalCp = UCharacter.codePointBefore(sb, sb.length()); 1667 if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) { 1668 return true; 1669 } 1670 return false; 1671 } 1672 1673 // Data Members 1674 private int fChecks; // Bit vector of checks to perform. 1675 private SpoofData fSpoofData; 1676 private Set<ULocale> fAllowedLocales; // The Set of allowed locales. 1677 private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters. 1678 private RestrictionLevel fRestrictionLevel; 1679 1680 private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance(); 1681 1682 // Confusable Mappings Data Structures, version 2.0 1683 // 1684 // This description and the corresponding implementation are to be kept 1685 // in-sync with the copy in icu4c uspoof_impl.h. 1686 // 1687 // For the confusable data, we are essentially implementing a map, 1688 // key: a code point 1689 // value: a string. Most commonly one char in length, but can be more. 1690 // 1691 // The keys are stored as a sorted array of 32 bit ints. 1692 // bits 0-23 a code point value 1693 // bits 24-31 length of value string, in UChars (between 1 and 256 UChars). 1694 // The key table is sorted in ascending code point order. (not on the 1695 // 32 bit int value, the flag bits do not participate in the sorting.) 1696 // 1697 // Lookup is done by means of a binary search in the key table. 1698 // 1699 // The corresponding values are kept in a parallel array of 16 bit ints. 1700 // If the value string is of length 1, it is literally in the value array. 1701 // For longer strings, the value array contains an index into the strings 1702 // table. 1703 // 1704 // String Table: 1705 // The strings table contains all of the value strings (those of length two or greater) 1706 // concatentated together into one long char (UTF-16) array. 1707 // 1708 // There is no nul character or other mark between adjacent strings. 1709 // 1710 //---------------------------------------------------------------------------- 1711 // 1712 // Changes from format version 1 to format version 2: 1713 // 1) Removal of the whole-script confusable data tables. 1714 // 2) Removal of the SL/SA/ML/MA and multi-table flags in the key bitmask. 1715 // 3) Expansion of string length value in the key bitmask from 2 bits to 8 bits. 1716 // 4) Removal of the string lengths table since 8 bits is sufficient for the 1717 // lengths of all entries in confusables.txt. 1718 // 1719 private static final class ConfusableDataUtils { 1720 public static final int FORMAT_VERSION = 2; // version for ICU 58 1721 keyToCodePoint(int key)1722 public static final int keyToCodePoint(int key) { 1723 return key & 0x00ffffff; 1724 } 1725 keyToLength(int key)1726 public static final int keyToLength(int key) { 1727 return ((key & 0xff000000) >> 24) + 1; 1728 } 1729 codePointAndLengthToKey(int codePoint, int length)1730 public static final int codePointAndLengthToKey(int codePoint, int length) { 1731 assert (codePoint & 0x00ffffff) == codePoint; 1732 assert length <= 256; 1733 return codePoint | ((length - 1) << 24); 1734 } 1735 } 1736 1737 // ------------------------------------------------------------------------------------- 1738 // 1739 // SpoofData 1740 // 1741 // This class corresponds to the ICU SpoofCheck data. 1742 // 1743 // The data can originate with the Binary ICU data that is generated in ICU4C, 1744 // or it can originate from source rules that are compiled in ICU4J. 1745 // 1746 // This class does not include the set of checks to be performed, but only 1747 // data that is serialized into the ICU binary data. 1748 // 1749 // Because Java cannot easily wrap binary data like ICU4C, the binary data is 1750 // copied into Java structures that are convenient for use by the run time code. 1751 // 1752 // --------------------------------------------------------------------------------------- 1753 private static class SpoofData { 1754 1755 // The Confusable data, Java data structures for. 1756 int[] fCFUKeys; 1757 short[] fCFUValues; 1758 String fCFUStrings; 1759 1760 private static final int DATA_FORMAT = 0x43667520; // "Cfu " 1761 1762 private static final class IsAcceptable implements Authenticate { 1763 @Override isDataVersionAcceptable(byte version[])1764 public boolean isDataVersionAcceptable(byte version[]) { 1765 return version[0] == ConfusableDataUtils.FORMAT_VERSION || version[1] != 0 || version[2] != 0 1766 || version[3] != 0; 1767 } 1768 } 1769 1770 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 1771 1772 private static final class DefaultData { 1773 private static SpoofData INSTANCE = null; 1774 private static IOException EXCEPTION = null; 1775 1776 static { 1777 // Note: Although this is static, the Java runtime can delay execution of this block until 1778 // the data is actually requested via SpoofData.getDefault(). 1779 try { 1780 INSTANCE = new SpoofData(ICUBinary.getRequiredData("confusables.cfu")); 1781 } catch (IOException e) { 1782 EXCEPTION = e; 1783 } 1784 } 1785 } 1786 1787 /** 1788 * @return instance for Unicode standard data 1789 */ getDefault()1790 public static SpoofData getDefault() { 1791 if (DefaultData.EXCEPTION != null) { 1792 throw new MissingResourceException( 1793 "Could not load default confusables data: " + DefaultData.EXCEPTION.getMessage(), 1794 "SpoofChecker", ""); 1795 } 1796 return DefaultData.INSTANCE; 1797 } 1798 1799 // SpoofChecker Data constructor for use from data builder. 1800 // Initializes a new, empty data area that will be populated later. SpoofData()1801 private SpoofData() { 1802 } 1803 1804 // Constructor for use when creating from prebuilt default data. 1805 // A ByteBuffer is what the ICU internal data loading functions provide. SpoofData(ByteBuffer bytes)1806 private SpoofData(ByteBuffer bytes) throws java.io.IOException { 1807 ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE); 1808 bytes.mark(); 1809 readData(bytes); 1810 } 1811 1812 @Override equals(Object other)1813 public boolean equals(Object other) { 1814 if (!(other instanceof SpoofData)) { 1815 return false; 1816 } 1817 SpoofData otherData = (SpoofData) other; 1818 if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys)) 1819 return false; 1820 if (!Arrays.equals(fCFUValues, otherData.fCFUValues)) 1821 return false; 1822 if (!Utility.sameObjects(fCFUStrings, otherData.fCFUStrings) && fCFUStrings != null 1823 && !fCFUStrings.equals(otherData.fCFUStrings)) 1824 return false; 1825 return true; 1826 } 1827 1828 @Override hashCode()1829 public int hashCode() { 1830 return Arrays.hashCode(fCFUKeys) 1831 ^ Arrays.hashCode(fCFUValues) 1832 ^ fCFUStrings.hashCode(); 1833 } 1834 1835 // Set the SpoofChecker data from pre-built binary data in a byte buffer. 1836 // The binary data format is as described for ICU4C spoof data. 1837 // readData(ByteBuffer bytes)1838 private void readData(ByteBuffer bytes) throws java.io.IOException { 1839 int magic = bytes.getInt(); 1840 if (magic != 0x3845fdef) { 1841 throw new IllegalArgumentException("Bad Spoof Check Data."); 1842 } 1843 @SuppressWarnings("unused") 1844 int dataFormatVersion = bytes.getInt(); 1845 @SuppressWarnings("unused") 1846 int dataLength = bytes.getInt(); 1847 1848 int CFUKeysOffset = bytes.getInt(); 1849 int CFUKeysSize = bytes.getInt(); 1850 1851 int CFUValuesOffset = bytes.getInt(); 1852 int CFUValuesSize = bytes.getInt(); 1853 1854 int CFUStringTableOffset = bytes.getInt(); 1855 int CFUStringTableSize = bytes.getInt(); 1856 1857 // We have now read the file header, and obtained the position for each 1858 // of the data items. Now read each in turn, first seeking the 1859 // input stream to the position of the data item. 1860 1861 bytes.reset(); 1862 ICUBinary.skipBytes(bytes, CFUKeysOffset); 1863 fCFUKeys = ICUBinary.getInts(bytes, CFUKeysSize, 0); 1864 1865 bytes.reset(); 1866 ICUBinary.skipBytes(bytes, CFUValuesOffset); 1867 fCFUValues = ICUBinary.getShorts(bytes, CFUValuesSize, 0); 1868 1869 bytes.reset(); 1870 ICUBinary.skipBytes(bytes, CFUStringTableOffset); 1871 fCFUStrings = ICUBinary.getString(bytes, CFUStringTableSize, 0); 1872 } 1873 1874 /** 1875 * Append the confusable skeleton transform for a single code point to a StringBuilder. The string to be 1876 * appended will between 1 and 18 characters as of Unicode 9. 1877 * 1878 * This is the heart of the confusable skeleton generation implementation. 1879 */ confusableLookup(int inChar, StringBuilder dest)1880 public void confusableLookup(int inChar, StringBuilder dest) { 1881 // Perform a binary search. 1882 // [lo, hi), i.e lo is inclusive, hi is exclusive. 1883 // The result after the loop will be in lo. 1884 int lo = 0; 1885 int hi = length(); 1886 do { 1887 int mid = (lo + hi) / 2; 1888 if (codePointAt(mid) > inChar) { 1889 hi = mid; 1890 } else if (codePointAt(mid) < inChar) { 1891 lo = mid; 1892 } else { 1893 // Found result. Break early. 1894 lo = mid; 1895 break; 1896 } 1897 } while (hi - lo > 1); 1898 1899 // Did we find an entry? If not, the char maps to itself. 1900 if (codePointAt(lo) != inChar) { 1901 dest.appendCodePoint(inChar); 1902 return; 1903 } 1904 1905 // Add the element to the string builder and return. 1906 appendValueTo(lo, dest); 1907 return; 1908 } 1909 1910 /** 1911 * Return the number of confusable entries in this SpoofData. 1912 * 1913 * @return The number of entries. 1914 */ length()1915 public int length() { 1916 return fCFUKeys.length; 1917 } 1918 1919 /** 1920 * Return the code point (key) at the specified index. 1921 * 1922 * @param index 1923 * The index within the SpoofData. 1924 * @return The code point. 1925 */ codePointAt(int index)1926 public int codePointAt(int index) { 1927 return ConfusableDataUtils.keyToCodePoint(fCFUKeys[index]); 1928 } 1929 1930 /** 1931 * Append the confusable skeleton at the specified index to the StringBuilder dest. 1932 * 1933 * @param index 1934 * The index within the SpoofData. 1935 * @param dest 1936 * The StringBuilder to which to append the skeleton. 1937 */ appendValueTo(int index, StringBuilder dest)1938 public void appendValueTo(int index, StringBuilder dest) { 1939 int stringLength = ConfusableDataUtils.keyToLength(fCFUKeys[index]); 1940 1941 // Value is either a char (for strings of length 1) or 1942 // an index into the string table (for longer strings) 1943 short value = fCFUValues[index]; 1944 if (stringLength == 1) { 1945 dest.append((char) value); 1946 } else { 1947 dest.append(fCFUStrings, value, value + stringLength); 1948 } 1949 } 1950 } 1951 1952 // ------------------------------------------------------------------------------- 1953 // 1954 // ScriptSet - Script code bit sets. 1955 // Extends Java BitSet with input/output support and a few helper methods. 1956 // Note: The I/O is not currently being used, so it has been commented out. If 1957 // it is needed again, the code can be restored. 1958 // 1959 // ------------------------------------------------------------------------------- 1960 static class ScriptSet extends BitSet { 1961 1962 // Eclipse default value to quell warnings: 1963 private static final long serialVersionUID = 1L; 1964 1965 // // The serialized version of this class can hold INT_CAPACITY * 32 scripts. 1966 // private static final int INT_CAPACITY = 6; 1967 // private static final long serialVersionUID = INT_CAPACITY; 1968 // static { 1969 // assert ScriptSet.INT_CAPACITY * Integer.SIZE <= UScript.CODE_LIMIT; 1970 // } 1971 // 1972 // public ScriptSet() { 1973 // } 1974 // 1975 // public ScriptSet(ByteBuffer bytes) throws java.io.IOException { 1976 // for (int i = 0; i < INT_CAPACITY; i++) { 1977 // int bits = bytes.getInt(); 1978 // for (int j = 0; j < Integer.SIZE; j++) { 1979 // if ((bits & (1 << j)) != 0) { 1980 // set(i * Integer.SIZE + j); 1981 // } 1982 // } 1983 // } 1984 // } 1985 // 1986 // public void output(DataOutputStream os) throws java.io.IOException { 1987 // for (int i = 0; i < INT_CAPACITY; i++) { 1988 // int bits = 0; 1989 // for (int j = 0; j < Integer.SIZE; j++) { 1990 // if (get(i * Integer.SIZE + j)) { 1991 // bits |= (1 << j); 1992 // } 1993 // } 1994 // os.writeInt(bits); 1995 // } 1996 // } 1997 ScriptSet()1998 ScriptSet() { 1999 } 2000 and(int script)2001 public void and(int script) { 2002 this.clear(0, script); 2003 this.clear(script + 1, UScript.CODE_LIMIT); 2004 } 2005 setAll()2006 public void setAll() { 2007 this.set(0, UScript.CODE_LIMIT); 2008 } 2009 isFull()2010 public boolean isFull() { 2011 return cardinality() == UScript.CODE_LIMIT; 2012 } 2013 appendStringTo(StringBuilder sb)2014 public void appendStringTo(StringBuilder sb) { 2015 sb.append("{ "); 2016 if (isEmpty()) { 2017 sb.append("- "); 2018 } else if (isFull()) { 2019 sb.append("* "); 2020 } else { 2021 for (int script = 0; script < UScript.CODE_LIMIT; script++) { 2022 if (get(script)) { 2023 sb.append(UScript.getShortName(script)); 2024 sb.append(" "); 2025 } 2026 } 2027 } 2028 sb.append("}"); 2029 } 2030 2031 @Override toString()2032 public String toString() { 2033 StringBuilder sb = new StringBuilder(); 2034 sb.append("<ScriptSet "); 2035 appendStringTo(sb); 2036 sb.append(">"); 2037 return sb.toString(); 2038 } 2039 } 2040 } 2041