1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 *************************************************************************** 5 * Copyright (C) 2008-2016, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 *************************************************************************** 8 * file name: uspoof.h 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2008Feb13 14 * created by: Andy Heninger 15 * 16 * Unicode Spoof Detection 17 */ 18 19 #ifndef USPOOF_H 20 #define USPOOF_H 21 22 #include "unicode/utypes.h" 23 #include "unicode/uset.h" 24 #include "unicode/parseerr.h" 25 #include "unicode/localpointer.h" 26 27 #if !UCONFIG_NO_NORMALIZATION 28 29 30 #if U_SHOW_CPLUSPLUS_API 31 #include "unicode/unistr.h" 32 #include "unicode/uniset.h" 33 #endif 34 35 36 /** 37 * \file 38 * \brief Unicode Security and Spoofing Detection, C API. 39 * 40 * <p> 41 * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and 42 * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions: 43 * 44 * <ol> 45 * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and 46 * "Ηarvest", where the second string starts with the Greek capital letter Eta.</li> 47 * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof 48 * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li> 49 * </ol> 50 * 51 * <p> 52 * Although originally designed as a method for flagging suspicious identifier strings such as URLs, 53 * <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word 54 * content filters. 55 * 56 * <p> 57 * The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++. 58 * 59 * <h2>Confusables</h2> 60 * 61 * <p> 62 * The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings: 63 * 64 * \code{.c} 65 * UErrorCode status = U_ZERO_ERROR; 66 * UChar* str1 = (UChar*) u"Harvest"; 67 * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA 68 * 69 * USpoofChecker* sc = uspoof_open(&status); 70 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 71 * 72 * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status); 73 * UBool result = bitmask != 0; 74 * // areConfusable: 1 (status: U_ZERO_ERROR) 75 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); 76 * uspoof_close(sc); 77 * \endcode 78 * 79 * <p> 80 * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks} 81 * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the 82 * confusability test; and the following line extracts the result out of the return value. For best performance, 83 * the instance should be created once (e.g., upon application startup), and the efficient 84 * {@link uspoof_areConfusable} method can be used at runtime. 85 * 86 * <p> 87 * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call 88 * {@link uspoof_close} when the object goes out of scope: 89 * 90 * \code{.cpp} 91 * UErrorCode status = U_ZERO_ERROR; 92 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 93 * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status); 94 * // ... 95 * \endcode 96 * 97 * UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can 98 * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so 99 * the following snippet is equivalent to the example above: 100 * 101 * \code{.c} 102 * UErrorCode status = U_ZERO_ERROR; 103 * UChar* str1 = (UChar*) u"Harvest"; 104 * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA 105 * 106 * USpoofChecker* sc = uspoof_open(&status); 107 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 108 * 109 * // Get skeleton 1 110 * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status); 111 * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar)); 112 * status = U_ZERO_ERROR; 113 * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status); 114 * 115 * // Get skeleton 2 116 * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status); 117 * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar)); 118 * status = U_ZERO_ERROR; 119 * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status); 120 * 121 * // Are the skeletons the same? 122 * UBool result = u_strcmp(skel1, skel2) == 0; 123 * // areConfusable: 1 (status: U_ZERO_ERROR) 124 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); 125 * uspoof_close(sc); 126 * free(skel1); 127 * free(skel2); 128 * \endcode 129 * 130 * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling 131 * {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below: 132 * 133 * \code{.c} 134 * UErrorCode status = U_ZERO_ERROR; 135 * #define DICTIONARY_LENGTH 2 136 * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" }; 137 * UChar* skeletons[DICTIONARY_LENGTH]; 138 * UChar* str = (UChar*) u"1orern"; 139 * 140 * // Setup: 141 * USpoofChecker* sc = uspoof_open(&status); 142 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 143 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 144 * UChar* word = dictionary[i]; 145 * int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status); 146 * skeletons[i] = (UChar*) malloc(++len * sizeof(UChar)); 147 * status = U_ZERO_ERROR; 148 * uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status); 149 * } 150 * 151 * // Live Check: 152 * { 153 * int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status); 154 * UChar* skel = (UChar*) malloc(++len * sizeof(UChar)); 155 * status = U_ZERO_ERROR; 156 * uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status); 157 * UBool result = FALSE; 158 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 159 * result = u_strcmp(skel, skeletons[i]) == 0; 160 * if (result == TRUE) { break; } 161 * } 162 * // Has confusable in dictionary: 1 (status: U_ZERO_ERROR) 163 * printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status)); 164 * free(skel); 165 * } 166 * 167 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 168 * free(skeletons[i]); 169 * } 170 * uspoof_close(sc); 171 * \endcode 172 * 173 * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em> 174 * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons 175 * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons. 176 * 177 * <h2>Spoof Detection</h2> 178 * 179 * The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a 180 * string: 181 * 182 * \code{.c} 183 * UErrorCode status = U_ZERO_ERROR; 184 * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A 185 * 186 * // Get the default set of allowable characters: 187 * USet* allowed = uset_openEmpty(); 188 * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); 189 * uset_addAll(allowed, uspoof_getInclusionSet(&status)); 190 * 191 * USpoofChecker* sc = uspoof_open(&status); 192 * uspoof_setAllowedChars(sc, allowed, &status); 193 * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); 194 * 195 * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status); 196 * UBool result = bitmask != 0; 197 * // fails checks: 1 (status: U_ZERO_ERROR) 198 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); 199 * uspoof_close(sc); 200 * uset_close(allowed); 201 * \endcode 202 * 203 * As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at 204 * startup, and call the cheaper {@link uspoof_check} online. We specify the set of 205 * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. 206 * 207 * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings, 208 * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers. 209 * 210 * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks 211 * is available in the returned bitmask. For complete information, use the {@link uspoof_check2} class of functions 212 * with a {@link USpoofCheckResult} parameter: 213 * 214 * \code{.c} 215 * UErrorCode status = U_ZERO_ERROR; 216 * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A 217 * 218 * // Get the default set of allowable characters: 219 * USet* allowed = uset_openEmpty(); 220 * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); 221 * uset_addAll(allowed, uspoof_getInclusionSet(&status)); 222 * 223 * USpoofChecker* sc = uspoof_open(&status); 224 * uspoof_setAllowedChars(sc, allowed, &status); 225 * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); 226 * 227 * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status); 228 * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status); 229 * 230 * int32_t failures1 = bitmask; 231 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status); 232 * assert(failures1 == failures2); 233 * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) 234 * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); 235 * 236 * // Cleanup: 237 * uspoof_close(sc); 238 * uset_close(allowed); 239 * uspoof_closeCheckResult(checkResult); 240 * \endcode 241 * 242 * C++ users can take advantage of a few syntactical conveniences. The following snippet is functionally 243 * equivalent to the one above: 244 * 245 * \code{.cpp} 246 * UErrorCode status = U_ZERO_ERROR; 247 * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A 248 * 249 * // Get the default set of allowable characters: 250 * UnicodeSet allowed; 251 * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); 252 * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); 253 * 254 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 255 * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); 256 * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); 257 * 258 * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); 259 * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); 260 * 261 * int32_t failures1 = bitmask; 262 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status); 263 * assert(failures1 == failures2); 264 * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) 265 * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); 266 * 267 * // Explicit cleanup not necessary. 268 * \endcode 269 * 270 * The return value is a bitmask of the checks that failed. In this case, there was one check that failed: 271 * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are: 272 * 273 * <ul> 274 * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the 275 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS 276 * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li> 277 * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character 278 * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li> 279 * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable 280 * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li> 281 * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li> 282 * </ul> 283 * 284 * <p> 285 * These checks can be enabled independently of each other. For example, if you were interested in checking for only the 286 * INVISIBLE and MIXED_NUMBERS conditions, you could do: 287 * 288 * \code{.c} 289 * UErrorCode status = U_ZERO_ERROR; 290 * UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR 291 * 292 * USpoofChecker* sc = uspoof_open(&status); 293 * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status); 294 * 295 * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status); 296 * UBool result = bitmask != 0; 297 * // fails checks: 1 (status: U_ZERO_ERROR) 298 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); 299 * uspoof_close(sc); 300 * \endcode 301 * 302 * Here is an example in C++ showing how to compute the restriction level of a string: 303 * 304 * \code{.cpp} 305 * UErrorCode status = U_ZERO_ERROR; 306 * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A 307 * 308 * // Get the default set of allowable characters: 309 * UnicodeSet allowed; 310 * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); 311 * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); 312 * 313 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 314 * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); 315 * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); 316 * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status); 317 * 318 * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); 319 * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); 320 * 321 * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status); 322 * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask: 323 * assert((restrictionLevel & bitmask) == restrictionLevel); 324 * // Restriction level: 0x50000000 (status: U_ZERO_ERROR) 325 * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status)); 326 * \endcode 327 * 328 * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since 329 * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check. 330 * 331 * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in 332 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings 333 * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have 334 * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is 335 * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed 336 * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on 337 * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of 338 * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code 339 * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple 340 * scripts. 341 * 342 * <h2>Additional Information</h2> 343 * 344 * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers. 345 * 346 * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether 347 * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads, 348 * using the same USpoofChecker instance. 349 * 350 * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are 351 * thread safe. Those that take a non-const USpoofChecker are not thread safe.. 352 * 353 * @stable ICU 4.6 354 */ 355 356 struct USpoofChecker; 357 /** 358 * @stable ICU 4.2 359 */ 360 typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */ 361 362 struct USpoofCheckResult; 363 /** 364 * @see uspoof_openCheckResult 365 * @stable ICU 58 366 */ 367 typedef struct USpoofCheckResult USpoofCheckResult; 368 369 /** 370 * Enum for the kinds of checks that USpoofChecker can perform. 371 * These enum values are used both to select the set of checks that 372 * will be performed, and to report results from the check function. 373 * 374 * @stable ICU 4.2 375 */ 376 typedef enum USpoofChecks { 377 /** 378 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 379 * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section 380 * 4. 381 * 382 * @see uspoof_areConfusable 383 * @stable ICU 4.2 384 */ 385 USPOOF_SINGLE_SCRIPT_CONFUSABLE = 1, 386 387 /** 388 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 389 * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS 390 * 39 section 4. 391 * 392 * @see uspoof_areConfusable 393 * @stable ICU 4.2 394 */ 395 USPOOF_MIXED_SCRIPT_CONFUSABLE = 2, 396 397 /** 398 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 399 * that the two strings are visually confusable and that they are not from the same script but both of them are 400 * single-script strings, according to UTS 39 section 4. 401 * 402 * @see uspoof_areConfusable 403 * @stable ICU 4.2 404 */ 405 USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4, 406 407 /** 408 * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables. You may set 409 * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to 410 * make {@link uspoof_areConfusable} return only those types of confusables. 411 * 412 * @see uspoof_areConfusable 413 * @see uspoof_getSkeleton 414 * @stable ICU 58 415 */ 416 USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, 417 418 #ifndef U_HIDE_DEPRECATED_API 419 /** 420 * This flag is deprecated and no longer affects the behavior of SpoofChecker. 421 * 422 * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was deprecated. 423 */ 424 USPOOF_ANY_CASE = 8, 425 #endif /* U_HIDE_DEPRECATED_API */ 426 427 /** 428 * Check that an identifier is no looser than the specified RestrictionLevel. 429 * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE. 430 * 431 * If USPOOF_AUX_INFO is enabled the actual restriction level of the 432 * identifier being tested will also be returned by uspoof_check(). 433 * 434 * @see URestrictionLevel 435 * @see uspoof_setRestrictionLevel 436 * @see USPOOF_AUX_INFO 437 * 438 * @stable ICU 51 439 */ 440 USPOOF_RESTRICTION_LEVEL = 16, 441 442 #ifndef U_HIDE_DEPRECATED_API 443 /** Check that an identifier contains only characters from a 444 * single script (plus chars from the common and inherited scripts.) 445 * Applies to checks of a single identifier check only. 446 * @deprecated ICU 51 Use RESTRICTION_LEVEL instead. 447 */ 448 USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL, 449 #endif /* U_HIDE_DEPRECATED_API */ 450 451 /** Check an identifier for the presence of invisible characters, 452 * such as zero-width spaces, or character sequences that are 453 * likely not to display, such as multiple occurrences of the same 454 * non-spacing mark. This check does not test the input string as a whole 455 * for conformance to any particular syntax for identifiers. 456 */ 457 USPOOF_INVISIBLE = 32, 458 459 /** Check that an identifier contains only characters from a specified set 460 * of acceptable characters. See {@link uspoof_setAllowedChars} and 461 * {@link uspoof_setAllowedLocales}. Note that a string that fails this check 462 * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check. 463 */ 464 USPOOF_CHAR_LIMIT = 64, 465 466 /** 467 * Check that an identifier does not mix numbers from different numbering systems. 468 * For more information, see UTS 39 section 5.3. 469 * 470 * @stable ICU 51 471 */ 472 USPOOF_MIXED_NUMBERS = 128, 473 474 #ifndef U_HIDE_DRAFT_API 475 /** 476 * Check that an identifier does not have a combining character following a character in which that 477 * combining character would be hidden; for example 'i' followed by a U+0307 combining dot. 478 * 479 * More specifically, the following characters are forbidden from preceding a U+0307: 480 * <ul> 481 * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li> 482 * <li>Latin lowercase letter 'l'</li> 483 * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li> 484 * <li>Any character whose confusable prototype ends with such a character 485 * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li> 486 * </ul> 487 * In addition, combining characters are allowed between the above characters and U+0307 except those 488 * with combining class 0 or combining class "Above" (230, same class as U+0307). 489 * 490 * This list and the number of combing characters considered by this check may grow over time. 491 * 492 * @draft ICU 62 493 */ 494 USPOOF_HIDDEN_OVERLAY = 256, 495 #endif /* U_HIDE_DRAFT_API */ 496 497 /** 498 * Enable all spoof checks. 499 * 500 * @stable ICU 4.6 501 */ 502 USPOOF_ALL_CHECKS = 0xFFFF, 503 504 /** 505 * Enable the return of auxillary (non-error) information in the 506 * upper bits of the check results value. 507 * 508 * If this "check" is not enabled, the results of {@link uspoof_check} will be 509 * zero when an identifier passes all of the enabled checks. 510 * 511 * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will 512 * be zero when an identifier passes all checks. 513 * 514 * @stable ICU 51 515 */ 516 USPOOF_AUX_INFO = 0x40000000 517 518 } USpoofChecks; 519 520 521 /** 522 * Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and 523 * for returned identifier restriction levels in check results. 524 * 525 * @stable ICU 51 526 * 527 * @see uspoof_setRestrictionLevel 528 * @see uspoof_check 529 */ 530 typedef enum URestrictionLevel { 531 /** 532 * All characters in the string are in the identifier profile and all characters in the string are in the 533 * ASCII range. 534 * 535 * @stable ICU 51 536 */ 537 USPOOF_ASCII = 0x10000000, 538 /** 539 * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and 540 * the string is single-script, according to the definition in UTS 39 section 5.1. 541 * 542 * @stable ICU 53 543 */ 544 USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000, 545 /** 546 * The string classifies as Single Script, or all characters in the string are in the identifier profile and 547 * the string is covered by any of the following sets of scripts, according to the definition in UTS 39 548 * section 5.1: 549 * <ul> 550 * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li> 551 * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li> 552 * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li> 553 * </ul> 554 * This is the default restriction in ICU. 555 * 556 * @stable ICU 51 557 */ 558 USPOOF_HIGHLY_RESTRICTIVE = 0x30000000, 559 /** 560 * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile 561 * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic, 562 * Greek, and Cherokee. 563 * 564 * @stable ICU 51 565 */ 566 USPOOF_MODERATELY_RESTRICTIVE = 0x40000000, 567 /** 568 * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts. 569 * 570 * @stable ICU 51 571 */ 572 USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000, 573 /** 574 * Any valid identifiers, including characters outside of the Identifier Profile. 575 * 576 * @stable ICU 51 577 */ 578 USPOOF_UNRESTRICTIVE = 0x60000000, 579 /** 580 * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}. 581 * 582 * @stable ICU 53 583 */ 584 USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000, 585 #ifndef U_HIDE_INTERNAL_API 586 /** 587 * An undefined restriction level. 588 * @internal 589 */ 590 USPOOF_UNDEFINED_RESTRICTIVE = -1 591 #endif /* U_HIDE_INTERNAL_API */ 592 } URestrictionLevel; 593 594 /** 595 * Create a Unicode Spoof Checker, configured to perform all 596 * checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT. 597 * Note that additional checks may be added in the future, 598 * resulting in the changes to the default checking behavior. 599 * 600 * @param status The error code, set if this function encounters a problem. 601 * @return the newly created Spoof Checker 602 * @stable ICU 4.2 603 */ 604 U_STABLE USpoofChecker * U_EXPORT2 605 uspoof_open(UErrorCode *status); 606 607 608 /** 609 * Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory. 610 * Inverse of uspoof_serialize(). 611 * The memory containing the serialized data must remain valid and unchanged 612 * as long as the spoof checker, or any cloned copies of the spoof checker, 613 * are in use. Ownership of the memory remains with the caller. 614 * The spoof checker (and any clones) must be closed prior to deleting the 615 * serialized data. 616 * 617 * @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data 618 * @param length the number of bytes available at data; 619 * can be more than necessary 620 * @param pActualLength receives the actual number of bytes at data taken up by the data; 621 * can be NULL 622 * @param pErrorCode ICU error code 623 * @return the spoof checker. 624 * 625 * @see uspoof_open 626 * @see uspoof_serialize 627 * @stable ICU 4.2 628 */ 629 U_STABLE USpoofChecker * U_EXPORT2 630 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, 631 UErrorCode *pErrorCode); 632 633 /** 634 * Open a Spoof Checker from the source form of the spoof data. 635 * The input corresponds to the Unicode data file confusables.txt 636 * as described in Unicode UAX #39. The syntax of the source data 637 * is as described in UAX #39 for this file, and the content of 638 * this file is acceptable input. 639 * 640 * The character encoding of the (char *) input text is UTF-8. 641 * 642 * @param confusables a pointer to the confusable characters definitions, 643 * as found in file confusables.txt from unicode.org. 644 * @param confusablesLen The length of the confusables text, or -1 if the 645 * input string is zero terminated. 646 * @param confusablesWholeScript 647 * Deprecated in ICU 58. No longer used. 648 * @param confusablesWholeScriptLen 649 * Deprecated in ICU 58. No longer used. 650 * @param errType In the event of an error in the input, indicates 651 * which of the input files contains the error. 652 * The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or 653 * USPOOF_WHOLE_SCRIPT_CONFUSABLE, or 654 * zero if no errors are found. 655 * @param pe In the event of an error in the input, receives the position 656 * in the input text (line, offset) of the error. 657 * @param status an in/out ICU UErrorCode. Among the possible errors is 658 * U_PARSE_ERROR, which is used to report syntax errors 659 * in the input. 660 * @return A spoof checker that uses the rules from the input files. 661 * @stable ICU 4.2 662 */ 663 U_STABLE USpoofChecker * U_EXPORT2 664 uspoof_openFromSource(const char *confusables, int32_t confusablesLen, 665 const char *confusablesWholeScript, int32_t confusablesWholeScriptLen, 666 int32_t *errType, UParseError *pe, UErrorCode *status); 667 668 669 /** 670 * Close a Spoof Checker, freeing any memory that was being held by 671 * its implementation. 672 * @stable ICU 4.2 673 */ 674 U_STABLE void U_EXPORT2 675 uspoof_close(USpoofChecker *sc); 676 677 #if U_SHOW_CPLUSPLUS_API 678 679 U_NAMESPACE_BEGIN 680 681 /** 682 * \class LocalUSpoofCheckerPointer 683 * "Smart pointer" class, closes a USpoofChecker via uspoof_close(). 684 * For most methods see the LocalPointerBase base class. 685 * 686 * @see LocalPointerBase 687 * @see LocalPointer 688 * @stable ICU 4.4 689 */ 690 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckerPointer, USpoofChecker, uspoof_close); 691 692 U_NAMESPACE_END 693 694 #endif 695 696 /** 697 * Clone a Spoof Checker. The clone will be set to perform the same checks 698 * as the original source. 699 * 700 * @param sc The source USpoofChecker 701 * @param status The error code, set if this function encounters a problem. 702 * @return 703 * @stable ICU 4.2 704 */ 705 U_STABLE USpoofChecker * U_EXPORT2 706 uspoof_clone(const USpoofChecker *sc, UErrorCode *status); 707 708 709 /** 710 * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method 711 * overwrites any checks that may have already been enabled. By default, all checks are enabled. 712 * 713 * To enable specific checks and disable all others, the "whitelisted" checks should be ORed together. For 714 * example, to fail strings containing characters outside of the set specified by {@link uspoof_setAllowedChars} and 715 * also strings that contain digits from mixed numbering systems: 716 * 717 * <pre> 718 * {@code 719 * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS); 720 * } 721 * </pre> 722 * 723 * To disable specific checks and enable all others, the "blacklisted" checks should be ANDed away from 724 * ALL_CHECKS. For example, if you are not planning to use the {@link uspoof_areConfusable} functionality, 725 * it is good practice to disable the CONFUSABLE check: 726 * 727 * <pre> 728 * {@code 729 * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE); 730 * } 731 * </pre> 732 * 733 * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and 734 * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they 735 * enable onto the existing bitmask specified by this method. For more details, see the documentation of those 736 * methods. 737 * 738 * @param sc The USpoofChecker 739 * @param checks The set of checks that this spoof checker will perform. 740 * The value is a bit set, obtained by OR-ing together 741 * values from enum USpoofChecks. 742 * @param status The error code, set if this function encounters a problem. 743 * @stable ICU 4.2 744 * 745 */ 746 U_STABLE void U_EXPORT2 747 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status); 748 749 /** 750 * Get the set of checks that this Spoof Checker has been configured to perform. 751 * 752 * @param sc The USpoofChecker 753 * @param status The error code, set if this function encounters a problem. 754 * @return The set of checks that this spoof checker will perform. 755 * The value is a bit set, obtained by OR-ing together 756 * values from enum USpoofChecks. 757 * @stable ICU 4.2 758 * 759 */ 760 U_STABLE int32_t U_EXPORT2 761 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status); 762 763 /** 764 * Set the loosest restriction level allowed for strings. The default if this is not called is 765 * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and 766 * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are 767 * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}. 768 * 769 * @param sc The USpoofChecker 770 * @param restrictionLevel The loosest restriction level allowed. 771 * @see URestrictionLevel 772 * @stable ICU 51 773 */ 774 U_STABLE void U_EXPORT2 775 uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel); 776 777 778 /** 779 * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}. 780 * 781 * @return The restriction level 782 * @see URestrictionLevel 783 * @stable ICU 51 784 */ 785 U_STABLE URestrictionLevel U_EXPORT2 786 uspoof_getRestrictionLevel(const USpoofChecker *sc); 787 788 /** 789 * Limit characters that are acceptable in identifiers being checked to those 790 * normally used with the languages associated with the specified locales. 791 * Any previously specified list of locales is replaced by the new settings. 792 * 793 * A set of languages is determined from the locale(s), and 794 * from those a set of acceptable Unicode scripts is determined. 795 * Characters from this set of scripts, along with characters from 796 * the "common" and "inherited" Unicode Script categories 797 * will be permitted. 798 * 799 * Supplying an empty string removes all restrictions; 800 * characters from any script will be allowed. 801 * 802 * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this 803 * USpoofChecker when calling this function with a non-empty list 804 * of locales. 805 * 806 * The Unicode Set of characters that will be allowed is accessible 807 * via the uspoof_getAllowedChars() function. uspoof_setAllowedLocales() 808 * will <i>replace</i> any previously applied set of allowed characters. 809 * 810 * Adjustments, such as additions or deletions of certain classes of characters, 811 * can be made to the result of uspoof_setAllowedLocales() by 812 * fetching the resulting set with uspoof_getAllowedChars(), 813 * manipulating it with the Unicode Set API, then resetting the 814 * spoof detectors limits with uspoof_setAllowedChars(). 815 * 816 * @param sc The USpoofChecker 817 * @param localesList A list list of locales, from which the language 818 * and associated script are extracted. The locales 819 * are comma-separated if there is more than one. 820 * White space may not appear within an individual locale, 821 * but is ignored otherwise. 822 * The locales are syntactically like those from the 823 * HTTP Accept-Language header. 824 * If the localesList is empty, no restrictions will be placed on 825 * the allowed characters. 826 * 827 * @param status The error code, set if this function encounters a problem. 828 * @stable ICU 4.2 829 */ 830 U_STABLE void U_EXPORT2 831 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status); 832 833 /** 834 * Get a list of locales for the scripts that are acceptable in strings 835 * to be checked. If no limitations on scripts have been specified, 836 * an empty string will be returned. 837 * 838 * uspoof_setAllowedChars() will reset the list of allowed to be empty. 839 * 840 * The format of the returned list is the same as that supplied to 841 * uspoof_setAllowedLocales(), but returned list may not be identical 842 * to the originally specified string; the string may be reformatted, 843 * and information other than languages from 844 * the originally specified locales may be omitted. 845 * 846 * @param sc The USpoofChecker 847 * @param status The error code, set if this function encounters a problem. 848 * @return A string containing a list of locales corresponding 849 * to the acceptable scripts, formatted like an 850 * HTTP Accept Language value. 851 * 852 * @stable ICU 4.2 853 */ 854 U_STABLE const char * U_EXPORT2 855 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status); 856 857 858 /** 859 * Limit the acceptable characters to those specified by a Unicode Set. 860 * Any previously specified character limit is 861 * is replaced by the new settings. This includes limits on 862 * characters that were set with the uspoof_setAllowedLocales() function. 863 * 864 * The USPOOF_CHAR_LIMIT test is automatically enabled for this 865 * USpoofChecker by this function. 866 * 867 * @param sc The USpoofChecker 868 * @param chars A Unicode Set containing the list of 869 * characters that are permitted. Ownership of the set 870 * remains with the caller. The incoming set is cloned by 871 * this function, so there are no restrictions on modifying 872 * or deleting the USet after calling this function. 873 * @param status The error code, set if this function encounters a problem. 874 * @stable ICU 4.2 875 */ 876 U_STABLE void U_EXPORT2 877 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status); 878 879 880 /** 881 * Get a USet for the characters permitted in an identifier. 882 * This corresponds to the limits imposed by the Set Allowed Characters 883 * functions. Limitations imposed by other checks will not be 884 * reflected in the set returned by this function. 885 * 886 * The returned set will be frozen, meaning that it cannot be modified 887 * by the caller. 888 * 889 * Ownership of the returned set remains with the Spoof Detector. The 890 * returned set will become invalid if the spoof detector is closed, 891 * or if a new set of allowed characters is specified. 892 * 893 * 894 * @param sc The USpoofChecker 895 * @param status The error code, set if this function encounters a problem. 896 * @return A USet containing the characters that are permitted by 897 * the USPOOF_CHAR_LIMIT test. 898 * @stable ICU 4.2 899 */ 900 U_STABLE const USet * U_EXPORT2 901 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status); 902 903 904 #if U_SHOW_CPLUSPLUS_API 905 /** 906 * Limit the acceptable characters to those specified by a Unicode Set. 907 * Any previously specified character limit is 908 * is replaced by the new settings. This includes limits on 909 * characters that were set with the uspoof_setAllowedLocales() function. 910 * 911 * The USPOOF_CHAR_LIMIT test is automatically enabled for this 912 * USoofChecker by this function. 913 * 914 * @param sc The USpoofChecker 915 * @param chars A Unicode Set containing the list of 916 * characters that are permitted. Ownership of the set 917 * remains with the caller. The incoming set is cloned by 918 * this function, so there are no restrictions on modifying 919 * or deleting the UnicodeSet after calling this function. 920 * @param status The error code, set if this function encounters a problem. 921 * @stable ICU 4.2 922 */ 923 U_STABLE void U_EXPORT2 924 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const icu::UnicodeSet *chars, UErrorCode *status); 925 926 927 /** 928 * Get a UnicodeSet for the characters permitted in an identifier. 929 * This corresponds to the limits imposed by the Set Allowed Characters / 930 * UnicodeSet functions. Limitations imposed by other checks will not be 931 * reflected in the set returned by this function. 932 * 933 * The returned set will be frozen, meaning that it cannot be modified 934 * by the caller. 935 * 936 * Ownership of the returned set remains with the Spoof Detector. The 937 * returned set will become invalid if the spoof detector is closed, 938 * or if a new set of allowed characters is specified. 939 * 940 * 941 * @param sc The USpoofChecker 942 * @param status The error code, set if this function encounters a problem. 943 * @return A UnicodeSet containing the characters that are permitted by 944 * the USPOOF_CHAR_LIMIT test. 945 * @stable ICU 4.2 946 */ 947 U_STABLE const icu::UnicodeSet * U_EXPORT2 948 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status); 949 #endif 950 951 952 /** 953 * Check the specified string for possible security issues. 954 * The text to be checked will typically be an identifier of some sort. 955 * The set of checks to be performed is specified with uspoof_setChecks(). 956 * 957 * \note 958 * Consider using the newer API, {@link uspoof_check2}, instead. 959 * The newer API exposes additional information from the check procedure 960 * and is otherwise identical to this method. 961 * 962 * @param sc The USpoofChecker 963 * @param id The identifier to be checked for possible security issues, 964 * in UTF-16 format. 965 * @param length the length of the string to be checked, expressed in 966 * 16 bit UTF-16 code units, or -1 if the string is 967 * zero terminated. 968 * @param position Deprecated in ICU 51. Always returns zero. 969 * Originally, an out parameter for the index of the first 970 * string position that failed a check. 971 * This parameter may be NULL. 972 * @param status The error code, set if an error occurred while attempting to 973 * perform the check. 974 * Spoofing or security issues detected with the input string are 975 * not reported here, but through the function's return value. 976 * @return An integer value with bits set for any potential security 977 * or spoofing issues detected. The bits are defined by 978 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 979 * will be zero if the input string passes all of the 980 * enabled checks. 981 * @see uspoof_check2 982 * @stable ICU 4.2 983 */ 984 U_STABLE int32_t U_EXPORT2 985 uspoof_check(const USpoofChecker *sc, 986 const UChar *id, int32_t length, 987 int32_t *position, 988 UErrorCode *status); 989 990 991 /** 992 * Check the specified string for possible security issues. 993 * The text to be checked will typically be an identifier of some sort. 994 * The set of checks to be performed is specified with uspoof_setChecks(). 995 * 996 * \note 997 * Consider using the newer API, {@link uspoof_check2UTF8}, instead. 998 * The newer API exposes additional information from the check procedure 999 * and is otherwise identical to this method. 1000 * 1001 * @param sc The USpoofChecker 1002 * @param id A identifier to be checked for possible security issues, in UTF8 format. 1003 * @param length the length of the string to be checked, or -1 if the string is 1004 * zero terminated. 1005 * @param position Deprecated in ICU 51. Always returns zero. 1006 * Originally, an out parameter for the index of the first 1007 * string position that failed a check. 1008 * This parameter may be NULL. 1009 * @param status The error code, set if an error occurred while attempting to 1010 * perform the check. 1011 * Spoofing or security issues detected with the input string are 1012 * not reported here, but through the function's return value. 1013 * If the input contains invalid UTF-8 sequences, 1014 * a status of U_INVALID_CHAR_FOUND will be returned. 1015 * @return An integer value with bits set for any potential security 1016 * or spoofing issues detected. The bits are defined by 1017 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1018 * will be zero if the input string passes all of the 1019 * enabled checks. 1020 * @see uspoof_check2UTF8 1021 * @stable ICU 4.2 1022 */ 1023 U_STABLE int32_t U_EXPORT2 1024 uspoof_checkUTF8(const USpoofChecker *sc, 1025 const char *id, int32_t length, 1026 int32_t *position, 1027 UErrorCode *status); 1028 1029 1030 #if U_SHOW_CPLUSPLUS_API 1031 /** 1032 * Check the specified string for possible security issues. 1033 * The text to be checked will typically be an identifier of some sort. 1034 * The set of checks to be performed is specified with uspoof_setChecks(). 1035 * 1036 * \note 1037 * Consider using the newer API, {@link uspoof_check2UnicodeString}, instead. 1038 * The newer API exposes additional information from the check procedure 1039 * and is otherwise identical to this method. 1040 * 1041 * @param sc The USpoofChecker 1042 * @param id A identifier to be checked for possible security issues. 1043 * @param position Deprecated in ICU 51. Always returns zero. 1044 * Originally, an out parameter for the index of the first 1045 * string position that failed a check. 1046 * This parameter may be NULL. 1047 * @param status The error code, set if an error occurred while attempting to 1048 * perform the check. 1049 * Spoofing or security issues detected with the input string are 1050 * not reported here, but through the function's return value. 1051 * @return An integer value with bits set for any potential security 1052 * or spoofing issues detected. The bits are defined by 1053 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1054 * will be zero if the input string passes all of the 1055 * enabled checks. 1056 * @see uspoof_check2UnicodeString 1057 * @stable ICU 4.2 1058 */ 1059 U_STABLE int32_t U_EXPORT2 1060 uspoof_checkUnicodeString(const USpoofChecker *sc, 1061 const icu::UnicodeString &id, 1062 int32_t *position, 1063 UErrorCode *status); 1064 #endif 1065 1066 1067 /** 1068 * Check the specified string for possible security issues. 1069 * The text to be checked will typically be an identifier of some sort. 1070 * The set of checks to be performed is specified with uspoof_setChecks(). 1071 * 1072 * @param sc The USpoofChecker 1073 * @param id The identifier to be checked for possible security issues, 1074 * in UTF-16 format. 1075 * @param length the length of the string to be checked, or -1 if the string is 1076 * zero terminated. 1077 * @param checkResult An instance of USpoofCheckResult to be filled with 1078 * details about the identifier. Can be NULL. 1079 * @param status The error code, set if an error occurred while attempting to 1080 * perform the check. 1081 * Spoofing or security issues detected with the input string are 1082 * not reported here, but through the function's return value. 1083 * @return An integer value with bits set for any potential security 1084 * or spoofing issues detected. The bits are defined by 1085 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1086 * will be zero if the input string passes all of the 1087 * enabled checks. Any information in this bitmask will be 1088 * consistent with the information saved in the optional 1089 * checkResult parameter. 1090 * @see uspoof_openCheckResult 1091 * @see uspoof_check2UTF8 1092 * @see uspoof_check2UnicodeString 1093 * @stable ICU 58 1094 */ 1095 U_STABLE int32_t U_EXPORT2 1096 uspoof_check2(const USpoofChecker *sc, 1097 const UChar* id, int32_t length, 1098 USpoofCheckResult* checkResult, 1099 UErrorCode *status); 1100 1101 /** 1102 * Check the specified string for possible security issues. 1103 * The text to be checked will typically be an identifier of some sort. 1104 * The set of checks to be performed is specified with uspoof_setChecks(). 1105 * 1106 * This version of {@link uspoof_check} accepts a USpoofCheckResult, which 1107 * returns additional information about the identifier. For more 1108 * information, see {@link uspoof_openCheckResult}. 1109 * 1110 * @param sc The USpoofChecker 1111 * @param id A identifier to be checked for possible security issues, in UTF8 format. 1112 * @param length the length of the string to be checked, or -1 if the string is 1113 * zero terminated. 1114 * @param checkResult An instance of USpoofCheckResult to be filled with 1115 * details about the identifier. Can be NULL. 1116 * @param status The error code, set if an error occurred while attempting to 1117 * perform the check. 1118 * Spoofing or security issues detected with the input string are 1119 * not reported here, but through the function's return value. 1120 * @return An integer value with bits set for any potential security 1121 * or spoofing issues detected. The bits are defined by 1122 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1123 * will be zero if the input string passes all of the 1124 * enabled checks. Any information in this bitmask will be 1125 * consistent with the information saved in the optional 1126 * checkResult parameter. 1127 * @see uspoof_openCheckResult 1128 * @see uspoof_check2 1129 * @see uspoof_check2UnicodeString 1130 * @stable ICU 58 1131 */ 1132 U_STABLE int32_t U_EXPORT2 1133 uspoof_check2UTF8(const USpoofChecker *sc, 1134 const char *id, int32_t length, 1135 USpoofCheckResult* checkResult, 1136 UErrorCode *status); 1137 1138 #if U_SHOW_CPLUSPLUS_API 1139 /** 1140 * Check the specified string for possible security issues. 1141 * The text to be checked will typically be an identifier of some sort. 1142 * The set of checks to be performed is specified with uspoof_setChecks(). 1143 * 1144 * @param sc The USpoofChecker 1145 * @param id A identifier to be checked for possible security issues. 1146 * @param checkResult An instance of USpoofCheckResult to be filled with 1147 * details about the identifier. Can be NULL. 1148 * @param status The error code, set if an error occurred while attempting to 1149 * perform the check. 1150 * Spoofing or security issues detected with the input string are 1151 * not reported here, but through the function's return value. 1152 * @return An integer value with bits set for any potential security 1153 * or spoofing issues detected. The bits are defined by 1154 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1155 * will be zero if the input string passes all of the 1156 * enabled checks. Any information in this bitmask will be 1157 * consistent with the information saved in the optional 1158 * checkResult parameter. 1159 * @see uspoof_openCheckResult 1160 * @see uspoof_check2 1161 * @see uspoof_check2UTF8 1162 * @stable ICU 58 1163 */ 1164 U_STABLE int32_t U_EXPORT2 1165 uspoof_check2UnicodeString(const USpoofChecker *sc, 1166 const icu::UnicodeString &id, 1167 USpoofCheckResult* checkResult, 1168 UErrorCode *status); 1169 #endif 1170 1171 /** 1172 * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return 1173 * information about the identifier. Information includes: 1174 * <ul> 1175 * <li>A bitmask of the checks that failed</li> 1176 * <li>The identifier's restriction level (UTS 39 section 5.2)</li> 1177 * <li>The set of numerics in the string (UTS 39 section 5.3)</li> 1178 * </ul> 1179 * The data held in a USpoofCheckResult is cleared whenever it is passed into a new call 1180 * of {@link uspoof_check2}. 1181 * 1182 * @param status The error code, set if this function encounters a problem. 1183 * @return the newly created USpoofCheckResult 1184 * @see uspoof_check2 1185 * @see uspoof_check2UTF8 1186 * @see uspoof_check2UnicodeString 1187 * @stable ICU 58 1188 */ 1189 U_STABLE USpoofCheckResult* U_EXPORT2 1190 uspoof_openCheckResult(UErrorCode *status); 1191 1192 /** 1193 * Close a USpoofCheckResult, freeing any memory that was being held by 1194 * its implementation. 1195 * 1196 * @param checkResult The instance of USpoofCheckResult to close 1197 * @stable ICU 58 1198 */ 1199 U_STABLE void U_EXPORT2 1200 uspoof_closeCheckResult(USpoofCheckResult *checkResult); 1201 1202 #if U_SHOW_CPLUSPLUS_API 1203 1204 U_NAMESPACE_BEGIN 1205 1206 /** 1207 * \class LocalUSpoofCheckResultPointer 1208 * "Smart pointer" class, closes a USpoofCheckResult via `uspoof_closeCheckResult()`. 1209 * For most methods see the LocalPointerBase base class. 1210 * 1211 * @see LocalPointerBase 1212 * @see LocalPointer 1213 * @stable ICU 58 1214 */ 1215 1216 /** 1217 * \cond 1218 * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER. 1219 * For now, suppress with a Doxygen cond 1220 */ 1221 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult); 1222 /** \endcond */ 1223 1224 U_NAMESPACE_END 1225 1226 #endif 1227 1228 /** 1229 * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests 1230 * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on. 1231 * 1232 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1233 * @param status The error code, set if an error occurred. 1234 * @return An integer value with bits set for any potential security 1235 * or spoofing issues detected. The bits are defined by 1236 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1237 * will be zero if the input string passes all of the 1238 * enabled checks. 1239 * @see uspoof_setChecks 1240 * @stable ICU 58 1241 */ 1242 U_STABLE int32_t U_EXPORT2 1243 uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status); 1244 1245 /** 1246 * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check 1247 * was enabled; otherwise, undefined. 1248 * 1249 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1250 * @param status The error code, set if an error occurred. 1251 * @return The restriction level contained in the USpoofCheckResult 1252 * @see uspoof_setRestrictionLevel 1253 * @stable ICU 58 1254 */ 1255 U_STABLE URestrictionLevel U_EXPORT2 1256 uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status); 1257 1258 /** 1259 * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled; 1260 * otherwise, undefined. The set will contain the zero digit from each decimal number system found 1261 * in the input string. Ownership of the returned USet remains with the USpoofCheckResult. 1262 * The USet will be free'd when {@link uspoof_closeCheckResult} is called. 1263 * 1264 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1265 * @return The set of numerics contained in the USpoofCheckResult 1266 * @param status The error code, set if an error occurred. 1267 * @stable ICU 58 1268 */ 1269 U_STABLE const USet* U_EXPORT2 1270 uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status); 1271 1272 1273 /** 1274 * Check the whether two specified strings are visually confusable. 1275 * 1276 * If the strings are confusable, the return value will be nonzero, as long as 1277 * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks(). 1278 * 1279 * The bits in the return value correspond to flags for each of the classes of 1280 * confusables applicable to the two input strings. According to UTS 39 1281 * section 4, the possible flags are: 1282 * 1283 * <ul> 1284 * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li> 1285 * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li> 1286 * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li> 1287 * </ul> 1288 * 1289 * If one or more of the above flags were not listed in uspoof_setChecks(), this 1290 * function will never report that class of confusable. The check 1291 * {@link USPOOF_CONFUSABLE} enables all three flags. 1292 * 1293 * 1294 * @param sc The USpoofChecker 1295 * @param id1 The first of the two identifiers to be compared for 1296 * confusability. The strings are in UTF-16 format. 1297 * @param length1 the length of the first identifer, expressed in 1298 * 16 bit UTF-16 code units, or -1 if the string is 1299 * nul terminated. 1300 * @param id2 The second of the two identifiers to be compared for 1301 * confusability. The identifiers are in UTF-16 format. 1302 * @param length2 The length of the second identifiers, expressed in 1303 * 16 bit UTF-16 code units, or -1 if the string is 1304 * nul terminated. 1305 * @param status The error code, set if an error occurred while attempting to 1306 * perform the check. 1307 * Confusability of the identifiers is not reported here, 1308 * but through this function's return value. 1309 * @return An integer value with bit(s) set corresponding to 1310 * the type of confusability found, as defined by 1311 * enum USpoofChecks. Zero is returned if the identifiers 1312 * are not confusable. 1313 * 1314 * @stable ICU 4.2 1315 */ 1316 U_STABLE int32_t U_EXPORT2 1317 uspoof_areConfusable(const USpoofChecker *sc, 1318 const UChar *id1, int32_t length1, 1319 const UChar *id2, int32_t length2, 1320 UErrorCode *status); 1321 1322 1323 1324 /** 1325 * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format. 1326 * 1327 * @param sc The USpoofChecker 1328 * @param id1 The first of the two identifiers to be compared for 1329 * confusability. The strings are in UTF-8 format. 1330 * @param length1 the length of the first identifiers, in bytes, or -1 1331 * if the string is nul terminated. 1332 * @param id2 The second of the two identifiers to be compared for 1333 * confusability. The strings are in UTF-8 format. 1334 * @param length2 The length of the second string in bytes, or -1 1335 * if the string is nul terminated. 1336 * @param status The error code, set if an error occurred while attempting to 1337 * perform the check. 1338 * Confusability of the strings is not reported here, 1339 * but through this function's return value. 1340 * @return An integer value with bit(s) set corresponding to 1341 * the type of confusability found, as defined by 1342 * enum USpoofChecks. Zero is returned if the strings 1343 * are not confusable. 1344 * 1345 * @stable ICU 4.2 1346 * 1347 * @see uspoof_areConfusable 1348 */ 1349 U_STABLE int32_t U_EXPORT2 1350 uspoof_areConfusableUTF8(const USpoofChecker *sc, 1351 const char *id1, int32_t length1, 1352 const char *id2, int32_t length2, 1353 UErrorCode *status); 1354 1355 1356 1357 1358 #if U_SHOW_CPLUSPLUS_API 1359 /** 1360 * A version of {@link uspoof_areConfusable} accepting UnicodeStrings. 1361 * 1362 * @param sc The USpoofChecker 1363 * @param s1 The first of the two identifiers to be compared for 1364 * confusability. The strings are in UTF-8 format. 1365 * @param s2 The second of the two identifiers to be compared for 1366 * confusability. The strings are in UTF-8 format. 1367 * @param status The error code, set if an error occurred while attempting to 1368 * perform the check. 1369 * Confusability of the identifiers is not reported here, 1370 * but through this function's return value. 1371 * @return An integer value with bit(s) set corresponding to 1372 * the type of confusability found, as defined by 1373 * enum USpoofChecks. Zero is returned if the identifiers 1374 * are not confusable. 1375 * 1376 * @stable ICU 4.2 1377 * 1378 * @see uspoof_areConfusable 1379 */ 1380 U_STABLE int32_t U_EXPORT2 1381 uspoof_areConfusableUnicodeString(const USpoofChecker *sc, 1382 const icu::UnicodeString &s1, 1383 const icu::UnicodeString &s2, 1384 UErrorCode *status); 1385 #endif 1386 1387 1388 /** 1389 * Get the "skeleton" for an identifier. 1390 * Skeletons are a transformation of the input identifier; 1391 * Two identifiers are confusable if their skeletons are identical. 1392 * See Unicode UAX #39 for additional information. 1393 * 1394 * Using skeletons directly makes it possible to quickly check 1395 * whether an identifier is confusable with any of some large 1396 * set of existing identifiers, by creating an efficiently 1397 * searchable collection of the skeletons. 1398 * 1399 * @param sc The USpoofChecker 1400 * @param type Deprecated in ICU 58. You may pass any number. 1401 * Originally, controlled which of the Unicode confusable data 1402 * tables to use. 1403 * @param id The input identifier whose skeleton will be computed. 1404 * @param length The length of the input identifier, expressed in 16 bit 1405 * UTF-16 code units, or -1 if the string is zero terminated. 1406 * @param dest The output buffer, to receive the skeleton string. 1407 * @param destCapacity The length of the output buffer, in 16 bit units. 1408 * The destCapacity may be zero, in which case the function will 1409 * return the actual length of the skeleton. 1410 * @param status The error code, set if an error occurred while attempting to 1411 * perform the check. 1412 * @return The length of the skeleton string. The returned length 1413 * is always that of the complete skeleton, even when the 1414 * supplied buffer is too small (or of zero length) 1415 * 1416 * @stable ICU 4.2 1417 * @see uspoof_areConfusable 1418 */ 1419 U_STABLE int32_t U_EXPORT2 1420 uspoof_getSkeleton(const USpoofChecker *sc, 1421 uint32_t type, 1422 const UChar *id, int32_t length, 1423 UChar *dest, int32_t destCapacity, 1424 UErrorCode *status); 1425 1426 /** 1427 * Get the "skeleton" for an identifier. 1428 * Skeletons are a transformation of the input identifier; 1429 * Two identifiers are confusable if their skeletons are identical. 1430 * See Unicode UAX #39 for additional information. 1431 * 1432 * Using skeletons directly makes it possible to quickly check 1433 * whether an identifier is confusable with any of some large 1434 * set of existing identifiers, by creating an efficiently 1435 * searchable collection of the skeletons. 1436 * 1437 * @param sc The USpoofChecker 1438 * @param type Deprecated in ICU 58. You may pass any number. 1439 * Originally, controlled which of the Unicode confusable data 1440 * tables to use. 1441 * @param id The UTF-8 format identifier whose skeleton will be computed. 1442 * @param length The length of the input string, in bytes, 1443 * or -1 if the string is zero terminated. 1444 * @param dest The output buffer, to receive the skeleton string. 1445 * @param destCapacity The length of the output buffer, in bytes. 1446 * The destCapacity may be zero, in which case the function will 1447 * return the actual length of the skeleton. 1448 * @param status The error code, set if an error occurred while attempting to 1449 * perform the check. Possible Errors include U_INVALID_CHAR_FOUND 1450 * for invalid UTF-8 sequences, and 1451 * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small 1452 * to hold the complete skeleton. 1453 * @return The length of the skeleton string, in bytes. The returned length 1454 * is always that of the complete skeleton, even when the 1455 * supplied buffer is too small (or of zero length) 1456 * 1457 * @stable ICU 4.2 1458 */ 1459 U_STABLE int32_t U_EXPORT2 1460 uspoof_getSkeletonUTF8(const USpoofChecker *sc, 1461 uint32_t type, 1462 const char *id, int32_t length, 1463 char *dest, int32_t destCapacity, 1464 UErrorCode *status); 1465 1466 #if U_SHOW_CPLUSPLUS_API 1467 /** 1468 * Get the "skeleton" for an identifier. 1469 * Skeletons are a transformation of the input identifier; 1470 * Two identifiers are confusable if their skeletons are identical. 1471 * See Unicode UAX #39 for additional information. 1472 * 1473 * Using skeletons directly makes it possible to quickly check 1474 * whether an identifier is confusable with any of some large 1475 * set of existing identifiers, by creating an efficiently 1476 * searchable collection of the skeletons. 1477 * 1478 * @param sc The USpoofChecker. 1479 * @param type Deprecated in ICU 58. You may pass any number. 1480 * Originally, controlled which of the Unicode confusable data 1481 * tables to use. 1482 * @param id The input identifier whose skeleton will be computed. 1483 * @param dest The output identifier, to receive the skeleton string. 1484 * @param status The error code, set if an error occurred while attempting to 1485 * perform the check. 1486 * @return A reference to the destination (skeleton) string. 1487 * 1488 * @stable ICU 4.2 1489 */ 1490 U_I18N_API icu::UnicodeString & U_EXPORT2 1491 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, 1492 uint32_t type, 1493 const icu::UnicodeString &id, 1494 icu::UnicodeString &dest, 1495 UErrorCode *status); 1496 #endif /* U_SHOW_CPLUSPLUS_API */ 1497 1498 /** 1499 * Get the set of Candidate Characters for Inclusion in Identifiers, as defined 1500 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1501 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1502 * 1503 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1504 * be deleted by the caller. 1505 * 1506 * @param status The error code, set if a problem occurs while creating the set. 1507 * 1508 * @stable ICU 51 1509 */ 1510 U_STABLE const USet * U_EXPORT2 1511 uspoof_getInclusionSet(UErrorCode *status); 1512 1513 /** 1514 * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined 1515 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1516 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1517 * 1518 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1519 * be deleted by the caller. 1520 * 1521 * @param status The error code, set if a problem occurs while creating the set. 1522 * 1523 * @stable ICU 51 1524 */ 1525 U_STABLE const USet * U_EXPORT2 1526 uspoof_getRecommendedSet(UErrorCode *status); 1527 1528 #if U_SHOW_CPLUSPLUS_API 1529 1530 /** 1531 * Get the set of Candidate Characters for Inclusion in Identifiers, as defined 1532 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1533 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1534 * 1535 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1536 * be deleted by the caller. 1537 * 1538 * @param status The error code, set if a problem occurs while creating the set. 1539 * 1540 * @stable ICU 51 1541 */ 1542 U_STABLE const icu::UnicodeSet * U_EXPORT2 1543 uspoof_getInclusionUnicodeSet(UErrorCode *status); 1544 1545 /** 1546 * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined 1547 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1548 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1549 * 1550 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1551 * be deleted by the caller. 1552 * 1553 * @param status The error code, set if a problem occurs while creating the set. 1554 * 1555 * @stable ICU 51 1556 */ 1557 U_STABLE const icu::UnicodeSet * U_EXPORT2 1558 uspoof_getRecommendedUnicodeSet(UErrorCode *status); 1559 1560 #endif /* U_SHOW_CPLUSPLUS_API */ 1561 1562 /** 1563 * Serialize the data for a spoof detector into a chunk of memory. 1564 * The flattened spoof detection tables can later be used to efficiently 1565 * instantiate a new Spoof Detector. 1566 * 1567 * The serialized spoof checker includes only the data compiled from the 1568 * Unicode data tables by uspoof_openFromSource(); it does not include 1569 * include any other state or configuration that may have been set. 1570 * 1571 * @param sc the Spoof Detector whose data is to be serialized. 1572 * @param data a pointer to 32-bit-aligned memory to be filled with the data, 1573 * can be NULL if capacity==0 1574 * @param capacity the number of bytes available at data, 1575 * or 0 for preflighting 1576 * @param status an in/out ICU UErrorCode; possible errors include: 1577 * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization 1578 * - U_ILLEGAL_ARGUMENT_ERROR the data or capacity parameters are bad 1579 * @return the number of bytes written or needed for the spoof data 1580 * 1581 * @see utrie2_openFromSerialized() 1582 * @stable ICU 4.2 1583 */ 1584 U_STABLE int32_t U_EXPORT2 1585 uspoof_serialize(USpoofChecker *sc, 1586 void *data, int32_t capacity, 1587 UErrorCode *status); 1588 1589 1590 #endif 1591 1592 #endif /* USPOOF_H */ 1593