1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2008-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11 #include "unicode/uspoof.h"
12 #include "unicode/uchar.h"
13 #include "unicode/uniset.h"
14 #include "unicode/utf16.h"
15 #include "utrie2.h"
16 #include "cmemory.h"
17 #include "cstring.h"
18 #include "scriptset.h"
19 #include "umutex.h"
20 #include "udataswp.h"
21 #include "uassert.h"
22 #include "ucln_in.h"
23 #include "uspoof_impl.h"
24
25 #if !UCONFIG_NO_NORMALIZATION
26
27
28 U_NAMESPACE_BEGIN
29
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)30 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
31
32 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
33 construct(status);
34 fSpoofData = data;
35 }
36
SpoofImpl(UErrorCode & status)37 SpoofImpl::SpoofImpl(UErrorCode& status) {
38 construct(status);
39
40 // TODO: Call this method where it is actually needed, instead of in the
41 // constructor, to allow for lazy data loading. See #12696.
42 fSpoofData = SpoofData::getDefault(status);
43 }
44
SpoofImpl()45 SpoofImpl::SpoofImpl() {
46 UErrorCode status = U_ZERO_ERROR;
47 construct(status);
48
49 // TODO: Call this method where it is actually needed, instead of in the
50 // constructor, to allow for lazy data loading. See #12696.
51 fSpoofData = SpoofData::getDefault(status);
52 }
53
construct(UErrorCode & status)54 void SpoofImpl::construct(UErrorCode& status) {
55 fChecks = USPOOF_ALL_CHECKS;
56 fSpoofData = NULL;
57 fAllowedCharsSet = NULL;
58 fAllowedLocales = NULL;
59 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
60
61 if (U_FAILURE(status)) { return; }
62
63 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
64 fAllowedCharsSet = allowedCharsSet;
65 fAllowedLocales = uprv_strdup("");
66 if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
67 status = U_MEMORY_ALLOCATION_ERROR;
68 return;
69 }
70 allowedCharsSet->freeze();
71 }
72
73
74 // Copy Constructor, used by the user level clone() function.
SpoofImpl(const SpoofImpl & src,UErrorCode & status)75 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
76 fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
77 fAllowedLocales(NULL) {
78 if (U_FAILURE(status)) {
79 return;
80 }
81 fChecks = src.fChecks;
82 if (src.fSpoofData != NULL) {
83 fSpoofData = src.fSpoofData->addReference();
84 }
85 fAllowedCharsSet = src.fAllowedCharsSet->clone();
86 fAllowedLocales = uprv_strdup(src.fAllowedLocales);
87 if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
88 status = U_MEMORY_ALLOCATION_ERROR;
89 }
90 fRestrictionLevel = src.fRestrictionLevel;
91 }
92
~SpoofImpl()93 SpoofImpl::~SpoofImpl() {
94 if (fSpoofData != NULL) {
95 fSpoofData->removeReference(); // Will delete if refCount goes to zero.
96 }
97 delete fAllowedCharsSet;
98 uprv_free((void *)fAllowedLocales);
99 }
100
101 // Cast this instance as a USpoofChecker for the C API.
asUSpoofChecker()102 USpoofChecker *SpoofImpl::asUSpoofChecker() {
103 return exportForC();
104 }
105
106 //
107 // Incoming parameter check on Status and the SpoofChecker object
108 // received from the C API.
109 //
validateThis(const USpoofChecker * sc,UErrorCode & status)110 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
111 auto* This = validate(sc, status);
112 if (U_FAILURE(status)) {
113 return NULL;
114 }
115 if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
116 return NULL;
117 }
118 return This;
119 }
120
validateThis(USpoofChecker * sc,UErrorCode & status)121 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
122 return const_cast<SpoofImpl *>
123 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
124 }
125
126
setAllowedLocales(const char * localesList,UErrorCode & status)127 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
128 UnicodeSet allowedChars;
129 UnicodeSet *tmpSet = NULL;
130 const char *locStart = localesList;
131 const char *locEnd = NULL;
132 const char *localesListEnd = localesList + uprv_strlen(localesList);
133 int32_t localeListCount = 0; // Number of locales provided by caller.
134
135 // Loop runs once per locale from the localesList, a comma separated list of locales.
136 do {
137 locEnd = uprv_strchr(locStart, ',');
138 if (locEnd == NULL) {
139 locEnd = localesListEnd;
140 }
141 while (*locStart == ' ') {
142 locStart++;
143 }
144 const char *trimmedEnd = locEnd-1;
145 while (trimmedEnd > locStart && *trimmedEnd == ' ') {
146 trimmedEnd--;
147 }
148 if (trimmedEnd <= locStart) {
149 break;
150 }
151 const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
152 localeListCount++;
153
154 // We have one locale from the locales list.
155 // Add the script chars for this locale to the accumulating set of allowed chars.
156 // If the locale is no good, we will be notified back via status.
157 addScriptChars(locale, &allowedChars, status);
158 uprv_free((void *)locale);
159 if (U_FAILURE(status)) {
160 break;
161 }
162 locStart = locEnd + 1;
163 } while (locStart < localesListEnd);
164
165 // If our caller provided an empty list of locales, we disable the allowed characters checking
166 if (localeListCount == 0) {
167 uprv_free((void *)fAllowedLocales);
168 fAllowedLocales = uprv_strdup("");
169 tmpSet = new UnicodeSet(0, 0x10ffff);
170 if (fAllowedLocales == NULL || tmpSet == NULL) {
171 status = U_MEMORY_ALLOCATION_ERROR;
172 return;
173 }
174 tmpSet->freeze();
175 delete fAllowedCharsSet;
176 fAllowedCharsSet = tmpSet;
177 fChecks &= ~USPOOF_CHAR_LIMIT;
178 return;
179 }
180
181
182 // Add all common and inherited characters to the set of allowed chars.
183 UnicodeSet tempSet;
184 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
185 allowedChars.addAll(tempSet);
186 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
187 allowedChars.addAll(tempSet);
188
189 // If anything went wrong, we bail out without changing
190 // the state of the spoof checker.
191 if (U_FAILURE(status)) {
192 return;
193 }
194
195 // Store the updated spoof checker state.
196 tmpSet = allowedChars.clone();
197 const char *tmpLocalesList = uprv_strdup(localesList);
198 if (tmpSet == NULL || tmpLocalesList == NULL) {
199 status = U_MEMORY_ALLOCATION_ERROR;
200 return;
201 }
202 uprv_free((void *)fAllowedLocales);
203 fAllowedLocales = tmpLocalesList;
204 tmpSet->freeze();
205 delete fAllowedCharsSet;
206 fAllowedCharsSet = tmpSet;
207 fChecks |= USPOOF_CHAR_LIMIT;
208 }
209
210
getAllowedLocales(UErrorCode &)211 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
212 return fAllowedLocales;
213 }
214
215
216 // Given a locale (a language), add all the characters from all of the scripts used with that language
217 // to the allowedChars UnicodeSet
218
addScriptChars(const char * locale,UnicodeSet * allowedChars,UErrorCode & status)219 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
220 UScriptCode scripts[30];
221
222 int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
223 if (U_FAILURE(status)) {
224 return;
225 }
226 if (status == U_USING_DEFAULT_WARNING) {
227 status = U_ILLEGAL_ARGUMENT_ERROR;
228 return;
229 }
230 UnicodeSet tmpSet;
231 int32_t i;
232 for (i=0; i<numScripts; i++) {
233 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
234 allowedChars->addAll(tmpSet);
235 }
236 }
237
238 // Computes the augmented script set for a code point, according to UTS 39 section 5.1.
getAugmentedScriptSet(UChar32 codePoint,ScriptSet & result,UErrorCode & status)239 void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
240 result.resetAll();
241 result.setScriptExtensions(codePoint, status);
242 if (U_FAILURE(status)) { return; }
243
244 // Section 5.1 step 1
245 if (result.test(USCRIPT_HAN, status)) {
246 result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
247 result.set(USCRIPT_JAPANESE, status);
248 result.set(USCRIPT_KOREAN, status);
249 }
250 if (result.test(USCRIPT_HIRAGANA, status)) {
251 result.set(USCRIPT_JAPANESE, status);
252 }
253 if (result.test(USCRIPT_KATAKANA, status)) {
254 result.set(USCRIPT_JAPANESE, status);
255 }
256 if (result.test(USCRIPT_HANGUL, status)) {
257 result.set(USCRIPT_KOREAN, status);
258 }
259 if (result.test(USCRIPT_BOPOMOFO, status)) {
260 result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
261 }
262
263 // Section 5.1 step 2
264 if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
265 result.setAll();
266 }
267 }
268
269 // Computes the resolved script set for a string, according to UTS 39 section 5.1.
getResolvedScriptSet(const UnicodeString & input,ScriptSet & result,UErrorCode & status) const270 void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
271 getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
272 }
273
274 // Computes the resolved script set for a string, omitting characters having the specified script.
275 // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
getResolvedScriptSetWithout(const UnicodeString & input,UScriptCode script,ScriptSet & result,UErrorCode & status) const276 void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
277 result.setAll();
278
279 ScriptSet temp;
280 UChar32 codePoint;
281 for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
282 codePoint = input.char32At(i);
283
284 // Compute the augmented script set for the character
285 getAugmentedScriptSet(codePoint, temp, status);
286 if (U_FAILURE(status)) { return; }
287
288 // Intersect the augmented script set with the resolved script set, but only if the character doesn't
289 // have the script specified in the function call
290 if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
291 result.intersect(temp);
292 }
293 }
294 }
295
296 // Computes the set of numerics for a string, according to UTS 39 section 5.3.
getNumerics(const UnicodeString & input,UnicodeSet & result,UErrorCode &) const297 void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
298 result.clear();
299
300 UChar32 codePoint;
301 for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
302 codePoint = input.char32At(i);
303
304 // Store a representative character for each kind of decimal digit
305 if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
306 // Store the zero character as a representative for comparison.
307 // Unicode guarantees it is codePoint - value
308 result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
309 }
310 }
311 }
312
313 // Computes the restriction level of a string, according to UTS 39 section 5.2.
getRestrictionLevel(const UnicodeString & input,UErrorCode & status) const314 URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
315 // Section 5.2 step 1:
316 if (!fAllowedCharsSet->containsAll(input)) {
317 return USPOOF_UNRESTRICTIVE;
318 }
319
320 // Section 5.2 step 2
321 // Java use a static UnicodeSet for this test. In C++, avoid the static variable
322 // and just do a simple for loop.
323 UBool allASCII = true;
324 for (int32_t i=0, length=input.length(); i<length; i++) {
325 if (input.charAt(i) > 0x7f) {
326 allASCII = false;
327 break;
328 }
329 }
330 if (allASCII) {
331 return USPOOF_ASCII;
332 }
333
334 // Section 5.2 steps 3:
335 ScriptSet resolvedScriptSet;
336 getResolvedScriptSet(input, resolvedScriptSet, status);
337 if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
338
339 // Section 5.2 step 4:
340 if (!resolvedScriptSet.isEmpty()) {
341 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
342 }
343
344 // Section 5.2 step 5:
345 ScriptSet resolvedNoLatn;
346 getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
347 if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
348
349 // Section 5.2 step 6:
350 if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
351 || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
352 || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
353 return USPOOF_HIGHLY_RESTRICTIVE;
354 }
355
356 // Section 5.2 step 7:
357 if (!resolvedNoLatn.isEmpty()
358 && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
359 && !resolvedNoLatn.test(USCRIPT_GREEK, status)
360 && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
361 return USPOOF_MODERATELY_RESTRICTIVE;
362 }
363
364 // Section 5.2 step 8:
365 return USPOOF_MINIMALLY_RESTRICTIVE;
366 }
367
findHiddenOverlay(const UnicodeString & input,UErrorCode &) const368 int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
369 bool sawLeadCharacter = false;
370 for (int32_t i=0; i<input.length();) {
371 UChar32 cp = input.char32At(i);
372 if (sawLeadCharacter && cp == 0x0307) {
373 return i;
374 }
375 uint8_t combiningClass = u_getCombiningClass(cp);
376 // Skip over characters except for those with combining class 0 (non-combining characters) or with
377 // combining class 230 (same class as U+0307)
378 U_ASSERT(u_getCombiningClass(0x0307) == 230);
379 if (combiningClass == 0 || combiningClass == 230) {
380 sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
381 }
382 i += U16_LENGTH(cp);
383 }
384 return -1;
385 }
386
isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp)387 static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
388 return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
389 u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
390 }
391
isIllegalCombiningDotLeadCharacter(UChar32 cp) const392 bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
393 if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
394 return true;
395 }
396 UnicodeString skelStr;
397 fSpoofData->confusableLookup(cp, skelStr);
398 UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
399 if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
400 return true;
401 }
402 return false;
403 }
404
405
406
407 // Convert a text format hex number. Utility function used by builder code. Static.
408 // Input: UChar *string text. Output: a UChar32
409 // Input has been pre-checked, and will have no non-hex chars.
410 // The number must fall in the code point range of 0..0x10ffff
411 // Static Function.
ScanHex(const UChar * s,int32_t start,int32_t limit,UErrorCode & status)412 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
413 if (U_FAILURE(status)) {
414 return 0;
415 }
416 U_ASSERT(limit-start > 0);
417 uint32_t val = 0;
418 int i;
419 for (i=start; i<limit; i++) {
420 int digitVal = s[i] - 0x30;
421 if (digitVal>9) {
422 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
423 }
424 if (digitVal>15) {
425 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
426 }
427 U_ASSERT(digitVal <= 0xf);
428 val <<= 4;
429 val += digitVal;
430 }
431 if (val > 0x10ffff) {
432 status = U_PARSE_ERROR;
433 val = 0;
434 }
435 return (UChar32)val;
436 }
437
438
439 //-----------------------------------------
440 //
441 // class CheckResult Implementation
442 //
443 //-----------------------------------------
444
CheckResult()445 CheckResult::CheckResult() {
446 clear();
447 }
448
asUSpoofCheckResult()449 USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
450 return exportForC();
451 }
452
453 //
454 // Incoming parameter check on Status and the CheckResult object
455 // received from the C API.
456 //
validateThis(const USpoofCheckResult * ptr,UErrorCode & status)457 const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
458 return validate(ptr, status);
459 }
460
validateThis(USpoofCheckResult * ptr,UErrorCode & status)461 CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
462 return validate(ptr, status);
463 }
464
clear()465 void CheckResult::clear() {
466 fChecks = 0;
467 fNumerics.clear();
468 fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
469 }
470
toCombinedBitmask(int32_t enabledChecks)471 int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
472 if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
473 return fChecks | fRestrictionLevel;
474 } else {
475 return fChecks;
476 }
477 }
478
~CheckResult()479 CheckResult::~CheckResult() {
480 }
481
482 //----------------------------------------------------------------------------------------------
483 //
484 // class SpoofData Implementation
485 //
486 //----------------------------------------------------------------------------------------------
487
488
validateDataVersion(UErrorCode & status) const489 UBool SpoofData::validateDataVersion(UErrorCode &status) const {
490 if (U_FAILURE(status) ||
491 fRawData == NULL ||
492 fRawData->fMagic != USPOOF_MAGIC ||
493 fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
494 fRawData->fFormatVersion[1] != 0 ||
495 fRawData->fFormatVersion[2] != 0 ||
496 fRawData->fFormatVersion[3] != 0) {
497 status = U_INVALID_FORMAT_ERROR;
498 return false;
499 }
500 return true;
501 }
502
503 static UBool U_CALLCONV
spoofDataIsAcceptable(void * context,const char *,const char *,const UDataInfo * pInfo)504 spoofDataIsAcceptable(void *context,
505 const char * /* type */, const char * /*name*/,
506 const UDataInfo *pInfo) {
507 if(
508 pInfo->size >= 20 &&
509 pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
510 pInfo->charsetFamily == U_CHARSET_FAMILY &&
511 pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu "
512 pInfo->dataFormat[1] == 0x66 &&
513 pInfo->dataFormat[2] == 0x75 &&
514 pInfo->dataFormat[3] == 0x20 &&
515 pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
516 ) {
517 UVersionInfo *version = static_cast<UVersionInfo *>(context);
518 if(version != NULL) {
519 uprv_memcpy(version, pInfo->dataVersion, 4);
520 }
521 return true;
522 } else {
523 return false;
524 }
525 }
526
527 // Methods for the loading of the default confusables data file. The confusable
528 // data is loaded only when it is needed.
529 //
530 // SpoofData::getDefault() - Return the default confusables data, and call the
531 // initOnce() if it is not available. Adds a reference
532 // to the SpoofData that the caller is responsible for
533 // decrementing when they are done with the data.
534 //
535 // uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
536 // is shared by all spoof checkers using the default data.
537 //
538 // uspoof_cleanupDefaultData - Called during cleanup.
539 //
540
541 static UInitOnce gSpoofInitDefaultOnce {};
542 static SpoofData* gDefaultSpoofData;
543
544 static UBool U_CALLCONV
uspoof_cleanupDefaultData(void)545 uspoof_cleanupDefaultData(void) {
546 if (gDefaultSpoofData) {
547 // Will delete, assuming all user-level spoof checkers were closed.
548 gDefaultSpoofData->removeReference();
549 gDefaultSpoofData = nullptr;
550 gSpoofInitDefaultOnce.reset();
551 }
552 return true;
553 }
554
uspoof_loadDefaultData(UErrorCode & status)555 static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
556 UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
557 spoofDataIsAcceptable,
558 nullptr, // context, would receive dataVersion if supplied.
559 &status);
560 if (U_FAILURE(status)) { return; }
561 gDefaultSpoofData = new SpoofData(udm, status);
562 if (U_FAILURE(status)) {
563 delete gDefaultSpoofData;
564 gDefaultSpoofData = nullptr;
565 return;
566 }
567 if (gDefaultSpoofData == nullptr) {
568 status = U_MEMORY_ALLOCATION_ERROR;
569 return;
570 }
571 ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
572 }
573
getDefault(UErrorCode & status)574 SpoofData* SpoofData::getDefault(UErrorCode& status) {
575 umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
576 if (U_FAILURE(status)) { return NULL; }
577 gDefaultSpoofData->addReference();
578 return gDefaultSpoofData;
579 }
580
581
582
SpoofData(UDataMemory * udm,UErrorCode & status)583 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
584 {
585 reset();
586 if (U_FAILURE(status)) {
587 return;
588 }
589 fUDM = udm;
590 // fRawData is non-const because it may be constructed by the data builder.
591 fRawData = reinterpret_cast<SpoofDataHeader *>(
592 const_cast<void *>(udata_getMemory(udm)));
593 validateDataVersion(status);
594 initPtrs(status);
595 }
596
597
SpoofData(const void * data,int32_t length,UErrorCode & status)598 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
599 {
600 reset();
601 if (U_FAILURE(status)) {
602 return;
603 }
604 if ((size_t)length < sizeof(SpoofDataHeader)) {
605 status = U_INVALID_FORMAT_ERROR;
606 return;
607 }
608 if (data == NULL) {
609 status = U_ILLEGAL_ARGUMENT_ERROR;
610 return;
611 }
612 void *ncData = const_cast<void *>(data);
613 fRawData = static_cast<SpoofDataHeader *>(ncData);
614 if (length < fRawData->fLength) {
615 status = U_INVALID_FORMAT_ERROR;
616 return;
617 }
618 validateDataVersion(status);
619 initPtrs(status);
620 }
621
622
623 // Spoof Data constructor for use from data builder.
624 // Initializes a new, empty data area that will be populated later.
SpoofData(UErrorCode & status)625 SpoofData::SpoofData(UErrorCode &status) {
626 reset();
627 if (U_FAILURE(status)) {
628 return;
629 }
630 fDataOwned = true;
631
632 // The spoof header should already be sized to be a multiple of 16 bytes.
633 // Just in case it's not, round it up.
634 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
635 U_ASSERT(initialSize == sizeof(SpoofDataHeader));
636
637 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
638 fMemLimit = initialSize;
639 if (fRawData == NULL) {
640 status = U_MEMORY_ALLOCATION_ERROR;
641 return;
642 }
643 uprv_memset(fRawData, 0, initialSize);
644
645 fRawData->fMagic = USPOOF_MAGIC;
646 fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
647 fRawData->fFormatVersion[1] = 0;
648 fRawData->fFormatVersion[2] = 0;
649 fRawData->fFormatVersion[3] = 0;
650 initPtrs(status);
651 }
652
653 // reset() - initialize all fields.
654 // Should be updated if any new fields are added.
655 // Called by constructors to put things in a known initial state.
reset()656 void SpoofData::reset() {
657 fRawData = NULL;
658 fDataOwned = false;
659 fUDM = NULL;
660 fMemLimit = 0;
661 fRefCount = 1;
662 fCFUKeys = NULL;
663 fCFUValues = NULL;
664 fCFUStrings = NULL;
665 }
666
667
668 // SpoofData::initPtrs()
669 // Initialize the pointers to the various sections of the raw data.
670 //
671 // This function is used both during the Trie building process (multiple
672 // times, as the individual data sections are added), and
673 // during the opening of a Spoof Checker from prebuilt data.
674 //
675 // The pointers for non-existent data sections (identified by an offset of 0)
676 // are set to NULL.
677 //
678 // Note: During building the data, adding each new data section
679 // reallocs the raw data area, which likely relocates it, which
680 // in turn requires reinitializing all of the pointers into it, hence
681 // multiple calls to this function during building.
682 //
initPtrs(UErrorCode & status)683 void SpoofData::initPtrs(UErrorCode &status) {
684 fCFUKeys = NULL;
685 fCFUValues = NULL;
686 fCFUStrings = NULL;
687 if (U_FAILURE(status)) {
688 return;
689 }
690 if (fRawData->fCFUKeys != 0) {
691 fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
692 }
693 if (fRawData->fCFUStringIndex != 0) {
694 fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
695 }
696 if (fRawData->fCFUStringTable != 0) {
697 fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
698 }
699 }
700
701
~SpoofData()702 SpoofData::~SpoofData() {
703 if (fDataOwned) {
704 uprv_free(fRawData);
705 }
706 fRawData = NULL;
707 if (fUDM != NULL) {
708 udata_close(fUDM);
709 }
710 fUDM = NULL;
711 }
712
713
removeReference()714 void SpoofData::removeReference() {
715 if (umtx_atomic_dec(&fRefCount) == 0) {
716 delete this;
717 }
718 }
719
720
addReference()721 SpoofData *SpoofData::addReference() {
722 umtx_atomic_inc(&fRefCount);
723 return this;
724 }
725
726
reserveSpace(int32_t numBytes,UErrorCode & status)727 void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
728 if (U_FAILURE(status)) {
729 return NULL;
730 }
731 if (!fDataOwned) {
732 UPRV_UNREACHABLE_EXIT;
733 }
734
735 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
736 uint32_t returnOffset = fMemLimit;
737 fMemLimit += numBytes;
738 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
739 fRawData->fLength = fMemLimit;
740 uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
741 initPtrs(status);
742 return (char *)fRawData + returnOffset;
743 }
744
serialize(void * buf,int32_t capacity,UErrorCode & status) const745 int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
746 int32_t dataSize = fRawData->fLength;
747 if (capacity < dataSize) {
748 status = U_BUFFER_OVERFLOW_ERROR;
749 return dataSize;
750 }
751 uprv_memcpy(buf, fRawData, dataSize);
752 return dataSize;
753 }
754
size() const755 int32_t SpoofData::size() const {
756 return fRawData->fLength;
757 }
758
759 //-------------------------------
760 //
761 // Front-end APIs for SpoofData
762 //
763 //-------------------------------
764
confusableLookup(UChar32 inChar,UnicodeString & dest) const765 int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
766 // Perform a binary search.
767 // [lo, hi), i.e lo is inclusive, hi is exclusive.
768 // The result after the loop will be in lo.
769 int32_t lo = 0;
770 int32_t hi = length();
771 do {
772 int32_t mid = (lo + hi) / 2;
773 if (codePointAt(mid) > inChar) {
774 hi = mid;
775 } else if (codePointAt(mid) < inChar) {
776 lo = mid;
777 } else {
778 // Found result. Break early.
779 lo = mid;
780 break;
781 }
782 } while (hi - lo > 1);
783
784 // Did we find an entry? If not, the char maps to itself.
785 if (codePointAt(lo) != inChar) {
786 dest.append(inChar);
787 return 1;
788 }
789
790 // Add the element to the string builder and return.
791 return appendValueTo(lo, dest);
792 }
793
length() const794 int32_t SpoofData::length() const {
795 return fRawData->fCFUKeysSize;
796 }
797
codePointAt(int32_t index) const798 UChar32 SpoofData::codePointAt(int32_t index) const {
799 return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
800 }
801
appendValueTo(int32_t index,UnicodeString & dest) const802 int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
803 int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
804
805 // Value is either a char (for strings of length 1) or
806 // an index into the string table (for longer strings)
807 uint16_t value = fCFUValues[index];
808 if (stringLength == 1) {
809 dest.append((UChar)value);
810 } else {
811 dest.append(fCFUStrings + value, stringLength);
812 }
813
814 return stringLength;
815 }
816
817
818 U_NAMESPACE_END
819
820 U_NAMESPACE_USE
821
822 //-----------------------------------------------------------------------------
823 //
824 // uspoof_swap - byte swap and char encoding swap of spoof data
825 //
826 //-----------------------------------------------------------------------------
827 U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * status)828 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
829 UErrorCode *status) {
830
831 if (status == NULL || U_FAILURE(*status)) {
832 return 0;
833 }
834 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
835 *status=U_ILLEGAL_ARGUMENT_ERROR;
836 return 0;
837 }
838
839 //
840 // Check that the data header is for spoof data.
841 // (Header contents are defined in gencfu.cpp)
842 //
843 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
844 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
845 pInfo->dataFormat[1]==0x66 &&
846 pInfo->dataFormat[2]==0x75 &&
847 pInfo->dataFormat[3]==0x20 &&
848 pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
849 pInfo->formatVersion[1]==0 &&
850 pInfo->formatVersion[2]==0 &&
851 pInfo->formatVersion[3]==0 )) {
852 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
853 "(format version %02x %02x %02x %02x) is not recognized\n",
854 pInfo->dataFormat[0], pInfo->dataFormat[1],
855 pInfo->dataFormat[2], pInfo->dataFormat[3],
856 pInfo->formatVersion[0], pInfo->formatVersion[1],
857 pInfo->formatVersion[2], pInfo->formatVersion[3]);
858 *status=U_UNSUPPORTED_ERROR;
859 return 0;
860 }
861
862 //
863 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
864 // header). This swap also conveniently gets us
865 // the size of the ICU d.h., which lets us locate the start
866 // of the uspoof specific data.
867 //
868 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
869
870
871 //
872 // Get the Spoof Data Header, and check that it appears to be OK.
873 //
874 //
875 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
876 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
877 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
878 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
879 {
880 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
881 *status=U_UNSUPPORTED_ERROR;
882 return 0;
883 }
884
885 //
886 // Prefight operation? Just return the size
887 //
888 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
889 int32_t totalSize = headerSize + spoofDataLength;
890 if (length < 0) {
891 return totalSize;
892 }
893
894 //
895 // Check that length passed in is consistent with length from Spoof data header.
896 //
897 if (length < totalSize) {
898 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
899 spoofDataLength);
900 *status=U_INDEX_OUTOFBOUNDS_ERROR;
901 return 0;
902 }
903
904
905 //
906 // Swap the Data. Do the data itself first, then the Spoof Data Header, because
907 // we need to reference the header to locate the data, and an
908 // inplace swap of the header leaves it unusable.
909 //
910 uint8_t *outBytes = (uint8_t *)outData + headerSize;
911 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
912
913 int32_t sectionStart;
914 int32_t sectionLength;
915
916 //
917 // If not swapping in place, zero out the output buffer before starting.
918 // Gaps may exist between the individual sections, and these must be zeroed in
919 // the output buffer. The simplest way to do that is to just zero the whole thing.
920 //
921 if (inBytes != outBytes) {
922 uprv_memset(outBytes, 0, spoofDataLength);
923 }
924
925 // Confusables Keys Section (fCFUKeys)
926 sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
927 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
928 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
929
930 // String Index Section
931 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
932 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
933 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
934
935 // String Table Section
936 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
937 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
938 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
939
940 // And, last, swap the header itself.
941 // int32_t fMagic // swap this
942 // uint8_t fFormatVersion[4] // Do not swap this, just copy
943 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
944 //
945 uint32_t magic = ds->readUInt32(spoofDH->fMagic);
946 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
947
948 if (inBytes != outBytes) {
949 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
950 }
951 // swap starting at fLength
952 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
953
954 return totalSize;
955 }
956
957 #endif
958
959
960