1 /*
2 **********************************************************************
3 * Copyright (C) 2008-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9 #include "unicode/uspoof.h"
10 #include "unicode/unorm.h"
11 #include "unicode/uchar.h"
12 #include "unicode/uniset.h"
13 #include "utrie2.h"
14 #include "cmemory.h"
15 #include "cstring.h"
16 #include "udatamem.h"
17 #include "umutex.h"
18 #include "udataswp.h"
19 #include "uassert.h"
20 #include "uspoof_impl.h"
21
22 #if !UCONFIG_NO_NORMALIZATION
23
24
25 U_NAMESPACE_BEGIN
26
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)27 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
28
29 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
30 fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) , fAllowedLocales(uprv_strdup("")) {
31 if (U_FAILURE(status)) {
32 return;
33 }
34 fMagic = USPOOF_MAGIC;
35 fSpoofData = data;
36 fChecks = USPOOF_ALL_CHECKS;
37 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
38 if (allowedCharsSet == NULL || fAllowedLocales == NULL) {
39 status = U_MEMORY_ALLOCATION_ERROR;
40 return;
41 }
42 allowedCharsSet->freeze();
43 fAllowedCharsSet = allowedCharsSet;
44 }
45
46
SpoofImpl()47 SpoofImpl::SpoofImpl() {
48 fMagic = USPOOF_MAGIC;
49 fSpoofData = NULL;
50 fChecks = USPOOF_ALL_CHECKS;
51 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
52 allowedCharsSet->freeze();
53 fAllowedCharsSet = allowedCharsSet;
54 fAllowedLocales = uprv_strdup("");
55 }
56
57
58 // Copy Constructor, used by the user level clone() function.
SpoofImpl(const SpoofImpl & src,UErrorCode & status)59 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
60 fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) {
61 if (U_FAILURE(status)) {
62 return;
63 }
64 fMagic = src.fMagic;
65 fChecks = src.fChecks;
66 if (src.fSpoofData != NULL) {
67 fSpoofData = src.fSpoofData->addReference();
68 }
69 fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
70 if (fAllowedCharsSet == NULL) {
71 status = U_MEMORY_ALLOCATION_ERROR;
72 }
73 fAllowedLocales = uprv_strdup(src.fAllowedLocales);
74 }
75
~SpoofImpl()76 SpoofImpl::~SpoofImpl() {
77 fMagic = 0; // head off application errors by preventing use of
78 // of deleted objects.
79 if (fSpoofData != NULL) {
80 fSpoofData->removeReference(); // Will delete if refCount goes to zero.
81 }
82 delete fAllowedCharsSet;
83 uprv_free((void *)fAllowedLocales);
84 }
85
86 //
87 // Incoming parameter check on Status and the SpoofChecker object
88 // received from the C API.
89 //
validateThis(const USpoofChecker * sc,UErrorCode & status)90 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
91 if (U_FAILURE(status)) {
92 return NULL;
93 }
94 if (sc == NULL) {
95 status = U_ILLEGAL_ARGUMENT_ERROR;
96 return NULL;
97 };
98 SpoofImpl *This = (SpoofImpl *)sc;
99 if (This->fMagic != USPOOF_MAGIC ||
100 This->fSpoofData == NULL) {
101 status = U_INVALID_FORMAT_ERROR;
102 return NULL;
103 }
104 if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
105 return NULL;
106 }
107 return This;
108 }
109
validateThis(USpoofChecker * sc,UErrorCode & status)110 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
111 return const_cast<SpoofImpl *>
112 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
113 }
114
115
116
117 //--------------------------------------------------------------------------------------
118 //
119 // confusableLookup() This is the heart of the confusable skeleton generation
120 // implementation.
121 //
122 // Given a source character, produce the corresponding
123 // replacement character(s)
124 //
125 //---------------------------------------------------------------------------------------
confusableLookup(UChar32 inChar,int32_t tableMask,UChar * destBuf) const126 int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const {
127
128 // Binary search the spoof data key table for the inChar
129 int32_t *low = fSpoofData->fCFUKeys;
130 int32_t *mid = NULL;
131 int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
132 UChar32 midc;
133 do {
134 int32_t delta = ((int32_t)(limit-low))/2;
135 mid = low + delta;
136 midc = *mid & 0x1fffff;
137 if (inChar == midc) {
138 goto foundChar;
139 } else if (inChar < midc) {
140 limit = mid;
141 } else {
142 low = mid;
143 }
144 } while (low < limit-1);
145 mid = low;
146 midc = *mid & 0x1fffff;
147 if (inChar != midc) {
148 // Char not found. It maps to itself.
149 int i = 0;
150 U16_APPEND_UNSAFE(destBuf, i, inChar)
151 return i;
152 }
153 foundChar:
154 int32_t keyFlags = *mid & 0xff000000;
155 if ((keyFlags & tableMask) == 0) {
156 // We found the right key char, but the entry doesn't pertain to the
157 // table we need. See if there is an adjacent key that does
158 if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
159 int32_t *altMid;
160 for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
161 keyFlags = *altMid & 0xff000000;
162 if (keyFlags & tableMask) {
163 mid = altMid;
164 goto foundKey;
165 }
166 }
167 for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
168 keyFlags = *altMid & 0xff000000;
169 if (keyFlags & tableMask) {
170 mid = altMid;
171 goto foundKey;
172 }
173 }
174 }
175 // No key entry for this char & table.
176 // The input char maps to itself.
177 int i = 0;
178 U16_APPEND_UNSAFE(destBuf, i, inChar)
179 return i;
180 }
181
182 foundKey:
183 int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
184 int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
185
186 // Value is either a UChar (for strings of length 1) or
187 // an index into the string table (for longer strings)
188 uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
189 if (stringLen == 1) {
190 destBuf[0] = value;
191 return 1;
192 }
193
194 // String length of 4 from the above lookup is used for all strings of length >= 4.
195 // For these, get the real length from the string lengths table,
196 // which maps string table indexes to lengths.
197 // All strings of the same length are stored contiguously in the string table.
198 // 'value' from the lookup above is the starting index for the desired string.
199
200 int32_t ix;
201 if (stringLen == 4) {
202 int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
203 for (ix = 0; ix < stringLengthsLimit; ix++) {
204 if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
205 stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
206 break;
207 }
208 }
209 U_ASSERT(ix < stringLengthsLimit);
210 }
211
212 U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
213 UChar *src = &fSpoofData->fCFUStrings[value];
214 for (ix=0; ix<stringLen; ix++) {
215 destBuf[ix] = src[ix];
216 }
217 return stringLen;
218 }
219
220
221 //---------------------------------------------------------------------------------------
222 //
223 // wholeScriptCheck()
224 //
225 // Input text is already normalized to NFD
226 // Return the set of scripts, each of which can represent something that is
227 // confusable with the input text. The script of the input text
228 // is included; input consisting of characters from a single script will
229 // always produce a result consisting of a set containing that script.
230 //
231 //---------------------------------------------------------------------------------------
wholeScriptCheck(const UChar * text,int32_t length,ScriptSet * result,UErrorCode & status) const232 void SpoofImpl::wholeScriptCheck(
233 const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const {
234
235 int32_t inputIdx = 0;
236 UChar32 c;
237
238 UTrie2 *table =
239 (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
240 result->setAll();
241 while (inputIdx < length) {
242 U16_NEXT(text, inputIdx, length, c);
243 uint32_t index = utrie2_get32(table, c);
244 if (index == 0) {
245 // No confusables in another script for this char.
246 // TODO: we should change the data to have sets with just the single script
247 // bit for the script of this char. Gets rid of this special case.
248 // Until then, grab the script from the char and intersect it with the set.
249 UScriptCode cpScript = uscript_getScript(c, &status);
250 U_ASSERT(cpScript > USCRIPT_INHERITED);
251 result->intersect(cpScript);
252 } else if (index == 1) {
253 // Script == Common or Inherited. Nothing to do.
254 } else {
255 result->intersect(fSpoofData->fScriptSets[index]);
256 }
257 }
258 }
259
260
setAllowedLocales(const char * localesList,UErrorCode & status)261 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
262 UnicodeSet allowedChars;
263 UnicodeSet *tmpSet = NULL;
264 const char *locStart = localesList;
265 const char *locEnd = NULL;
266 const char *localesListEnd = localesList + uprv_strlen(localesList);
267 int32_t localeListCount = 0; // Number of locales provided by caller.
268
269 // Loop runs once per locale from the localesList, a comma separated list of locales.
270 do {
271 locEnd = uprv_strchr(locStart, ',');
272 if (locEnd == NULL) {
273 locEnd = localesListEnd;
274 }
275 while (*locStart == ' ') {
276 locStart++;
277 }
278 const char *trimmedEnd = locEnd-1;
279 while (trimmedEnd > locStart && *trimmedEnd == ' ') {
280 trimmedEnd--;
281 }
282 if (trimmedEnd <= locStart) {
283 break;
284 }
285 const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
286 localeListCount++;
287
288 // We have one locale from the locales list.
289 // Add the script chars for this locale to the accumulating set of allowed chars.
290 // If the locale is no good, we will be notified back via status.
291 addScriptChars(locale, &allowedChars, status);
292 uprv_free((void *)locale);
293 if (U_FAILURE(status)) {
294 break;
295 }
296 locStart = locEnd + 1;
297 } while (locStart < localesListEnd);
298
299 // If our caller provided an empty list of locales, we disable the allowed characters checking
300 if (localeListCount == 0) {
301 uprv_free((void *)fAllowedLocales);
302 fAllowedLocales = uprv_strdup("");
303 tmpSet = new UnicodeSet(0, 0x10ffff);
304 if (fAllowedLocales == NULL || tmpSet == NULL) {
305 status = U_MEMORY_ALLOCATION_ERROR;
306 return;
307 }
308 tmpSet->freeze();
309 delete fAllowedCharsSet;
310 fAllowedCharsSet = tmpSet;
311 fChecks &= ~USPOOF_CHAR_LIMIT;
312 return;
313 }
314
315
316 // Add all common and inherited characters to the set of allowed chars.
317 UnicodeSet tempSet;
318 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
319 allowedChars.addAll(tempSet);
320 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
321 allowedChars.addAll(tempSet);
322
323 // If anything went wrong, we bail out without changing
324 // the state of the spoof checker.
325 if (U_FAILURE(status)) {
326 return;
327 }
328
329 // Store the updated spoof checker state.
330 tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
331 const char *tmpLocalesList = uprv_strdup(localesList);
332 if (tmpSet == NULL || tmpLocalesList == NULL) {
333 status = U_MEMORY_ALLOCATION_ERROR;
334 return;
335 }
336 uprv_free((void *)fAllowedLocales);
337 fAllowedLocales = tmpLocalesList;
338 tmpSet->freeze();
339 delete fAllowedCharsSet;
340 fAllowedCharsSet = tmpSet;
341 fChecks |= USPOOF_CHAR_LIMIT;
342 }
343
344
getAllowedLocales(UErrorCode &)345 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
346 return fAllowedLocales;
347 }
348
349
350 // Given a locale (a language), add all the characters from all of the scripts used with that language
351 // to the allowedChars UnicodeSet
352
addScriptChars(const char * locale,UnicodeSet * allowedChars,UErrorCode & status)353 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
354 UScriptCode scripts[30];
355
356 int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);
357 if (U_FAILURE(status)) {
358 return;
359 }
360 if (status == U_USING_DEFAULT_WARNING) {
361 status = U_ILLEGAL_ARGUMENT_ERROR;
362 return;
363 }
364 UnicodeSet tmpSet;
365 int32_t i;
366 for (i=0; i<numScripts; i++) {
367 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
368 allowedChars->addAll(tmpSet);
369 }
370 }
371
372
scriptScan(const UChar * text,int32_t length,int32_t & pos,UErrorCode & status) const373 int32_t SpoofImpl::scriptScan
374 (const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const {
375 if (U_FAILURE(status)) {
376 return 0;
377 }
378 int32_t inputIdx = 0;
379 UChar32 c;
380 int32_t scriptCount = 0;
381 UScriptCode lastScript = USCRIPT_INVALID_CODE;
382 UScriptCode sc = USCRIPT_INVALID_CODE;
383 while ((inputIdx < length || length == -1) && scriptCount < 2) {
384 U16_NEXT(text, inputIdx, length, c);
385 if (c == 0 && length == -1) {
386 break;
387 }
388 sc = uscript_getScript(c, &status);
389 if (sc == USCRIPT_COMMON || sc == USCRIPT_INHERITED || sc == USCRIPT_UNKNOWN) {
390 continue;
391 }
392
393 // Temporary fix: fold Japanese Hiragana and Katakana into Han.
394 // Names are allowed to mix these scripts.
395 // A more general solution will follow later for characters that are
396 // used with multiple scripts.
397
398 if (sc == USCRIPT_HIRAGANA || sc == USCRIPT_KATAKANA || sc == USCRIPT_HANGUL) {
399 sc = USCRIPT_HAN;
400 }
401
402 if (sc != lastScript) {
403 scriptCount++;
404 lastScript = sc;
405 }
406 }
407 if (scriptCount == 2) {
408 pos = inputIdx;
409 }
410 return scriptCount;
411 }
412
413
414 // Convert a text format hex number. Utility function used by builder code. Static.
415 // Input: UChar *string text. Output: a UChar32
416 // Input has been pre-checked, and will have no non-hex chars.
417 // The number must fall in the code point range of 0..0x10ffff
418 // Static Function.
ScanHex(const UChar * s,int32_t start,int32_t limit,UErrorCode & status)419 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
420 if (U_FAILURE(status)) {
421 return 0;
422 }
423 U_ASSERT(limit-start > 0);
424 uint32_t val = 0;
425 int i;
426 for (i=start; i<limit; i++) {
427 int digitVal = s[i] - 0x30;
428 if (digitVal>9) {
429 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
430 }
431 if (digitVal>15) {
432 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
433 }
434 U_ASSERT(digitVal <= 0xf);
435 val <<= 4;
436 val += digitVal;
437 }
438 if (val > 0x10ffff) {
439 status = U_PARSE_ERROR;
440 val = 0;
441 }
442 return (UChar32)val;
443 }
444
445
446
447 //----------------------------------------------------------------------------------------------
448 //
449 // class SpoofData Implementation
450 //
451 //----------------------------------------------------------------------------------------------
452
453
validateDataVersion(const SpoofDataHeader * rawData,UErrorCode & status)454 UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
455 if (U_FAILURE(status) ||
456 rawData == NULL ||
457 rawData->fMagic != USPOOF_MAGIC ||
458 rawData->fFormatVersion[0] > 1 ||
459 rawData->fFormatVersion[1] > 0) {
460 status = U_INVALID_FORMAT_ERROR;
461 return FALSE;
462 }
463 return TRUE;
464 }
465
466 //
467 // SpoofData::getDefault() - return a wrapper around the spoof data that is
468 // baked into the default ICU data.
469 //
getDefault(UErrorCode & status)470 SpoofData *SpoofData::getDefault(UErrorCode &status) {
471 // TODO: Cache it. Lazy create, keep until cleanup.
472
473 UDataMemory *udm = udata_open(NULL, "cfu", "confusables", &status);
474 if (U_FAILURE(status)) {
475 return NULL;
476 }
477 SpoofData *This = new SpoofData(udm, status);
478 if (U_FAILURE(status)) {
479 delete This;
480 return NULL;
481 }
482 if (This == NULL) {
483 status = U_MEMORY_ALLOCATION_ERROR;
484 }
485 return This;
486 }
487
488
SpoofData(UDataMemory * udm,UErrorCode & status)489 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
490 {
491 reset();
492 if (U_FAILURE(status)) {
493 return;
494 }
495 fRawData = reinterpret_cast<SpoofDataHeader *>
496 ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
497 fUDM = udm;
498 validateDataVersion(fRawData, status);
499 initPtrs(status);
500 }
501
502
SpoofData(const void * data,int32_t length,UErrorCode & status)503 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
504 {
505 reset();
506 if (U_FAILURE(status)) {
507 return;
508 }
509 if ((size_t)length < sizeof(SpoofDataHeader)) {
510 status = U_INVALID_FORMAT_ERROR;
511 return;
512 }
513 void *ncData = const_cast<void *>(data);
514 fRawData = static_cast<SpoofDataHeader *>(ncData);
515 if (length < fRawData->fLength) {
516 status = U_INVALID_FORMAT_ERROR;
517 return;
518 }
519 validateDataVersion(fRawData, status);
520 initPtrs(status);
521 }
522
523
524 // Spoof Data constructor for use from data builder.
525 // Initializes a new, empty data area that will be populated later.
SpoofData(UErrorCode & status)526 SpoofData::SpoofData(UErrorCode &status) {
527 reset();
528 if (U_FAILURE(status)) {
529 return;
530 }
531 fDataOwned = true;
532 fRefCount = 1;
533
534 // The spoof header should already be sized to be a multiple of 16 bytes.
535 // Just in case it's not, round it up.
536 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
537 U_ASSERT(initialSize == sizeof(SpoofDataHeader));
538
539 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
540 fMemLimit = initialSize;
541 if (fRawData == NULL) {
542 status = U_MEMORY_ALLOCATION_ERROR;
543 return;
544 }
545 uprv_memset(fRawData, 0, initialSize);
546
547 fRawData->fMagic = USPOOF_MAGIC;
548 fRawData->fFormatVersion[0] = 1;
549 fRawData->fFormatVersion[1] = 0;
550 fRawData->fFormatVersion[2] = 0;
551 fRawData->fFormatVersion[3] = 0;
552 initPtrs(status);
553 }
554
555 // reset() - initialize all fields.
556 // Should be updated if any new fields are added.
557 // Called by constructors to put things in a known initial state.
reset()558 void SpoofData::reset() {
559 fRawData = NULL;
560 fDataOwned = FALSE;
561 fUDM = NULL;
562 fMemLimit = 0;
563 fRefCount = 1;
564 fCFUKeys = NULL;
565 fCFUValues = NULL;
566 fCFUStringLengths = NULL;
567 fCFUStrings = NULL;
568 fAnyCaseTrie = NULL;
569 fLowerCaseTrie = NULL;
570 fScriptSets = NULL;
571 }
572
573
574 // SpoofData::initPtrs()
575 // Initialize the pointers to the various sections of the raw data.
576 //
577 // This function is used both during the Trie building process (multiple
578 // times, as the individual data sections are added), and
579 // during the opening of a Spoof Checker from prebuilt data.
580 //
581 // The pointers for non-existent data sections (identified by an offset of 0)
582 // are set to NULL.
583 //
584 // Note: During building the data, adding each new data section
585 // reallocs the raw data area, which likely relocates it, which
586 // in turn requires reinitializing all of the pointers into it, hence
587 // multiple calls to this function during building.
588 //
initPtrs(UErrorCode & status)589 void SpoofData::initPtrs(UErrorCode &status) {
590 fCFUKeys = NULL;
591 fCFUValues = NULL;
592 fCFUStringLengths = NULL;
593 fCFUStrings = NULL;
594 if (U_FAILURE(status)) {
595 return;
596 }
597 if (fRawData->fCFUKeys != 0) {
598 fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
599 }
600 if (fRawData->fCFUStringIndex != 0) {
601 fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
602 }
603 if (fRawData->fCFUStringLengths != 0) {
604 fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
605 }
606 if (fRawData->fCFUStringTable != 0) {
607 fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
608 }
609
610 if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
611 fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
612 (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
613 }
614 if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
615 fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
616 (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
617 }
618
619 if (fRawData->fScriptSets != 0) {
620 fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
621 }
622 }
623
624
~SpoofData()625 SpoofData::~SpoofData() {
626 utrie2_close(fAnyCaseTrie);
627 fAnyCaseTrie = NULL;
628 utrie2_close(fLowerCaseTrie);
629 fLowerCaseTrie = NULL;
630 if (fDataOwned) {
631 uprv_free(fRawData);
632 }
633 fRawData = NULL;
634 if (fUDM != NULL) {
635 udata_close(fUDM);
636 }
637 fUDM = NULL;
638 }
639
640
removeReference()641 void SpoofData::removeReference() {
642 if (umtx_atomic_dec(&fRefCount) == 0) {
643 delete this;
644 }
645 }
646
647
addReference()648 SpoofData *SpoofData::addReference() {
649 umtx_atomic_inc(&fRefCount);
650 return this;
651 }
652
653
reserveSpace(int32_t numBytes,UErrorCode & status)654 void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
655 if (U_FAILURE(status)) {
656 return NULL;
657 }
658 if (!fDataOwned) {
659 U_ASSERT(FALSE);
660 status = U_INTERNAL_PROGRAM_ERROR;
661 return NULL;
662 }
663
664 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
665 uint32_t returnOffset = fMemLimit;
666 fMemLimit += numBytes;
667 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
668 fRawData->fLength = fMemLimit;
669 uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
670 initPtrs(status);
671 return (char *)fRawData + returnOffset;
672 }
673
674
675 //----------------------------------------------------------------------------
676 //
677 // ScriptSet implementation
678 //
679 //----------------------------------------------------------------------------
ScriptSet()680 ScriptSet::ScriptSet() {
681 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
682 bits[i] = 0;
683 }
684 }
685
~ScriptSet()686 ScriptSet::~ScriptSet() {
687 }
688
operator ==(const ScriptSet & other)689 UBool ScriptSet::operator == (const ScriptSet &other) {
690 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
691 if (bits[i] != other.bits[i]) {
692 return FALSE;
693 }
694 }
695 return TRUE;
696 }
697
Union(UScriptCode script)698 void ScriptSet::Union(UScriptCode script) {
699 uint32_t index = script / 32;
700 uint32_t bit = 1 << (script & 31);
701 U_ASSERT(index < sizeof(bits)*4);
702 bits[index] |= bit;
703 }
704
705
Union(const ScriptSet & other)706 void ScriptSet::Union(const ScriptSet &other) {
707 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
708 bits[i] |= other.bits[i];
709 }
710 }
711
intersect(const ScriptSet & other)712 void ScriptSet::intersect(const ScriptSet &other) {
713 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
714 bits[i] &= other.bits[i];
715 }
716 }
717
intersect(UScriptCode script)718 void ScriptSet::intersect(UScriptCode script) {
719 uint32_t index = script / 32;
720 uint32_t bit = 1 << (script & 31);
721 U_ASSERT(index < sizeof(bits)*4);
722 uint32_t i;
723 for (i=0; i<index; i++) {
724 bits[i] = 0;
725 }
726 bits[index] &= bit;
727 for (i=index+1; i<sizeof(bits)/sizeof(uint32_t); i++) {
728 bits[i] = 0;
729 }
730 }
731
732
operator =(const ScriptSet & other)733 ScriptSet & ScriptSet::operator =(const ScriptSet &other) {
734 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
735 bits[i] = other.bits[i];
736 }
737 return *this;
738 }
739
740
setAll()741 void ScriptSet::setAll() {
742 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
743 bits[i] = 0xffffffffu;
744 }
745 }
746
747
resetAll()748 void ScriptSet::resetAll() {
749 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
750 bits[i] = 0;
751 }
752 }
753
countMembers()754 int32_t ScriptSet::countMembers() {
755 // This bit counter is good for sparse numbers of '1's, which is
756 // very much the case that we will usually have.
757 int32_t count = 0;
758 for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
759 uint32_t x = bits[i];
760 while (x > 0) {
761 count++;
762 x &= (x - 1); // and off the least significant one bit.
763 }
764 }
765 return count;
766 }
767
768
769
770 //-----------------------------------------------------------------------------
771 //
772 // NFDBuffer Implementation.
773 //
774 //-----------------------------------------------------------------------------
775
NFDBuffer(const UChar * text,int32_t length,UErrorCode & status)776 NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
777 fNormalizedText = NULL;
778 fNormalizedTextLength = 0;
779 fOriginalText = text;
780 if (U_FAILURE(status)) {
781 return;
782 }
783 fNormalizedText = fSmallBuf;
784 fNormalizedTextLength = unorm_normalize(
785 text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
786 if (status == U_BUFFER_OVERFLOW_ERROR) {
787 status = U_ZERO_ERROR;
788 fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
789 if (fNormalizedText == NULL) {
790 status = U_MEMORY_ALLOCATION_ERROR;
791 } else {
792 fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0,
793 fNormalizedText, fNormalizedTextLength+1, &status);
794 }
795 }
796 }
797
798
~NFDBuffer()799 NFDBuffer::~NFDBuffer() {
800 if (fNormalizedText != fSmallBuf) {
801 uprv_free(fNormalizedText);
802 }
803 fNormalizedText = 0;
804 }
805
getBuffer()806 const UChar *NFDBuffer::getBuffer() {
807 return fNormalizedText;
808 }
809
getLength()810 int32_t NFDBuffer::getLength() {
811 return fNormalizedTextLength;
812 }
813
814
815
816
817
818 U_NAMESPACE_END
819
820 U_NAMESPACE_USE
821
822 //-----------------------------------------------------------------------------
823 //
824 // uspoof_swap - byte swap and char encoding swap of spoof data
825 //
826 //-----------------------------------------------------------------------------
827 U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * status)828 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
829 UErrorCode *status) {
830
831 if (status == NULL || U_FAILURE(*status)) {
832 return 0;
833 }
834 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
835 *status=U_ILLEGAL_ARGUMENT_ERROR;
836 return 0;
837 }
838
839 //
840 // Check that the data header is for spoof data.
841 // (Header contents are defined in gencfu.cpp)
842 //
843 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
844 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
845 pInfo->dataFormat[1]==0x66 &&
846 pInfo->dataFormat[2]==0x75 &&
847 pInfo->dataFormat[3]==0x20 &&
848 pInfo->formatVersion[0]==1 )) {
849 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
850 "(format version %02x %02x %02x %02x) is not recognized\n",
851 pInfo->dataFormat[0], pInfo->dataFormat[1],
852 pInfo->dataFormat[2], pInfo->dataFormat[3],
853 pInfo->formatVersion[0], pInfo->formatVersion[1],
854 pInfo->formatVersion[2], pInfo->formatVersion[3]);
855 *status=U_UNSUPPORTED_ERROR;
856 return 0;
857 }
858
859 //
860 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
861 // header). This swap also conveniently gets us
862 // the size of the ICU d.h., which lets us locate the start
863 // of the uspoof specific data.
864 //
865 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
866
867
868 //
869 // Get the Spoof Data Header, and check that it appears to be OK.
870 //
871 //
872 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
873 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
874 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
875 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
876 {
877 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
878 *status=U_UNSUPPORTED_ERROR;
879 return 0;
880 }
881
882 //
883 // Prefight operation? Just return the size
884 //
885 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
886 int32_t totalSize = headerSize + spoofDataLength;
887 if (length < 0) {
888 return totalSize;
889 }
890
891 //
892 // Check that length passed in is consistent with length from Spoof data header.
893 //
894 if (length < totalSize) {
895 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
896 spoofDataLength);
897 *status=U_INDEX_OUTOFBOUNDS_ERROR;
898 return 0;
899 }
900
901
902 //
903 // Swap the Data. Do the data itself first, then the Spoof Data Header, because
904 // we need to reference the header to locate the data, and an
905 // inplace swap of the header leaves it unusable.
906 //
907 uint8_t *outBytes = (uint8_t *)outData + headerSize;
908 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
909
910 int32_t sectionStart;
911 int32_t sectionLength;
912
913 //
914 // If not swapping in place, zero out the output buffer before starting.
915 // Gaps may exist between the individual sections, and these must be zeroed in
916 // the output buffer. The simplest way to do that is to just zero the whole thing.
917 //
918 if (inBytes != outBytes) {
919 uprv_memset(outBytes, 0, spoofDataLength);
920 }
921
922 // Confusables Keys Section (fCFUKeys)
923 sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
924 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
925 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
926
927 // String Index Section
928 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
929 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
930 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
931
932 // String Table Section
933 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
934 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
935 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
936
937 // String Lengths Section
938 sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
939 sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
940 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
941
942 // Any Case Trie
943 sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
944 sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
945 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
946
947 // Lower Case Trie
948 sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
949 sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
950 utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
951
952 // Script Sets. The data is an array of int32_t
953 sectionStart = ds->readUInt32(spoofDH->fScriptSets);
954 sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
955 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
956
957 // And, last, swap the header itself.
958 // int32_t fMagic // swap this
959 // uint8_t fFormatVersion[4] // Do not swap this, just copy
960 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
961 //
962 uint32_t magic = ds->readUInt32(spoofDH->fMagic);
963 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
964
965 if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
966 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
967 }
968 // swap starting at fLength
969 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
970
971 return totalSize;
972 }
973
974 #endif
975
976
977