• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2008-2011, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7 
8 #include "unicode/utypes.h"
9 #include "unicode/uspoof.h"
10 #include "unicode/unorm.h"
11 #include "unicode/uchar.h"
12 #include "unicode/uniset.h"
13 #include "unicode/utf16.h"
14 #include "utrie2.h"
15 #include "cmemory.h"
16 #include "cstring.h"
17 #include "udatamem.h"
18 #include "umutex.h"
19 #include "udataswp.h"
20 #include "uassert.h"
21 #include "uspoof_impl.h"
22 
23 #if !UCONFIG_NO_NORMALIZATION
24 
25 
26 U_NAMESPACE_BEGIN
27 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
29 
30 SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
31     fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) , fAllowedLocales(uprv_strdup("")) {
32     if (U_FAILURE(status)) {
33         return;
34     }
35     fMagic = USPOOF_MAGIC;
36     fSpoofData = data;
37     fChecks = USPOOF_ALL_CHECKS;
38     UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
39     if (allowedCharsSet == NULL || fAllowedLocales == NULL) {
40         status = U_MEMORY_ALLOCATION_ERROR;
41         return;
42     }
43     allowedCharsSet->freeze();
44     fAllowedCharsSet = allowedCharsSet;
45 }
46 
47 
SpoofImpl()48 SpoofImpl::SpoofImpl() {
49     fMagic = USPOOF_MAGIC;
50     fSpoofData = NULL;
51     fChecks = USPOOF_ALL_CHECKS;
52     UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
53     allowedCharsSet->freeze();
54     fAllowedCharsSet = allowedCharsSet;
55     fAllowedLocales  = uprv_strdup("");
56 }
57 
58 
59 // Copy Constructor, used by the user level clone() function.
SpoofImpl(const SpoofImpl & src,UErrorCode & status)60 SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
61     fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) {
62     if (U_FAILURE(status)) {
63         return;
64     }
65     fMagic = src.fMagic;
66     fChecks = src.fChecks;
67     if (src.fSpoofData != NULL) {
68         fSpoofData = src.fSpoofData->addReference();
69     }
70     fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
71     if (fAllowedCharsSet == NULL) {
72         status = U_MEMORY_ALLOCATION_ERROR;
73     }
74     fAllowedLocales = uprv_strdup(src.fAllowedLocales);
75 }
76 
~SpoofImpl()77 SpoofImpl::~SpoofImpl() {
78     fMagic = 0;                // head off application errors by preventing use of
79                                //    of deleted objects.
80     if (fSpoofData != NULL) {
81         fSpoofData->removeReference();   // Will delete if refCount goes to zero.
82     }
83     delete fAllowedCharsSet;
84     uprv_free((void *)fAllowedLocales);
85 }
86 
87 //
88 //  Incoming parameter check on Status and the SpoofChecker object
89 //    received from the C API.
90 //
validateThis(const USpoofChecker * sc,UErrorCode & status)91 const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
92     if (U_FAILURE(status)) {
93         return NULL;
94     }
95     if (sc == NULL) {
96         status = U_ILLEGAL_ARGUMENT_ERROR;
97         return NULL;
98     };
99     SpoofImpl *This = (SpoofImpl *)sc;
100     if (This->fMagic != USPOOF_MAGIC ||
101         This->fSpoofData == NULL) {
102         status = U_INVALID_FORMAT_ERROR;
103         return NULL;
104     }
105     if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
106         return NULL;
107     }
108     return This;
109 }
110 
validateThis(USpoofChecker * sc,UErrorCode & status)111 SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
112     return const_cast<SpoofImpl *>
113         (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
114 }
115 
116 
117 
118 //--------------------------------------------------------------------------------------
119 //
120 //  confusableLookup()    This is the heart of the confusable skeleton generation
121 //                        implementation.
122 //
123 //                        Given a source character, produce the corresponding
124 //                        replacement character(s)
125 //
126 //---------------------------------------------------------------------------------------
confusableLookup(UChar32 inChar,int32_t tableMask,UChar * destBuf) const127 int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const {
128 
129     // Binary search the spoof data key table for the inChar
130     int32_t  *low   = fSpoofData->fCFUKeys;
131     int32_t  *mid   = NULL;
132     int32_t  *limit = low + fSpoofData->fRawData->fCFUKeysSize;
133     UChar32   midc;
134     do {
135         int32_t delta = ((int32_t)(limit-low))/2;
136         mid = low + delta;
137         midc = *mid & 0x1fffff;
138         if (inChar == midc) {
139             goto foundChar;
140         } else if (inChar < midc) {
141             limit = mid;
142         } else {
143             low = mid;
144         }
145     } while (low < limit-1);
146     mid = low;
147     midc = *mid & 0x1fffff;
148     if (inChar != midc) {
149         // Char not found.  It maps to itself.
150         int i = 0;
151         U16_APPEND_UNSAFE(destBuf, i, inChar)
152         return i;
153     }
154   foundChar:
155     int32_t keyFlags = *mid & 0xff000000;
156     if ((keyFlags & tableMask) == 0) {
157         // We found the right key char, but the entry doesn't pertain to the
158         //  table we need.  See if there is an adjacent key that does
159         if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
160             int32_t *altMid;
161             for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
162                 keyFlags = *altMid & 0xff000000;
163                 if (keyFlags & tableMask) {
164                     mid = altMid;
165                     goto foundKey;
166                 }
167             }
168             for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
169                 keyFlags = *altMid & 0xff000000;
170                 if (keyFlags & tableMask) {
171                     mid = altMid;
172                     goto foundKey;
173                 }
174             }
175         }
176         // No key entry for this char & table.
177         // The input char maps to itself.
178         int i = 0;
179         U16_APPEND_UNSAFE(destBuf, i, inChar)
180         return i;
181     }
182 
183   foundKey:
184     int32_t  stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
185     int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys);
186 
187     // Value is either a UChar  (for strings of length 1) or
188     //                 an index into the string table (for longer strings)
189     uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
190     if (stringLen == 1) {
191         destBuf[0] = value;
192         return 1;
193     }
194 
195     // String length of 4 from the above lookup is used for all strings of length >= 4.
196     // For these, get the real length from the string lengths table,
197     //   which maps string table indexes to lengths.
198     //   All strings of the same length are stored contiguously in the string table.
199     //   'value' from the lookup above is the starting index for the desired string.
200 
201     int32_t ix;
202     if (stringLen == 4) {
203         int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
204         for (ix = 0; ix < stringLengthsLimit; ix++) {
205             if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
206                 stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
207                 break;
208             }
209         }
210         U_ASSERT(ix < stringLengthsLimit);
211     }
212 
213     U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen);
214     UChar *src = &fSpoofData->fCFUStrings[value];
215     for (ix=0; ix<stringLen; ix++) {
216         destBuf[ix] = src[ix];
217     }
218     return stringLen;
219 }
220 
221 
222 //---------------------------------------------------------------------------------------
223 //
224 //  wholeScriptCheck()
225 //
226 //      Input text is already normalized to NFD
227 //      Return the set of scripts, each of which can represent something that is
228 //             confusable with the input text.  The script of the input text
229 //             is included; input consisting of characters from a single script will
230 //             always produce a result consisting of a set containing that script.
231 //
232 //---------------------------------------------------------------------------------------
wholeScriptCheck(const UChar * text,int32_t length,ScriptSet * result,UErrorCode & status) const233 void SpoofImpl::wholeScriptCheck(
234     const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const {
235 
236     int32_t       inputIdx = 0;
237     UChar32       c;
238 
239     UTrie2 *table =
240         (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
241     result->setAll();
242     while (inputIdx < length) {
243         U16_NEXT(text, inputIdx, length, c);
244         uint32_t index = utrie2_get32(table, c);
245         if (index == 0) {
246             // No confusables in another script for this char.
247             // TODO:  we should change the data to have sets with just the single script
248             //        bit for the script of this char.  Gets rid of this special case.
249             //        Until then, grab the script from the char and intersect it with the set.
250             UScriptCode cpScript = uscript_getScript(c, &status);
251             U_ASSERT(cpScript > USCRIPT_INHERITED);
252             result->intersect(cpScript);
253         } else if (index == 1) {
254             // Script == Common or Inherited.  Nothing to do.
255         } else {
256             result->intersect(fSpoofData->fScriptSets[index]);
257         }
258     }
259 }
260 
261 
setAllowedLocales(const char * localesList,UErrorCode & status)262 void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
263     UnicodeSet    allowedChars;
264     UnicodeSet    *tmpSet = NULL;
265     const char    *locStart = localesList;
266     const char    *locEnd = NULL;
267     const char    *localesListEnd = localesList + uprv_strlen(localesList);
268     int32_t        localeListCount = 0;   // Number of locales provided by caller.
269 
270     // Loop runs once per locale from the localesList, a comma separated list of locales.
271     do {
272         locEnd = uprv_strchr(locStart, ',');
273         if (locEnd == NULL) {
274             locEnd = localesListEnd;
275         }
276         while (*locStart == ' ') {
277             locStart++;
278         }
279         const char *trimmedEnd = locEnd-1;
280         while (trimmedEnd > locStart && *trimmedEnd == ' ') {
281             trimmedEnd--;
282         }
283         if (trimmedEnd <= locStart) {
284             break;
285         }
286         const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
287         localeListCount++;
288 
289         // We have one locale from the locales list.
290         // Add the script chars for this locale to the accumulating set of allowed chars.
291         // If the locale is no good, we will be notified back via status.
292         addScriptChars(locale, &allowedChars, status);
293         uprv_free((void *)locale);
294         if (U_FAILURE(status)) {
295             break;
296         }
297         locStart = locEnd + 1;
298     } while (locStart < localesListEnd);
299 
300     // If our caller provided an empty list of locales, we disable the allowed characters checking
301     if (localeListCount == 0) {
302         uprv_free((void *)fAllowedLocales);
303         fAllowedLocales = uprv_strdup("");
304         tmpSet = new UnicodeSet(0, 0x10ffff);
305         if (fAllowedLocales == NULL || tmpSet == NULL) {
306             status = U_MEMORY_ALLOCATION_ERROR;
307             return;
308         }
309         tmpSet->freeze();
310         delete fAllowedCharsSet;
311         fAllowedCharsSet = tmpSet;
312         fChecks &= ~USPOOF_CHAR_LIMIT;
313         return;
314     }
315 
316 
317     // Add all common and inherited characters to the set of allowed chars.
318     UnicodeSet tempSet;
319     tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
320     allowedChars.addAll(tempSet);
321     tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
322     allowedChars.addAll(tempSet);
323 
324     // If anything went wrong, we bail out without changing
325     // the state of the spoof checker.
326     if (U_FAILURE(status)) {
327         return;
328     }
329 
330     // Store the updated spoof checker state.
331     tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
332     const char *tmpLocalesList = uprv_strdup(localesList);
333     if (tmpSet == NULL || tmpLocalesList == NULL) {
334         status = U_MEMORY_ALLOCATION_ERROR;
335         return;
336     }
337     uprv_free((void *)fAllowedLocales);
338     fAllowedLocales = tmpLocalesList;
339     tmpSet->freeze();
340     delete fAllowedCharsSet;
341     fAllowedCharsSet = tmpSet;
342     fChecks |= USPOOF_CHAR_LIMIT;
343 }
344 
345 
getAllowedLocales(UErrorCode &)346 const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
347     return fAllowedLocales;
348 }
349 
350 
351 // Given a locale (a language), add all the characters from all of the scripts used with that language
352 // to the allowedChars UnicodeSet
353 
addScriptChars(const char * locale,UnicodeSet * allowedChars,UErrorCode & status)354 void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
355     UScriptCode scripts[30];
356 
357     int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status);
358     if (U_FAILURE(status)) {
359         return;
360     }
361     if (status == U_USING_DEFAULT_WARNING) {
362         status = U_ILLEGAL_ARGUMENT_ERROR;
363         return;
364     }
365     UnicodeSet tmpSet;
366     int32_t    i;
367     for (i=0; i<numScripts; i++) {
368         tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
369         allowedChars->addAll(tmpSet);
370     }
371 }
372 
373 
scriptScan(const UChar * text,int32_t length,int32_t & pos,UErrorCode & status) const374 int32_t SpoofImpl::scriptScan
375         (const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const {
376     if (U_FAILURE(status)) {
377         return 0;
378     }
379     int32_t       inputIdx = 0;
380     UChar32       c;
381     int32_t       scriptCount = 0;
382     UScriptCode   lastScript = USCRIPT_INVALID_CODE;
383     UScriptCode   sc = USCRIPT_INVALID_CODE;
384     while ((inputIdx < length || length == -1) && scriptCount < 2) {
385         U16_NEXT(text, inputIdx, length, c);
386         if (c == 0 && length == -1) {
387             break;
388         }
389         sc = uscript_getScript(c, &status);
390         if (sc == USCRIPT_COMMON || sc == USCRIPT_INHERITED || sc == USCRIPT_UNKNOWN) {
391             continue;
392         }
393 
394         // Temporary fix: fold Japanese Hiragana and Katakana into Han.
395         //   Names are allowed to mix these scripts.
396         //   A more general solution will follow later for characters that are
397         //   used with multiple scripts.
398 
399         if (sc == USCRIPT_HIRAGANA || sc == USCRIPT_KATAKANA || sc == USCRIPT_HANGUL) {
400             sc = USCRIPT_HAN;
401         }
402 
403         if (sc != lastScript) {
404            scriptCount++;
405            lastScript = sc;
406         }
407     }
408     if (scriptCount == 2) {
409         pos = inputIdx;
410     }
411     return scriptCount;
412 }
413 
414 
415 // Convert a text format hex number.  Utility function used by builder code.  Static.
416 // Input: UChar *string text.  Output: a UChar32
417 // Input has been pre-checked, and will have no non-hex chars.
418 // The number must fall in the code point range of 0..0x10ffff
419 // Static Function.
ScanHex(const UChar * s,int32_t start,int32_t limit,UErrorCode & status)420 UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
421     if (U_FAILURE(status)) {
422         return 0;
423     }
424     U_ASSERT(limit-start > 0);
425     uint32_t val = 0;
426     int i;
427     for (i=start; i<limit; i++) {
428         int digitVal = s[i] - 0x30;
429         if (digitVal>9) {
430             digitVal = 0xa + (s[i] - 0x41);  // Upper Case 'A'
431         }
432         if (digitVal>15) {
433             digitVal = 0xa + (s[i] - 0x61);  // Lower Case 'a'
434         }
435         U_ASSERT(digitVal <= 0xf);
436         val <<= 4;
437         val += digitVal;
438     }
439     if (val > 0x10ffff) {
440         status = U_PARSE_ERROR;
441         val = 0;
442     }
443     return (UChar32)val;
444 }
445 
446 
447 
448 //----------------------------------------------------------------------------------------------
449 //
450 //   class SpoofData Implementation
451 //
452 //----------------------------------------------------------------------------------------------
453 
454 
validateDataVersion(const SpoofDataHeader * rawData,UErrorCode & status)455 UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
456     if (U_FAILURE(status) ||
457         rawData == NULL ||
458         rawData->fMagic != USPOOF_MAGIC ||
459         rawData->fFormatVersion[0] > 1 ||
460         rawData->fFormatVersion[1] > 0) {
461             status = U_INVALID_FORMAT_ERROR;
462             return FALSE;
463     }
464     return TRUE;
465 }
466 
467 //
468 //  SpoofData::getDefault() - return a wrapper around the spoof data that is
469 //                           baked into the default ICU data.
470 //
getDefault(UErrorCode & status)471 SpoofData *SpoofData::getDefault(UErrorCode &status) {
472     // TODO:  Cache it.  Lazy create, keep until cleanup.
473 
474     UDataMemory *udm = udata_open(NULL, "cfu", "confusables", &status);
475     if (U_FAILURE(status)) {
476         return NULL;
477     }
478     SpoofData *This = new SpoofData(udm, status);
479     if (U_FAILURE(status)) {
480         delete This;
481         return NULL;
482     }
483     if (This == NULL) {
484         status = U_MEMORY_ALLOCATION_ERROR;
485     }
486     return This;
487 }
488 
489 
SpoofData(UDataMemory * udm,UErrorCode & status)490 SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
491 {
492     reset();
493     if (U_FAILURE(status)) {
494         return;
495     }
496     fRawData = reinterpret_cast<SpoofDataHeader *>
497                    ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
498     fUDM = udm;
499     validateDataVersion(fRawData, status);
500     initPtrs(status);
501 }
502 
503 
SpoofData(const void * data,int32_t length,UErrorCode & status)504 SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
505 {
506     reset();
507     if (U_FAILURE(status)) {
508         return;
509     }
510     if ((size_t)length < sizeof(SpoofDataHeader)) {
511         status = U_INVALID_FORMAT_ERROR;
512         return;
513     }
514     void *ncData = const_cast<void *>(data);
515     fRawData = static_cast<SpoofDataHeader *>(ncData);
516     if (length < fRawData->fLength) {
517         status = U_INVALID_FORMAT_ERROR;
518         return;
519     }
520     validateDataVersion(fRawData, status);
521     initPtrs(status);
522 }
523 
524 
525 // Spoof Data constructor for use from data builder.
526 //   Initializes a new, empty data area that will be populated later.
SpoofData(UErrorCode & status)527 SpoofData::SpoofData(UErrorCode &status) {
528     reset();
529     if (U_FAILURE(status)) {
530         return;
531     }
532     fDataOwned = true;
533     fRefCount = 1;
534 
535     // The spoof header should already be sized to be a multiple of 16 bytes.
536     // Just in case it's not, round it up.
537     uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
538     U_ASSERT(initialSize == sizeof(SpoofDataHeader));
539 
540     fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
541     fMemLimit = initialSize;
542     if (fRawData == NULL) {
543         status = U_MEMORY_ALLOCATION_ERROR;
544         return;
545     }
546     uprv_memset(fRawData, 0, initialSize);
547 
548     fRawData->fMagic = USPOOF_MAGIC;
549     fRawData->fFormatVersion[0] = 1;
550     fRawData->fFormatVersion[1] = 0;
551     fRawData->fFormatVersion[2] = 0;
552     fRawData->fFormatVersion[3] = 0;
553     initPtrs(status);
554 }
555 
556 // reset() - initialize all fields.
557 //           Should be updated if any new fields are added.
558 //           Called by constructors to put things in a known initial state.
reset()559 void SpoofData::reset() {
560    fRawData = NULL;
561    fDataOwned = FALSE;
562    fUDM      = NULL;
563    fMemLimit = 0;
564    fRefCount = 1;
565    fCFUKeys = NULL;
566    fCFUValues = NULL;
567    fCFUStringLengths = NULL;
568    fCFUStrings = NULL;
569    fAnyCaseTrie = NULL;
570    fLowerCaseTrie = NULL;
571    fScriptSets = NULL;
572 }
573 
574 
575 //  SpoofData::initPtrs()
576 //            Initialize the pointers to the various sections of the raw data.
577 //
578 //            This function is used both during the Trie building process (multiple
579 //            times, as the individual data sections are added), and
580 //            during the opening of a Spoof Checker from prebuilt data.
581 //
582 //            The pointers for non-existent data sections (identified by an offset of 0)
583 //            are set to NULL.
584 //
585 //            Note:  During building the data, adding each new data section
586 //            reallocs the raw data area, which likely relocates it, which
587 //            in turn requires reinitializing all of the pointers into it, hence
588 //            multiple calls to this function during building.
589 //
initPtrs(UErrorCode & status)590 void SpoofData::initPtrs(UErrorCode &status) {
591     fCFUKeys = NULL;
592     fCFUValues = NULL;
593     fCFUStringLengths = NULL;
594     fCFUStrings = NULL;
595     if (U_FAILURE(status)) {
596         return;
597     }
598     if (fRawData->fCFUKeys != 0) {
599         fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
600     }
601     if (fRawData->fCFUStringIndex != 0) {
602         fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
603     }
604     if (fRawData->fCFUStringLengths != 0) {
605         fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
606     }
607     if (fRawData->fCFUStringTable != 0) {
608         fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
609     }
610 
611     if (fAnyCaseTrie ==  NULL && fRawData->fAnyCaseTrie != 0) {
612         fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
613             (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
614     }
615     if (fLowerCaseTrie ==  NULL && fRawData->fLowerCaseTrie != 0) {
616         fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
617             (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
618     }
619 
620     if (fRawData->fScriptSets != 0) {
621         fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
622     }
623 }
624 
625 
~SpoofData()626 SpoofData::~SpoofData() {
627     utrie2_close(fAnyCaseTrie);
628     fAnyCaseTrie = NULL;
629     utrie2_close(fLowerCaseTrie);
630     fLowerCaseTrie = NULL;
631     if (fDataOwned) {
632         uprv_free(fRawData);
633     }
634     fRawData = NULL;
635     if (fUDM != NULL) {
636         udata_close(fUDM);
637     }
638     fUDM = NULL;
639 }
640 
641 
removeReference()642 void SpoofData::removeReference() {
643     if (umtx_atomic_dec(&fRefCount) == 0) {
644         delete this;
645     }
646 }
647 
648 
addReference()649 SpoofData *SpoofData::addReference() {
650     umtx_atomic_inc(&fRefCount);
651     return this;
652 }
653 
654 
reserveSpace(int32_t numBytes,UErrorCode & status)655 void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
656     if (U_FAILURE(status)) {
657         return NULL;
658     }
659     if (!fDataOwned) {
660         U_ASSERT(FALSE);
661         status = U_INTERNAL_PROGRAM_ERROR;
662         return NULL;
663     }
664 
665     numBytes = (numBytes + 15) & ~15;   // Round up to a multiple of 16
666     uint32_t returnOffset = fMemLimit;
667     fMemLimit += numBytes;
668     fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
669     fRawData->fLength = fMemLimit;
670     uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
671     initPtrs(status);
672     return (char *)fRawData + returnOffset;
673 }
674 
675 
676 //----------------------------------------------------------------------------
677 //
678 //  ScriptSet implementation
679 //
680 //----------------------------------------------------------------------------
ScriptSet()681 ScriptSet::ScriptSet() {
682     for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
683         bits[i] = 0;
684     }
685 }
686 
~ScriptSet()687 ScriptSet::~ScriptSet() {
688 }
689 
operator ==(const ScriptSet & other)690 UBool ScriptSet::operator == (const ScriptSet &other) {
691     for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
692         if (bits[i] != other.bits[i]) {
693             return FALSE;
694         }
695     }
696     return TRUE;
697 }
698 
Union(UScriptCode script)699 void ScriptSet::Union(UScriptCode script) {
700     uint32_t index = script / 32;
701     uint32_t bit   = 1 << (script & 31);
702     U_ASSERT(index < sizeof(bits)*4);
703     bits[index] |= bit;
704 }
705 
706 
Union(const ScriptSet & other)707 void ScriptSet::Union(const ScriptSet &other) {
708     for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
709         bits[i] |= other.bits[i];
710     }
711 }
712 
intersect(const ScriptSet & other)713 void ScriptSet::intersect(const ScriptSet &other) {
714     for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
715         bits[i] &= other.bits[i];
716     }
717 }
718 
intersect(UScriptCode script)719 void ScriptSet::intersect(UScriptCode script) {
720     uint32_t index = script / 32;
721     uint32_t bit   = 1 << (script & 31);
722     U_ASSERT(index < sizeof(bits)*4);
723     uint32_t i;
724     for (i=0; i<index; i++) {
725         bits[i] = 0;
726     }
727     bits[index] &= bit;
728     for (i=index+1; i<sizeof(bits)/sizeof(uint32_t); i++) {
729         bits[i] = 0;
730     }
731 }
732 
733 
operator =(const ScriptSet & other)734 ScriptSet & ScriptSet::operator =(const ScriptSet &other) {
735     for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
736         bits[i] = other.bits[i];
737     }
738     return *this;
739 }
740 
741 
setAll()742 void ScriptSet::setAll() {
743     for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
744         bits[i] = 0xffffffffu;
745     }
746 }
747 
748 
resetAll()749 void ScriptSet::resetAll() {
750     for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
751         bits[i] = 0;
752     }
753 }
754 
countMembers()755 int32_t ScriptSet::countMembers() {
756     // This bit counter is good for sparse numbers of '1's, which is
757     //  very much the case that we will usually have.
758     int32_t count = 0;
759     for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
760         uint32_t x = bits[i];
761         while (x > 0) {
762             count++;
763             x &= (x - 1);    // and off the least significant one bit.
764         }
765     }
766     return count;
767 }
768 
769 
770 
771 //-----------------------------------------------------------------------------
772 //
773 //  NFDBuffer Implementation.
774 //
775 //-----------------------------------------------------------------------------
776 
NFDBuffer(const UChar * text,int32_t length,UErrorCode & status)777 NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
778     fNormalizedText = NULL;
779     fNormalizedTextLength = 0;
780     fOriginalText = text;
781     if (U_FAILURE(status)) {
782         return;
783     }
784     fNormalizedText = fSmallBuf;
785     fNormalizedTextLength = unorm_normalize(
786         text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
787     if (status == U_BUFFER_OVERFLOW_ERROR) {
788         status = U_ZERO_ERROR;
789         fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
790         if (fNormalizedText == NULL) {
791             status = U_MEMORY_ALLOCATION_ERROR;
792         } else {
793             fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0,
794                                         fNormalizedText, fNormalizedTextLength+1, &status);
795         }
796     }
797 }
798 
799 
~NFDBuffer()800 NFDBuffer::~NFDBuffer() {
801     if (fNormalizedText != fSmallBuf) {
802         uprv_free(fNormalizedText);
803     }
804     fNormalizedText = 0;
805 }
806 
getBuffer()807 const UChar *NFDBuffer::getBuffer() {
808     return fNormalizedText;
809 }
810 
getLength()811 int32_t NFDBuffer::getLength() {
812     return fNormalizedTextLength;
813 }
814 
815 
816 
817 
818 
819 U_NAMESPACE_END
820 
821 U_NAMESPACE_USE
822 
823 //-----------------------------------------------------------------------------
824 //
825 //  uspoof_swap   -  byte swap and char encoding swap of spoof data
826 //
827 //-----------------------------------------------------------------------------
828 U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * status)829 uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
830            UErrorCode *status) {
831 
832     if (status == NULL || U_FAILURE(*status)) {
833         return 0;
834     }
835     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
836         *status=U_ILLEGAL_ARGUMENT_ERROR;
837         return 0;
838     }
839 
840     //
841     //  Check that the data header is for spoof data.
842     //    (Header contents are defined in gencfu.cpp)
843     //
844     const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
845     if(!(  pInfo->dataFormat[0]==0x43 &&   /* dataFormat="Cfu " */
846            pInfo->dataFormat[1]==0x66 &&
847            pInfo->dataFormat[2]==0x75 &&
848            pInfo->dataFormat[3]==0x20 &&
849            pInfo->formatVersion[0]==1  )) {
850         udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
851                              "(format version %02x %02x %02x %02x) is not recognized\n",
852                          pInfo->dataFormat[0], pInfo->dataFormat[1],
853                          pInfo->dataFormat[2], pInfo->dataFormat[3],
854                          pInfo->formatVersion[0], pInfo->formatVersion[1],
855                          pInfo->formatVersion[2], pInfo->formatVersion[3]);
856         *status=U_UNSUPPORTED_ERROR;
857         return 0;
858     }
859 
860     //
861     // Swap the data header.  (This is the generic ICU Data Header, not the uspoof Specific
862     //                         header).  This swap also conveniently gets us
863     //                         the size of the ICU d.h., which lets us locate the start
864     //                         of the uspoof specific data.
865     //
866     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
867 
868 
869     //
870     // Get the Spoof Data Header, and check that it appears to be OK.
871     //
872     //
873     const uint8_t   *inBytes =(const uint8_t *)inData+headerSize;
874     SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
875     if (ds->readUInt32(spoofDH->fMagic)   != USPOOF_MAGIC ||
876         ds->readUInt32(spoofDH->fLength)  <  sizeof(SpoofDataHeader))
877     {
878         udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
879         *status=U_UNSUPPORTED_ERROR;
880         return 0;
881     }
882 
883     //
884     // Prefight operation?  Just return the size
885     //
886     int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
887     int32_t totalSize = headerSize + spoofDataLength;
888     if (length < 0) {
889         return totalSize;
890     }
891 
892     //
893     // Check that length passed in is consistent with length from Spoof data header.
894     //
895     if (length < totalSize) {
896         udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
897                             spoofDataLength);
898         *status=U_INDEX_OUTOFBOUNDS_ERROR;
899         return 0;
900         }
901 
902 
903     //
904     // Swap the Data.  Do the data itself first, then the Spoof Data Header, because
905     //                 we need to reference the header to locate the data, and an
906     //                 inplace swap of the header leaves it unusable.
907     //
908     uint8_t          *outBytes = (uint8_t *)outData + headerSize;
909     SpoofDataHeader  *outputDH = (SpoofDataHeader *)outBytes;
910 
911     int32_t   sectionStart;
912     int32_t   sectionLength;
913 
914     //
915     // If not swapping in place, zero out the output buffer before starting.
916     //    Gaps may exist between the individual sections, and these must be zeroed in
917     //    the output buffer.  The simplest way to do that is to just zero the whole thing.
918     //
919     if (inBytes != outBytes) {
920         uprv_memset(outBytes, 0, spoofDataLength);
921     }
922 
923     // Confusables Keys Section   (fCFUKeys)
924     sectionStart  = ds->readUInt32(spoofDH->fCFUKeys);
925     sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
926     ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
927 
928     // String Index Section
929     sectionStart  = ds->readUInt32(spoofDH->fCFUStringIndex);
930     sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
931     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
932 
933     // String Table Section
934     sectionStart  = ds->readUInt32(spoofDH->fCFUStringTable);
935     sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
936     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
937 
938     // String Lengths Section
939     sectionStart  = ds->readUInt32(spoofDH->fCFUStringLengths);
940     sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
941     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
942 
943     // Any Case Trie
944     sectionStart  = ds->readUInt32(spoofDH->fAnyCaseTrie);
945     sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
946     utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
947 
948     // Lower Case Trie
949     sectionStart  = ds->readUInt32(spoofDH->fLowerCaseTrie);
950     sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
951     utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
952 
953     // Script Sets.  The data is an array of int32_t
954     sectionStart  = ds->readUInt32(spoofDH->fScriptSets);
955     sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet);
956     ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
957 
958     // And, last, swap the header itself.
959     //   int32_t   fMagic             // swap this
960     //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy
961     //   int32_t   fLength and all the rest       // Swap the rest, all is 32 bit stuff.
962     //
963     uint32_t magic = ds->readUInt32(spoofDH->fMagic);
964     ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
965 
966     if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
967         uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
968     }
969     // swap starting at fLength
970     ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
971 
972     return totalSize;
973 }
974 
975 #endif
976 
977 
978