• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ***************************************************************************
3 * Copyright (C) 2008-2009, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ***************************************************************************
6 *   file name:  uspoof.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2008Feb13
12 *   created by: Andy Heninger
13 *
14 *   Unicode Spoof Detection
15 */
16 #include "unicode/utypes.h"
17 #include "unicode/uspoof.h"
18 #include "unicode/unorm.h"
19 #include "unicode/ustring.h"
20 #include "cmemory.h"
21 #include "uspoof_impl.h"
22 #include "uassert.h"
23 
24 
25 #if !UCONFIG_NO_NORMALIZATION
26 
27 
28 #include <stdio.h>      // debug
29 
30 U_NAMESPACE_USE
31 
32 
33 U_CAPI USpoofChecker * U_EXPORT2
uspoof_open(UErrorCode * status)34 uspoof_open(UErrorCode *status) {
35     if (U_FAILURE(*status)) {
36         return NULL;
37     }
38     SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
39     if (U_FAILURE(*status)) {
40         delete si;
41         si = NULL;
42     }
43     return (USpoofChecker *)si;
44 }
45 
46 
47 U_CAPI USpoofChecker * U_EXPORT2
uspoof_openFromSerialized(const void * data,int32_t length,int32_t * pActualLength,UErrorCode * status)48 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
49                           UErrorCode *status) {
50     if (U_FAILURE(*status)) {
51         return NULL;
52     }
53     SpoofData *sd = new SpoofData(data, length, *status);
54     SpoofImpl *si = new SpoofImpl(sd, *status);
55     if (U_FAILURE(*status)) {
56         delete sd;
57         delete si;
58         return NULL;
59     }
60     if (sd == NULL || si == NULL) {
61         *status = U_MEMORY_ALLOCATION_ERROR;
62         delete sd;
63         delete si;
64         return NULL;
65     }
66 
67     if (pActualLength != NULL) {
68         *pActualLength = sd->fRawData->fLength;
69     }
70     return reinterpret_cast<USpoofChecker *>(si);
71 }
72 
73 
74 U_CAPI USpoofChecker * U_EXPORT2
uspoof_clone(const USpoofChecker * sc,UErrorCode * status)75 uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
76     const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
77     if (src == NULL) {
78         return NULL;
79     }
80     SpoofImpl *result = new SpoofImpl(*src, *status);   // copy constructor
81     if (U_FAILURE(*status)) {
82         delete result;
83         result = NULL;
84     }
85     return (USpoofChecker *)result;
86 }
87 
88 
89 U_CAPI void U_EXPORT2
uspoof_close(USpoofChecker * sc)90 uspoof_close(USpoofChecker *sc) {
91     UErrorCode status = U_ZERO_ERROR;
92     SpoofImpl *This = SpoofImpl::validateThis(sc, status);
93     delete This;
94 }
95 
96 
97 U_CAPI void U_EXPORT2
uspoof_setChecks(USpoofChecker * sc,int32_t checks,UErrorCode * status)98 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
99     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
100     if (This == NULL) {
101         return;
102     }
103 
104     // Verify that the requested checks are all ones (bits) that
105     //   are acceptable, known values.
106     if (checks & ~USPOOF_ALL_CHECKS) {
107         *status = U_ILLEGAL_ARGUMENT_ERROR;
108         return;
109     }
110 
111     This->fChecks = checks;
112 }
113 
114 
115 U_CAPI int32_t U_EXPORT2
uspoof_getChecks(const USpoofChecker * sc,UErrorCode * status)116 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
117     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
118     if (This == NULL) {
119         return 0;
120     }
121     return This->fChecks;
122 }
123 
124 U_CAPI void U_EXPORT2
uspoof_setAllowedLocales(USpoofChecker * sc,const char * localesList,UErrorCode * status)125 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) {
126     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
127     if (This == NULL) {
128         return;
129     }
130     This->setAllowedLocales(localesList, *status);
131 }
132 
133 U_CAPI const char * U_EXPORT2
uspoof_getAllowedLocales(USpoofChecker * sc,UErrorCode * status)134 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) {
135     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
136     if (This == NULL) {
137         return NULL;
138     }
139     return This->getAllowedLocales(*status);
140 }
141 
142 
143 U_CAPI const USet * U_EXPORT2
uspoof_getAllowedChars(const USpoofChecker * sc,UErrorCode * status)144 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
145     const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
146     return reinterpret_cast<const USet *>(result);
147 }
148 
149 U_CAPI const UnicodeSet * U_EXPORT2
uspoof_getAllowedUnicodeSet(const USpoofChecker * sc,UErrorCode * status)150 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
151     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
152     if (This == NULL) {
153         return NULL;
154     }
155     return This->fAllowedCharsSet;
156 }
157 
158 
159 U_CAPI void U_EXPORT2
uspoof_setAllowedChars(USpoofChecker * sc,const USet * chars,UErrorCode * status)160 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
161     const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars);
162     uspoof_setAllowedUnicodeSet(sc, set, status);
163 }
164 
165 
166 U_CAPI void U_EXPORT2
uspoof_setAllowedUnicodeSet(USpoofChecker * sc,const UnicodeSet * chars,UErrorCode * status)167 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
168     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
169     if (This == NULL) {
170         return;
171     }
172     if (chars->isBogus()) {
173         *status = U_ILLEGAL_ARGUMENT_ERROR;
174         return;
175     }
176     UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
177     if (clonedSet == NULL || clonedSet->isBogus()) {
178         *status = U_MEMORY_ALLOCATION_ERROR;
179         return;
180     }
181     clonedSet->freeze();
182     delete This->fAllowedCharsSet;
183     This->fAllowedCharsSet = clonedSet;
184     This->fChecks |= USPOOF_CHAR_LIMIT;
185 }
186 
187 
188 U_CAPI int32_t U_EXPORT2
uspoof_check(const USpoofChecker * sc,const UChar * text,int32_t length,int32_t * position,UErrorCode * status)189 uspoof_check(const USpoofChecker *sc,
190              const UChar *text, int32_t length,
191              int32_t *position,
192              UErrorCode *status) {
193 
194     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
195     if (This == NULL) {
196         return 0;
197     }
198     if (length < -1) {
199         *status = U_ILLEGAL_ARGUMENT_ERROR;
200         return 0;
201     }
202     if (length == -1) {
203         // It's not worth the bother to handle nul terminated strings everywhere.
204         //   Just get the length and be done with it.
205         length = u_strlen(text);
206     }
207 
208     int32_t result = 0;
209     int32_t failPos = 0x7fffffff;   // TODO: do we have a #define for max int32?
210 
211     // A count of the number of non-Common or inherited scripts.
212     // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
213     // Share the computation when possible.  scriptCount == -1 means that we haven't
214     // done it yet.
215     int32_t scriptCount = -1;
216 
217     if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
218         scriptCount = This->scriptScan(text, length, failPos, *status);
219         // printf("scriptCount (clipped to 2) = %d\n", scriptCount);
220         if ( scriptCount >= 2) {
221             // Note: scriptCount == 2 covers all cases of the number of scripts >= 2
222             result |= USPOOF_SINGLE_SCRIPT;
223         }
224     }
225 
226     if (This->fChecks & USPOOF_CHAR_LIMIT) {
227         int32_t i;
228         UChar32 c;
229         for (i=0; i<length ;) {
230             U16_NEXT(text, i, length, c);
231             if (!This->fAllowedCharsSet->contains(c)) {
232                 result |= USPOOF_CHAR_LIMIT;
233                 if (i < failPos) {
234                     failPos = i;
235                 }
236                 break;
237             }
238         }
239     }
240 
241     if (This->fChecks &
242         (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
243         // These are the checks that need to be done on NFKD input
244         NFKDBuffer   normalizedInput(text, length, *status);
245         const UChar  *nfkdText = normalizedInput.getBuffer();
246         int32_t      nfkdLength = normalizedInput.getLength();
247 
248         if (This->fChecks & USPOOF_INVISIBLE) {
249 
250             // scan for more than one occurence of the same non-spacing mark
251             // in a sequence of non-spacing marks.
252             int32_t     i;
253             UChar32     c;
254             UChar32     firstNonspacingMark = 0;
255             UBool       haveMultipleMarks = FALSE;
256             UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
257 
258             for (i=0; i<length ;) {
259                 U16_NEXT(nfkdText, i, nfkdLength, c);
260                 if (u_charType(c) != U_NON_SPACING_MARK) {
261                     firstNonspacingMark = 0;
262                     if (haveMultipleMarks) {
263                         marksSeenSoFar.clear();
264                         haveMultipleMarks = FALSE;
265                     }
266                     continue;
267                 }
268                 if (firstNonspacingMark == 0) {
269                     firstNonspacingMark = c;
270                     continue;
271                 }
272                 if (!haveMultipleMarks) {
273                     marksSeenSoFar.add(firstNonspacingMark);
274                     haveMultipleMarks = TRUE;
275                 }
276                 if (marksSeenSoFar.contains(c)) {
277                     // report the error, and stop scanning.
278                     // No need to find more than the first failure.
279                     result |= USPOOF_INVISIBLE;
280                     failPos = i;
281                     break;
282                 }
283                 marksSeenSoFar.add(c);
284             }
285         }
286 
287 
288         if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
289             // The basic test is the same for both whole and mixed script confusables.
290             // Compute the set of scripts that every input character has a confusable in.
291             // For this computation an input character is always considered to be
292             //    confusable with itself in its own script.
293             // If the number of such scripts is two or more, and the input consisted of
294             //   characters all from a single script, we have a whole script confusable.
295             //   (The two scripts will be the original script and the one that is confusable)
296             // If the number of such scripts >= one, and the original input contained characters from
297             //   more than one script, we have a mixed script confusable.  (We can transform
298             //   some of the characters, and end up with a visually similar string all in
299             //   one script.)
300 
301             if (scriptCount == -1) {
302                 int32_t t;
303                 scriptCount = This->scriptScan(text, length, t, *status);
304             }
305 
306             ScriptSet scripts;
307             This->wholeScriptCheck(nfkdText, nfkdLength, &scripts, *status);
308             int32_t confusableScriptCount = scripts.countMembers();
309             //printf("confusableScriptCount = %d\n", confusableScriptCount);
310 
311             if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
312                 confusableScriptCount >= 2 &&
313                 scriptCount == 1) {
314                 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
315             }
316 
317             if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
318                 confusableScriptCount >= 1 &&
319                 scriptCount > 1) {
320                 result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
321             }
322         }
323     }
324     if (position != NULL && failPos != 0x7fffffff) {
325         *position = failPos;
326     }
327     return result;
328 }
329 
330 
331 U_CAPI int32_t U_EXPORT2
uspoof_checkUTF8(const USpoofChecker * sc,const char * text,int32_t length,int32_t * position,UErrorCode * status)332 uspoof_checkUTF8(const USpoofChecker *sc,
333                  const char *text, int32_t length,
334                  int32_t *position,
335                  UErrorCode *status) {
336 
337     if (U_FAILURE(*status)) {
338         return 0;
339     }
340     UChar stackBuf[USPOOF_STACK_BUFFER_SIZE];
341     UChar* text16 = stackBuf;
342     int32_t len16;
343 
344     u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status);
345     if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
346         return 0;
347     }
348     if (*status == U_BUFFER_OVERFLOW_ERROR) {
349         text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2));
350         if (text16 == NULL) {
351             *status = U_MEMORY_ALLOCATION_ERROR;
352             return 0;
353         }
354         *status = U_ZERO_ERROR;
355         u_strFromUTF8(text16, len16+1, NULL, text, length, status);
356     }
357 
358     int32_t position16 = -1;
359     int32_t result = uspoof_check(sc, text16, len16, &position16, status);
360     if (U_FAILURE(*status)) {
361         return 0;
362     }
363 
364     if (position16 > 0) {
365         // Translate a UTF-16 based error position back to a UTF-8 offset.
366         // u_strToUTF8() in preflight mode is an easy way to do it.
367         U_ASSERT(position16 <= len16);
368         u_strToUTF8(NULL, 0, position, text16, position16, status);
369         if (position > 0) {
370             // position is the required buffer length from u_strToUTF8, which includes
371             // space for a terminating NULL, which we don't want, hence the -1.
372             *position -= 1;
373         }
374         *status = U_ZERO_ERROR;   // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR.
375     }
376 
377     if (text16 != stackBuf) {
378         uprv_free(text16);
379     }
380     return result;
381 
382 }
383 
384 /*  A convenience wrapper around the public uspoof_getSkeleton that handles
385  *  allocating a larger buffer than provided if the original is too small.
386  */
getSkeleton(const USpoofChecker * sc,uint32_t type,const UChar * s,int32_t inputLength,UChar * dest,int32_t destCapacity,int32_t * outputLength,UErrorCode * status)387 static UChar *getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t inputLength,
388                          UChar *dest, int32_t destCapacity, int32_t *outputLength, UErrorCode *status) {
389     int32_t requiredCapacity = 0;
390     UChar *buf = dest;
391 
392     if (U_FAILURE(*status)) {
393         return NULL;
394     }
395     requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCapacity, status);
396     if (*status == U_BUFFER_OVERFLOW_ERROR) {
397         buf = static_cast<UChar *>(uprv_malloc(requiredCapacity * sizeof(UChar)));
398         if (buf == NULL) {
399             *status = U_MEMORY_ALLOCATION_ERROR;
400             return NULL;
401         }
402         *status = U_ZERO_ERROR;
403         uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, status);
404     }
405     *outputLength = requiredCapacity;
406     return buf;
407 }
408 
409 
410 U_CAPI int32_t U_EXPORT2
uspoof_areConfusable(const USpoofChecker * sc,const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,UErrorCode * status)411 uspoof_areConfusable(const USpoofChecker *sc,
412                      const UChar *s1, int32_t length1,
413                      const UChar *s2, int32_t length2,
414                      UErrorCode *status) {
415     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
416     if (U_FAILURE(*status)) {
417         return 0;
418     }
419     //
420     // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
421     //   and for definitions of the types (single, whole, mixed-script) of confusables.
422 
423     // We only care about a few of the check flags.  Ignore the others.
424     // If no tests relavant to this function have been specified, return an error.
425     // TODO:  is this really the right thing to do?  It's probably an error on the caller's part,
426     //        but logically we would just return 0 (no error).
427     if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
428                           USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
429         *status = U_INVALID_STATE_ERROR;
430         return 0;
431     }
432     int32_t  flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
433     UChar    s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
434     UChar   *s1Skeleton;
435     int32_t  s1SkeletonLength = 0;
436 
437     UChar    s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
438     UChar   *s2Skeleton;
439     int32_t  s2SkeletonLength = 0;
440 
441     int32_t  result = 0;
442     int32_t  t;
443     int32_t  s1ScriptCount = This->scriptScan(s1, length1, t, *status);
444     int32_t  s2ScriptCount = This->scriptScan(s2, length2, t, *status);
445 
446     if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
447         // Do the Single Script compare.
448         if (s1ScriptCount <= 1 && s2ScriptCount <= 1) {
449             flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
450             s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
451                                      sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
452             s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
453                                      sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
454             if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
455                 result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
456             }
457             if (s1Skeleton != s1SkeletonBuf) {
458                 uprv_free(s1Skeleton);
459             }
460             if (s2Skeleton != s2SkeletonBuf) {
461                 uprv_free(s2Skeleton);
462             }
463         }
464     }
465 
466     if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
467          // If the two inputs are single script confusable they cannot also be
468          // mixed or whole script confusable, according to the UAX39 definitions.
469          // So we can skip those tests.
470          return result;
471     }
472 
473     // Optimization for whole script confusables test:  two identifiers are whole script confusable if
474     // each is of a single script and they are mixed script confusable.
475     UBool possiblyWholeScriptConfusables =
476         s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
477 
478     //
479     // Mixed Script Check
480     //
481     if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
482         // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
483         // the mixed script table skeleton, which is what we want.
484         // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
485         flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
486         s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
487                                  sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
488         s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
489                                  sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
490         if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
491             result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
492             if (possiblyWholeScriptConfusables) {
493                 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
494             }
495         }
496         if (s1Skeleton != s1SkeletonBuf) {
497             uprv_free(s1Skeleton);
498         }
499         if (s2Skeleton != s2SkeletonBuf) {
500             uprv_free(s2Skeleton);
501         }
502     }
503 
504     return result;
505 }
506 
507 
508 // Convenience function for converting a UTF-8 input to a UChar * string, including
509 //          reallocating a buffer when required.  Parameters and their interpretation mostly
510 //          match u_strFromUTF8.
511 
convertFromUTF8(UChar * outBuf,int32_t outBufCapacity,int32_t * outputLength,const char * in,int32_t inLength,UErrorCode * status)512 static UChar * convertFromUTF8(UChar *outBuf, int32_t outBufCapacity, int32_t *outputLength,
513                                const char *in, int32_t inLength, UErrorCode *status) {
514     if (U_FAILURE(*status)) {
515         return NULL;
516     }
517     UChar *dest = outBuf;
518     u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status);
519     if (*status == U_BUFFER_OVERFLOW_ERROR) {
520         dest = static_cast<UChar *>(uprv_malloc(*outputLength * sizeof(UChar)));
521         if (dest == NULL) {
522             *status = U_MEMORY_ALLOCATION_ERROR;
523             return NULL;
524         }
525         *status = U_ZERO_ERROR;
526         u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status);
527     }
528     return dest;
529 }
530 
531 
532 
533 U_CAPI int32_t U_EXPORT2
uspoof_areConfusableUTF8(const USpoofChecker * sc,const char * s1,int32_t length1,const char * s2,int32_t length2,UErrorCode * status)534 uspoof_areConfusableUTF8(const USpoofChecker *sc,
535                          const char *s1, int32_t length1,
536                          const char *s2, int32_t length2,
537                          UErrorCode *status) {
538 
539     SpoofImpl::validateThis(sc, *status);
540     if (U_FAILURE(*status)) {
541         return 0;
542     }
543 
544     UChar    s1Buf[USPOOF_STACK_BUFFER_SIZE];
545     int32_t  lengthS1U;
546     UChar   *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status);
547 
548     UChar    s2Buf[USPOOF_STACK_BUFFER_SIZE];
549     int32_t  lengthS2U;
550     UChar   *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status);
551 
552     int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, status);
553 
554     if (s1U != s1Buf) {
555         uprv_free(s1U);
556     }
557     if (s2U != s2Buf) {
558         uprv_free(s2U);
559     }
560     return results;
561 }
562 
563 
564 U_CAPI int32_t U_EXPORT2
uspoof_areConfusableUnicodeString(const USpoofChecker * sc,const U_NAMESPACE_QUALIFIER UnicodeString & s1,const U_NAMESPACE_QUALIFIER UnicodeString & s2,UErrorCode * status)565 uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
566                                   const U_NAMESPACE_QUALIFIER UnicodeString &s1,
567                                   const U_NAMESPACE_QUALIFIER UnicodeString &s2,
568                                   UErrorCode *status) {
569 
570     const UChar *u1  = s1.getBuffer();
571     int32_t  length1 = s1.length();
572     const UChar *u2  = s2.getBuffer();
573     int32_t  length2 = s2.length();
574 
575     int32_t results  = uspoof_areConfusable(sc, u1, length1, u2, length2, status);
576     return results;
577 }
578 
579 
580 
581 
582 U_CAPI int32_t U_EXPORT2
uspoof_checkUnicodeString(const USpoofChecker * sc,const U_NAMESPACE_QUALIFIER UnicodeString & text,int32_t * position,UErrorCode * status)583 uspoof_checkUnicodeString(const USpoofChecker *sc,
584                           const U_NAMESPACE_QUALIFIER UnicodeString &text,
585                           int32_t *position,
586                           UErrorCode *status) {
587     int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status);
588     return result;
589 }
590 
591 
592 U_CAPI int32_t U_EXPORT2
uspoof_getSkeleton(const USpoofChecker * sc,uint32_t type,const UChar * s,int32_t length,UChar * dest,int32_t destCapacity,UErrorCode * status)593 uspoof_getSkeleton(const USpoofChecker *sc,
594                    uint32_t type,
595                    const UChar *s,  int32_t length,
596                    UChar *dest, int32_t destCapacity,
597                    UErrorCode *status) {
598 
599     // TODO:  this function could be sped up a bit
600     //        Skip the input normalization when not needed, work from callers data.
601     //        Put the initial skeleton straight into the caller's destination buffer.
602     //        It probably won't need normalization.
603     //        But these would make the structure more complicated.
604 
605     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
606     if (U_FAILURE(*status)) {
607         return 0;
608     }
609     if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
610         (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
611         *status = U_ILLEGAL_ARGUMENT_ERROR;
612         return 0;
613     }
614 
615    int32_t tableMask = 0;
616    switch (type) {
617       case 0:
618         tableMask = USPOOF_ML_TABLE_FLAG;
619         break;
620       case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
621         tableMask = USPOOF_SL_TABLE_FLAG;
622         break;
623       case USPOOF_ANY_CASE:
624         tableMask = USPOOF_MA_TABLE_FLAG;
625         break;
626       case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
627         tableMask = USPOOF_SA_TABLE_FLAG;
628         break;
629       default:
630         *status = U_ILLEGAL_ARGUMENT_ERROR;
631         return 0;
632     }
633 
634     // NFKD transform of the user supplied input
635 
636     UChar nfkdStackBuf[USPOOF_STACK_BUFFER_SIZE];
637     UChar *nfkdInput = nfkdStackBuf;
638     int32_t normalizedLen = unorm_normalize(
639         s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status);
640     if (*status == U_BUFFER_OVERFLOW_ERROR) {
641         nfkdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
642         if (nfkdInput == NULL) {
643             *status = U_MEMORY_ALLOCATION_ERROR;
644             return 0;
645         }
646         *status = U_ZERO_ERROR;
647         normalizedLen = unorm_normalize(s, length, UNORM_NFKD, 0,
648                                         nfkdInput, normalizedLen+1, status);
649     }
650     if (U_FAILURE(*status)) {
651         if (nfkdInput != nfkdStackBuf) {
652             uprv_free(nfkdInput);
653         }
654         return 0;
655     }
656 
657     // buffer to hold the Unicode defined skeleton mappings for a single code point
658     UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
659 
660     // Apply the skeleton mapping to the NFKD normalized input string
661     // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
662     int32_t inputIndex = 0;
663     UnicodeString skelStr;
664     while (inputIndex < normalizedLen) {
665         UChar32 c;
666         U16_NEXT(nfkdInput, inputIndex, normalizedLen, c);
667         int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
668         skelStr.append(buf, replaceLen);
669     }
670 
671     if (nfkdInput != nfkdStackBuf) {
672         uprv_free(nfkdInput);
673     }
674 
675     const UChar *result = skelStr.getBuffer();
676     int32_t  resultLen  = skelStr.length();
677     UChar   *normedResult = NULL;
678 
679     // Check the skeleton for NFKD, normalize it if needed.
680     // Unnormalized results should be very rare.
681     if (!unorm_isNormalized(result, resultLen, UNORM_NFKD, status)) {
682         normalizedLen = unorm_normalize(result, resultLen, UNORM_NFKD, 0, NULL, 0, status);
683         normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
684         if (normedResult == NULL) {
685             *status = U_MEMORY_ALLOCATION_ERROR;
686             return 0;
687         }
688         *status = U_ZERO_ERROR;
689         unorm_normalize(result, resultLen, UNORM_NFKD, 0, normedResult, normalizedLen+1, status);
690         result = normedResult;
691         resultLen = normalizedLen;
692     }
693 
694     // Copy the skeleton to the caller's buffer
695     if (U_SUCCESS(*status)) {
696         if (destCapacity == 0 || resultLen > destCapacity) {
697             *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
698         } else {
699             u_memcpy(dest, result, resultLen);
700             if (destCapacity > resultLen) {
701                 dest[resultLen] = 0;
702             } else {
703                 *status = U_STRING_NOT_TERMINATED_WARNING;
704             }
705         }
706      }
707      uprv_free(normedResult);
708      return resultLen;
709 }
710 
711 
712 
713 U_CAPI UnicodeString &  U_EXPORT2
uspoof_getSkeletonUnicodeString(const USpoofChecker * sc,uint32_t type,const UnicodeString & s,UnicodeString & dest,UErrorCode * status)714 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
715                                 uint32_t type,
716                                 const UnicodeString &s,
717                                 UnicodeString &dest,
718                                 UErrorCode *status) {
719     if (U_FAILURE(*status)) {
720         return dest;
721     }
722     dest.remove();
723 
724     const UChar *str = s.getBuffer();
725     int32_t      strLen = s.length();
726     UChar        smallBuf[USPOOF_STACK_BUFFER_SIZE];
727     UChar       *buf = smallBuf;
728     int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USPOOF_STACK_BUFFER_SIZE, status);
729     if (*status == U_BUFFER_OVERFLOW_ERROR) {
730         buf = static_cast<UChar *>(uprv_malloc((outputSize+1)*sizeof(UChar)));
731         if (buf == NULL) {
732             *status = U_MEMORY_ALLOCATION_ERROR;
733             return dest;
734         }
735         *status = U_ZERO_ERROR;
736         uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status);
737     }
738     if (U_SUCCESS(*status)) {
739         dest.setTo(buf, outputSize);
740     }
741 
742     if (buf != smallBuf) {
743         uprv_free(buf);
744     }
745     return dest;
746 }
747 
748 
749 U_CAPI int32_t U_EXPORT2
uspoof_getSkeletonUTF8(const USpoofChecker * sc,uint32_t type,const char * s,int32_t length,char * dest,int32_t destCapacity,UErrorCode * status)750 uspoof_getSkeletonUTF8(const USpoofChecker *sc,
751                        uint32_t type,
752                        const char *s,  int32_t length,
753                        char *dest, int32_t destCapacity,
754                        UErrorCode *status) {
755     // Lacking a UTF-8 normalization API, just converting the input to
756     // UTF-16 seems as good an approach as any.  In typical use, input will
757     // be an identifier, which is to say not too long for stack buffers.
758     if (U_FAILURE(*status)) {
759         return 0;
760     }
761     // Buffers for the UChar form of the input and skeleton strings.
762     UChar    smallInBuf[USPOOF_STACK_BUFFER_SIZE];
763     UChar   *inBuf = smallInBuf;
764     UChar    smallOutBuf[USPOOF_STACK_BUFFER_SIZE];
765     UChar   *outBuf = smallOutBuf;
766 
767     int32_t  lengthInUChars = 0;
768     int32_t  skelLengthInUChars = 0;
769     int32_t  skelLengthInUTF8 = 0;
770 
771     u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars,
772                   s, length, status);
773     if (*status == U_BUFFER_OVERFLOW_ERROR) {
774         inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar)));
775         if (inBuf == NULL) {
776             *status = U_MEMORY_ALLOCATION_ERROR;
777             goto cleanup;
778         }
779         *status = U_ZERO_ERROR;
780         u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars,
781                       s, length, status);
782     }
783 
784     skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
785                                          outBuf, USPOOF_STACK_BUFFER_SIZE, status);
786     if (*status == U_BUFFER_OVERFLOW_ERROR) {
787         outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
788         if (outBuf == NULL) {
789             *status = U_MEMORY_ALLOCATION_ERROR;
790             goto cleanup;
791         }
792         *status = U_ZERO_ERROR;
793         skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
794                                          outBuf, skelLengthInUChars+1, status);
795     }
796 
797     u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,
798                 outBuf, skelLengthInUChars, status);
799 
800   cleanup:
801     if (inBuf != smallInBuf) {
802         uprv_free(inBuf);
803     }
804     if (outBuf != smallOutBuf) {
805         uprv_free(outBuf);
806     }
807     return skelLengthInUTF8;
808 }
809 
810 
811 U_CAPI int32_t U_EXPORT2
uspoof_serialize(USpoofChecker * sc,void * buf,int32_t capacity,UErrorCode * status)812 uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
813     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
814     if (This == NULL) {
815         U_ASSERT(U_FAILURE(*status));
816         return 0;
817     }
818     int32_t dataSize = This->fSpoofData->fRawData->fLength;
819     if (capacity < dataSize) {
820         *status = U_BUFFER_OVERFLOW_ERROR;
821         return dataSize;
822     }
823     uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
824     return dataSize;
825 }
826 
827 #endif
828