• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ***************************************************************************
3 * Copyright (C) 2008-2012, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ***************************************************************************
6 *   file name:  uspoof.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2008Feb13
12 *   created by: Andy Heninger
13 *
14 *   Unicode Spoof Detection
15 */
16 #include "unicode/utypes.h"
17 #include "unicode/uspoof.h"
18 #include "unicode/unorm.h"
19 #include "unicode/ustring.h"
20 #include "unicode/utf16.h"
21 #include "cmemory.h"
22 #include "uspoof_impl.h"
23 #include "uassert.h"
24 
25 
26 #if !UCONFIG_NO_NORMALIZATION
27 
28 U_NAMESPACE_USE
29 
30 
31 U_CAPI USpoofChecker * U_EXPORT2
uspoof_open(UErrorCode * status)32 uspoof_open(UErrorCode *status) {
33     if (U_FAILURE(*status)) {
34         return NULL;
35     }
36     SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
37     if (U_FAILURE(*status)) {
38         delete si;
39         si = NULL;
40     }
41     return (USpoofChecker *)si;
42 }
43 
44 
45 U_CAPI USpoofChecker * U_EXPORT2
uspoof_openFromSerialized(const void * data,int32_t length,int32_t * pActualLength,UErrorCode * status)46 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
47                           UErrorCode *status) {
48     if (U_FAILURE(*status)) {
49         return NULL;
50     }
51     SpoofData *sd = new SpoofData(data, length, *status);
52     SpoofImpl *si = new SpoofImpl(sd, *status);
53     if (U_FAILURE(*status)) {
54         delete sd;
55         delete si;
56         return NULL;
57     }
58     if (sd == NULL || si == NULL) {
59         *status = U_MEMORY_ALLOCATION_ERROR;
60         delete sd;
61         delete si;
62         return NULL;
63     }
64 
65     if (pActualLength != NULL) {
66         *pActualLength = sd->fRawData->fLength;
67     }
68     return reinterpret_cast<USpoofChecker *>(si);
69 }
70 
71 
72 U_CAPI USpoofChecker * U_EXPORT2
uspoof_clone(const USpoofChecker * sc,UErrorCode * status)73 uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
74     const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
75     if (src == NULL) {
76         return NULL;
77     }
78     SpoofImpl *result = new SpoofImpl(*src, *status);   // copy constructor
79     if (U_FAILURE(*status)) {
80         delete result;
81         result = NULL;
82     }
83     return (USpoofChecker *)result;
84 }
85 
86 
87 U_CAPI void U_EXPORT2
uspoof_close(USpoofChecker * sc)88 uspoof_close(USpoofChecker *sc) {
89     UErrorCode status = U_ZERO_ERROR;
90     SpoofImpl *This = SpoofImpl::validateThis(sc, status);
91     delete This;
92 }
93 
94 
95 U_CAPI void U_EXPORT2
uspoof_setChecks(USpoofChecker * sc,int32_t checks,UErrorCode * status)96 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
97     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
98     if (This == NULL) {
99         return;
100     }
101 
102     // Verify that the requested checks are all ones (bits) that
103     //   are acceptable, known values.
104     if (checks & ~USPOOF_ALL_CHECKS) {
105         *status = U_ILLEGAL_ARGUMENT_ERROR;
106         return;
107     }
108 
109     This->fChecks = checks;
110 }
111 
112 
113 U_CAPI int32_t U_EXPORT2
uspoof_getChecks(const USpoofChecker * sc,UErrorCode * status)114 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
115     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
116     if (This == NULL) {
117         return 0;
118     }
119     return This->fChecks;
120 }
121 
122 U_CAPI void U_EXPORT2
uspoof_setAllowedLocales(USpoofChecker * sc,const char * localesList,UErrorCode * status)123 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) {
124     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
125     if (This == NULL) {
126         return;
127     }
128     This->setAllowedLocales(localesList, *status);
129 }
130 
131 U_CAPI const char * U_EXPORT2
uspoof_getAllowedLocales(USpoofChecker * sc,UErrorCode * status)132 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) {
133     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
134     if (This == NULL) {
135         return NULL;
136     }
137     return This->getAllowedLocales(*status);
138 }
139 
140 
141 U_CAPI const USet * U_EXPORT2
uspoof_getAllowedChars(const USpoofChecker * sc,UErrorCode * status)142 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
143     const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
144     return reinterpret_cast<const USet *>(result);
145 }
146 
147 U_CAPI const UnicodeSet * U_EXPORT2
uspoof_getAllowedUnicodeSet(const USpoofChecker * sc,UErrorCode * status)148 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
149     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
150     if (This == NULL) {
151         return NULL;
152     }
153     return This->fAllowedCharsSet;
154 }
155 
156 
157 U_CAPI void U_EXPORT2
uspoof_setAllowedChars(USpoofChecker * sc,const USet * chars,UErrorCode * status)158 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
159     const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars);
160     uspoof_setAllowedUnicodeSet(sc, set, status);
161 }
162 
163 
164 U_CAPI void U_EXPORT2
uspoof_setAllowedUnicodeSet(USpoofChecker * sc,const UnicodeSet * chars,UErrorCode * status)165 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
166     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
167     if (This == NULL) {
168         return;
169     }
170     if (chars->isBogus()) {
171         *status = U_ILLEGAL_ARGUMENT_ERROR;
172         return;
173     }
174     UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
175     if (clonedSet == NULL || clonedSet->isBogus()) {
176         *status = U_MEMORY_ALLOCATION_ERROR;
177         return;
178     }
179     clonedSet->freeze();
180     delete This->fAllowedCharsSet;
181     This->fAllowedCharsSet = clonedSet;
182     This->fChecks |= USPOOF_CHAR_LIMIT;
183 }
184 
185 
186 U_CAPI int32_t U_EXPORT2
uspoof_check(const USpoofChecker * sc,const UChar * text,int32_t length,int32_t * position,UErrorCode * status)187 uspoof_check(const USpoofChecker *sc,
188              const UChar *text, int32_t length,
189              int32_t *position,
190              UErrorCode *status) {
191 
192     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
193     if (This == NULL) {
194         return 0;
195     }
196     if (length < -1) {
197         *status = U_ILLEGAL_ARGUMENT_ERROR;
198         return 0;
199     }
200     if (length == -1) {
201         // It's not worth the bother to handle nul terminated strings everywhere.
202         //   Just get the length and be done with it.
203         length = u_strlen(text);
204     }
205 
206     int32_t result = 0;
207     int32_t failPos = 0x7fffffff;   // TODO: do we have a #define for max int32?
208 
209     // A count of the number of non-Common or inherited scripts.
210     // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
211     // Share the computation when possible.  scriptCount == -1 means that we haven't
212     // done it yet.
213     int32_t scriptCount = -1;
214 
215     if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
216         scriptCount = This->scriptScan(text, length, failPos, *status);
217         // printf("scriptCount (clipped to 2) = %d\n", scriptCount);
218         if ( scriptCount >= 2) {
219             // Note: scriptCount == 2 covers all cases of the number of scripts >= 2
220             result |= USPOOF_SINGLE_SCRIPT;
221         }
222     }
223 
224     if (This->fChecks & USPOOF_CHAR_LIMIT) {
225         int32_t i;
226         UChar32 c;
227         for (i=0; i<length ;) {
228             U16_NEXT(text, i, length, c);
229             if (!This->fAllowedCharsSet->contains(c)) {
230                 result |= USPOOF_CHAR_LIMIT;
231                 if (i < failPos) {
232                     failPos = i;
233                 }
234                 break;
235             }
236         }
237     }
238 
239     if (This->fChecks &
240         (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
241         // These are the checks that need to be done on NFD input
242         NFDBuffer   normalizedInput(text, length, *status);
243         const UChar  *nfdText = normalizedInput.getBuffer();
244         int32_t      nfdLength = normalizedInput.getLength();
245 
246         if (This->fChecks & USPOOF_INVISIBLE) {
247 
248             // scan for more than one occurence of the same non-spacing mark
249             // in a sequence of non-spacing marks.
250             int32_t     i;
251             UChar32     c;
252             UChar32     firstNonspacingMark = 0;
253             UBool       haveMultipleMarks = FALSE;
254             UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
255 
256             for (i=0; i<nfdLength ;) {
257                 U16_NEXT(nfdText, i, nfdLength, c);
258                 if (u_charType(c) != U_NON_SPACING_MARK) {
259                     firstNonspacingMark = 0;
260                     if (haveMultipleMarks) {
261                         marksSeenSoFar.clear();
262                         haveMultipleMarks = FALSE;
263                     }
264                     continue;
265                 }
266                 if (firstNonspacingMark == 0) {
267                     firstNonspacingMark = c;
268                     continue;
269                 }
270                 if (!haveMultipleMarks) {
271                     marksSeenSoFar.add(firstNonspacingMark);
272                     haveMultipleMarks = TRUE;
273                 }
274                 if (marksSeenSoFar.contains(c)) {
275                     // report the error, and stop scanning.
276                     // No need to find more than the first failure.
277                     result |= USPOOF_INVISIBLE;
278                     failPos = i;
279                     // TODO: Bug 8655: failPos is the position in the NFD buffer, but what we want
280                     //       to give back to our caller is a position in the original input string.
281                     if (failPos > length) {
282                         failPos = length;
283                     }
284                     break;
285                 }
286                 marksSeenSoFar.add(c);
287             }
288         }
289 
290 
291         if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
292             // The basic test is the same for both whole and mixed script confusables.
293             // Compute the set of scripts that every input character has a confusable in.
294             // For this computation an input character is always considered to be
295             //    confusable with itself in its own script.
296             // If the number of such scripts is two or more, and the input consisted of
297             //   characters all from a single script, we have a whole script confusable.
298             //   (The two scripts will be the original script and the one that is confusable)
299             // If the number of such scripts >= one, and the original input contained characters from
300             //   more than one script, we have a mixed script confusable.  (We can transform
301             //   some of the characters, and end up with a visually similar string all in
302             //   one script.)
303 
304             if (scriptCount == -1) {
305                 int32_t t;
306                 scriptCount = This->scriptScan(text, length, t, *status);
307             }
308 
309             ScriptSet scripts;
310             This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status);
311             int32_t confusableScriptCount = scripts.countMembers();
312             //printf("confusableScriptCount = %d\n", confusableScriptCount);
313 
314             if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
315                 confusableScriptCount >= 2 &&
316                 scriptCount == 1) {
317                 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
318             }
319 
320             if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
321                 confusableScriptCount >= 1 &&
322                 scriptCount > 1) {
323                 result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
324             }
325         }
326     }
327     if (position != NULL && failPos != 0x7fffffff) {
328         *position = failPos;
329     }
330     return result;
331 }
332 
333 
334 U_CAPI int32_t U_EXPORT2
uspoof_checkUTF8(const USpoofChecker * sc,const char * text,int32_t length,int32_t * position,UErrorCode * status)335 uspoof_checkUTF8(const USpoofChecker *sc,
336                  const char *text, int32_t length,
337                  int32_t *position,
338                  UErrorCode *status) {
339 
340     if (U_FAILURE(*status)) {
341         return 0;
342     }
343     UChar stackBuf[USPOOF_STACK_BUFFER_SIZE];
344     UChar* text16 = stackBuf;
345     int32_t len16;
346 
347     u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status);
348     if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
349         return 0;
350     }
351     if (*status == U_BUFFER_OVERFLOW_ERROR) {
352         text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2));
353         if (text16 == NULL) {
354             *status = U_MEMORY_ALLOCATION_ERROR;
355             return 0;
356         }
357         *status = U_ZERO_ERROR;
358         u_strFromUTF8(text16, len16+1, NULL, text, length, status);
359     }
360 
361     int32_t position16 = -1;
362     int32_t result = uspoof_check(sc, text16, len16, &position16, status);
363     if (U_FAILURE(*status)) {
364         return 0;
365     }
366 
367     if (position16 > 0) {
368         // Translate a UTF-16 based error position back to a UTF-8 offset.
369         // u_strToUTF8() in preflight mode is an easy way to do it.
370         U_ASSERT(position16 <= len16);
371         u_strToUTF8(NULL, 0, position, text16, position16, status);
372         if (position > 0) {
373             // position is the required buffer length from u_strToUTF8, which includes
374             // space for a terminating NULL, which we don't want, hence the -1.
375             *position -= 1;
376         }
377         *status = U_ZERO_ERROR;   // u_strToUTF8, above sets BUFFER_OVERFLOW_ERROR.
378     }
379 
380     if (text16 != stackBuf) {
381         uprv_free(text16);
382     }
383     return result;
384 
385 }
386 
387 /*  A convenience wrapper around the public uspoof_getSkeleton that handles
388  *  allocating a larger buffer than provided if the original is too small.
389  */
getSkeleton(const USpoofChecker * sc,uint32_t type,const UChar * s,int32_t inputLength,UChar * dest,int32_t destCapacity,int32_t * outputLength,UErrorCode * status)390 static UChar *getSkeleton(const USpoofChecker *sc, uint32_t type, const UChar *s, int32_t inputLength,
391                          UChar *dest, int32_t destCapacity, int32_t *outputLength, UErrorCode *status) {
392     int32_t requiredCapacity = 0;
393     UChar *buf = dest;
394 
395     if (U_FAILURE(*status)) {
396         return NULL;
397     }
398     requiredCapacity = uspoof_getSkeleton(sc, type, s, inputLength, dest, destCapacity, status);
399     if (*status == U_BUFFER_OVERFLOW_ERROR) {
400         buf = static_cast<UChar *>(uprv_malloc(requiredCapacity * sizeof(UChar)));
401         if (buf == NULL) {
402             *status = U_MEMORY_ALLOCATION_ERROR;
403             return NULL;
404         }
405         *status = U_ZERO_ERROR;
406         uspoof_getSkeleton(sc, type, s, inputLength, buf, requiredCapacity, status);
407     }
408     *outputLength = requiredCapacity;
409     return buf;
410 }
411 
412 
413 U_CAPI int32_t U_EXPORT2
uspoof_areConfusable(const USpoofChecker * sc,const UChar * s1,int32_t length1,const UChar * s2,int32_t length2,UErrorCode * status)414 uspoof_areConfusable(const USpoofChecker *sc,
415                      const UChar *s1, int32_t length1,
416                      const UChar *s2, int32_t length2,
417                      UErrorCode *status) {
418     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
419     if (U_FAILURE(*status)) {
420         return 0;
421     }
422     //
423     // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
424     //   and for definitions of the types (single, whole, mixed-script) of confusables.
425 
426     // We only care about a few of the check flags.  Ignore the others.
427     // If no tests relavant to this function have been specified, return an error.
428     // TODO:  is this really the right thing to do?  It's probably an error on the caller's part,
429     //        but logically we would just return 0 (no error).
430     if ((This->fChecks & (USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE |
431                           USPOOF_WHOLE_SCRIPT_CONFUSABLE)) == 0) {
432         *status = U_INVALID_STATE_ERROR;
433         return 0;
434     }
435     int32_t  flagsForSkeleton = This->fChecks & USPOOF_ANY_CASE;
436     UChar    s1SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
437     UChar   *s1Skeleton;
438     int32_t  s1SkeletonLength = 0;
439 
440     UChar    s2SkeletonBuf[USPOOF_STACK_BUFFER_SIZE];
441     UChar   *s2Skeleton;
442     int32_t  s2SkeletonLength = 0;
443 
444     int32_t  result = 0;
445     int32_t  t;
446     int32_t  s1ScriptCount = This->scriptScan(s1, length1, t, *status);
447     int32_t  s2ScriptCount = This->scriptScan(s2, length2, t, *status);
448 
449     if (This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
450         // Do the Single Script compare.
451         if (s1ScriptCount <= 1 && s2ScriptCount <= 1) {
452             flagsForSkeleton |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
453             s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
454                                      sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
455             s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
456                                      sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
457             if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
458                 result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE;
459             }
460             if (s1Skeleton != s1SkeletonBuf) {
461                 uprv_free(s1Skeleton);
462             }
463             if (s2Skeleton != s2SkeletonBuf) {
464                 uprv_free(s2Skeleton);
465             }
466         }
467     }
468 
469     if (result & USPOOF_SINGLE_SCRIPT_CONFUSABLE) {
470          // If the two inputs are single script confusable they cannot also be
471          // mixed or whole script confusable, according to the UAX39 definitions.
472          // So we can skip those tests.
473          return result;
474     }
475 
476     // Optimization for whole script confusables test:  two identifiers are whole script confusable if
477     // each is of a single script and they are mixed script confusable.
478     UBool possiblyWholeScriptConfusables =
479         s1ScriptCount <= 1 && s2ScriptCount <= 1 && (This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE);
480 
481     //
482     // Mixed Script Check
483     //
484     if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) || possiblyWholeScriptConfusables ) {
485         // For getSkeleton(), resetting the USPOOF_SINGLE_SCRIPT_CONFUSABLE flag will get us
486         // the mixed script table skeleton, which is what we want.
487         // The Any Case / Lower Case bit in the skelton flags was set at the top of the function.
488         flagsForSkeleton &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE;
489         s1Skeleton = getSkeleton(sc, flagsForSkeleton, s1, length1, s1SkeletonBuf,
490                                  sizeof(s1SkeletonBuf)/sizeof(UChar), &s1SkeletonLength, status);
491         s2Skeleton = getSkeleton(sc, flagsForSkeleton, s2, length2, s2SkeletonBuf,
492                                  sizeof(s2SkeletonBuf)/sizeof(UChar), &s2SkeletonLength, status);
493         if (s1SkeletonLength == s2SkeletonLength && u_strncmp(s1Skeleton, s2Skeleton, s1SkeletonLength) == 0) {
494             result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
495             if (possiblyWholeScriptConfusables) {
496                 result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
497             }
498         }
499         if (s1Skeleton != s1SkeletonBuf) {
500             uprv_free(s1Skeleton);
501         }
502         if (s2Skeleton != s2SkeletonBuf) {
503             uprv_free(s2Skeleton);
504         }
505     }
506 
507     return result;
508 }
509 
510 
511 // Convenience function for converting a UTF-8 input to a UChar * string, including
512 //          reallocating a buffer when required.  Parameters and their interpretation mostly
513 //          match u_strFromUTF8.
514 
convertFromUTF8(UChar * outBuf,int32_t outBufCapacity,int32_t * outputLength,const char * in,int32_t inLength,UErrorCode * status)515 static UChar * convertFromUTF8(UChar *outBuf, int32_t outBufCapacity, int32_t *outputLength,
516                                const char *in, int32_t inLength, UErrorCode *status) {
517     if (U_FAILURE(*status)) {
518         return NULL;
519     }
520     UChar *dest = outBuf;
521     u_strFromUTF8(dest, outBufCapacity, outputLength, in, inLength, status);
522     if (*status == U_BUFFER_OVERFLOW_ERROR) {
523         dest = static_cast<UChar *>(uprv_malloc(*outputLength * sizeof(UChar)));
524         if (dest == NULL) {
525             *status = U_MEMORY_ALLOCATION_ERROR;
526             return NULL;
527         }
528         *status = U_ZERO_ERROR;
529         u_strFromUTF8(dest, *outputLength, NULL, in, inLength, status);
530     }
531     return dest;
532 }
533 
534 
535 
536 U_CAPI int32_t U_EXPORT2
uspoof_areConfusableUTF8(const USpoofChecker * sc,const char * s1,int32_t length1,const char * s2,int32_t length2,UErrorCode * status)537 uspoof_areConfusableUTF8(const USpoofChecker *sc,
538                          const char *s1, int32_t length1,
539                          const char *s2, int32_t length2,
540                          UErrorCode *status) {
541 
542     SpoofImpl::validateThis(sc, *status);
543     if (U_FAILURE(*status)) {
544         return 0;
545     }
546 
547     UChar    s1Buf[USPOOF_STACK_BUFFER_SIZE];
548     int32_t  lengthS1U;
549     UChar   *s1U = convertFromUTF8(s1Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS1U, s1, length1, status);
550 
551     UChar    s2Buf[USPOOF_STACK_BUFFER_SIZE];
552     int32_t  lengthS2U;
553     UChar   *s2U = convertFromUTF8(s2Buf, USPOOF_STACK_BUFFER_SIZE, &lengthS2U, s2, length2, status);
554 
555     int32_t results = uspoof_areConfusable(sc, s1U, lengthS1U, s2U, lengthS2U, status);
556 
557     if (s1U != s1Buf) {
558         uprv_free(s1U);
559     }
560     if (s2U != s2Buf) {
561         uprv_free(s2U);
562     }
563     return results;
564 }
565 
566 
567 U_CAPI int32_t U_EXPORT2
uspoof_areConfusableUnicodeString(const USpoofChecker * sc,const icu::UnicodeString & s1,const icu::UnicodeString & s2,UErrorCode * status)568 uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
569                                   const icu::UnicodeString &s1,
570                                   const icu::UnicodeString &s2,
571                                   UErrorCode *status) {
572 
573     const UChar *u1  = s1.getBuffer();
574     int32_t  length1 = s1.length();
575     const UChar *u2  = s2.getBuffer();
576     int32_t  length2 = s2.length();
577 
578     int32_t results  = uspoof_areConfusable(sc, u1, length1, u2, length2, status);
579     return results;
580 }
581 
582 
583 
584 
585 U_CAPI int32_t U_EXPORT2
uspoof_checkUnicodeString(const USpoofChecker * sc,const icu::UnicodeString & text,int32_t * position,UErrorCode * status)586 uspoof_checkUnicodeString(const USpoofChecker *sc,
587                           const icu::UnicodeString &text,
588                           int32_t *position,
589                           UErrorCode *status) {
590     int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status);
591     return result;
592 }
593 
594 
595 U_CAPI int32_t U_EXPORT2
uspoof_getSkeleton(const USpoofChecker * sc,uint32_t type,const UChar * s,int32_t length,UChar * dest,int32_t destCapacity,UErrorCode * status)596 uspoof_getSkeleton(const USpoofChecker *sc,
597                    uint32_t type,
598                    const UChar *s,  int32_t length,
599                    UChar *dest, int32_t destCapacity,
600                    UErrorCode *status) {
601 
602     // TODO:  this function could be sped up a bit
603     //        Skip the input normalization when not needed, work from callers data.
604     //        Put the initial skeleton straight into the caller's destination buffer.
605     //        It probably won't need normalization.
606     //        But these would make the structure more complicated.
607 
608     const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
609     if (U_FAILURE(*status)) {
610         return 0;
611     }
612     if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
613         (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
614         *status = U_ILLEGAL_ARGUMENT_ERROR;
615         return 0;
616     }
617 
618    int32_t tableMask = 0;
619    switch (type) {
620       case 0:
621         tableMask = USPOOF_ML_TABLE_FLAG;
622         break;
623       case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
624         tableMask = USPOOF_SL_TABLE_FLAG;
625         break;
626       case USPOOF_ANY_CASE:
627         tableMask = USPOOF_MA_TABLE_FLAG;
628         break;
629       case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
630         tableMask = USPOOF_SA_TABLE_FLAG;
631         break;
632       default:
633         *status = U_ILLEGAL_ARGUMENT_ERROR;
634         return 0;
635     }
636 
637     // NFD transform of the user supplied input
638 
639     UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE];
640     UChar *nfdInput = nfdStackBuf;
641     int32_t normalizedLen = unorm_normalize(
642         s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status);
643     if (*status == U_BUFFER_OVERFLOW_ERROR) {
644         nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
645         if (nfdInput == NULL) {
646             *status = U_MEMORY_ALLOCATION_ERROR;
647             return 0;
648         }
649         *status = U_ZERO_ERROR;
650         normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0,
651                                         nfdInput, normalizedLen+1, status);
652     }
653     if (U_FAILURE(*status)) {
654         if (nfdInput != nfdStackBuf) {
655             uprv_free(nfdInput);
656         }
657         return 0;
658     }
659 
660     // buffer to hold the Unicode defined skeleton mappings for a single code point
661     UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
662 
663     // Apply the skeleton mapping to the NFD normalized input string
664     // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
665     int32_t inputIndex = 0;
666     UnicodeString skelStr;
667     while (inputIndex < normalizedLen) {
668         UChar32 c;
669         U16_NEXT(nfdInput, inputIndex, normalizedLen, c);
670         int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
671         skelStr.append(buf, replaceLen);
672     }
673 
674     if (nfdInput != nfdStackBuf) {
675         uprv_free(nfdInput);
676     }
677 
678     const UChar *result = skelStr.getBuffer();
679     int32_t  resultLen  = skelStr.length();
680     UChar   *normedResult = NULL;
681 
682     // Check the skeleton for NFD, normalize it if needed.
683     // Unnormalized results should be very rare.
684     if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) {
685         normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status);
686         normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
687         if (normedResult == NULL) {
688             *status = U_MEMORY_ALLOCATION_ERROR;
689             return 0;
690         }
691         *status = U_ZERO_ERROR;
692         unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status);
693         result = normedResult;
694         resultLen = normalizedLen;
695     }
696 
697     // Copy the skeleton to the caller's buffer
698     if (U_SUCCESS(*status)) {
699         if (destCapacity == 0 || resultLen > destCapacity) {
700             *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
701         } else {
702             u_memcpy(dest, result, resultLen);
703             if (destCapacity > resultLen) {
704                 dest[resultLen] = 0;
705             } else {
706                 *status = U_STRING_NOT_TERMINATED_WARNING;
707             }
708         }
709      }
710      uprv_free(normedResult);
711      return resultLen;
712 }
713 
714 
715 
716 U_I18N_API UnicodeString &  U_EXPORT2
uspoof_getSkeletonUnicodeString(const USpoofChecker * sc,uint32_t type,const UnicodeString & s,UnicodeString & dest,UErrorCode * status)717 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
718                                 uint32_t type,
719                                 const UnicodeString &s,
720                                 UnicodeString &dest,
721                                 UErrorCode *status) {
722     if (U_FAILURE(*status)) {
723         return dest;
724     }
725     dest.remove();
726 
727     const UChar *str = s.getBuffer();
728     int32_t      strLen = s.length();
729     UChar        smallBuf[USPOOF_STACK_BUFFER_SIZE];
730     UChar       *buf = smallBuf;
731     int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, USPOOF_STACK_BUFFER_SIZE, status);
732     if (*status == U_BUFFER_OVERFLOW_ERROR) {
733         buf = static_cast<UChar *>(uprv_malloc((outputSize+1)*sizeof(UChar)));
734         if (buf == NULL) {
735             *status = U_MEMORY_ALLOCATION_ERROR;
736             return dest;
737         }
738         *status = U_ZERO_ERROR;
739         uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status);
740     }
741     if (U_SUCCESS(*status)) {
742         dest.setTo(buf, outputSize);
743     }
744 
745     if (buf != smallBuf) {
746         uprv_free(buf);
747     }
748     return dest;
749 }
750 
751 
752 U_CAPI int32_t U_EXPORT2
uspoof_getSkeletonUTF8(const USpoofChecker * sc,uint32_t type,const char * s,int32_t length,char * dest,int32_t destCapacity,UErrorCode * status)753 uspoof_getSkeletonUTF8(const USpoofChecker *sc,
754                        uint32_t type,
755                        const char *s,  int32_t length,
756                        char *dest, int32_t destCapacity,
757                        UErrorCode *status) {
758     // Lacking a UTF-8 normalization API, just converting the input to
759     // UTF-16 seems as good an approach as any.  In typical use, input will
760     // be an identifier, which is to say not too long for stack buffers.
761     if (U_FAILURE(*status)) {
762         return 0;
763     }
764     // Buffers for the UChar form of the input and skeleton strings.
765     UChar    smallInBuf[USPOOF_STACK_BUFFER_SIZE];
766     UChar   *inBuf = smallInBuf;
767     UChar    smallOutBuf[USPOOF_STACK_BUFFER_SIZE];
768     UChar   *outBuf = smallOutBuf;
769 
770     int32_t  lengthInUChars = 0;
771     int32_t  skelLengthInUChars = 0;
772     int32_t  skelLengthInUTF8 = 0;
773 
774     u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars,
775                   s, length, status);
776     if (*status == U_BUFFER_OVERFLOW_ERROR) {
777         inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar)));
778         if (inBuf == NULL) {
779             *status = U_MEMORY_ALLOCATION_ERROR;
780             goto cleanup;
781         }
782         *status = U_ZERO_ERROR;
783         u_strFromUTF8(inBuf, lengthInUChars+1, &lengthInUChars,
784                       s, length, status);
785     }
786 
787     skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
788                                          outBuf, USPOOF_STACK_BUFFER_SIZE, status);
789     if (*status == U_BUFFER_OVERFLOW_ERROR) {
790         outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
791         if (outBuf == NULL) {
792             *status = U_MEMORY_ALLOCATION_ERROR;
793             goto cleanup;
794         }
795         *status = U_ZERO_ERROR;
796         skelLengthInUChars = uspoof_getSkeleton(sc, type, inBuf, lengthInUChars,
797                                          outBuf, skelLengthInUChars+1, status);
798     }
799 
800     u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,
801                 outBuf, skelLengthInUChars, status);
802 
803   cleanup:
804     if (inBuf != smallInBuf) {
805         uprv_free(inBuf);
806     }
807     if (outBuf != smallOutBuf) {
808         uprv_free(outBuf);
809     }
810     return skelLengthInUTF8;
811 }
812 
813 
814 U_CAPI int32_t U_EXPORT2
uspoof_serialize(USpoofChecker * sc,void * buf,int32_t capacity,UErrorCode * status)815 uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
816     SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
817     if (This == NULL) {
818         U_ASSERT(U_FAILURE(*status));
819         return 0;
820     }
821     int32_t dataSize = This->fSpoofData->fRawData->fLength;
822     if (capacity < dataSize) {
823         *status = U_BUFFER_OVERFLOW_ERROR;
824         return dataSize;
825     }
826     uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
827     return dataSize;
828 }
829 
830 #endif
831