• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2005-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucasemap.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2005may06
16 *   created by: Markus W. Scherer
17 *
18 *   Case mapping service object and functions using it.
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/bytestream.h"
24 #include "unicode/casemap.h"
25 #include "unicode/edits.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/ubrk.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ustring.h"
31 #include "unicode/ucasemap.h"
32 #if !UCONFIG_NO_BREAK_ITERATION
33 #include "unicode/utext.h"
34 #endif
35 #include "unicode/utf.h"
36 #include "unicode/utf8.h"
37 #include "unicode/utf16.h"
38 #include "bytesinkutil.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41 #include "uassert.h"
42 #include "ucase.h"
43 #include "ucasemap_imp.h"
44 #include "ustr_imp.h"
45 
46 U_NAMESPACE_USE
47 
48 /* UCaseMap service object -------------------------------------------------- */
49 
UCaseMap(const char * localeID,uint32_t opts,UErrorCode * pErrorCode)50 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
51 #if !UCONFIG_NO_BREAK_ITERATION
52         iter(NULL),
53 #endif
54         caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55     ucasemap_setLocale(this, localeID, pErrorCode);
56 }
57 
~UCaseMap()58 UCaseMap::~UCaseMap() {
59 #if !UCONFIG_NO_BREAK_ITERATION
60     delete iter;
61 #endif
62 }
63 
64 U_CAPI UCaseMap * U_EXPORT2
ucasemap_open(const char * locale,uint32_t options,UErrorCode * pErrorCode)65 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
66     if(U_FAILURE(*pErrorCode)) {
67         return NULL;
68     }
69     UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
70     if(csm==NULL) {
71         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72         return NULL;
73     } else if (U_FAILURE(*pErrorCode)) {
74         delete csm;
75         return NULL;
76     }
77     return csm;
78 }
79 
80 U_CAPI void U_EXPORT2
ucasemap_close(UCaseMap * csm)81 ucasemap_close(UCaseMap *csm) {
82     delete csm;
83 }
84 
85 U_CAPI const char * U_EXPORT2
ucasemap_getLocale(const UCaseMap * csm)86 ucasemap_getLocale(const UCaseMap *csm) {
87     return csm->locale;
88 }
89 
90 U_CAPI uint32_t U_EXPORT2
ucasemap_getOptions(const UCaseMap * csm)91 ucasemap_getOptions(const UCaseMap *csm) {
92     return csm->options;
93 }
94 
95 U_CAPI void U_EXPORT2
ucasemap_setLocale(UCaseMap * csm,const char * locale,UErrorCode * pErrorCode)96 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
97     if(U_FAILURE(*pErrorCode)) {
98         return;
99     }
100     if (locale != NULL && *locale == 0) {
101         csm->locale[0] = 0;
102         csm->caseLocale = UCASE_LOC_ROOT;
103         return;
104     }
105 
106     int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
108         *pErrorCode=U_ZERO_ERROR;
109         /* we only really need the language code for case mappings */
110         length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111     }
112     if(length==sizeof(csm->locale)) {
113         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114     }
115     if(U_SUCCESS(*pErrorCode)) {
116         csm->caseLocale=UCASE_LOC_UNKNOWN;
117         csm->caseLocale = ucase_getCaseLocale(csm->locale);
118     } else {
119         csm->locale[0]=0;
120         csm->caseLocale = UCASE_LOC_ROOT;
121     }
122 }
123 
124 U_CAPI void U_EXPORT2
ucasemap_setOptions(UCaseMap * csm,uint32_t options,UErrorCode * pErrorCode)125 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
126     if(U_FAILURE(*pErrorCode)) {
127         return;
128     }
129     csm->options=options;
130 }
131 
132 /* UTF-8 string case mappings ----------------------------------------------- */
133 
134 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
135 
136 namespace {
137 
138 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
139 inline UBool
appendResult(int32_t cpLength,int32_t result,const UChar * s,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)140 appendResult(int32_t cpLength, int32_t result, const UChar *s,
141              ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
142     U_ASSERT(U_SUCCESS(errorCode));
143 
144     /* decode the result */
145     if(result<0) {
146         /* (not) original code point */
147         if(edits!=NULL) {
148             edits->addUnchanged(cpLength);
149         }
150         if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
151             ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
152         }
153     } else {
154         if(result<=UCASE_MAX_STRING_LENGTH) {
155             // string: "result" is the UTF-16 length
156             return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
157         } else {
158             ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
159         }
160     }
161     return TRUE;
162 }
163 
164 // See unicode/utf8.h U8_APPEND_UNSAFE().
getTwoByteLead(UChar32 c)165 inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
getTwoByteTrail(UChar32 c)166 inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
167 
168 }  // namespace
169 
170 static UChar32 U_CALLCONV
utf8_caseContextIterator(void * context,int8_t dir)171 utf8_caseContextIterator(void *context, int8_t dir) {
172     UCaseContext *csc=(UCaseContext *)context;
173     UChar32 c;
174 
175     if(dir<0) {
176         /* reset for backward iteration */
177         csc->index=csc->cpStart;
178         csc->dir=dir;
179     } else if(dir>0) {
180         /* reset for forward iteration */
181         csc->index=csc->cpLimit;
182         csc->dir=dir;
183     } else {
184         /* continue current iteration direction */
185         dir=csc->dir;
186     }
187 
188     if(dir<0) {
189         if(csc->start<csc->index) {
190             U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
191             return c;
192         }
193     } else {
194         if(csc->index<csc->limit) {
195             U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
196             return c;
197         }
198     }
199     return U_SENTINEL;
200 }
201 
202 /*
203  * Case-maps [srcStart..srcLimit[ but takes
204  * context [0..srcLength[ into account.
205  */
206 static void
_caseMap(int32_t caseLocale,uint32_t options,UCaseMapFull * map,const uint8_t * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)207 _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
208          const uint8_t *src, UCaseContext *csc,
209          int32_t srcStart, int32_t srcLimit,
210          icu::ByteSink &sink, icu::Edits *edits,
211          UErrorCode &errorCode) {
212     /* case mapping loop */
213     int32_t srcIndex=srcStart;
214     while (U_SUCCESS(errorCode) && srcIndex<srcLimit) {
215         int32_t cpStart;
216         csc->cpStart=cpStart=srcIndex;
217         UChar32 c;
218         U8_NEXT(src, srcIndex, srcLimit, c);
219         csc->cpLimit=srcIndex;
220         if(c<0) {
221             // Malformed UTF-8.
222             ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
223                                           sink, options, edits, errorCode);
224         } else {
225             const UChar *s;
226             c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
227             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
228         }
229     }
230 }
231 
232 #if !UCONFIG_NO_BREAK_ITERATION
233 
234 U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,const uint8_t * src,int32_t srcLength,ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)235 ucasemap_internalUTF8ToTitle(
236         int32_t caseLocale, uint32_t options, BreakIterator *iter,
237         const uint8_t *src, int32_t srcLength,
238         ByteSink &sink, icu::Edits *edits,
239         UErrorCode &errorCode) {
240     if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
241         return;
242     }
243 
244     /* set up local variables */
245     UCaseContext csc=UCASECONTEXT_INITIALIZER;
246     csc.p=(void *)src;
247     csc.limit=srcLength;
248     int32_t prev=0;
249     UBool isFirstIndex=TRUE;
250 
251     /* titlecasing loop */
252     while(prev<srcLength) {
253         /* find next index where to titlecase */
254         int32_t index;
255         if(isFirstIndex) {
256             isFirstIndex=FALSE;
257             index=iter->first();
258         } else {
259             index=iter->next();
260         }
261         if(index==UBRK_DONE || index>srcLength) {
262             index=srcLength;
263         }
264 
265         /*
266          * Segment [prev..index[ into 3 parts:
267          * a) skipped characters (copy as-is) [prev..titleStart[
268          * b) first letter (titlecase)              [titleStart..titleLimit[
269          * c) subsequent characters (lowercase)                 [titleLimit..index[
270          */
271         if(prev<index) {
272             /* find and copy skipped characters [prev..titleStart[ */
273             int32_t titleStart=prev;
274             int32_t titleLimit=prev;
275             UChar32 c;
276             U8_NEXT(src, titleLimit, index, c);
277             if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
278                 // Adjust the titlecasing index to the next cased character,
279                 // or to the next letter/number/symbol/private use.
280                 // Stop with titleStart<titleLimit<=index
281                 // if there is a character to be titlecased,
282                 // or else stop with titleStart==titleLimit==index.
283                 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
284                 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
285                     titleStart=titleLimit;
286                     if(titleLimit==index) {
287                         break;
288                     }
289                     U8_NEXT(src, titleLimit, index, c);
290                 }
291                 if (prev < titleStart) {
292                     if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
293                                                        sink, options, edits, errorCode)) {
294                         return;
295                     }
296                 }
297             }
298 
299             if(titleStart<titleLimit) {
300                 /* titlecase c which is from [titleStart..titleLimit[ */
301                 if(c>=0) {
302                     csc.cpStart=titleStart;
303                     csc.cpLimit=titleLimit;
304                     const UChar *s;
305                     c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
306                     if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
307                         return;
308                     }
309                 } else {
310                     // Malformed UTF-8.
311                     if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
312                                                        sink, options, edits, errorCode)) {
313                         return;
314                     }
315                 }
316 
317                 /* Special case Dutch IJ titlecasing */
318                 if (titleStart+1 < index &&
319                         caseLocale == UCASE_LOC_DUTCH &&
320                         (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
321                     if (src[titleStart+1] == 0x006A) {
322                         ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
323                         titleLimit++;
324                     } else if (src[titleStart+1] == 0x004A) {
325                         // Keep the capital J from getting lowercased.
326                         if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
327                                                            sink, options, edits, errorCode)) {
328                             return;
329                         }
330                         titleLimit++;
331                     }
332                 }
333 
334                 /* lowercase [titleLimit..index[ */
335                 if(titleLimit<index) {
336                     if((options&U_TITLECASE_NO_LOWERCASE)==0) {
337                         /* Normal operation: Lowercase the rest of the word. */
338                         _caseMap(caseLocale, options, ucase_toFullLower,
339                                  src, &csc,
340                                  titleLimit, index,
341                                  sink, edits, errorCode);
342                         if(U_FAILURE(errorCode)) {
343                             return;
344                         }
345                     } else {
346                         /* Optionally just copy the rest of the word unchanged. */
347                         if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
348                                                            sink, options, edits, errorCode)) {
349                             return;
350                         }
351                     }
352                 }
353             }
354         }
355 
356         prev=index;
357     }
358 }
359 
360 #endif
361 
362 U_NAMESPACE_BEGIN
363 namespace GreekUpper {
364 
isFollowedByCasedLetter(const uint8_t * s,int32_t i,int32_t length)365 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
366     while (i < length) {
367         UChar32 c;
368         U8_NEXT(s, i, length, c);
369         int32_t type = ucase_getTypeOrIgnorable(c);
370         if ((type & UCASE_IGNORABLE) != 0) {
371             // Case-ignorable, continue with the loop.
372         } else if (type != UCASE_NONE) {
373             return TRUE;  // Followed by cased letter.
374         } else {
375             return FALSE;  // Uncased and not case-ignorable.
376         }
377     }
378     return FALSE;  // Not followed by cased letter.
379 }
380 
381 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
toUpper(uint32_t options,const uint8_t * src,int32_t srcLength,ByteSink & sink,Edits * edits,UErrorCode & errorCode)382 void toUpper(uint32_t options,
383              const uint8_t *src, int32_t srcLength,
384              ByteSink &sink, Edits *edits,
385              UErrorCode &errorCode) {
386     uint32_t state = 0;
387     for (int32_t i = 0; i < srcLength;) {
388         int32_t nextIndex = i;
389         UChar32 c;
390         U8_NEXT(src, nextIndex, srcLength, c);
391         uint32_t nextState = 0;
392         int32_t type = ucase_getTypeOrIgnorable(c);
393         if ((type & UCASE_IGNORABLE) != 0) {
394             // c is case-ignorable
395             nextState |= (state & AFTER_CASED);
396         } else if (type != UCASE_NONE) {
397             // c is cased
398             nextState |= AFTER_CASED;
399         }
400         uint32_t data = getLetterData(c);
401         if (data > 0) {
402             uint32_t upper = data & UPPER_MASK;
403             // Add a dialytika to this iota or ypsilon vowel
404             // if we removed a tonos from the previous vowel,
405             // and that previous vowel did not also have (or gain) a dialytika.
406             // Adding one only to the final vowel in a longer sequence
407             // (which does not occur in normal writing) would require lookahead.
408             // Set the same flag as for preserving an existing dialytika.
409             if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
410                     (upper == 0x399 || upper == 0x3A5)) {
411                 data |= HAS_DIALYTIKA;
412             }
413             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
414             if ((data & HAS_YPOGEGRAMMENI) != 0) {
415                 numYpogegrammeni = 1;
416             }
417             // Skip combining diacritics after this Greek letter.
418             int32_t nextNextIndex = nextIndex;
419             while (nextIndex < srcLength) {
420                 UChar32 c2;
421                 U8_NEXT(src, nextNextIndex, srcLength, c2);
422                 uint32_t diacriticData = getDiacriticData(c2);
423                 if (diacriticData != 0) {
424                     data |= diacriticData;
425                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
426                         ++numYpogegrammeni;
427                     }
428                     nextIndex = nextNextIndex;
429                 } else {
430                     break;  // not a Greek diacritic
431                 }
432             }
433             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
434                 nextState |= AFTER_VOWEL_WITH_ACCENT;
435             }
436             // Map according to Greek rules.
437             UBool addTonos = FALSE;
438             if (upper == 0x397 &&
439                     (data & HAS_ACCENT) != 0 &&
440                     numYpogegrammeni == 0 &&
441                     (state & AFTER_CASED) == 0 &&
442                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
443                 // Keep disjunctive "or" with (only) a tonos.
444                 // We use the same "word boundary" conditions as for the Final_Sigma test.
445                 if (i == nextIndex) {
446                     upper = 0x389;  // Preserve the precomposed form.
447                 } else {
448                     addTonos = TRUE;
449                 }
450             } else if ((data & HAS_DIALYTIKA) != 0) {
451                 // Preserve a vowel with dialytika in precomposed form if it exists.
452                 if (upper == 0x399) {
453                     upper = 0x3AA;
454                     data &= ~HAS_EITHER_DIALYTIKA;
455                 } else if (upper == 0x3A5) {
456                     upper = 0x3AB;
457                     data &= ~HAS_EITHER_DIALYTIKA;
458                 }
459             }
460 
461             UBool change;
462             if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
463                 change = TRUE;  // common, simple usage
464             } else {
465                 // Find out first whether we are changing the text.
466                 U_ASSERT(0x370 <= upper && upper <= 0x3ff);  // 2-byte UTF-8, main Greek block
467                 change = (i + 2) > nextIndex ||
468                         src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
469                         numYpogegrammeni > 0;
470                 int32_t i2 = i + 2;
471                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
472                     change |= (i2 + 2) > nextIndex ||
473                             src[i2] != (uint8_t)u8"\u0308"[0] ||
474                             src[i2 + 1] != (uint8_t)u8"\u0308"[1];
475                     i2 += 2;
476                 }
477                 if (addTonos) {
478                     change |= (i2 + 2) > nextIndex ||
479                             src[i2] != (uint8_t)u8"\u0301"[0] ||
480                             src[i2 + 1] != (uint8_t)u8"\u0301"[1];
481                     i2 += 2;
482                 }
483                 int32_t oldLength = nextIndex - i;
484                 int32_t newLength = (i2 - i) + numYpogegrammeni * 2;  // 2 bytes per U+0399
485                 change |= oldLength != newLength;
486                 if (change) {
487                     if (edits != NULL) {
488                         edits->addReplace(oldLength, newLength);
489                     }
490                 } else {
491                     if (edits != NULL) {
492                         edits->addUnchanged(oldLength);
493                     }
494                     // Write unchanged text?
495                     change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
496                 }
497             }
498 
499             if (change) {
500                 ByteSinkUtil::appendTwoBytes(upper, sink);
501                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
502                     sink.Append(u8"\u0308", 2);  // restore or add a dialytika
503                 }
504                 if (addTonos) {
505                     sink.Append(u8"\u0301", 2);
506                 }
507                 while (numYpogegrammeni > 0) {
508                     sink.Append(u8"\u0399", 2);
509                     --numYpogegrammeni;
510                 }
511             }
512         } else if(c>=0) {
513             const UChar *s;
514             c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
515             if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
516                 return;
517             }
518         } else {
519             // Malformed UTF-8.
520             if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
521                                                sink, options, edits, errorCode)) {
522                 return;
523             }
524         }
525         i = nextIndex;
526         state = nextState;
527     }
528 }
529 
530 }  // namespace GreekUpper
531 U_NAMESPACE_END
532 
533 static void U_CALLCONV
ucasemap_internalUTF8ToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)534 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
535                              const uint8_t *src, int32_t srcLength,
536                              icu::ByteSink &sink, icu::Edits *edits,
537                              UErrorCode &errorCode) {
538     UCaseContext csc=UCASECONTEXT_INITIALIZER;
539     csc.p=(void *)src;
540     csc.limit=srcLength;
541     _caseMap(
542         caseLocale, options, ucase_toFullLower,
543         src, &csc, 0, srcLength,
544         sink, edits, errorCode);
545 }
546 
547 static void U_CALLCONV
ucasemap_internalUTF8ToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)548 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
549                              const uint8_t *src, int32_t srcLength,
550                              icu::ByteSink &sink, icu::Edits *edits,
551                              UErrorCode &errorCode) {
552     if (caseLocale == UCASE_LOC_GREEK) {
553         GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
554     } else {
555         UCaseContext csc=UCASECONTEXT_INITIALIZER;
556         csc.p=(void *)src;
557         csc.limit=srcLength;
558         _caseMap(
559             caseLocale, options, ucase_toFullUpper,
560             src, &csc, 0, srcLength,
561             sink, edits, errorCode);
562     }
563 }
564 
565 static void U_CALLCONV
ucasemap_internalUTF8Fold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)566 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
567                           const uint8_t *src, int32_t srcLength,
568                           icu::ByteSink &sink, icu::Edits *edits,
569                           UErrorCode &errorCode) {
570     /* case mapping loop */
571     int32_t srcIndex = 0;
572     while (U_SUCCESS(errorCode) && srcIndex < srcLength) {
573         int32_t cpStart = srcIndex;
574         UChar32 c;
575         U8_NEXT(src, srcIndex, srcLength, c);
576         if(c<0) {
577             // Malformed UTF-8.
578             ByteSinkUtil::appendUnchanged(src+cpStart, srcIndex-cpStart,
579                                           sink, options, edits, errorCode);
580         } else {
581             const UChar *s;
582             c = ucase_toFullFolding(c, &s, options);
583             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
584         }
585     }
586 }
587 
588 void
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)589 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
590                  const char *src, int32_t srcLength,
591                  UTF8CaseMapper *stringCaseMapper,
592                  icu::ByteSink &sink, icu::Edits *edits,
593                  UErrorCode &errorCode) {
594     /* check argument values */
595     if (U_FAILURE(errorCode)) {
596         return;
597     }
598     if ((src == nullptr && srcLength != 0) || srcLength < -1) {
599         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
600         return;
601     }
602 
603     // Get the string length.
604     if (srcLength == -1) {
605         srcLength = (int32_t)uprv_strlen((const char *)src);
606     }
607 
608     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
609         edits->reset();
610     }
611     stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
612                      (const uint8_t *)src, srcLength, sink, edits, errorCode);
613     sink.Flush();
614     if (U_SUCCESS(errorCode)) {
615         if (edits != nullptr) {
616             edits->copyErrorTo(errorCode);
617         }
618     }
619 }
620 
621 int32_t
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)622 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
623                  char *dest, int32_t destCapacity,
624                  const char *src, int32_t srcLength,
625                  UTF8CaseMapper *stringCaseMapper,
626                  icu::Edits *edits,
627                  UErrorCode &errorCode) {
628     /* check argument values */
629     if(U_FAILURE(errorCode)) {
630         return 0;
631     }
632     if( destCapacity<0 ||
633         (dest==NULL && destCapacity>0) ||
634         (src==NULL && srcLength!=0) || srcLength<-1
635     ) {
636         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
637         return 0;
638     }
639 
640     /* get the string length */
641     if(srcLength==-1) {
642         srcLength=(int32_t)uprv_strlen((const char *)src);
643     }
644 
645     /* check for overlapping source and destination */
646     if( dest!=NULL &&
647         ((src>=dest && src<(dest+destCapacity)) ||
648          (dest>=src && dest<(src+srcLength)))
649     ) {
650         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
651         return 0;
652     }
653 
654     CheckedArrayByteSink sink(dest, destCapacity);
655     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
656         edits->reset();
657     }
658     stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
659                      (const uint8_t *)src, srcLength, sink, edits, errorCode);
660     sink.Flush();
661     if (U_SUCCESS(errorCode)) {
662         if (sink.Overflowed()) {
663             errorCode = U_BUFFER_OVERFLOW_ERROR;
664         } else if (edits != nullptr) {
665             edits->copyErrorTo(errorCode);
666         }
667     }
668     return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
669 }
670 
671 /* public API functions */
672 
673 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToLower(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)674 ucasemap_utf8ToLower(const UCaseMap *csm,
675                      char *dest, int32_t destCapacity,
676                      const char *src, int32_t srcLength,
677                      UErrorCode *pErrorCode) {
678     return ucasemap_mapUTF8(
679         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
680         dest, destCapacity,
681         src, srcLength,
682         ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
683 }
684 
685 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToUpper(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)686 ucasemap_utf8ToUpper(const UCaseMap *csm,
687                      char *dest, int32_t destCapacity,
688                      const char *src, int32_t srcLength,
689                      UErrorCode *pErrorCode) {
690     return ucasemap_mapUTF8(
691         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
692         dest, destCapacity,
693         src, srcLength,
694         ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
695 }
696 
697 U_CAPI int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)698 ucasemap_utf8FoldCase(const UCaseMap *csm,
699                       char *dest, int32_t destCapacity,
700                       const char *src, int32_t srcLength,
701                       UErrorCode *pErrorCode) {
702     return ucasemap_mapUTF8(
703         UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
704         dest, destCapacity,
705         src, srcLength,
706         ucasemap_internalUTF8Fold, NULL, *pErrorCode);
707 }
708 
709 U_NAMESPACE_BEGIN
710 
utf8ToLower(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)711 void CaseMap::utf8ToLower(
712         const char *locale, uint32_t options,
713         StringPiece src, ByteSink &sink, Edits *edits,
714         UErrorCode &errorCode) {
715     ucasemap_mapUTF8(
716         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
717         src.data(), src.length(),
718         ucasemap_internalUTF8ToLower, sink, edits, errorCode);
719 }
720 
utf8ToUpper(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)721 void CaseMap::utf8ToUpper(
722         const char *locale, uint32_t options,
723         StringPiece src, ByteSink &sink, Edits *edits,
724         UErrorCode &errorCode) {
725     ucasemap_mapUTF8(
726         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
727         src.data(), src.length(),
728         ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
729 }
730 
utf8Fold(uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)731 void CaseMap::utf8Fold(
732         uint32_t options,
733         StringPiece src, ByteSink &sink, Edits *edits,
734         UErrorCode &errorCode) {
735     ucasemap_mapUTF8(
736         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
737         src.data(), src.length(),
738         ucasemap_internalUTF8Fold, sink, edits, errorCode);
739 }
740 
utf8ToLower(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)741 int32_t CaseMap::utf8ToLower(
742         const char *locale, uint32_t options,
743         const char *src, int32_t srcLength,
744         char *dest, int32_t destCapacity, Edits *edits,
745         UErrorCode &errorCode) {
746     return ucasemap_mapUTF8(
747         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
748         dest, destCapacity,
749         src, srcLength,
750         ucasemap_internalUTF8ToLower, edits, errorCode);
751 }
752 
utf8ToUpper(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)753 int32_t CaseMap::utf8ToUpper(
754         const char *locale, uint32_t options,
755         const char *src, int32_t srcLength,
756         char *dest, int32_t destCapacity, Edits *edits,
757         UErrorCode &errorCode) {
758     return ucasemap_mapUTF8(
759         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
760         dest, destCapacity,
761         src, srcLength,
762         ucasemap_internalUTF8ToUpper, edits, errorCode);
763 }
764 
utf8Fold(uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)765 int32_t CaseMap::utf8Fold(
766         uint32_t options,
767         const char *src, int32_t srcLength,
768         char *dest, int32_t destCapacity, Edits *edits,
769         UErrorCode &errorCode) {
770     return ucasemap_mapUTF8(
771         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
772         dest, destCapacity,
773         src, srcLength,
774         ucasemap_internalUTF8Fold, edits, errorCode);
775 }
776 
777 U_NAMESPACE_END
778