• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2005-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucasemap.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2005may06
16 *   created by: Markus W. Scherer
17 *
18 *   Case mapping service object and functions using it.
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/bytestream.h"
24 #include "unicode/casemap.h"
25 #include "unicode/edits.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/ubrk.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ustring.h"
31 #include "unicode/ucasemap.h"
32 #if !UCONFIG_NO_BREAK_ITERATION
33 #include "unicode/utext.h"
34 #endif
35 #include "unicode/utf.h"
36 #include "unicode/utf8.h"
37 #include "unicode/utf16.h"
38 #include "bytesinkutil.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41 #include "uassert.h"
42 #include "ucase.h"
43 #include "ucasemap_imp.h"
44 #include "ustr_imp.h"
45 
46 U_NAMESPACE_USE
47 
48 /* UCaseMap service object -------------------------------------------------- */
49 
UCaseMap(const char * localeID,uint32_t opts,UErrorCode * pErrorCode)50 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
51 #if !UCONFIG_NO_BREAK_ITERATION
52         iter(NULL),
53 #endif
54         caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
55     ucasemap_setLocale(this, localeID, pErrorCode);
56 }
57 
~UCaseMap()58 UCaseMap::~UCaseMap() {
59 #if !UCONFIG_NO_BREAK_ITERATION
60     delete iter;
61 #endif
62 }
63 
64 U_CAPI UCaseMap * U_EXPORT2
ucasemap_open(const char * locale,uint32_t options,UErrorCode * pErrorCode)65 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
66     if(U_FAILURE(*pErrorCode)) {
67         return NULL;
68     }
69     UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
70     if(csm==NULL) {
71         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
72         return NULL;
73     } else if (U_FAILURE(*pErrorCode)) {
74         delete csm;
75         return NULL;
76     }
77     return csm;
78 }
79 
80 U_CAPI void U_EXPORT2
ucasemap_close(UCaseMap * csm)81 ucasemap_close(UCaseMap *csm) {
82     delete csm;
83 }
84 
85 U_CAPI const char * U_EXPORT2
ucasemap_getLocale(const UCaseMap * csm)86 ucasemap_getLocale(const UCaseMap *csm) {
87     return csm->locale;
88 }
89 
90 U_CAPI uint32_t U_EXPORT2
ucasemap_getOptions(const UCaseMap * csm)91 ucasemap_getOptions(const UCaseMap *csm) {
92     return csm->options;
93 }
94 
95 U_CAPI void U_EXPORT2
ucasemap_setLocale(UCaseMap * csm,const char * locale,UErrorCode * pErrorCode)96 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
97     if(U_FAILURE(*pErrorCode)) {
98         return;
99     }
100     if (locale != NULL && *locale == 0) {
101         csm->locale[0] = 0;
102         csm->caseLocale = UCASE_LOC_ROOT;
103         return;
104     }
105 
106     int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
107     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
108         *pErrorCode=U_ZERO_ERROR;
109         /* we only really need the language code for case mappings */
110         length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
111     }
112     if(length==sizeof(csm->locale)) {
113         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
114     }
115     if(U_SUCCESS(*pErrorCode)) {
116         csm->caseLocale = ucase_getCaseLocale(csm->locale);
117     } else {
118         csm->locale[0]=0;
119         csm->caseLocale = UCASE_LOC_ROOT;
120     }
121 }
122 
123 U_CAPI void U_EXPORT2
ucasemap_setOptions(UCaseMap * csm,uint32_t options,UErrorCode * pErrorCode)124 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
125     if(U_FAILURE(*pErrorCode)) {
126         return;
127     }
128     csm->options=options;
129 }
130 
131 /* UTF-8 string case mappings ----------------------------------------------- */
132 
133 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
134 
135 namespace {
136 
137 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
138 inline UBool
appendResult(int32_t cpLength,int32_t result,const UChar * s,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)139 appendResult(int32_t cpLength, int32_t result, const UChar *s,
140              ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
141     U_ASSERT(U_SUCCESS(errorCode));
142 
143     /* decode the result */
144     if(result<0) {
145         /* (not) original code point */
146         if(edits!=NULL) {
147             edits->addUnchanged(cpLength);
148         }
149         if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
150             ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
151         }
152     } else {
153         if(result<=UCASE_MAX_STRING_LENGTH) {
154             // string: "result" is the UTF-16 length
155             return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
156         } else {
157             ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
158         }
159     }
160     return true;
161 }
162 
163 // See unicode/utf8.h U8_APPEND_UNSAFE().
getTwoByteLead(UChar32 c)164 inline uint8_t getTwoByteLead(UChar32 c) { return (uint8_t)((c >> 6) | 0xc0); }
getTwoByteTrail(UChar32 c)165 inline uint8_t getTwoByteTrail(UChar32 c) { return (uint8_t)((c & 0x3f) | 0x80); }
166 
167 UChar32 U_CALLCONV
utf8_caseContextIterator(void * context,int8_t dir)168 utf8_caseContextIterator(void *context, int8_t dir) {
169     UCaseContext *csc=(UCaseContext *)context;
170     UChar32 c;
171 
172     if(dir<0) {
173         /* reset for backward iteration */
174         csc->index=csc->cpStart;
175         csc->dir=dir;
176     } else if(dir>0) {
177         /* reset for forward iteration */
178         csc->index=csc->cpLimit;
179         csc->dir=dir;
180     } else {
181         /* continue current iteration direction */
182         dir=csc->dir;
183     }
184 
185     if(dir<0) {
186         if(csc->start<csc->index) {
187             U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
188             return c;
189         }
190     } else {
191         if(csc->index<csc->limit) {
192             U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
193             return c;
194         }
195     }
196     return U_SENTINEL;
197 }
198 
199 /**
200  * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
201  * caseLocale < 0: Case-folds [srcStart..srcLimit[.
202  */
toLower(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)203 void toLower(int32_t caseLocale, uint32_t options,
204              const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
205              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
206     const int8_t *latinToLower;
207     if (caseLocale == UCASE_LOC_ROOT ||
208             (caseLocale >= 0 ?
209                 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
210                 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
211         latinToLower = LatinCase::TO_LOWER_NORMAL;
212     } else {
213         latinToLower = LatinCase::TO_LOWER_TR_LT;
214     }
215     const UTrie2 *trie = ucase_getTrie();
216     int32_t prev = srcStart;
217     int32_t srcIndex = srcStart;
218     for (;;) {
219         // fast path for simple cases
220         int32_t cpStart;
221         UChar32 c;
222         for (;;) {
223             if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
224                 c = U_SENTINEL;
225                 break;
226             }
227             uint8_t lead = src[srcIndex++];
228             if (lead <= 0x7f) {
229                 int8_t d = latinToLower[lead];
230                 if (d == LatinCase::EXC) {
231                     cpStart = srcIndex - 1;
232                     c = lead;
233                     break;
234                 }
235                 if (d == 0) { continue; }
236                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
237                                               sink, options, edits, errorCode);
238                 char ascii = (char)(lead + d);
239                 sink.Append(&ascii, 1);
240                 if (edits != nullptr) {
241                     edits->addReplace(1, 1);
242                 }
243                 prev = srcIndex;
244                 continue;
245             } else if (lead < 0xe3) {
246                 uint8_t t;
247                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
248                         (t = src[srcIndex] - 0x80) <= 0x3f) {
249                     // U+0080..U+017F
250                     ++srcIndex;
251                     c = ((lead - 0xc0) << 6) | t;
252                     int8_t d = latinToLower[c];
253                     if (d == LatinCase::EXC) {
254                         cpStart = srcIndex - 2;
255                         break;
256                     }
257                     if (d == 0) { continue; }
258                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
259                                                   sink, options, edits, errorCode);
260                     ByteSinkUtil::appendTwoBytes(c + d, sink);
261                     if (edits != nullptr) {
262                         edits->addReplace(2, 2);
263                     }
264                     prev = srcIndex;
265                     continue;
266                 }
267             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
268                     (srcIndex + 2) <= srcLimit &&
269                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
270                 // most of CJK: no case mappings
271                 srcIndex += 2;
272                 continue;
273             }
274             cpStart = --srcIndex;
275             U8_NEXT(src, srcIndex, srcLimit, c);
276             if (c < 0) {
277                 // ill-formed UTF-8
278                 continue;
279             }
280             uint16_t props = UTRIE2_GET16(trie, c);
281             if (UCASE_HAS_EXCEPTION(props)) { break; }
282             int32_t delta;
283             if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
284                 continue;
285             }
286             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
287                                           sink, options, edits, errorCode);
288             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
289             prev = srcIndex;
290         }
291         if (c < 0) {
292             break;
293         }
294         // slow path
295         const UChar *s;
296         if (caseLocale >= 0) {
297             csc->cpStart = cpStart;
298             csc->cpLimit = srcIndex;
299             c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
300         } else {
301             c = ucase_toFullFolding(c, &s, options);
302         }
303         if (c >= 0) {
304             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
305                                           sink, options, edits, errorCode);
306             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
307             prev = srcIndex;
308         }
309     }
310     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
311                                   sink, options, edits, errorCode);
312 }
313 
toUpper(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)314 void toUpper(int32_t caseLocale, uint32_t options,
315              const uint8_t *src, UCaseContext *csc, int32_t srcLength,
316              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
317     const int8_t *latinToUpper;
318     if (caseLocale == UCASE_LOC_TURKISH) {
319         latinToUpper = LatinCase::TO_UPPER_TR;
320     } else {
321         latinToUpper = LatinCase::TO_UPPER_NORMAL;
322     }
323     const UTrie2 *trie = ucase_getTrie();
324     int32_t prev = 0;
325     int32_t srcIndex = 0;
326     for (;;) {
327         // fast path for simple cases
328         int32_t cpStart;
329         UChar32 c;
330         for (;;) {
331             if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
332                 c = U_SENTINEL;
333                 break;
334             }
335             uint8_t lead = src[srcIndex++];
336             if (lead <= 0x7f) {
337                 int8_t d = latinToUpper[lead];
338                 if (d == LatinCase::EXC) {
339                     cpStart = srcIndex - 1;
340                     c = lead;
341                     break;
342                 }
343                 if (d == 0) { continue; }
344                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
345                                               sink, options, edits, errorCode);
346                 char ascii = (char)(lead + d);
347                 sink.Append(&ascii, 1);
348                 if (edits != nullptr) {
349                     edits->addReplace(1, 1);
350                 }
351                 prev = srcIndex;
352                 continue;
353             } else if (lead < 0xe3) {
354                 uint8_t t;
355                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
356                         (t = src[srcIndex] - 0x80) <= 0x3f) {
357                     // U+0080..U+017F
358                     ++srcIndex;
359                     c = ((lead - 0xc0) << 6) | t;
360                     int8_t d = latinToUpper[c];
361                     if (d == LatinCase::EXC) {
362                         cpStart = srcIndex - 2;
363                         break;
364                     }
365                     if (d == 0) { continue; }
366                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
367                                                   sink, options, edits, errorCode);
368                     ByteSinkUtil::appendTwoBytes(c + d, sink);
369                     if (edits != nullptr) {
370                         edits->addReplace(2, 2);
371                     }
372                     prev = srcIndex;
373                     continue;
374                 }
375             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
376                     (srcIndex + 2) <= srcLength &&
377                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
378                 // most of CJK: no case mappings
379                 srcIndex += 2;
380                 continue;
381             }
382             cpStart = --srcIndex;
383             U8_NEXT(src, srcIndex, srcLength, c);
384             if (c < 0) {
385                 // ill-formed UTF-8
386                 continue;
387             }
388             uint16_t props = UTRIE2_GET16(trie, c);
389             if (UCASE_HAS_EXCEPTION(props)) { break; }
390             int32_t delta;
391             if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
392                 continue;
393             }
394             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
395                                           sink, options, edits, errorCode);
396             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
397             prev = srcIndex;
398         }
399         if (c < 0) {
400             break;
401         }
402         // slow path
403         csc->cpStart = cpStart;
404         csc->cpLimit = srcIndex;
405         const UChar *s;
406         c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
407         if (c >= 0) {
408             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
409                                           sink, options, edits, errorCode);
410             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
411             prev = srcIndex;
412         }
413     }
414     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
415                                   sink, options, edits, errorCode);
416 }
417 
418 }  // namespace
419 
420 #if !UCONFIG_NO_BREAK_ITERATION
421 
422 namespace {
423 
424 constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
425 
426 constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
427 
428 /**
429  * Input: c is a letter I with or without acute accent.
430  * start is the index in src after c, and is less than segmentLimit.
431  * If a plain i/I is followed by a plain j/J,
432  * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
433  * then we output accordingly.
434  *
435  * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
436  */
maybeTitleDutchIJ(const uint8_t * src,UChar32 c,int32_t start,int32_t segmentLimit,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)437 int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
438                           ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
439     U_ASSERT(start < segmentLimit);
440 
441     int32_t index = start;
442     bool withAcute = false;
443 
444     // If the conditions are met, then the following variables tell us what to output.
445     int32_t unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
446     bool doTitleJ = false;  // true if the j needs to be titlecased
447     int32_t unchanged2 = 0;  // after the j (0 or 1)
448 
449     // next character after the first letter
450     UChar32 c2;
451     c2 = src[index++];
452 
453     // Is the first letter an i/I with accent?
454     if (c == u'I') {
455         if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
456             withAcute = true;
457             unchanged1 = 2;  // ACUTE is 2 code units in UTF-8
458             if (index == segmentLimit) { return start; }
459             c2 = src[index++];
460         }
461     } else {  // Í
462         withAcute = true;
463     }
464 
465     // Is the next character a j/J?
466     if (c2 == u'j') {
467         doTitleJ = true;
468     } else if (c2 == u'J') {
469         ++unchanged1;
470     } else {
471         return start;
472     }
473 
474     // A plain i/I must be followed by a plain j/J.
475     // An i/I with acute must be followed by a j/J with acute.
476     if (withAcute) {
477         if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
478             return start;
479         }
480         if (doTitleJ) {
481             unchanged2 = 2;  // ACUTE is 2 code units in UTF-8
482         } else {
483             unchanged1 = unchanged1 + 2;    // ACUTE is 2 code units in UTF-8
484         }
485     }
486 
487     // There must not be another combining mark.
488     if (index < segmentLimit) {
489         int32_t cp;
490         int32_t i = index;
491         U8_NEXT(src, i, segmentLimit, cp);
492         uint32_t typeMask = U_GET_GC_MASK(cp);
493         if ((typeMask & U_GC_M_MASK) != 0) {
494             return start;
495         }
496     }
497 
498     // Output the rest of the Dutch IJ.
499     ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
500     start += unchanged1;
501     if (doTitleJ) {
502         ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
503         ++start;
504     }
505     ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
506 
507     U_ASSERT(start + unchanged2 == index);
508     return index;
509 }
510 
511 }  // namespace
512 
513 U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,const uint8_t * src,int32_t srcLength,ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)514 ucasemap_internalUTF8ToTitle(
515         int32_t caseLocale, uint32_t options, BreakIterator *iter,
516         const uint8_t *src, int32_t srcLength,
517         ByteSink &sink, icu::Edits *edits,
518         UErrorCode &errorCode) {
519     if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
520         return;
521     }
522 
523     /* set up local variables */
524     UCaseContext csc=UCASECONTEXT_INITIALIZER;
525     csc.p=(void *)src;
526     csc.limit=srcLength;
527     int32_t prev=0;
528     UBool isFirstIndex=true;
529 
530     /* titlecasing loop */
531     while(prev<srcLength) {
532         /* find next index where to titlecase */
533         int32_t index;
534         if(isFirstIndex) {
535             isFirstIndex=false;
536             index=iter->first();
537         } else {
538             index=iter->next();
539         }
540         if(index==UBRK_DONE || index>srcLength) {
541             index=srcLength;
542         }
543 
544         /*
545          * Segment [prev..index[ into 3 parts:
546          * a) skipped characters (copy as-is) [prev..titleStart[
547          * b) first letter (titlecase)              [titleStart..titleLimit[
548          * c) subsequent characters (lowercase)                 [titleLimit..index[
549          */
550         if(prev<index) {
551             /* find and copy skipped characters [prev..titleStart[ */
552             int32_t titleStart=prev;
553             int32_t titleLimit=prev;
554             UChar32 c;
555             U8_NEXT(src, titleLimit, index, c);
556             if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
557                 // Adjust the titlecasing index to the next cased character,
558                 // or to the next letter/number/symbol/private use.
559                 // Stop with titleStart<titleLimit<=index
560                 // if there is a character to be titlecased,
561                 // or else stop with titleStart==titleLimit==index.
562                 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
563                 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
564                     titleStart=titleLimit;
565                     if(titleLimit==index) {
566                         break;
567                     }
568                     U8_NEXT(src, titleLimit, index, c);
569                 }
570                 if (prev < titleStart) {
571                     if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
572                                                        sink, options, edits, errorCode)) {
573                         return;
574                     }
575                 }
576             }
577 
578             if(titleStart<titleLimit) {
579                 /* titlecase c which is from [titleStart..titleLimit[ */
580                 if(c>=0) {
581                     csc.cpStart=titleStart;
582                     csc.cpLimit=titleLimit;
583                     const UChar *s;
584                     c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
585                     if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
586                         return;
587                     }
588                 } else {
589                     // Malformed UTF-8.
590                     if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
591                                                        sink, options, edits, errorCode)) {
592                         return;
593                     }
594                 }
595 
596                 /* Special case Dutch IJ titlecasing */
597                 if (titleLimit < index &&
598                     caseLocale == UCASE_LOC_DUTCH) {
599                     if (c < 0) {
600                         c = ~c;
601                     }
602 
603                     if (c == u'I' || c == u'Í') {
604                         titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
605                     }
606                 }
607 
608                 /* lowercase [titleLimit..index[ */
609                 if(titleLimit<index) {
610                     if((options&U_TITLECASE_NO_LOWERCASE)==0) {
611                         /* Normal operation: Lowercase the rest of the word. */
612                         toLower(caseLocale, options,
613                                 src, &csc, titleLimit, index,
614                                 sink, edits, errorCode);
615                         if(U_FAILURE(errorCode)) {
616                             return;
617                         }
618                     } else {
619                         /* Optionally just copy the rest of the word unchanged. */
620                         if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
621                                                            sink, options, edits, errorCode)) {
622                             return;
623                         }
624                     }
625                 }
626             }
627         }
628 
629         prev=index;
630     }
631 }
632 
633 #endif
634 
635 U_NAMESPACE_BEGIN
636 namespace GreekUpper {
637 
isFollowedByCasedLetter(const uint8_t * s,int32_t i,int32_t length)638 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
639     while (i < length) {
640         UChar32 c;
641         U8_NEXT(s, i, length, c);
642         int32_t type = ucase_getTypeOrIgnorable(c);
643         if ((type & UCASE_IGNORABLE) != 0) {
644             // Case-ignorable, continue with the loop.
645         } else if (type != UCASE_NONE) {
646             return true;  // Followed by cased letter.
647         } else {
648             return false;  // Uncased and not case-ignorable.
649         }
650     }
651     return false;  // Not followed by cased letter.
652 }
653 
654 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
toUpper(uint32_t options,const uint8_t * src,int32_t srcLength,ByteSink & sink,Edits * edits,UErrorCode & errorCode)655 void toUpper(uint32_t options,
656              const uint8_t *src, int32_t srcLength,
657              ByteSink &sink, Edits *edits,
658              UErrorCode &errorCode) {
659     uint32_t state = 0;
660     for (int32_t i = 0; i < srcLength;) {
661         int32_t nextIndex = i;
662         UChar32 c;
663         U8_NEXT(src, nextIndex, srcLength, c);
664         uint32_t nextState = 0;
665         int32_t type = ucase_getTypeOrIgnorable(c);
666         if ((type & UCASE_IGNORABLE) != 0) {
667             // c is case-ignorable
668             nextState |= (state & AFTER_CASED);
669         } else if (type != UCASE_NONE) {
670             // c is cased
671             nextState |= AFTER_CASED;
672         }
673         uint32_t data = getLetterData(c);
674         if (data > 0) {
675             uint32_t upper = data & UPPER_MASK;
676             // Add a dialytika to this iota or ypsilon vowel
677             // if we removed a tonos from the previous vowel,
678             // and that previous vowel did not also have (or gain) a dialytika.
679             // Adding one only to the final vowel in a longer sequence
680             // (which does not occur in normal writing) would require lookahead.
681             // Set the same flag as for preserving an existing dialytika.
682             if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
683                     (upper == 0x399 || upper == 0x3A5)) {
684                 data |= HAS_DIALYTIKA;
685             }
686             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
687             if ((data & HAS_YPOGEGRAMMENI) != 0) {
688                 numYpogegrammeni = 1;
689             }
690             // Skip combining diacritics after this Greek letter.
691             int32_t nextNextIndex = nextIndex;
692             while (nextIndex < srcLength) {
693                 UChar32 c2;
694                 U8_NEXT(src, nextNextIndex, srcLength, c2);
695                 uint32_t diacriticData = getDiacriticData(c2);
696                 if (diacriticData != 0) {
697                     data |= diacriticData;
698                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
699                         ++numYpogegrammeni;
700                     }
701                     nextIndex = nextNextIndex;
702                 } else {
703                     break;  // not a Greek diacritic
704                 }
705             }
706             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
707                 nextState |= AFTER_VOWEL_WITH_ACCENT;
708             }
709             // Map according to Greek rules.
710             UBool addTonos = false;
711             if (upper == 0x397 &&
712                     (data & HAS_ACCENT) != 0 &&
713                     numYpogegrammeni == 0 &&
714                     (state & AFTER_CASED) == 0 &&
715                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
716                 // Keep disjunctive "or" with (only) a tonos.
717                 // We use the same "word boundary" conditions as for the Final_Sigma test.
718                 if (i == nextIndex) {
719                     upper = 0x389;  // Preserve the precomposed form.
720                 } else {
721                     addTonos = true;
722                 }
723             } else if ((data & HAS_DIALYTIKA) != 0) {
724                 // Preserve a vowel with dialytika in precomposed form if it exists.
725                 if (upper == 0x399) {
726                     upper = 0x3AA;
727                     data &= ~HAS_EITHER_DIALYTIKA;
728                 } else if (upper == 0x3A5) {
729                     upper = 0x3AB;
730                     data &= ~HAS_EITHER_DIALYTIKA;
731                 }
732             }
733 
734             UBool change;
735             if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
736                 change = true;  // common, simple usage
737             } else {
738                 // Find out first whether we are changing the text.
739                 U_ASSERT(0x370 <= upper && upper <= 0x3ff);  // 2-byte UTF-8, main Greek block
740                 change = (i + 2) > nextIndex ||
741                         src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
742                         numYpogegrammeni > 0;
743                 int32_t i2 = i + 2;
744                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
745                     change |= (i2 + 2) > nextIndex ||
746                             src[i2] != (uint8_t)u8"\u0308"[0] ||
747                             src[i2 + 1] != (uint8_t)u8"\u0308"[1];
748                     i2 += 2;
749                 }
750                 if (addTonos) {
751                     change |= (i2 + 2) > nextIndex ||
752                             src[i2] != (uint8_t)u8"\u0301"[0] ||
753                             src[i2 + 1] != (uint8_t)u8"\u0301"[1];
754                     i2 += 2;
755                 }
756                 int32_t oldLength = nextIndex - i;
757                 int32_t newLength = (i2 - i) + numYpogegrammeni * 2;  // 2 bytes per U+0399
758                 change |= oldLength != newLength;
759                 if (change) {
760                     if (edits != NULL) {
761                         edits->addReplace(oldLength, newLength);
762                     }
763                 } else {
764                     if (edits != NULL) {
765                         edits->addUnchanged(oldLength);
766                     }
767                     // Write unchanged text?
768                     change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
769                 }
770             }
771 
772             if (change) {
773                 ByteSinkUtil::appendTwoBytes(upper, sink);
774                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
775                     sink.AppendU8(u8"\u0308", 2);  // restore or add a dialytika
776                 }
777                 if (addTonos) {
778                     sink.AppendU8(u8"\u0301", 2);
779                 }
780                 while (numYpogegrammeni > 0) {
781                     sink.AppendU8(u8"\u0399", 2);
782                     --numYpogegrammeni;
783                 }
784             }
785         } else if(c>=0) {
786             const UChar *s;
787             c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
788             if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
789                 return;
790             }
791         } else {
792             // Malformed UTF-8.
793             if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
794                                                sink, options, edits, errorCode)) {
795                 return;
796             }
797         }
798         i = nextIndex;
799         state = nextState;
800     }
801 }
802 
803 }  // namespace GreekUpper
804 U_NAMESPACE_END
805 
806 static void U_CALLCONV
ucasemap_internalUTF8ToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)807 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
808                              const uint8_t *src, int32_t srcLength,
809                              icu::ByteSink &sink, icu::Edits *edits,
810                              UErrorCode &errorCode) {
811     UCaseContext csc=UCASECONTEXT_INITIALIZER;
812     csc.p=(void *)src;
813     csc.limit=srcLength;
814     toLower(
815         caseLocale, options,
816         src, &csc, 0, srcLength,
817         sink, edits, errorCode);
818 }
819 
820 static void U_CALLCONV
ucasemap_internalUTF8ToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)821 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
822                              const uint8_t *src, int32_t srcLength,
823                              icu::ByteSink &sink, icu::Edits *edits,
824                              UErrorCode &errorCode) {
825     if (caseLocale == UCASE_LOC_GREEK) {
826         GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
827     } else {
828         UCaseContext csc=UCASECONTEXT_INITIALIZER;
829         csc.p=(void *)src;
830         csc.limit=srcLength;
831         toUpper(
832             caseLocale, options,
833             src, &csc, srcLength,
834             sink, edits, errorCode);
835     }
836 }
837 
838 static void U_CALLCONV
ucasemap_internalUTF8Fold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)839 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
840                           const uint8_t *src, int32_t srcLength,
841                           icu::ByteSink &sink, icu::Edits *edits,
842                           UErrorCode &errorCode) {
843     toLower(
844         -1, options,
845         src, nullptr, 0, srcLength,
846         sink, edits, errorCode);
847 }
848 
849 void
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)850 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
851                  const char *src, int32_t srcLength,
852                  UTF8CaseMapper *stringCaseMapper,
853                  icu::ByteSink &sink, icu::Edits *edits,
854                  UErrorCode &errorCode) {
855     /* check argument values */
856     if (U_FAILURE(errorCode)) {
857         return;
858     }
859     if ((src == nullptr && srcLength != 0) || srcLength < -1) {
860         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
861         return;
862     }
863 
864     // Get the string length.
865     if (srcLength == -1) {
866         srcLength = (int32_t)uprv_strlen((const char *)src);
867     }
868 
869     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
870         edits->reset();
871     }
872     stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
873                      (const uint8_t *)src, srcLength, sink, edits, errorCode);
874     sink.Flush();
875     if (U_SUCCESS(errorCode)) {
876         if (edits != nullptr) {
877             edits->copyErrorTo(errorCode);
878         }
879     }
880 }
881 
882 int32_t
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)883 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
884                  char *dest, int32_t destCapacity,
885                  const char *src, int32_t srcLength,
886                  UTF8CaseMapper *stringCaseMapper,
887                  icu::Edits *edits,
888                  UErrorCode &errorCode) {
889     /* check argument values */
890     if(U_FAILURE(errorCode)) {
891         return 0;
892     }
893     if( destCapacity<0 ||
894         (dest==NULL && destCapacity>0) ||
895         (src==NULL && srcLength!=0) || srcLength<-1
896     ) {
897         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
898         return 0;
899     }
900 
901     /* get the string length */
902     if(srcLength==-1) {
903         srcLength=(int32_t)uprv_strlen((const char *)src);
904     }
905 
906     /* check for overlapping source and destination */
907     if( dest!=NULL &&
908         ((src>=dest && src<(dest+destCapacity)) ||
909          (dest>=src && dest<(src+srcLength)))
910     ) {
911         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
912         return 0;
913     }
914 
915     CheckedArrayByteSink sink(dest, destCapacity);
916     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
917         edits->reset();
918     }
919     stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
920                      (const uint8_t *)src, srcLength, sink, edits, errorCode);
921     sink.Flush();
922     if (U_SUCCESS(errorCode)) {
923         if (sink.Overflowed()) {
924             errorCode = U_BUFFER_OVERFLOW_ERROR;
925         } else if (edits != nullptr) {
926             edits->copyErrorTo(errorCode);
927         }
928     }
929     return u_terminateChars(dest, destCapacity, sink.NumberOfBytesAppended(), &errorCode);
930 }
931 
932 /* public API functions */
933 
934 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToLower(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)935 ucasemap_utf8ToLower(const UCaseMap *csm,
936                      char *dest, int32_t destCapacity,
937                      const char *src, int32_t srcLength,
938                      UErrorCode *pErrorCode) {
939     return ucasemap_mapUTF8(
940         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
941         dest, destCapacity,
942         src, srcLength,
943         ucasemap_internalUTF8ToLower, NULL, *pErrorCode);
944 }
945 
946 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToUpper(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)947 ucasemap_utf8ToUpper(const UCaseMap *csm,
948                      char *dest, int32_t destCapacity,
949                      const char *src, int32_t srcLength,
950                      UErrorCode *pErrorCode) {
951     return ucasemap_mapUTF8(
952         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
953         dest, destCapacity,
954         src, srcLength,
955         ucasemap_internalUTF8ToUpper, NULL, *pErrorCode);
956 }
957 
958 U_CAPI int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)959 ucasemap_utf8FoldCase(const UCaseMap *csm,
960                       char *dest, int32_t destCapacity,
961                       const char *src, int32_t srcLength,
962                       UErrorCode *pErrorCode) {
963     return ucasemap_mapUTF8(
964         UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
965         dest, destCapacity,
966         src, srcLength,
967         ucasemap_internalUTF8Fold, NULL, *pErrorCode);
968 }
969 
970 U_NAMESPACE_BEGIN
971 
utf8ToLower(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)972 void CaseMap::utf8ToLower(
973         const char *locale, uint32_t options,
974         StringPiece src, ByteSink &sink, Edits *edits,
975         UErrorCode &errorCode) {
976     ucasemap_mapUTF8(
977         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
978         src.data(), src.length(),
979         ucasemap_internalUTF8ToLower, sink, edits, errorCode);
980 }
981 
utf8ToUpper(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)982 void CaseMap::utf8ToUpper(
983         const char *locale, uint32_t options,
984         StringPiece src, ByteSink &sink, Edits *edits,
985         UErrorCode &errorCode) {
986     ucasemap_mapUTF8(
987         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
988         src.data(), src.length(),
989         ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
990 }
991 
utf8Fold(uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)992 void CaseMap::utf8Fold(
993         uint32_t options,
994         StringPiece src, ByteSink &sink, Edits *edits,
995         UErrorCode &errorCode) {
996     ucasemap_mapUTF8(
997         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
998         src.data(), src.length(),
999         ucasemap_internalUTF8Fold, sink, edits, errorCode);
1000 }
1001 
utf8ToLower(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1002 int32_t CaseMap::utf8ToLower(
1003         const char *locale, uint32_t options,
1004         const char *src, int32_t srcLength,
1005         char *dest, int32_t destCapacity, Edits *edits,
1006         UErrorCode &errorCode) {
1007     return ucasemap_mapUTF8(
1008         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1009         dest, destCapacity,
1010         src, srcLength,
1011         ucasemap_internalUTF8ToLower, edits, errorCode);
1012 }
1013 
utf8ToUpper(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1014 int32_t CaseMap::utf8ToUpper(
1015         const char *locale, uint32_t options,
1016         const char *src, int32_t srcLength,
1017         char *dest, int32_t destCapacity, Edits *edits,
1018         UErrorCode &errorCode) {
1019     return ucasemap_mapUTF8(
1020         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1021         dest, destCapacity,
1022         src, srcLength,
1023         ucasemap_internalUTF8ToUpper, edits, errorCode);
1024 }
1025 
utf8Fold(uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1026 int32_t CaseMap::utf8Fold(
1027         uint32_t options,
1028         const char *src, int32_t srcLength,
1029         char *dest, int32_t destCapacity, Edits *edits,
1030         UErrorCode &errorCode) {
1031     return ucasemap_mapUTF8(
1032         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1033         dest, destCapacity,
1034         src, srcLength,
1035         ucasemap_internalUTF8Fold, edits, errorCode);
1036 }
1037 
1038 U_NAMESPACE_END
1039