• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2005-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucasemap.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2005may06
16 *   created by: Markus W. Scherer
17 *
18 *   Case mapping service object and functions using it.
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/bytestream.h"
24 #include "unicode/casemap.h"
25 #include "unicode/edits.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/ubrk.h"
29 #include "unicode/uloc.h"
30 #include "unicode/ustring.h"
31 #include "unicode/ucasemap.h"
32 #if !UCONFIG_NO_BREAK_ITERATION
33 #include "unicode/utext.h"
34 #endif
35 #include "unicode/utf.h"
36 #include "unicode/utf8.h"
37 #include "unicode/utf16.h"
38 #include "bytesinkutil.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41 #include "uassert.h"
42 #include "ucase.h"
43 #include "ucasemap_imp.h"
44 
45 U_NAMESPACE_USE
46 
47 /* UCaseMap service object -------------------------------------------------- */
48 
UCaseMap(const char * localeID,uint32_t opts,UErrorCode * pErrorCode)49 UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
50 #if !UCONFIG_NO_BREAK_ITERATION
51         iter(nullptr),
52 #endif
53         caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
54     ucasemap_setLocale(this, localeID, pErrorCode);
55 }
56 
~UCaseMap()57 UCaseMap::~UCaseMap() {
58 #if !UCONFIG_NO_BREAK_ITERATION
59     delete iter;
60 #endif
61 }
62 
63 U_CAPI UCaseMap * U_EXPORT2
ucasemap_open(const char * locale,uint32_t options,UErrorCode * pErrorCode)64 ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
65     if(U_FAILURE(*pErrorCode)) {
66         return nullptr;
67     }
68     UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
69     if(csm==nullptr) {
70         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
71         return nullptr;
72     } else if (U_FAILURE(*pErrorCode)) {
73         delete csm;
74         return nullptr;
75     }
76     return csm;
77 }
78 
79 U_CAPI void U_EXPORT2
ucasemap_close(UCaseMap * csm)80 ucasemap_close(UCaseMap *csm) {
81     delete csm;
82 }
83 
84 U_CAPI const char * U_EXPORT2
ucasemap_getLocale(const UCaseMap * csm)85 ucasemap_getLocale(const UCaseMap *csm) {
86     return csm->locale;
87 }
88 
89 U_CAPI uint32_t U_EXPORT2
ucasemap_getOptions(const UCaseMap * csm)90 ucasemap_getOptions(const UCaseMap *csm) {
91     return csm->options;
92 }
93 
94 U_CAPI void U_EXPORT2
ucasemap_setLocale(UCaseMap * csm,const char * locale,UErrorCode * pErrorCode)95 ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
96     if(U_FAILURE(*pErrorCode)) {
97         return;
98     }
99     if (locale != nullptr && *locale == 0) {
100         csm->locale[0] = 0;
101         csm->caseLocale = UCASE_LOC_ROOT;
102         return;
103     }
104 
105     int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
106     if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
107         *pErrorCode=U_ZERO_ERROR;
108         /* we only really need the language code for case mappings */
109         length=uloc_getLanguage(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
110     }
111     if(length==sizeof(csm->locale)) {
112         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
113     }
114     if(U_SUCCESS(*pErrorCode)) {
115         csm->caseLocale = ucase_getCaseLocale(csm->locale);
116     } else {
117         csm->locale[0]=0;
118         csm->caseLocale = UCASE_LOC_ROOT;
119     }
120 }
121 
122 U_CAPI void U_EXPORT2
ucasemap_setOptions(UCaseMap * csm,uint32_t options,UErrorCode * pErrorCode)123 ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
124     if(U_FAILURE(*pErrorCode)) {
125         return;
126     }
127     csm->options=options;
128 }
129 
130 /* UTF-8 string case mappings ----------------------------------------------- */
131 
132 /* TODO(markus): Move to a new, separate utf8case.cpp file. */
133 
134 namespace {
135 
136 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
137 inline UBool
appendResult(int32_t cpLength,int32_t result,const char16_t * s,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)138 appendResult(int32_t cpLength, int32_t result, const char16_t *s,
139              ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
140     U_ASSERT(U_SUCCESS(errorCode));
141 
142     /* decode the result */
143     if(result<0) {
144         /* (not) original code point */
145         if(edits!=nullptr) {
146             edits->addUnchanged(cpLength);
147         }
148         if((options & U_OMIT_UNCHANGED_TEXT) == 0) {
149             ByteSinkUtil::appendCodePoint(cpLength, ~result, sink);
150         }
151     } else {
152         if(result<=UCASE_MAX_STRING_LENGTH) {
153             // string: "result" is the UTF-16 length
154             return ByteSinkUtil::appendChange(cpLength, s, result, sink, edits, errorCode);
155         } else {
156             ByteSinkUtil::appendCodePoint(cpLength, result, sink, edits);
157         }
158     }
159     return true;
160 }
161 
162 // See unicode/utf8.h U8_APPEND_UNSAFE().
getTwoByteLead(UChar32 c)163 inline uint8_t getTwoByteLead(UChar32 c) { return static_cast<uint8_t>((c >> 6) | 0xc0); }
getTwoByteTrail(UChar32 c)164 inline uint8_t getTwoByteTrail(UChar32 c) { return static_cast<uint8_t>((c & 0x3f) | 0x80); }
165 
166 UChar32 U_CALLCONV
utf8_caseContextIterator(void * context,int8_t dir)167 utf8_caseContextIterator(void *context, int8_t dir) {
168     UCaseContext* csc = static_cast<UCaseContext*>(context);
169     UChar32 c;
170 
171     if(dir<0) {
172         /* reset for backward iteration */
173         csc->index=csc->cpStart;
174         csc->dir=dir;
175     } else if(dir>0) {
176         /* reset for forward iteration */
177         csc->index=csc->cpLimit;
178         csc->dir=dir;
179     } else {
180         /* continue current iteration direction */
181         dir=csc->dir;
182     }
183 
184     if(dir<0) {
185         if(csc->start<csc->index) {
186             U8_PREV((const uint8_t *)csc->p, csc->start, csc->index, c);
187             return c;
188         }
189     } else {
190         if(csc->index<csc->limit) {
191             U8_NEXT((const uint8_t *)csc->p, csc->index, csc->limit, c);
192             return c;
193         }
194     }
195     return U_SENTINEL;
196 }
197 
198 /**
199  * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
200  * caseLocale < 0: Case-folds [srcStart..srcLimit[.
201  */
toLower(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)202 void toLower(int32_t caseLocale, uint32_t options,
203              const uint8_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
204              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
205     const int8_t *latinToLower;
206     if (caseLocale == UCASE_LOC_ROOT ||
207             (caseLocale >= 0 ?
208                 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
209                 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
210         latinToLower = LatinCase::TO_LOWER_NORMAL;
211     } else {
212         latinToLower = LatinCase::TO_LOWER_TR_LT;
213     }
214     const UTrie2 *trie = ucase_getTrie();
215     int32_t prev = srcStart;
216     int32_t srcIndex = srcStart;
217     for (;;) {
218         // fast path for simple cases
219         int32_t cpStart;
220         UChar32 c;
221         for (;;) {
222             if (U_FAILURE(errorCode) || srcIndex >= srcLimit) {
223                 c = U_SENTINEL;
224                 break;
225             }
226             uint8_t lead = src[srcIndex++];
227             if (lead <= 0x7f) {
228                 int8_t d = latinToLower[lead];
229                 if (d == LatinCase::EXC) {
230                     cpStart = srcIndex - 1;
231                     c = lead;
232                     break;
233                 }
234                 if (d == 0) { continue; }
235                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
236                                               sink, options, edits, errorCode);
237                 char ascii = static_cast<char>(lead + d);
238                 sink.Append(&ascii, 1);
239                 if (edits != nullptr) {
240                     edits->addReplace(1, 1);
241                 }
242                 prev = srcIndex;
243                 continue;
244             } else if (lead < 0xe3) {
245                 uint8_t t;
246                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLimit &&
247                         (t = src[srcIndex] - 0x80) <= 0x3f) {
248                     // U+0080..U+017F
249                     ++srcIndex;
250                     c = ((lead - 0xc0) << 6) | t;
251                     int8_t d = latinToLower[c];
252                     if (d == LatinCase::EXC) {
253                         cpStart = srcIndex - 2;
254                         break;
255                     }
256                     if (d == 0) { continue; }
257                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
258                                                   sink, options, edits, errorCode);
259                     ByteSinkUtil::appendTwoBytes(c + d, sink);
260                     if (edits != nullptr) {
261                         edits->addReplace(2, 2);
262                     }
263                     prev = srcIndex;
264                     continue;
265                 }
266             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
267                     (srcIndex + 2) <= srcLimit &&
268                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
269                 // most of CJK: no case mappings
270                 srcIndex += 2;
271                 continue;
272             }
273             cpStart = --srcIndex;
274             U8_NEXT(src, srcIndex, srcLimit, c);
275             if (c < 0) {
276                 // ill-formed UTF-8
277                 continue;
278             }
279             uint16_t props = UTRIE2_GET16(trie, c);
280             if (UCASE_HAS_EXCEPTION(props)) { break; }
281             int32_t delta;
282             if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
283                 continue;
284             }
285             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
286                                           sink, options, edits, errorCode);
287             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
288             prev = srcIndex;
289         }
290         if (c < 0) {
291             break;
292         }
293         // slow path
294         const char16_t *s;
295         if (caseLocale >= 0) {
296             csc->cpStart = cpStart;
297             csc->cpLimit = srcIndex;
298             c = ucase_toFullLower(c, utf8_caseContextIterator, csc, &s, caseLocale);
299         } else {
300             c = ucase_toFullFolding(c, &s, options);
301         }
302         if (c >= 0) {
303             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
304                                           sink, options, edits, errorCode);
305             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
306             prev = srcIndex;
307         }
308     }
309     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
310                                   sink, options, edits, errorCode);
311 }
312 
toUpper(int32_t caseLocale,uint32_t options,const uint8_t * src,UCaseContext * csc,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)313 void toUpper(int32_t caseLocale, uint32_t options,
314              const uint8_t *src, UCaseContext *csc, int32_t srcLength,
315              icu::ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) {
316     const int8_t *latinToUpper;
317     if (caseLocale == UCASE_LOC_TURKISH) {
318         latinToUpper = LatinCase::TO_UPPER_TR;
319     } else {
320         latinToUpper = LatinCase::TO_UPPER_NORMAL;
321     }
322     const UTrie2 *trie = ucase_getTrie();
323     int32_t prev = 0;
324     int32_t srcIndex = 0;
325     for (;;) {
326         // fast path for simple cases
327         int32_t cpStart;
328         UChar32 c;
329         for (;;) {
330             if (U_FAILURE(errorCode) || srcIndex >= srcLength) {
331                 c = U_SENTINEL;
332                 break;
333             }
334             uint8_t lead = src[srcIndex++];
335             if (lead <= 0x7f) {
336                 int8_t d = latinToUpper[lead];
337                 if (d == LatinCase::EXC) {
338                     cpStart = srcIndex - 1;
339                     c = lead;
340                     break;
341                 }
342                 if (d == 0) { continue; }
343                 ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 1 - prev,
344                                               sink, options, edits, errorCode);
345                 char ascii = static_cast<char>(lead + d);
346                 sink.Append(&ascii, 1);
347                 if (edits != nullptr) {
348                     edits->addReplace(1, 1);
349                 }
350                 prev = srcIndex;
351                 continue;
352             } else if (lead < 0xe3) {
353                 uint8_t t;
354                 if (0xc2 <= lead && lead <= 0xc5 && srcIndex < srcLength &&
355                         (t = src[srcIndex] - 0x80) <= 0x3f) {
356                     // U+0080..U+017F
357                     ++srcIndex;
358                     c = ((lead - 0xc0) << 6) | t;
359                     int8_t d = latinToUpper[c];
360                     if (d == LatinCase::EXC) {
361                         cpStart = srcIndex - 2;
362                         break;
363                     }
364                     if (d == 0) { continue; }
365                     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - 2 - prev,
366                                                   sink, options, edits, errorCode);
367                     ByteSinkUtil::appendTwoBytes(c + d, sink);
368                     if (edits != nullptr) {
369                         edits->addReplace(2, 2);
370                     }
371                     prev = srcIndex;
372                     continue;
373                 }
374             } else if ((lead <= 0xe9 || lead == 0xeb || lead == 0xec) &&
375                     (srcIndex + 2) <= srcLength &&
376                     U8_IS_TRAIL(src[srcIndex]) && U8_IS_TRAIL(src[srcIndex + 1])) {
377                 // most of CJK: no case mappings
378                 srcIndex += 2;
379                 continue;
380             }
381             cpStart = --srcIndex;
382             U8_NEXT(src, srcIndex, srcLength, c);
383             if (c < 0) {
384                 // ill-formed UTF-8
385                 continue;
386             }
387             uint16_t props = UTRIE2_GET16(trie, c);
388             if (UCASE_HAS_EXCEPTION(props)) { break; }
389             int32_t delta;
390             if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
391                 continue;
392             }
393             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
394                                           sink, options, edits, errorCode);
395             ByteSinkUtil::appendCodePoint(srcIndex - cpStart, c + delta, sink, edits);
396             prev = srcIndex;
397         }
398         if (c < 0) {
399             break;
400         }
401         // slow path
402         csc->cpStart = cpStart;
403         csc->cpLimit = srcIndex;
404         const char16_t *s;
405         c = ucase_toFullUpper(c, utf8_caseContextIterator, csc, &s, caseLocale);
406         if (c >= 0) {
407             ByteSinkUtil::appendUnchanged(src + prev, cpStart - prev,
408                                           sink, options, edits, errorCode);
409             appendResult(srcIndex - cpStart, c, s, sink, options, edits, errorCode);
410             prev = srcIndex;
411         }
412     }
413     ByteSinkUtil::appendUnchanged(src + prev, srcIndex - prev,
414                                   sink, options, edits, errorCode);
415 }
416 
417 }  // namespace
418 
419 #if !UCONFIG_NO_BREAK_ITERATION
420 
421 namespace {
422 
423 constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
424 
425 constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
426 
427 /**
428  * Input: c is a letter I with or without acute accent.
429  * start is the index in src after c, and is less than segmentLimit.
430  * If a plain i/I is followed by a plain j/J,
431  * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
432  * then we output accordingly.
433  *
434  * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
435  */
maybeTitleDutchIJ(const uint8_t * src,UChar32 c,int32_t start,int32_t segmentLimit,ByteSink & sink,uint32_t options,icu::Edits * edits,UErrorCode & errorCode)436 int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
437                           ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
438     U_ASSERT(start < segmentLimit);
439 
440     int32_t index = start;
441     bool withAcute = false;
442 
443     // If the conditions are met, then the following variables tell us what to output.
444     int32_t unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
445     bool doTitleJ = false;  // true if the j needs to be titlecased
446     int32_t unchanged2 = 0;  // after the j (0 or 1)
447 
448     // next character after the first letter
449     UChar32 c2;
450     c2 = src[index++];
451 
452     // Is the first letter an i/I with accent?
453     if (c == u'I') {
454         if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
455             withAcute = true;
456             unchanged1 = 2;  // ACUTE is 2 code units in UTF-8
457             if (index == segmentLimit) { return start; }
458             c2 = src[index++];
459         }
460     } else {  // Í
461         withAcute = true;
462     }
463 
464     // Is the next character a j/J?
465     if (c2 == u'j') {
466         doTitleJ = true;
467     } else if (c2 == u'J') {
468         ++unchanged1;
469     } else {
470         return start;
471     }
472 
473     // A plain i/I must be followed by a plain j/J.
474     // An i/I with acute must be followed by a j/J with acute.
475     if (withAcute) {
476         if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
477             return start;
478         }
479         if (doTitleJ) {
480             unchanged2 = 2;  // ACUTE is 2 code units in UTF-8
481         } else {
482             unchanged1 = unchanged1 + 2;    // ACUTE is 2 code units in UTF-8
483         }
484     }
485 
486     // There must not be another combining mark.
487     if (index < segmentLimit) {
488         int32_t cp;
489         int32_t i = index;
490         U8_NEXT(src, i, segmentLimit, cp);
491         uint32_t typeMask = U_GET_GC_MASK(cp);
492         if ((typeMask & U_GC_M_MASK) != 0) {
493             return start;
494         }
495     }
496 
497     // Output the rest of the Dutch IJ.
498     ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
499     start += unchanged1;
500     if (doTitleJ) {
501         ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
502         ++start;
503     }
504     ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
505 
506     U_ASSERT(start + unchanged2 == index);
507     return index;
508 }
509 
510 }  // namespace
511 
512 U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,const uint8_t * src,int32_t srcLength,ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)513 ucasemap_internalUTF8ToTitle(
514         int32_t caseLocale, uint32_t options, BreakIterator *iter,
515         const uint8_t *src, int32_t srcLength,
516         ByteSink &sink, icu::Edits *edits,
517         UErrorCode &errorCode) {
518     if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
519         return;
520     }
521 
522     /* set up local variables */
523     UCaseContext csc=UCASECONTEXT_INITIALIZER;
524     csc.p=(void *)src;
525     csc.limit=srcLength;
526     int32_t prev=0;
527     UBool isFirstIndex=true;
528 
529     /* titlecasing loop */
530     while(prev<srcLength) {
531         /* find next index where to titlecase */
532         int32_t index;
533         if(isFirstIndex) {
534             isFirstIndex=false;
535             index=iter->first();
536         } else {
537             index=iter->next();
538         }
539         if(index==UBRK_DONE || index>srcLength) {
540             index=srcLength;
541         }
542 
543         /*
544          * Segment [prev..index[ into 3 parts:
545          * a) skipped characters (copy as-is) [prev..titleStart[
546          * b) first letter (titlecase)              [titleStart..titleLimit[
547          * c) subsequent characters (lowercase)                 [titleLimit..index[
548          */
549         if(prev<index) {
550             /* find and copy skipped characters [prev..titleStart[ */
551             int32_t titleStart=prev;
552             int32_t titleLimit=prev;
553             UChar32 c;
554             U8_NEXT(src, titleLimit, index, c);
555             if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
556                 // Adjust the titlecasing index to the next cased character,
557                 // or to the next letter/number/symbol/private use.
558                 // Stop with titleStart<titleLimit<=index
559                 // if there is a character to be titlecased,
560                 // or else stop with titleStart==titleLimit==index.
561                 UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
562                 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
563                     titleStart=titleLimit;
564                     if(titleLimit==index) {
565                         break;
566                     }
567                     U8_NEXT(src, titleLimit, index, c);
568                 }
569                 if (prev < titleStart) {
570                     if (!ByteSinkUtil::appendUnchanged(src+prev, titleStart-prev,
571                                                        sink, options, edits, errorCode)) {
572                         return;
573                     }
574                 }
575             }
576 
577             if(titleStart<titleLimit) {
578                 /* titlecase c which is from [titleStart..titleLimit[ */
579                 if(c>=0) {
580                     csc.cpStart=titleStart;
581                     csc.cpLimit=titleLimit;
582                     const char16_t *s;
583                     c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
584                     if (!appendResult(titleLimit-titleStart, c, s, sink, options, edits, errorCode)) {
585                         return;
586                     }
587                 } else {
588                     // Malformed UTF-8.
589                     if (!ByteSinkUtil::appendUnchanged(src+titleStart, titleLimit-titleStart,
590                                                        sink, options, edits, errorCode)) {
591                         return;
592                     }
593                 }
594 
595                 /* Special case Dutch IJ titlecasing */
596                 if (titleLimit < index &&
597                     caseLocale == UCASE_LOC_DUTCH) {
598                     if (c < 0) {
599                         c = ~c;
600                     }
601 
602                     if (c == u'I' || c == u'Í') {
603                         titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
604                     }
605                 }
606 
607                 /* lowercase [titleLimit..index[ */
608                 if(titleLimit<index) {
609                     if((options&U_TITLECASE_NO_LOWERCASE)==0) {
610                         /* Normal operation: Lowercase the rest of the word. */
611                         toLower(caseLocale, options,
612                                 src, &csc, titleLimit, index,
613                                 sink, edits, errorCode);
614                         if(U_FAILURE(errorCode)) {
615                             return;
616                         }
617                     } else {
618                         /* Optionally just copy the rest of the word unchanged. */
619                         if (!ByteSinkUtil::appendUnchanged(src+titleLimit, index-titleLimit,
620                                                            sink, options, edits, errorCode)) {
621                             return;
622                         }
623                     }
624                 }
625             }
626         }
627 
628         prev=index;
629     }
630 }
631 
632 #endif
633 
634 U_NAMESPACE_BEGIN
635 namespace GreekUpper {
636 
isFollowedByCasedLetter(const uint8_t * s,int32_t i,int32_t length)637 UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
638     while (i < length) {
639         UChar32 c;
640         U8_NEXT(s, i, length, c);
641         int32_t type = ucase_getTypeOrIgnorable(c);
642         if ((type & UCASE_IGNORABLE) != 0) {
643             // Case-ignorable, continue with the loop.
644         } else if (type != UCASE_NONE) {
645             return true;  // Followed by cased letter.
646         } else {
647             return false;  // Uncased and not case-ignorable.
648         }
649     }
650     return false;  // Not followed by cased letter.
651 }
652 
653 // Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
toUpper(uint32_t options,const uint8_t * src,int32_t srcLength,ByteSink & sink,Edits * edits,UErrorCode & errorCode)654 void toUpper(uint32_t options,
655              const uint8_t *src, int32_t srcLength,
656              ByteSink &sink, Edits *edits,
657              UErrorCode &errorCode) {
658     uint32_t state = 0;
659     for (int32_t i = 0; i < srcLength;) {
660         int32_t nextIndex = i;
661         UChar32 c;
662         U8_NEXT(src, nextIndex, srcLength, c);
663         uint32_t nextState = 0;
664         int32_t type = ucase_getTypeOrIgnorable(c);
665         if ((type & UCASE_IGNORABLE) != 0) {
666             // c is case-ignorable
667             nextState |= (state & AFTER_CASED);
668         } else if (type != UCASE_NONE) {
669             // c is cased
670             nextState |= AFTER_CASED;
671         }
672         uint32_t data = getLetterData(c);
673         if (data > 0) {
674             uint32_t upper = data & UPPER_MASK;
675             // Add a dialytika to this iota or ypsilon vowel
676             // if we removed a tonos from the previous vowel,
677             // and that previous vowel did not also have (or gain) a dialytika.
678             // Adding one only to the final vowel in a longer sequence
679             // (which does not occur in normal writing) would require lookahead.
680             // Set the same flag as for preserving an existing dialytika.
681             if ((data & HAS_VOWEL) != 0 &&
682                 (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
683                     0 &&
684                 (upper == 0x399 || upper == 0x3A5)) {
685                 data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) != 0 ? HAS_DIALYTIKA
686                                                                            : HAS_COMBINING_DIALYTIKA;
687             }
688             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
689             if ((data & HAS_YPOGEGRAMMENI) != 0) {
690                 numYpogegrammeni = 1;
691             }
692             const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
693             // Skip combining diacritics after this Greek letter.
694             int32_t nextNextIndex = nextIndex;
695             while (nextIndex < srcLength) {
696                 UChar32 c2;
697                 U8_NEXT(src, nextNextIndex, srcLength, c2);
698                 uint32_t diacriticData = getDiacriticData(c2);
699                 if (diacriticData != 0) {
700                     data |= diacriticData;
701                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
702                         ++numYpogegrammeni;
703                     }
704                     nextIndex = nextNextIndex;
705                 } else {
706                     break;  // not a Greek diacritic
707                 }
708             }
709             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
710                 nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
711                                                   : AFTER_VOWEL_WITH_COMBINING_ACCENT;
712             }
713             // Map according to Greek rules.
714             UBool addTonos = false;
715             if (upper == 0x397 &&
716                     (data & HAS_ACCENT) != 0 &&
717                     numYpogegrammeni == 0 &&
718                     (state & AFTER_CASED) == 0 &&
719                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
720                 // Keep disjunctive "or" with (only) a tonos.
721                 // We use the same "word boundary" conditions as for the Final_Sigma test.
722                 if (hasPrecomposedAccent) {
723                     upper = 0x389;  // Preserve the precomposed form.
724                 } else {
725                     addTonos = true;
726                 }
727             } else if ((data & HAS_DIALYTIKA) != 0) {
728                 // Preserve a vowel with dialytika in precomposed form if it exists.
729                 if (upper == 0x399) {
730                     upper = 0x3AA;
731                     data &= ~HAS_EITHER_DIALYTIKA;
732                 } else if (upper == 0x3A5) {
733                     upper = 0x3AB;
734                     data &= ~HAS_EITHER_DIALYTIKA;
735                 }
736             }
737 
738             UBool change;
739             if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
740                 change = true;  // common, simple usage
741             } else {
742                 // Find out first whether we are changing the text.
743                 U_ASSERT(0x370 <= upper && upper <= 0x3ff);  // 2-byte UTF-8, main Greek block
744                 change = (i + 2) > nextIndex ||
745                         src[i] != getTwoByteLead(upper) || src[i + 1] != getTwoByteTrail(upper) ||
746                         numYpogegrammeni > 0;
747                 int32_t i2 = i + 2;
748                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
749                     change |= (i2 + 2) > nextIndex ||
750                             src[i2] != static_cast<uint8_t>(u8"\u0308"[0]) ||
751                             src[i2 + 1] != static_cast<uint8_t>(u8"\u0308"[1]);
752                     i2 += 2;
753                 }
754                 if (addTonos) {
755                     change |= (i2 + 2) > nextIndex ||
756                             src[i2] != static_cast<uint8_t>(u8"\u0301"[0]) ||
757                             src[i2 + 1] != static_cast<uint8_t>(u8"\u0301"[1]);
758                     i2 += 2;
759                 }
760                 int32_t oldLength = nextIndex - i;
761                 int32_t newLength = (i2 - i) + numYpogegrammeni * 2;  // 2 bytes per U+0399
762                 change |= oldLength != newLength;
763                 if (change) {
764                     if (edits != nullptr) {
765                         edits->addReplace(oldLength, newLength);
766                     }
767                 } else {
768                     if (edits != nullptr) {
769                         edits->addUnchanged(oldLength);
770                     }
771                     // Write unchanged text?
772                     change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
773                 }
774             }
775 
776             if (change) {
777                 ByteSinkUtil::appendTwoBytes(upper, sink);
778                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
779                     sink.AppendU8(u8"\u0308", 2);  // restore or add a dialytika
780                 }
781                 if (addTonos) {
782                     sink.AppendU8(u8"\u0301", 2);
783                 }
784                 while (numYpogegrammeni > 0) {
785                     sink.AppendU8(u8"\u0399", 2);
786                     --numYpogegrammeni;
787                 }
788             }
789         } else if(c>=0) {
790             const char16_t *s;
791             c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK);
792             if (!appendResult(nextIndex - i, c, s, sink, options, edits, errorCode)) {
793                 return;
794             }
795         } else {
796             // Malformed UTF-8.
797             if (!ByteSinkUtil::appendUnchanged(src+i, nextIndex-i,
798                                                sink, options, edits, errorCode)) {
799                 return;
800             }
801         }
802         i = nextIndex;
803         state = nextState;
804     }
805 }
806 
807 }  // namespace GreekUpper
808 U_NAMESPACE_END
809 
810 static void U_CALLCONV
ucasemap_internalUTF8ToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)811 ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
812                              const uint8_t *src, int32_t srcLength,
813                              icu::ByteSink &sink, icu::Edits *edits,
814                              UErrorCode &errorCode) {
815     UCaseContext csc=UCASECONTEXT_INITIALIZER;
816     csc.p=(void *)src;
817     csc.limit=srcLength;
818     toLower(
819         caseLocale, options,
820         src, &csc, 0, srcLength,
821         sink, edits, errorCode);
822 }
823 
824 static void U_CALLCONV
ucasemap_internalUTF8ToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)825 ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
826                              const uint8_t *src, int32_t srcLength,
827                              icu::ByteSink &sink, icu::Edits *edits,
828                              UErrorCode &errorCode) {
829     if (caseLocale == UCASE_LOC_GREEK) {
830         GreekUpper::toUpper(options, src, srcLength, sink, edits, errorCode);
831     } else {
832         UCaseContext csc=UCASECONTEXT_INITIALIZER;
833         csc.p=(void *)src;
834         csc.limit=srcLength;
835         toUpper(
836             caseLocale, options,
837             src, &csc, srcLength,
838             sink, edits, errorCode);
839     }
840 }
841 
842 static void U_CALLCONV
ucasemap_internalUTF8Fold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED const uint8_t * src,int32_t srcLength,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)843 ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
844                           const uint8_t *src, int32_t srcLength,
845                           icu::ByteSink &sink, icu::Edits *edits,
846                           UErrorCode &errorCode) {
847     toLower(
848         -1, options,
849         src, nullptr, 0, srcLength,
850         sink, edits, errorCode);
851 }
852 
853 void
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::ByteSink & sink,icu::Edits * edits,UErrorCode & errorCode)854 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
855                  const char *src, int32_t srcLength,
856                  UTF8CaseMapper *stringCaseMapper,
857                  icu::ByteSink &sink, icu::Edits *edits,
858                  UErrorCode &errorCode) {
859     /* check argument values */
860     if (U_FAILURE(errorCode)) {
861         return;
862     }
863     if ((src == nullptr && srcLength != 0) || srcLength < -1) {
864         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
865         return;
866     }
867 
868     // Get the string length.
869     if (srcLength == -1) {
870         srcLength = static_cast<int32_t>(uprv_strlen(src));
871     }
872 
873     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
874         edits->reset();
875     }
876     stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
877                      reinterpret_cast<const uint8_t*>(src), srcLength, sink, edits, errorCode);
878     sink.Flush();
879     if (U_SUCCESS(errorCode)) {
880         if (edits != nullptr) {
881             edits->copyErrorTo(errorCode);
882         }
883     }
884 }
885 
886 int32_t
ucasemap_mapUTF8(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UTF8CaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)887 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
888                  char *dest, int32_t destCapacity,
889                  const char *src, int32_t srcLength,
890                  UTF8CaseMapper *stringCaseMapper,
891                  icu::Edits *edits,
892                  UErrorCode &errorCode) {
893     /* check argument values */
894     if(U_FAILURE(errorCode)) {
895         return 0;
896     }
897     if( destCapacity<0 ||
898         (dest==nullptr && destCapacity>0) ||
899         (src==nullptr && srcLength!=0) || srcLength<-1
900     ) {
901         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
902         return 0;
903     }
904 
905     /* get the string length */
906     if(srcLength==-1) {
907         srcLength = static_cast<int32_t>(uprv_strlen(src));
908     }
909 
910     /* check for overlapping source and destination */
911     if( dest!=nullptr &&
912         ((src>=dest && src<(dest+destCapacity)) ||
913          (dest>=src && dest<(src+srcLength)))
914     ) {
915         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
916         return 0;
917     }
918 
919     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
920         edits->reset();
921     }
922     int32_t reslen = ByteSinkUtil::viaByteSinkToTerminatedChars(
923         dest, destCapacity,
924         [&](ByteSink& sink, UErrorCode& status) {
925             stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
926                              reinterpret_cast<const uint8_t*>(src), srcLength, sink, edits, status);
927         },
928         errorCode);
929     if (U_SUCCESS(errorCode) && edits != nullptr) {
930         edits->copyErrorTo(errorCode);
931     }
932     return reslen;
933 }
934 
935 /* public API functions */
936 
937 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToLower(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)938 ucasemap_utf8ToLower(const UCaseMap *csm,
939                      char *dest, int32_t destCapacity,
940                      const char *src, int32_t srcLength,
941                      UErrorCode *pErrorCode) {
942     return ucasemap_mapUTF8(
943         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
944         dest, destCapacity,
945         src, srcLength,
946         ucasemap_internalUTF8ToLower, nullptr, *pErrorCode);
947 }
948 
949 U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToUpper(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)950 ucasemap_utf8ToUpper(const UCaseMap *csm,
951                      char *dest, int32_t destCapacity,
952                      const char *src, int32_t srcLength,
953                      UErrorCode *pErrorCode) {
954     return ucasemap_mapUTF8(
955         csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
956         dest, destCapacity,
957         src, srcLength,
958         ucasemap_internalUTF8ToUpper, nullptr, *pErrorCode);
959 }
960 
961 U_CAPI int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap * csm,char * dest,int32_t destCapacity,const char * src,int32_t srcLength,UErrorCode * pErrorCode)962 ucasemap_utf8FoldCase(const UCaseMap *csm,
963                       char *dest, int32_t destCapacity,
964                       const char *src, int32_t srcLength,
965                       UErrorCode *pErrorCode) {
966     return ucasemap_mapUTF8(
967         UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
968         dest, destCapacity,
969         src, srcLength,
970         ucasemap_internalUTF8Fold, nullptr, *pErrorCode);
971 }
972 
973 U_NAMESPACE_BEGIN
974 
utf8ToLower(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)975 void CaseMap::utf8ToLower(
976         const char *locale, uint32_t options,
977         StringPiece src, ByteSink &sink, Edits *edits,
978         UErrorCode &errorCode) {
979     ucasemap_mapUTF8(
980         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
981         src.data(), src.length(),
982         ucasemap_internalUTF8ToLower, sink, edits, errorCode);
983 }
984 
utf8ToUpper(const char * locale,uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)985 void CaseMap::utf8ToUpper(
986         const char *locale, uint32_t options,
987         StringPiece src, ByteSink &sink, Edits *edits,
988         UErrorCode &errorCode) {
989     ucasemap_mapUTF8(
990         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
991         src.data(), src.length(),
992         ucasemap_internalUTF8ToUpper, sink, edits, errorCode);
993 }
994 
utf8Fold(uint32_t options,StringPiece src,ByteSink & sink,Edits * edits,UErrorCode & errorCode)995 void CaseMap::utf8Fold(
996         uint32_t options,
997         StringPiece src, ByteSink &sink, Edits *edits,
998         UErrorCode &errorCode) {
999     ucasemap_mapUTF8(
1000         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1001         src.data(), src.length(),
1002         ucasemap_internalUTF8Fold, sink, edits, errorCode);
1003 }
1004 
utf8ToLower(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1005 int32_t CaseMap::utf8ToLower(
1006         const char *locale, uint32_t options,
1007         const char *src, int32_t srcLength,
1008         char *dest, int32_t destCapacity, Edits *edits,
1009         UErrorCode &errorCode) {
1010     return ucasemap_mapUTF8(
1011         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1012         dest, destCapacity,
1013         src, srcLength,
1014         ucasemap_internalUTF8ToLower, edits, errorCode);
1015 }
1016 
utf8ToUpper(const char * locale,uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1017 int32_t CaseMap::utf8ToUpper(
1018         const char *locale, uint32_t options,
1019         const char *src, int32_t srcLength,
1020         char *dest, int32_t destCapacity, Edits *edits,
1021         UErrorCode &errorCode) {
1022     return ucasemap_mapUTF8(
1023         ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
1024         dest, destCapacity,
1025         src, srcLength,
1026         ucasemap_internalUTF8ToUpper, edits, errorCode);
1027 }
1028 
utf8Fold(uint32_t options,const char * src,int32_t srcLength,char * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1029 int32_t CaseMap::utf8Fold(
1030         uint32_t options,
1031         const char *src, int32_t srcLength,
1032         char *dest, int32_t destCapacity, Edits *edits,
1033         UErrorCode &errorCode) {
1034     return ucasemap_mapUTF8(
1035         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1036         dest, destCapacity,
1037         src, srcLength,
1038         ucasemap_internalUTF8Fold, edits, errorCode);
1039 }
1040 
1041 U_NAMESPACE_END
1042