• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2001-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ustrcase.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002feb20
16 *   created by: Markus W. Scherer
17 *
18 *   Implementation file for string casing C API functions.
19 *   Uses functions from uchar.c for basic functionality that requires access
20 *   to the Unicode Character Database (uprops.dat).
21 */
22 
23 #include "unicode/utypes.h"
24 #include "unicode/brkiter.h"
25 #include "unicode/casemap.h"
26 #include "unicode/edits.h"
27 #include "unicode/stringoptions.h"
28 #include "unicode/ustring.h"
29 #include "unicode/ucasemap.h"
30 #include "unicode/ubrk.h"
31 #include "unicode/utf.h"
32 #include "unicode/utf16.h"
33 #include "cmemory.h"
34 #include "ucase.h"
35 #include "ucasemap_imp.h"
36 #include "ustr_imp.h"
37 #include "uassert.h"
38 
39 /**
40  * Code point for COMBINING ACUTE ACCENT
41  * @internal
42  */
43 #define ACUTE u'\u0301'
44 
45 U_NAMESPACE_BEGIN
46 
47 namespace {
48 
checkOverflowAndEditsError(int32_t destIndex,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)49 int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
50                                    Edits *edits, UErrorCode &errorCode) {
51     if (U_SUCCESS(errorCode)) {
52         if (destIndex > destCapacity) {
53             errorCode = U_BUFFER_OVERFLOW_ERROR;
54         } else if (edits != nullptr) {
55             edits->copyErrorTo(errorCode);
56         }
57     }
58     return destIndex;
59 }
60 
61 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
62 inline int32_t
appendResult(char16_t * dest,int32_t destIndex,int32_t destCapacity,int32_t result,const char16_t * s,int32_t cpLength,uint32_t options,icu::Edits * edits)63 appendResult(char16_t *dest, int32_t destIndex, int32_t destCapacity,
64              int32_t result, const char16_t *s,
65              int32_t cpLength, uint32_t options, icu::Edits *edits) {
66     UChar32 c;
67     int32_t length;
68 
69     /* decode the result */
70     if(result<0) {
71         /* (not) original code point */
72         if(edits!=nullptr) {
73             edits->addUnchanged(cpLength);
74         }
75         if(options & U_OMIT_UNCHANGED_TEXT) {
76             return destIndex;
77         }
78         c=~result;
79         if(destIndex<destCapacity && c<=0xffff) {  // BMP slightly-fastpath
80             dest[destIndex++]=(char16_t)c;
81             return destIndex;
82         }
83         length=cpLength;
84     } else {
85         if(result<=UCASE_MAX_STRING_LENGTH) {
86             c=U_SENTINEL;
87             length=result;
88         } else if(destIndex<destCapacity && result<=0xffff) {  // BMP slightly-fastpath
89             dest[destIndex++]=(char16_t)result;
90             if(edits!=nullptr) {
91                 edits->addReplace(cpLength, 1);
92             }
93             return destIndex;
94         } else {
95             c=result;
96             length=U16_LENGTH(c);
97         }
98         if(edits!=nullptr) {
99             edits->addReplace(cpLength, length);
100         }
101     }
102     if(length>(INT32_MAX-destIndex)) {
103         return -1;  // integer overflow
104     }
105 
106     if(destIndex<destCapacity) {
107         /* append the result */
108         if(c>=0) {
109             /* code point */
110             UBool isError=false;
111             U16_APPEND(dest, destIndex, destCapacity, c, isError);
112             if(isError) {
113                 /* overflow, nothing written */
114                 destIndex+=length;
115             }
116         } else {
117             /* string */
118             if((destIndex+length)<=destCapacity) {
119                 while(length>0) {
120                     dest[destIndex++]=*s++;
121                     --length;
122                 }
123             } else {
124                 /* overflow */
125                 destIndex+=length;
126             }
127         }
128     } else {
129         /* preflight */
130         destIndex+=length;
131     }
132     return destIndex;
133 }
134 
135 inline int32_t
appendUChar(char16_t * dest,int32_t destIndex,int32_t destCapacity,char16_t c)136 appendUChar(char16_t *dest, int32_t destIndex, int32_t destCapacity, char16_t c) {
137     if(destIndex<destCapacity) {
138         dest[destIndex]=c;
139     } else if(destIndex==INT32_MAX) {
140         return -1;  // integer overflow
141     }
142     return destIndex+1;
143 }
144 
145 int32_t
appendNonEmptyUnchanged(char16_t * dest,int32_t destIndex,int32_t destCapacity,const char16_t * s,int32_t length,uint32_t options,icu::Edits * edits)146 appendNonEmptyUnchanged(char16_t *dest, int32_t destIndex, int32_t destCapacity,
147                         const char16_t *s, int32_t length, uint32_t options, icu::Edits *edits) {
148     if(edits!=nullptr) {
149         edits->addUnchanged(length);
150     }
151     if(options & U_OMIT_UNCHANGED_TEXT) {
152         return destIndex;
153     }
154     if(length>(INT32_MAX-destIndex)) {
155         return -1;  // integer overflow
156     }
157     if((destIndex+length)<=destCapacity) {
158         u_memcpy(dest+destIndex, s, length);
159     }
160     return destIndex + length;
161 }
162 
163 inline int32_t
appendUnchanged(char16_t * dest,int32_t destIndex,int32_t destCapacity,const char16_t * s,int32_t length,uint32_t options,icu::Edits * edits)164 appendUnchanged(char16_t *dest, int32_t destIndex, int32_t destCapacity,
165                 const char16_t *s, int32_t length, uint32_t options, icu::Edits *edits) {
166     if (length <= 0) {
167         return destIndex;
168     }
169     return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits);
170 }
171 
172 UChar32 U_CALLCONV
utf16_caseContextIterator(void * context,int8_t dir)173 utf16_caseContextIterator(void *context, int8_t dir) {
174     UCaseContext *csc=(UCaseContext *)context;
175     UChar32 c;
176 
177     if(dir<0) {
178         /* reset for backward iteration */
179         csc->index=csc->cpStart;
180         csc->dir=dir;
181     } else if(dir>0) {
182         /* reset for forward iteration */
183         csc->index=csc->cpLimit;
184         csc->dir=dir;
185     } else {
186         /* continue current iteration direction */
187         dir=csc->dir;
188     }
189 
190     if(dir<0) {
191         if(csc->start<csc->index) {
192             U16_PREV((const char16_t *)csc->p, csc->start, csc->index, c);
193             return c;
194         }
195     } else {
196         if(csc->index<csc->limit) {
197             U16_NEXT((const char16_t *)csc->p, csc->index, csc->limit, c);
198             return c;
199         }
200     }
201     return U_SENTINEL;
202 }
203 
204 /**
205  * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account.
206  * caseLocale < 0: Case-folds [srcStart..srcLimit[.
207  */
toLower(int32_t caseLocale,uint32_t options,char16_t * dest,int32_t destCapacity,const char16_t * src,UCaseContext * csc,int32_t srcStart,int32_t srcLimit,icu::Edits * edits,UErrorCode & errorCode)208 int32_t toLower(int32_t caseLocale, uint32_t options,
209                 char16_t *dest, int32_t destCapacity,
210                 const char16_t *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit,
211                 icu::Edits *edits, UErrorCode &errorCode) {
212     const int8_t *latinToLower;
213     if (caseLocale == UCASE_LOC_ROOT ||
214             (caseLocale >= 0 ?
215                 !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) :
216                 (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) {
217         latinToLower = LatinCase::TO_LOWER_NORMAL;
218     } else {
219         latinToLower = LatinCase::TO_LOWER_TR_LT;
220     }
221     const UTrie2 *trie = ucase_getTrie();
222     int32_t destIndex = 0;
223     int32_t prev = srcStart;
224     int32_t srcIndex = srcStart;
225     for (;;) {
226         // fast path for simple cases
227         char16_t lead = 0;
228         while (srcIndex < srcLimit) {
229             lead = src[srcIndex];
230             int32_t delta;
231             if (lead < LatinCase::LONG_S) {
232                 int8_t d = latinToLower[lead];
233                 if (d == LatinCase::EXC) { break; }
234                 ++srcIndex;
235                 if (d == 0) { continue; }
236                 delta = d;
237             } else if (lead >= 0xd800) {
238                 break;  // surrogate or higher
239             } else {
240                 uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
241                 if (UCASE_HAS_EXCEPTION(props)) { break; }
242                 ++srcIndex;
243                 if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) {
244                     continue;
245                 }
246             }
247             lead += static_cast<char16_t>(delta);
248             destIndex = appendUnchanged(dest, destIndex, destCapacity,
249                                         src + prev, srcIndex - 1 - prev, options, edits);
250             if (destIndex >= 0) {
251                 destIndex = appendUChar(dest, destIndex, destCapacity, lead);
252                 if (edits != nullptr) {
253                     edits->addReplace(1, 1);
254                 }
255             }
256             if (destIndex < 0) {
257                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
258                 return 0;
259             }
260             prev = srcIndex;
261         }
262         if (srcIndex >= srcLimit) {
263             break;
264         }
265         // slow path
266         int32_t cpStart = srcIndex++;
267         char16_t trail;
268         UChar32 c;
269         if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) {
270             c = U16_GET_SUPPLEMENTARY(lead, trail);
271             ++srcIndex;
272         } else {
273             c = lead;
274         }
275         const char16_t *s;
276         if (caseLocale >= 0) {
277             csc->cpStart = cpStart;
278             csc->cpLimit = srcIndex;
279             c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale);
280         } else {
281             c = ucase_toFullFolding(c, &s, options);
282         }
283         if (c >= 0) {
284             destIndex = appendUnchanged(dest, destIndex, destCapacity,
285                                         src + prev, cpStart - prev, options, edits);
286             if (destIndex >= 0) {
287                 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
288                                          srcIndex - cpStart, options, edits);
289             }
290             if (destIndex < 0) {
291                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
292                 return 0;
293             }
294             prev = srcIndex;
295         }
296     }
297     destIndex = appendUnchanged(dest, destIndex, destCapacity,
298                                 src + prev, srcIndex - prev, options, edits);
299     if (destIndex < 0) {
300         errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
301         return 0;
302     }
303     return destIndex;
304 }
305 
toUpper(int32_t caseLocale,uint32_t options,char16_t * dest,int32_t destCapacity,const char16_t * src,UCaseContext * csc,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)306 int32_t toUpper(int32_t caseLocale, uint32_t options,
307                 char16_t *dest, int32_t destCapacity,
308                 const char16_t *src, UCaseContext *csc, int32_t srcLength,
309                 icu::Edits *edits, UErrorCode &errorCode) {
310     const int8_t *latinToUpper;
311     if (caseLocale == UCASE_LOC_TURKISH) {
312         latinToUpper = LatinCase::TO_UPPER_TR;
313     } else {
314         latinToUpper = LatinCase::TO_UPPER_NORMAL;
315     }
316     const UTrie2 *trie = ucase_getTrie();
317     int32_t destIndex = 0;
318     int32_t prev = 0;
319     int32_t srcIndex = 0;
320     for (;;) {
321         // fast path for simple cases
322         char16_t lead = 0;
323         while (srcIndex < srcLength) {
324             lead = src[srcIndex];
325             int32_t delta;
326             if (lead < LatinCase::LONG_S) {
327                 int8_t d = latinToUpper[lead];
328                 if (d == LatinCase::EXC) { break; }
329                 ++srcIndex;
330                 if (d == 0) { continue; }
331                 delta = d;
332             } else if (lead >= 0xd800) {
333                 break;  // surrogate or higher
334             } else {
335                 uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead);
336                 if (UCASE_HAS_EXCEPTION(props)) { break; }
337                 ++srcIndex;
338                 if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) {
339                     continue;
340                 }
341             }
342             lead += static_cast<char16_t>(delta);
343             destIndex = appendUnchanged(dest, destIndex, destCapacity,
344                                         src + prev, srcIndex - 1 - prev, options, edits);
345             if (destIndex >= 0) {
346                 destIndex = appendUChar(dest, destIndex, destCapacity, lead);
347                 if (edits != nullptr) {
348                     edits->addReplace(1, 1);
349                 }
350             }
351             if (destIndex < 0) {
352                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
353                 return 0;
354             }
355             prev = srcIndex;
356         }
357         if (srcIndex >= srcLength) {
358             break;
359         }
360         // slow path
361         int32_t cpStart;
362         csc->cpStart = cpStart = srcIndex++;
363         char16_t trail;
364         UChar32 c;
365         if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) {
366             c = U16_GET_SUPPLEMENTARY(lead, trail);
367             ++srcIndex;
368         } else {
369             c = lead;
370         }
371         csc->cpLimit = srcIndex;
372         const char16_t *s;
373         c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale);
374         if (c >= 0) {
375             destIndex = appendUnchanged(dest, destIndex, destCapacity,
376                                         src + prev, cpStart - prev, options, edits);
377             if (destIndex >= 0) {
378                 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
379                                          srcIndex - cpStart, options, edits);
380             }
381             if (destIndex < 0) {
382                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
383                 return 0;
384             }
385             prev = srcIndex;
386         }
387     }
388     destIndex = appendUnchanged(dest, destIndex, destCapacity,
389                                 src + prev, srcIndex - prev, options, edits);
390     if (destIndex < 0) {
391         errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
392         return 0;
393     }
394     return destIndex;
395 }
396 
397 }  // namespace
398 
399 U_NAMESPACE_END
400 
401 U_NAMESPACE_USE
402 
403 #if !UCONFIG_NO_BREAK_ITERATION
404 
405 namespace {
406 
407 /**
408  * Input: c is a letter I with or without acute accent.
409  * start is the index in src after c, and is less than segmentLimit.
410  * If a plain i/I is followed by a plain j/J,
411  * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
412  * then we output accordingly.
413  *
414  * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
415  */
maybeTitleDutchIJ(const char16_t * src,UChar32 c,int32_t start,int32_t segmentLimit,char16_t * dest,int32_t & destIndex,int32_t destCapacity,uint32_t options,icu::Edits * edits)416 int32_t maybeTitleDutchIJ(const char16_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
417                           char16_t *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options,
418                           icu::Edits *edits) {
419     U_ASSERT(start < segmentLimit);
420 
421     int32_t index = start;
422     bool withAcute = false;
423 
424     // If the conditions are met, then the following variables tell us what to output.
425     int32_t unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
426     bool doTitleJ = false;  // true if the j needs to be titlecased
427     int32_t unchanged2 = 0;  // after the j (0 or 1)
428 
429     // next character after the first letter
430     char16_t c2 = src[index++];
431 
432     // Is the first letter an i/I with accent?
433     if (c == u'I') {
434         if (c2 == ACUTE) {
435             withAcute = true;
436             unchanged1 = 1;
437             if (index == segmentLimit) { return start; }
438             c2 = src[index++];
439         }
440     } else {  // Í
441         withAcute = true;
442     }
443 
444     // Is the next character a j/J?
445     if (c2 == u'j') {
446         doTitleJ = true;
447     } else if (c2 == u'J') {
448         ++unchanged1;
449     } else {
450         return start;
451     }
452 
453     // A plain i/I must be followed by a plain j/J.
454     // An i/I with acute must be followed by a j/J with acute.
455     if (withAcute) {
456         if (index == segmentLimit || src[index++] != ACUTE) { return start; }
457         if (doTitleJ) {
458             unchanged2 = 1;
459         } else {
460             ++unchanged1;
461         }
462     }
463 
464     // There must not be another combining mark.
465     if (index < segmentLimit) {
466         int32_t cp;
467         int32_t i = index;
468         U16_NEXT(src, i, segmentLimit, cp);
469         uint32_t typeMask = U_GET_GC_MASK(cp);
470         if ((typeMask & U_GC_M_MASK) != 0) {
471             return start;
472         }
473     }
474 
475     // Output the rest of the Dutch IJ.
476     destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits);
477     start += unchanged1;
478     if (doTitleJ) {
479         destIndex = appendUChar(dest, destIndex, destCapacity, u'J');
480         if (edits != nullptr) {
481             edits->addReplace(1, 1);
482         }
483         ++start;
484     }
485     destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits);
486 
487     U_ASSERT(start + unchanged2 == index);
488     return index;
489 }
490 
491 }  // namespace
492 
493 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(int32_t caseLocale,uint32_t options,BreakIterator * iter,char16_t * dest,int32_t destCapacity,const char16_t * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)494 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
495                          char16_t *dest, int32_t destCapacity,
496                          const char16_t *src, int32_t srcLength,
497                          icu::Edits *edits,
498                          UErrorCode &errorCode) {
499     if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) {
500         return 0;
501     }
502 
503     /* set up local variables */
504     UCaseContext csc=UCASECONTEXT_INITIALIZER;
505     csc.p=(void *)src;
506     csc.limit=srcLength;
507     int32_t destIndex=0;
508     int32_t prev=0;
509     bool isFirstIndex=true;
510 
511     /* titlecasing loop */
512     while(prev<srcLength) {
513         /* find next index where to titlecase */
514         int32_t index;
515         if(isFirstIndex) {
516             isFirstIndex=false;
517             index=iter->first();
518         } else {
519             index=iter->next();
520         }
521         if(index==UBRK_DONE || index>srcLength) {
522             index=srcLength;
523         }
524 
525         /*
526          * Segment [prev..index[ into 3 parts:
527          * a) skipped characters (copy as-is) [prev..titleStart[
528          * b) first letter (titlecase)              [titleStart..titleLimit[
529          * c) subsequent characters (lowercase)                 [titleLimit..index[
530          */
531         if(prev<index) {
532             // Find and copy skipped characters [prev..titleStart[
533             int32_t titleStart=prev;
534             int32_t titleLimit=prev;
535             UChar32 c;
536             U16_NEXT(src, titleLimit, index, c);
537             if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) {
538                 // Adjust the titlecasing index to the next cased character,
539                 // or to the next letter/number/symbol/private use.
540                 // Stop with titleStart<titleLimit<=index
541                 // if there is a character to be titlecased,
542                 // or else stop with titleStart==titleLimit==index.
543                 bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
544                 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
545                     titleStart=titleLimit;
546                     if(titleLimit==index) {
547                         break;
548                     }
549                     U16_NEXT(src, titleLimit, index, c);
550                 }
551                 if (prev < titleStart) {
552                     destIndex=appendUnchanged(dest, destIndex, destCapacity,
553                                               src+prev, titleStart-prev, options, edits);
554                     if(destIndex<0) {
555                         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
556                         return 0;
557                     }
558                 }
559             }
560 
561             if(titleStart<titleLimit) {
562                 /* titlecase c which is from [titleStart..titleLimit[ */
563                 csc.cpStart=titleStart;
564                 csc.cpLimit=titleLimit;
565                 const char16_t *s;
566                 c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
567                 destIndex=appendResult(dest, destIndex, destCapacity, c, s,
568                                        titleLimit-titleStart, options, edits);
569                 if(destIndex<0) {
570                     errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
571                     return 0;
572                 }
573 
574                 /* Special case Dutch IJ titlecasing */
575                 if (titleStart+1 < index &&
576                         caseLocale == UCASE_LOC_DUTCH) {
577                     if (c < 0) {
578                         c = ~c;
579                     }
580 
581                     if (c == u'I' || c == u'Í') {
582                         titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index,
583                                                        dest, destIndex, destCapacity, options,
584                                                        edits);
585                     }
586                 }
587 
588                 /* lowercase [titleLimit..index[ */
589                 if(titleLimit<index) {
590                     if((options&U_TITLECASE_NO_LOWERCASE)==0) {
591                         /* Normal operation: Lowercase the rest of the word. */
592                         destIndex+=
593                             toLower(
594                                 caseLocale, options,
595                                 (dest==nullptr) ? nullptr: dest+destIndex, destCapacity-destIndex,
596                                 src, &csc, titleLimit, index,
597                                 edits, errorCode);
598                         if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
599                             errorCode=U_ZERO_ERROR;
600                         }
601                         if(U_FAILURE(errorCode)) {
602                             return destIndex;
603                         }
604                     } else {
605                         /* Optionally just copy the rest of the word unchanged. */
606                         destIndex=appendUnchanged(dest, destIndex, destCapacity,
607                                                   src+titleLimit, index-titleLimit, options, edits);
608                         if(destIndex<0) {
609                             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
610                             return 0;
611                         }
612                     }
613                 }
614             }
615         }
616 
617         prev=index;
618     }
619 
620     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
621 }
622 
623 #endif  // !UCONFIG_NO_BREAK_ITERATION
624 
625 U_NAMESPACE_BEGIN
626 namespace GreekUpper {
627 
628 // Data generated by prototype code, see
629 // https://icu.unicode.org/design/case/greek-upper
630 // TODO: Move this data into ucase.icu.
631 static const uint16_t data0370[] = {
632     // U+0370..03FF
633     0x0370,
634     0x0370,
635     0x0372,
636     0x0372,
637     0,
638     0,
639     0x0376,
640     0x0376,
641     0,
642     0,
643     0x037A,
644     0x03FD,
645     0x03FE,
646     0x03FF,
647     0,
648     0x037F,
649     0,
650     0,
651     0,
652     0,
653     0,
654     0,
655     0x0391 | HAS_VOWEL | HAS_ACCENT,
656     0,
657     0x0395 | HAS_VOWEL | HAS_ACCENT,
658     0x0397 | HAS_VOWEL | HAS_ACCENT,
659     0x0399 | HAS_VOWEL | HAS_ACCENT,
660     0,
661     0x039F | HAS_VOWEL | HAS_ACCENT,
662     0,
663     0x03A5 | HAS_VOWEL | HAS_ACCENT,
664     0x03A9 | HAS_VOWEL | HAS_ACCENT,
665     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
666     0x0391 | HAS_VOWEL,
667     0x0392,
668     0x0393,
669     0x0394,
670     0x0395 | HAS_VOWEL,
671     0x0396,
672     0x0397 | HAS_VOWEL,
673     0x0398,
674     0x0399 | HAS_VOWEL,
675     0x039A,
676     0x039B,
677     0x039C,
678     0x039D,
679     0x039E,
680     0x039F | HAS_VOWEL,
681     0x03A0,
682     0x03A1,
683     0,
684     0x03A3,
685     0x03A4,
686     0x03A5 | HAS_VOWEL,
687     0x03A6,
688     0x03A7,
689     0x03A8,
690     0x03A9 | HAS_VOWEL,
691     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
692     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
693     0x0391 | HAS_VOWEL | HAS_ACCENT,
694     0x0395 | HAS_VOWEL | HAS_ACCENT,
695     0x0397 | HAS_VOWEL | HAS_ACCENT,
696     0x0399 | HAS_VOWEL | HAS_ACCENT,
697     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
698     0x0391 | HAS_VOWEL,
699     0x0392,
700     0x0393,
701     0x0394,
702     0x0395 | HAS_VOWEL,
703     0x0396,
704     0x0397 | HAS_VOWEL,
705     0x0398,
706     0x0399 | HAS_VOWEL,
707     0x039A,
708     0x039B,
709     0x039C,
710     0x039D,
711     0x039E,
712     0x039F | HAS_VOWEL,
713     0x03A0,
714     0x03A1,
715     0x03A3,
716     0x03A3,
717     0x03A4,
718     0x03A5 | HAS_VOWEL,
719     0x03A6,
720     0x03A7,
721     0x03A8,
722     0x03A9 | HAS_VOWEL,
723     0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
724     0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
725     0x039F | HAS_VOWEL | HAS_ACCENT,
726     0x03A5 | HAS_VOWEL | HAS_ACCENT,
727     0x03A9 | HAS_VOWEL | HAS_ACCENT,
728     0x03CF,
729     0x0392,
730     0x0398,
731     0x03D2,
732     0x03D2 | HAS_ACCENT,
733     0x03D2 | HAS_DIALYTIKA,
734     0x03A6,
735     0x03A0,
736     0x03CF,
737     0x03D8,
738     0x03D8,
739     0x03DA,
740     0x03DA,
741     0x03DC,
742     0x03DC,
743     0x03DE,
744     0x03DE,
745     0x03E0,
746     0x03E0,
747     0,
748     0,
749     0,
750     0,
751     0,
752     0,
753     0,
754     0,
755     0,
756     0,
757     0,
758     0,
759     0,
760     0,
761     0x039A,
762     0x03A1,
763     0x03F9,
764     0x037F,
765     0x03F4,
766     0x0395 | HAS_VOWEL,
767     0,
768     0x03F7,
769     0x03F7,
770     0x03F9,
771     0x03FA,
772     0x03FA,
773     0x03FC,
774     0x03FD,
775     0x03FE,
776     0x03FF,
777 };
778 
779 static const uint16_t data1F00[] = {
780     // U+1F00..1FFF
781     0x0391 | HAS_VOWEL,
782     0x0391 | HAS_VOWEL,
783     0x0391 | HAS_VOWEL | HAS_ACCENT,
784     0x0391 | HAS_VOWEL | HAS_ACCENT,
785     0x0391 | HAS_VOWEL | HAS_ACCENT,
786     0x0391 | HAS_VOWEL | HAS_ACCENT,
787     0x0391 | HAS_VOWEL | HAS_ACCENT,
788     0x0391 | HAS_VOWEL | HAS_ACCENT,
789     0x0391 | HAS_VOWEL,
790     0x0391 | HAS_VOWEL,
791     0x0391 | HAS_VOWEL | HAS_ACCENT,
792     0x0391 | HAS_VOWEL | HAS_ACCENT,
793     0x0391 | HAS_VOWEL | HAS_ACCENT,
794     0x0391 | HAS_VOWEL | HAS_ACCENT,
795     0x0391 | HAS_VOWEL | HAS_ACCENT,
796     0x0391 | HAS_VOWEL | HAS_ACCENT,
797     0x0395 | HAS_VOWEL,
798     0x0395 | HAS_VOWEL,
799     0x0395 | HAS_VOWEL | HAS_ACCENT,
800     0x0395 | HAS_VOWEL | HAS_ACCENT,
801     0x0395 | HAS_VOWEL | HAS_ACCENT,
802     0x0395 | HAS_VOWEL | HAS_ACCENT,
803     0,
804     0,
805     0x0395 | HAS_VOWEL,
806     0x0395 | HAS_VOWEL,
807     0x0395 | HAS_VOWEL | HAS_ACCENT,
808     0x0395 | HAS_VOWEL | HAS_ACCENT,
809     0x0395 | HAS_VOWEL | HAS_ACCENT,
810     0x0395 | HAS_VOWEL | HAS_ACCENT,
811     0,
812     0,
813     0x0397 | HAS_VOWEL,
814     0x0397 | HAS_VOWEL,
815     0x0397 | HAS_VOWEL | HAS_ACCENT,
816     0x0397 | HAS_VOWEL | HAS_ACCENT,
817     0x0397 | HAS_VOWEL | HAS_ACCENT,
818     0x0397 | HAS_VOWEL | HAS_ACCENT,
819     0x0397 | HAS_VOWEL | HAS_ACCENT,
820     0x0397 | HAS_VOWEL | HAS_ACCENT,
821     0x0397 | HAS_VOWEL,
822     0x0397 | HAS_VOWEL,
823     0x0397 | HAS_VOWEL | HAS_ACCENT,
824     0x0397 | HAS_VOWEL | HAS_ACCENT,
825     0x0397 | HAS_VOWEL | HAS_ACCENT,
826     0x0397 | HAS_VOWEL | HAS_ACCENT,
827     0x0397 | HAS_VOWEL | HAS_ACCENT,
828     0x0397 | HAS_VOWEL | HAS_ACCENT,
829     0x0399 | HAS_VOWEL,
830     0x0399 | HAS_VOWEL,
831     0x0399 | HAS_VOWEL | HAS_ACCENT,
832     0x0399 | HAS_VOWEL | HAS_ACCENT,
833     0x0399 | HAS_VOWEL | HAS_ACCENT,
834     0x0399 | HAS_VOWEL | HAS_ACCENT,
835     0x0399 | HAS_VOWEL | HAS_ACCENT,
836     0x0399 | HAS_VOWEL | HAS_ACCENT,
837     0x0399 | HAS_VOWEL,
838     0x0399 | HAS_VOWEL,
839     0x0399 | HAS_VOWEL | HAS_ACCENT,
840     0x0399 | HAS_VOWEL | HAS_ACCENT,
841     0x0399 | HAS_VOWEL | HAS_ACCENT,
842     0x0399 | HAS_VOWEL | HAS_ACCENT,
843     0x0399 | HAS_VOWEL | HAS_ACCENT,
844     0x0399 | HAS_VOWEL | HAS_ACCENT,
845     0x039F | HAS_VOWEL,
846     0x039F | HAS_VOWEL,
847     0x039F | HAS_VOWEL | HAS_ACCENT,
848     0x039F | HAS_VOWEL | HAS_ACCENT,
849     0x039F | HAS_VOWEL | HAS_ACCENT,
850     0x039F | HAS_VOWEL | HAS_ACCENT,
851     0,
852     0,
853     0x039F | HAS_VOWEL,
854     0x039F | HAS_VOWEL,
855     0x039F | HAS_VOWEL | HAS_ACCENT,
856     0x039F | HAS_VOWEL | HAS_ACCENT,
857     0x039F | HAS_VOWEL | HAS_ACCENT,
858     0x039F | HAS_VOWEL | HAS_ACCENT,
859     0,
860     0,
861     0x03A5 | HAS_VOWEL,
862     0x03A5 | HAS_VOWEL,
863     0x03A5 | HAS_VOWEL | HAS_ACCENT,
864     0x03A5 | HAS_VOWEL | HAS_ACCENT,
865     0x03A5 | HAS_VOWEL | HAS_ACCENT,
866     0x03A5 | HAS_VOWEL | HAS_ACCENT,
867     0x03A5 | HAS_VOWEL | HAS_ACCENT,
868     0x03A5 | HAS_VOWEL | HAS_ACCENT,
869     0,
870     0x03A5 | HAS_VOWEL,
871     0,
872     0x03A5 | HAS_VOWEL | HAS_ACCENT,
873     0,
874     0x03A5 | HAS_VOWEL | HAS_ACCENT,
875     0,
876     0x03A5 | HAS_VOWEL | HAS_ACCENT,
877     0x03A9 | HAS_VOWEL,
878     0x03A9 | HAS_VOWEL,
879     0x03A9 | HAS_VOWEL | HAS_ACCENT,
880     0x03A9 | HAS_VOWEL | HAS_ACCENT,
881     0x03A9 | HAS_VOWEL | HAS_ACCENT,
882     0x03A9 | HAS_VOWEL | HAS_ACCENT,
883     0x03A9 | HAS_VOWEL | HAS_ACCENT,
884     0x03A9 | HAS_VOWEL | HAS_ACCENT,
885     0x03A9 | HAS_VOWEL,
886     0x03A9 | HAS_VOWEL,
887     0x03A9 | HAS_VOWEL | HAS_ACCENT,
888     0x03A9 | HAS_VOWEL | HAS_ACCENT,
889     0x03A9 | HAS_VOWEL | HAS_ACCENT,
890     0x03A9 | HAS_VOWEL | HAS_ACCENT,
891     0x03A9 | HAS_VOWEL | HAS_ACCENT,
892     0x03A9 | HAS_VOWEL | HAS_ACCENT,
893     0x0391 | HAS_VOWEL | HAS_ACCENT,
894     0x0391 | HAS_VOWEL | HAS_ACCENT,
895     0x0395 | HAS_VOWEL | HAS_ACCENT,
896     0x0395 | HAS_VOWEL | HAS_ACCENT,
897     0x0397 | HAS_VOWEL | HAS_ACCENT,
898     0x0397 | HAS_VOWEL | HAS_ACCENT,
899     0x0399 | HAS_VOWEL | HAS_ACCENT,
900     0x0399 | HAS_VOWEL | HAS_ACCENT,
901     0x039F | HAS_VOWEL | HAS_ACCENT,
902     0x039F | HAS_VOWEL | HAS_ACCENT,
903     0x03A5 | HAS_VOWEL | HAS_ACCENT,
904     0x03A5 | HAS_VOWEL | HAS_ACCENT,
905     0x03A9 | HAS_VOWEL | HAS_ACCENT,
906     0x03A9 | HAS_VOWEL | HAS_ACCENT,
907     0,
908     0,
909     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
910     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
911     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
912     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
913     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
914     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
915     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
916     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
917     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
918     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
919     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
920     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
921     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
922     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
923     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
924     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
925     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
926     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
927     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
928     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
929     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
930     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
931     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
932     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
933     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
934     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
935     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
936     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
937     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
938     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
939     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
940     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
941     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
942     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
943     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
944     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
945     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
946     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
947     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
948     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
949     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
950     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
951     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
952     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
953     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
954     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
955     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
956     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
957     0x0391 | HAS_VOWEL,
958     0x0391 | HAS_VOWEL,
959     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
960     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
961     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
962     0,
963     0x0391 | HAS_VOWEL | HAS_ACCENT,
964     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
965     0x0391 | HAS_VOWEL,
966     0x0391 | HAS_VOWEL,
967     0x0391 | HAS_VOWEL | HAS_ACCENT,
968     0x0391 | HAS_VOWEL | HAS_ACCENT,
969     0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
970     0,
971     0x0399 | HAS_VOWEL,
972     0,
973     0,
974     0,
975     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
976     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
977     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
978     0,
979     0x0397 | HAS_VOWEL | HAS_ACCENT,
980     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
981     0x0395 | HAS_VOWEL | HAS_ACCENT,
982     0x0395 | HAS_VOWEL | HAS_ACCENT,
983     0x0397 | HAS_VOWEL | HAS_ACCENT,
984     0x0397 | HAS_VOWEL | HAS_ACCENT,
985     0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
986     0,
987     0,
988     0,
989     0x0399 | HAS_VOWEL,
990     0x0399 | HAS_VOWEL,
991     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
992     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
993     0,
994     0,
995     0x0399 | HAS_VOWEL | HAS_ACCENT,
996     0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
997     0x0399 | HAS_VOWEL,
998     0x0399 | HAS_VOWEL,
999     0x0399 | HAS_VOWEL | HAS_ACCENT,
1000     0x0399 | HAS_VOWEL | HAS_ACCENT,
1001     0,
1002     0,
1003     0,
1004     0,
1005     0x03A5 | HAS_VOWEL,
1006     0x03A5 | HAS_VOWEL,
1007     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
1008     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
1009     0x03A1,
1010     0x03A1,
1011     0x03A5 | HAS_VOWEL | HAS_ACCENT,
1012     0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
1013     0x03A5 | HAS_VOWEL,
1014     0x03A5 | HAS_VOWEL,
1015     0x03A5 | HAS_VOWEL | HAS_ACCENT,
1016     0x03A5 | HAS_VOWEL | HAS_ACCENT,
1017     0x03A1,
1018     0,
1019     0,
1020     0,
1021     0,
1022     0,
1023     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
1024     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
1025     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
1026     0,
1027     0x03A9 | HAS_VOWEL | HAS_ACCENT,
1028     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
1029     0x039F | HAS_VOWEL | HAS_ACCENT,
1030     0x039F | HAS_VOWEL | HAS_ACCENT,
1031     0x03A9 | HAS_VOWEL | HAS_ACCENT,
1032     0x03A9 | HAS_VOWEL | HAS_ACCENT,
1033     0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
1034     0,
1035     0,
1036     0,
1037 };
1038 
1039 // U+2126 Ohm sign
1040 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
1041 
getLetterData(UChar32 c)1042 uint32_t getLetterData(UChar32 c) {
1043     if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
1044         return 0;
1045     } else if (c <= 0x3ff) {
1046         return data0370[c - 0x370];
1047     } else if (c <= 0x1fff) {
1048         return data1F00[c - 0x1f00];
1049     } else if (c == 0x2126) {
1050         return data2126;
1051     } else {
1052         return 0;
1053     }
1054 }
1055 
getDiacriticData(UChar32 c)1056 uint32_t getDiacriticData(UChar32 c) {
1057     switch (c) {
1058     case 0x0300:  // varia
1059     case 0x0301:  // tonos = oxia
1060     case 0x0342:  // perispomeni
1061     case 0x0302:  // circumflex can look like perispomeni
1062     case 0x0303:  // tilde can look like perispomeni
1063     case 0x0311:  // inverted breve can look like perispomeni
1064         return HAS_ACCENT;
1065     case 0x0308:  // dialytika = diaeresis
1066         return HAS_COMBINING_DIALYTIKA;
1067     case 0x0344:  // dialytika tonos
1068         return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
1069     case 0x0345:  // ypogegrammeni = iota subscript
1070         return HAS_YPOGEGRAMMENI;
1071     case 0x0304:  // macron
1072     case 0x0306:  // breve
1073     case 0x0313:  // comma above
1074     case 0x0314:  // reversed comma above
1075     case 0x0343:  // koronis
1076         return HAS_OTHER_GREEK_DIACRITIC;
1077     default:
1078         return 0;
1079     }
1080 }
1081 
isFollowedByCasedLetter(const char16_t * s,int32_t i,int32_t length)1082 UBool isFollowedByCasedLetter(const char16_t *s, int32_t i, int32_t length) {
1083     while (i < length) {
1084         UChar32 c;
1085         U16_NEXT(s, i, length, c);
1086         int32_t type = ucase_getTypeOrIgnorable(c);
1087         if ((type & UCASE_IGNORABLE) != 0) {
1088             // Case-ignorable, continue with the loop.
1089         } else if (type != UCASE_NONE) {
1090             return true;  // Followed by cased letter.
1091         } else {
1092             return false;  // Uncased and not case-ignorable.
1093         }
1094     }
1095     return false;  // Not followed by cased letter.
1096 }
1097 
1098 /**
1099  * Greek string uppercasing with a state machine.
1100  * Probably simpler than a stateless function that has to figure out complex context-before
1101  * for each character.
1102  * TODO: Try to re-consolidate one way or another with the non-Greek function.
1103  */
toUpper(uint32_t options,char16_t * dest,int32_t destCapacity,const char16_t * src,int32_t srcLength,Edits * edits,UErrorCode & errorCode)1104 int32_t toUpper(uint32_t options,
1105                 char16_t *dest, int32_t destCapacity,
1106                 const char16_t *src, int32_t srcLength,
1107                 Edits *edits,
1108                 UErrorCode &errorCode) {
1109     int32_t destIndex=0;
1110     uint32_t state = 0;
1111     for (int32_t i = 0; i < srcLength;) {
1112         int32_t nextIndex = i;
1113         UChar32 c;
1114         U16_NEXT(src, nextIndex, srcLength, c);
1115         uint32_t nextState = 0;
1116         int32_t type = ucase_getTypeOrIgnorable(c);
1117         if ((type & UCASE_IGNORABLE) != 0) {
1118             // c is case-ignorable
1119             nextState |= (state & AFTER_CASED);
1120         } else if (type != UCASE_NONE) {
1121             // c is cased
1122             nextState |= AFTER_CASED;
1123         }
1124         uint32_t data = getLetterData(c);
1125         if (data > 0) {
1126             uint32_t upper = data & UPPER_MASK;
1127             // Add a dialytika to this iota or ypsilon vowel
1128             // if we removed a tonos from the previous vowel,
1129             // and that previous vowel did not also have (or gain) a dialytika.
1130             // Adding one only to the final vowel in a longer sequence
1131             // (which does not occur in normal writing) would require lookahead.
1132             // Set the same flag as for preserving an existing dialytika.
1133             if ((data & HAS_VOWEL) != 0 &&
1134                 (state & (AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT | AFTER_VOWEL_WITH_COMBINING_ACCENT)) !=
1135                     0 &&
1136                 (upper == 0x399 || upper == 0x3A5)) {
1137                 data |= (state & AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT) ? HAS_DIALYTIKA
1138                                                                       : HAS_COMBINING_DIALYTIKA;
1139             }
1140             int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
1141             if ((data & HAS_YPOGEGRAMMENI) != 0) {
1142                 numYpogegrammeni = 1;
1143             }
1144             const UBool hasPrecomposedAccent = (data & HAS_ACCENT) != 0;
1145             // Skip combining diacritics after this Greek letter.
1146             while (nextIndex < srcLength) {
1147                 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
1148                 if (diacriticData != 0) {
1149                     data |= diacriticData;
1150                     if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
1151                         ++numYpogegrammeni;
1152                     }
1153                     ++nextIndex;
1154                 } else {
1155                     break;  // not a Greek diacritic
1156                 }
1157             }
1158             if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
1159                 nextState |= hasPrecomposedAccent ? AFTER_VOWEL_WITH_PRECOMPOSED_ACCENT
1160                                                   : AFTER_VOWEL_WITH_COMBINING_ACCENT;
1161             }
1162             // Map according to Greek rules.
1163             UBool addTonos = false;
1164             if (upper == 0x397 &&
1165                     (data & HAS_ACCENT) != 0 &&
1166                     numYpogegrammeni == 0 &&
1167                     (state & AFTER_CASED) == 0 &&
1168                     !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
1169                 // Keep disjunctive "or" with (only) a tonos.
1170                 // We use the same "word boundary" conditions as for the Final_Sigma test.
1171                 if (hasPrecomposedAccent) {
1172                     upper = 0x389;  // Preserve the precomposed form.
1173                 } else {
1174                     addTonos = true;
1175                 }
1176             } else if ((data & HAS_DIALYTIKA) != 0) {
1177                 // Preserve a vowel with dialytika in precomposed form if it exists.
1178                 if (upper == 0x399) {
1179                     upper = 0x3AA;
1180                     data &= ~HAS_EITHER_DIALYTIKA;
1181                 } else if (upper == 0x3A5) {
1182                     upper = 0x3AB;
1183                     data &= ~HAS_EITHER_DIALYTIKA;
1184                 }
1185             }
1186 
1187             UBool change;
1188             if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) {
1189                 change = true;  // common, simple usage
1190             } else {
1191                 // Find out first whether we are changing the text.
1192                 change = src[i] != upper || numYpogegrammeni > 0;
1193                 int32_t i2 = i + 1;
1194                 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
1195                     change |= i2 >= nextIndex || src[i2] != 0x308;
1196                     ++i2;
1197                 }
1198                 if (addTonos) {
1199                     change |= i2 >= nextIndex || src[i2] != 0x301;
1200                     ++i2;
1201                 }
1202                 int32_t oldLength = nextIndex - i;
1203                 int32_t newLength = (i2 - i) + numYpogegrammeni;
1204                 change |= oldLength != newLength;
1205                 if (change) {
1206                     if (edits != nullptr) {
1207                         edits->addReplace(oldLength, newLength);
1208                     }
1209                 } else {
1210                     if (edits != nullptr) {
1211                         edits->addUnchanged(oldLength);
1212                     }
1213                     // Write unchanged text?
1214                     change = (options & U_OMIT_UNCHANGED_TEXT) == 0;
1215                 }
1216             }
1217 
1218             if (change) {
1219                 destIndex=appendUChar(dest, destIndex, destCapacity, (char16_t)upper);
1220                 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
1221                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x308);  // restore or add a dialytika
1222                 }
1223                 if (destIndex >= 0 && addTonos) {
1224                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
1225                 }
1226                 while (destIndex >= 0 && numYpogegrammeni > 0) {
1227                     destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
1228                     --numYpogegrammeni;
1229                 }
1230                 if(destIndex<0) {
1231                     errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1232                     return 0;
1233                 }
1234             }
1235         } else {
1236             const char16_t *s;
1237             c=ucase_toFullUpper(c, nullptr, nullptr, &s, UCASE_LOC_GREEK);
1238             destIndex = appendResult(dest, destIndex, destCapacity, c, s,
1239                                      nextIndex - i, options, edits);
1240             if (destIndex < 0) {
1241                 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1242                 return 0;
1243             }
1244         }
1245         i = nextIndex;
1246         state = nextState;
1247     }
1248 
1249     return destIndex;
1250 }
1251 
1252 }  // namespace GreekUpper
1253 U_NAMESPACE_END
1254 
1255 /* functions available in the common library (for unistr_case.cpp) */
1256 
1257 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToLower(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED char16_t * dest,int32_t destCapacity,const char16_t * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1258 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1259                          char16_t *dest, int32_t destCapacity,
1260                          const char16_t *src, int32_t srcLength,
1261                          icu::Edits *edits,
1262                          UErrorCode &errorCode) {
1263     UCaseContext csc=UCASECONTEXT_INITIALIZER;
1264     csc.p=(void *)src;
1265     csc.limit=srcLength;
1266     int32_t destIndex = toLower(
1267         caseLocale, options,
1268         dest, destCapacity,
1269         src, &csc, 0, srcLength,
1270         edits, errorCode);
1271     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1272 }
1273 
1274 U_CFUNC int32_t U_CALLCONV
ustrcase_internalToUpper(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED char16_t * dest,int32_t destCapacity,const char16_t * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1275 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1276                          char16_t *dest, int32_t destCapacity,
1277                          const char16_t *src, int32_t srcLength,
1278                          icu::Edits *edits,
1279                          UErrorCode &errorCode) {
1280     int32_t destIndex;
1281     if (caseLocale == UCASE_LOC_GREEK) {
1282         destIndex = GreekUpper::toUpper(options, dest, destCapacity,
1283                                         src, srcLength, edits, errorCode);
1284     } else {
1285         UCaseContext csc=UCASECONTEXT_INITIALIZER;
1286         csc.p=(void *)src;
1287         csc.limit=srcLength;
1288         destIndex = toUpper(
1289             caseLocale, options,
1290             dest, destCapacity,
1291             src, &csc, srcLength,
1292             edits, errorCode);
1293     }
1294     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1295 }
1296 
1297 U_CFUNC int32_t U_CALLCONV
ustrcase_internalFold(int32_t,uint32_t options,UCASEMAP_BREAK_ITERATOR_UNUSED char16_t * dest,int32_t destCapacity,const char16_t * src,int32_t srcLength,icu::Edits * edits,UErrorCode & errorCode)1298 ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1299                       char16_t *dest, int32_t destCapacity,
1300                       const char16_t *src, int32_t srcLength,
1301                       icu::Edits *edits,
1302                       UErrorCode &errorCode) {
1303     int32_t destIndex = toLower(
1304         -1, options,
1305         dest, destCapacity,
1306         src, nullptr, 0, srcLength,
1307         edits, errorCode);
1308     return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1309 }
1310 
1311 U_CFUNC int32_t
ustrcase_map(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM char16_t * dest,int32_t destCapacity,const char16_t * src,int32_t srcLength,UStringCaseMapper * stringCaseMapper,icu::Edits * edits,UErrorCode & errorCode)1312 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1313              char16_t *dest, int32_t destCapacity,
1314              const char16_t *src, int32_t srcLength,
1315              UStringCaseMapper *stringCaseMapper,
1316              icu::Edits *edits,
1317              UErrorCode &errorCode) {
1318     int32_t destLength;
1319 
1320     /* check argument values */
1321     if(U_FAILURE(errorCode)) {
1322         return 0;
1323     }
1324     if( destCapacity<0 ||
1325         (dest==nullptr && destCapacity>0) ||
1326         src==nullptr ||
1327         srcLength<-1
1328     ) {
1329         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1330         return 0;
1331     }
1332 
1333     /* get the string length */
1334     if(srcLength==-1) {
1335         srcLength=u_strlen(src);
1336     }
1337 
1338     /* check for overlapping source and destination */
1339     if( dest!=nullptr &&
1340         ((src>=dest && src<(dest+destCapacity)) ||
1341          (dest>=src && dest<(src+srcLength)))
1342     ) {
1343         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1344         return 0;
1345     }
1346 
1347     if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
1348         edits->reset();
1349     }
1350     destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1351                                 dest, destCapacity, src, srcLength, edits, errorCode);
1352     return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1353 }
1354 
1355 U_CFUNC int32_t
ustrcase_mapWithOverlap(int32_t caseLocale,uint32_t options,UCASEMAP_BREAK_ITERATOR_PARAM char16_t * dest,int32_t destCapacity,const char16_t * src,int32_t srcLength,UStringCaseMapper * stringCaseMapper,UErrorCode & errorCode)1356 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1357                         char16_t *dest, int32_t destCapacity,
1358                         const char16_t *src, int32_t srcLength,
1359                         UStringCaseMapper *stringCaseMapper,
1360                         UErrorCode &errorCode) {
1361     char16_t buffer[300];
1362     char16_t *temp;
1363 
1364     int32_t destLength;
1365 
1366     /* check argument values */
1367     if(U_FAILURE(errorCode)) {
1368         return 0;
1369     }
1370     if( destCapacity<0 ||
1371         (dest==nullptr && destCapacity>0) ||
1372         src==nullptr ||
1373         srcLength<-1
1374     ) {
1375         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1376         return 0;
1377     }
1378 
1379     /* get the string length */
1380     if(srcLength==-1) {
1381         srcLength=u_strlen(src);
1382     }
1383 
1384     /* check for overlapping source and destination */
1385     if( dest!=nullptr &&
1386         ((src>=dest && src<(dest+destCapacity)) ||
1387          (dest>=src && dest<(src+srcLength)))
1388     ) {
1389         /* overlap: provide a temporary destination buffer and later copy the result */
1390         if(destCapacity<=UPRV_LENGTHOF(buffer)) {
1391             /* the stack buffer is large enough */
1392             temp=buffer;
1393         } else {
1394             /* allocate a buffer */
1395             temp=(char16_t *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
1396             if(temp==nullptr) {
1397                 errorCode=U_MEMORY_ALLOCATION_ERROR;
1398                 return 0;
1399             }
1400         }
1401     } else {
1402         temp=dest;
1403     }
1404 
1405     destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1406                                 temp, destCapacity, src, srcLength, nullptr, errorCode);
1407     if(temp!=dest) {
1408         /* copy the result string to the destination buffer */
1409         if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) {
1410             u_memmove(dest, temp, destLength);
1411         }
1412         if(temp!=buffer) {
1413             uprv_free(temp);
1414         }
1415     }
1416 
1417     return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1418 }
1419 
1420 /* public API functions */
1421 
1422 U_CAPI int32_t U_EXPORT2
u_strFoldCase(char16_t * dest,int32_t destCapacity,const char16_t * src,int32_t srcLength,uint32_t options,UErrorCode * pErrorCode)1423 u_strFoldCase(char16_t *dest, int32_t destCapacity,
1424               const char16_t *src, int32_t srcLength,
1425               uint32_t options,
1426               UErrorCode *pErrorCode) {
1427     return ustrcase_mapWithOverlap(
1428         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1429         dest, destCapacity,
1430         src, srcLength,
1431         ustrcase_internalFold, *pErrorCode);
1432 }
1433 
1434 U_NAMESPACE_BEGIN
1435 
fold(uint32_t options,const char16_t * src,int32_t srcLength,char16_t * dest,int32_t destCapacity,Edits * edits,UErrorCode & errorCode)1436 int32_t CaseMap::fold(
1437         uint32_t options,
1438         const char16_t *src, int32_t srcLength,
1439         char16_t *dest, int32_t destCapacity, Edits *edits,
1440         UErrorCode &errorCode) {
1441     return ustrcase_map(
1442         UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1443         dest, destCapacity,
1444         src, srcLength,
1445         ustrcase_internalFold, edits, errorCode);
1446 }
1447 
1448 U_NAMESPACE_END
1449 
1450 /* case-insensitive string comparisons -------------------------------------- */
1451 
1452 /*
1453  * This function is a copy of unorm_cmpEquivFold() minus the parts for
1454  * canonical equivalence.
1455  * Keep the functions in sync, and see there for how this works.
1456  * The duplication is for modularization:
1457  * It makes caseless (but not canonical caseless) matches independent of
1458  * the normalization code.
1459  */
1460 
1461 /* stack element for previous-level source/decomposition pointers */
1462 struct CmpEquivLevel {
1463     const char16_t *start, *s, *limit;
1464 };
1465 typedef struct CmpEquivLevel CmpEquivLevel;
1466 
1467 /**
1468  * Internal implementation code comparing string with case fold.
1469  * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1470  *
1471  * @param s1            input string 1
1472  * @param length1       length of string 1, or -1 (NUL terminated)
1473  * @param s2            input string 2
1474  * @param length2       length of string 2, or -1 (NUL terminated)
1475  * @param options       compare options
1476  * @param matchLen1     (output) length of partial prefix match in s1
1477  * @param matchLen2     (output) length of partial prefix match in s2
1478  * @param pErrorCode    receives error status
1479  * @return The result of comparison
1480  */
_cmpFold(const char16_t * s1,int32_t length1,const char16_t * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1481 static int32_t _cmpFold(
1482             const char16_t *s1, int32_t length1,
1483             const char16_t *s2, int32_t length2,
1484             uint32_t options,
1485             int32_t *matchLen1, int32_t *matchLen2,
1486             UErrorCode *pErrorCode) {
1487     int32_t cmpRes = 0;
1488 
1489     /* current-level start/limit - s1/s2 as current */
1490     const char16_t *start1, *start2, *limit1, *limit2;
1491 
1492     /* points to the original start address */
1493     const char16_t *org1, *org2;
1494 
1495     /* points to the end of match + 1 */
1496     const char16_t *m1, *m2;
1497 
1498     /* case folding variables */
1499     const char16_t *p;
1500     int32_t length;
1501 
1502     /* stacks of previous-level start/current/limit */
1503     CmpEquivLevel stack1[2], stack2[2];
1504 
1505     /* case folding buffers, only use current-level start/limit */
1506     char16_t fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
1507 
1508     /* track which is the current level per string */
1509     int32_t level1, level2;
1510 
1511     /* current code units, and code points for lookups */
1512     UChar32 c1, c2, cp1, cp2;
1513 
1514     /* no argument error checking because this itself is not an API */
1515 
1516     /*
1517      * assume that at least the option U_COMPARE_IGNORE_CASE is set
1518      * otherwise this function would have to behave exactly as uprv_strCompare()
1519      */
1520     if(U_FAILURE(*pErrorCode)) {
1521         return 0;
1522     }
1523 
1524     /* initialize */
1525     if(matchLen1) {
1526         U_ASSERT(matchLen2 !=nullptr);
1527         *matchLen1=0;
1528         *matchLen2=0;
1529     }
1530 
1531     start1=m1=org1=s1;
1532     if(length1==-1) {
1533         limit1=nullptr;
1534     } else {
1535         limit1=s1+length1;
1536     }
1537 
1538     start2=m2=org2=s2;
1539     if(length2==-1) {
1540         limit2=nullptr;
1541     } else {
1542         limit2=s2+length2;
1543     }
1544 
1545     level1=level2=0;
1546     c1=c2=-1;
1547 
1548     /* comparison loop */
1549     for(;;) {
1550         /*
1551          * here a code unit value of -1 means "get another code unit"
1552          * below it will mean "this source is finished"
1553          */
1554 
1555         if(c1<0) {
1556             /* get next code unit from string 1, post-increment */
1557             for(;;) {
1558                 if(s1==limit1 || ((c1=*s1)==0 && (limit1==nullptr || (options&_STRNCMP_STYLE)))) {
1559                     if(level1==0) {
1560                         c1=-1;
1561                         break;
1562                     }
1563                 } else {
1564                     ++s1;
1565                     break;
1566                 }
1567 
1568                 /* reached end of level buffer, pop one level */
1569                 do {
1570                     --level1;
1571                     start1=stack1[level1].start;    /*Not uninitialized*/
1572                 } while(start1==nullptr);
1573                 s1=stack1[level1].s;                /*Not uninitialized*/
1574                 limit1=stack1[level1].limit;        /*Not uninitialized*/
1575             }
1576         }
1577 
1578         if(c2<0) {
1579             /* get next code unit from string 2, post-increment */
1580             for(;;) {
1581                 if(s2==limit2 || ((c2=*s2)==0 && (limit2==nullptr || (options&_STRNCMP_STYLE)))) {
1582                     if(level2==0) {
1583                         c2=-1;
1584                         break;
1585                     }
1586                 } else {
1587                     ++s2;
1588                     break;
1589                 }
1590 
1591                 /* reached end of level buffer, pop one level */
1592                 do {
1593                     --level2;
1594                     start2=stack2[level2].start;    /*Not uninitialized*/
1595                 } while(start2==nullptr);
1596                 s2=stack2[level2].s;                /*Not uninitialized*/
1597                 limit2=stack2[level2].limit;        /*Not uninitialized*/
1598             }
1599         }
1600 
1601         /*
1602          * compare c1 and c2
1603          * either variable c1, c2 is -1 only if the corresponding string is finished
1604          */
1605         if(c1==c2) {
1606             const char16_t *next1, *next2;
1607 
1608             if(c1<0) {
1609                 cmpRes=0;   /* c1==c2==-1 indicating end of strings */
1610                 break;
1611             }
1612 
1613             /*
1614              * Note: Move the match positions in both strings at the same time
1615              *      only when corresponding code point(s) in the original strings
1616              *      are fully consumed. For example, when comparing s1="Fust" and
1617              *      s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1618              *      the first code point in the case-folded data. But the second "s"
1619              *      has no matching code point in s1, so this implementation returns
1620              *      2 as the prefix match length ("Fu").
1621              */
1622             next1=next2=nullptr;
1623             if(level1==0) {
1624                 next1=s1;
1625             } else if(s1==limit1) {
1626                 /* Note: This implementation only use a single level of stack.
1627                  *      If this code needs to be changed to use multiple levels
1628                  *      of stacks, the code above should check if the current
1629                  *      code is at the end of all stacks.
1630                  */
1631                 U_ASSERT(level1==1);
1632 
1633                 /* is s1 at the end of the current stack? */
1634                 next1=stack1[0].s;
1635             }
1636 
1637             if (next1!=nullptr) {
1638                 if(level2==0) {
1639                     next2=s2;
1640                 } else if(s2==limit2) {
1641                     U_ASSERT(level2==1);
1642 
1643                     /* is s2 at the end of the current stack? */
1644                     next2=stack2[0].s;
1645                 }
1646                 if(next2!=nullptr) {
1647                     m1=next1;
1648                     m2=next2;
1649                 }
1650             }
1651             c1=c2=-1;       /* make us fetch new code units */
1652             continue;
1653         } else if(c1<0) {
1654             cmpRes=-1;      /* string 1 ends before string 2 */
1655             break;
1656         } else if(c2<0) {
1657             cmpRes=1;       /* string 2 ends before string 1 */
1658             break;
1659         }
1660         /* c1!=c2 && c1>=0 && c2>=0 */
1661 
1662         /* get complete code points for c1, c2 for lookups if either is a surrogate */
1663         cp1=c1;
1664         if(U_IS_SURROGATE(c1)) {
1665             char16_t c;
1666 
1667             if(U_IS_SURROGATE_LEAD(c1)) {
1668                 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
1669                     /* advance ++s1; only below if cp1 decomposes/case-folds */
1670                     cp1=U16_GET_SUPPLEMENTARY(c1, c);
1671                 }
1672             } else /* isTrail(c1) */ {
1673                 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
1674                     cp1=U16_GET_SUPPLEMENTARY(c, c1);
1675                 }
1676             }
1677         }
1678 
1679         cp2=c2;
1680         if(U_IS_SURROGATE(c2)) {
1681             char16_t c;
1682 
1683             if(U_IS_SURROGATE_LEAD(c2)) {
1684                 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
1685                     /* advance ++s2; only below if cp2 decomposes/case-folds */
1686                     cp2=U16_GET_SUPPLEMENTARY(c2, c);
1687                 }
1688             } else /* isTrail(c2) */ {
1689                 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
1690                     cp2=U16_GET_SUPPLEMENTARY(c, c2);
1691                 }
1692             }
1693         }
1694 
1695         /*
1696          * go down one level for each string
1697          * continue with the main loop as soon as there is a real change
1698          */
1699 
1700         if( level1==0 &&
1701             (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0
1702         ) {
1703             /* cp1 case-folds to the code point "length" or to p[length] */
1704             if(U_IS_SURROGATE(c1)) {
1705                 if(U_IS_SURROGATE_LEAD(c1)) {
1706                     /* advance beyond source surrogate pair if it case-folds */
1707                     ++s1;
1708                 } else /* isTrail(c1) */ {
1709                     /*
1710                      * we got a supplementary code point when hitting its trail surrogate,
1711                      * therefore the lead surrogate must have been the same as in the other string;
1712                      * compare this decomposition with the lead surrogate in the other string
1713                      * remember that this simulates bulk text replacement:
1714                      * the decomposition would replace the entire code point
1715                      */
1716                     --s2;
1717                     --m2;
1718                     c2=*(s2-1);
1719                 }
1720             }
1721 
1722             /* push current level pointers */
1723             stack1[0].start=start1;
1724             stack1[0].s=s1;
1725             stack1[0].limit=limit1;
1726             ++level1;
1727 
1728             /* copy the folding result to fold1[] */
1729             if(length<=UCASE_MAX_STRING_LENGTH) {
1730                 u_memcpy(fold1, p, length);
1731             } else {
1732                 int32_t i=0;
1733                 U16_APPEND_UNSAFE(fold1, i, length);
1734                 length=i;
1735             }
1736 
1737             /* set next level pointers to case folding */
1738             start1=s1=fold1;
1739             limit1=fold1+length;
1740 
1741             /* get ready to read from decomposition, continue with loop */
1742             c1=-1;
1743             continue;
1744         }
1745 
1746         if( level2==0 &&
1747             (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0
1748         ) {
1749             /* cp2 case-folds to the code point "length" or to p[length] */
1750             if(U_IS_SURROGATE(c2)) {
1751                 if(U_IS_SURROGATE_LEAD(c2)) {
1752                     /* advance beyond source surrogate pair if it case-folds */
1753                     ++s2;
1754                 } else /* isTrail(c2) */ {
1755                     /*
1756                      * we got a supplementary code point when hitting its trail surrogate,
1757                      * therefore the lead surrogate must have been the same as in the other string;
1758                      * compare this decomposition with the lead surrogate in the other string
1759                      * remember that this simulates bulk text replacement:
1760                      * the decomposition would replace the entire code point
1761                      */
1762                     --s1;
1763                     --m2;
1764                     c1=*(s1-1);
1765                 }
1766             }
1767 
1768             /* push current level pointers */
1769             stack2[0].start=start2;
1770             stack2[0].s=s2;
1771             stack2[0].limit=limit2;
1772             ++level2;
1773 
1774             /* copy the folding result to fold2[] */
1775             if(length<=UCASE_MAX_STRING_LENGTH) {
1776                 u_memcpy(fold2, p, length);
1777             } else {
1778                 int32_t i=0;
1779                 U16_APPEND_UNSAFE(fold2, i, length);
1780                 length=i;
1781             }
1782 
1783             /* set next level pointers to case folding */
1784             start2=s2=fold2;
1785             limit2=fold2+length;
1786 
1787             /* get ready to read from decomposition, continue with loop */
1788             c2=-1;
1789             continue;
1790         }
1791 
1792         /*
1793          * no decomposition/case folding, max level for both sides:
1794          * return difference result
1795          *
1796          * code point order comparison must not just return cp1-cp2
1797          * because when single surrogates are present then the surrogate pairs
1798          * that formed cp1 and cp2 may be from different string indexes
1799          *
1800          * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1801          * c1=d800 cp1=10001 c2=dc00 cp2=10000
1802          * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1803          *
1804          * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1805          * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1806          * so we have slightly different pointer/start/limit comparisons here
1807          */
1808 
1809         if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
1810             /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1811             if(
1812                 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
1813                 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
1814             ) {
1815                 /* part of a surrogate pair, leave >=d800 */
1816             } else {
1817                 /* BMP code point - may be surrogate code point - make <d800 */
1818                 c1-=0x2800;
1819             }
1820 
1821             if(
1822                 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
1823                 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
1824             ) {
1825                 /* part of a surrogate pair, leave >=d800 */
1826             } else {
1827                 /* BMP code point - may be surrogate code point - make <d800 */
1828                 c2-=0x2800;
1829             }
1830         }
1831 
1832         cmpRes=c1-c2;
1833         break;
1834     }
1835 
1836     if(matchLen1) {
1837         *matchLen1=static_cast<int32_t>(m1-org1);
1838         *matchLen2=static_cast<int32_t>(m2-org2);
1839     }
1840     return cmpRes;
1841 }
1842 
1843 /* internal function */
1844 U_CFUNC int32_t
u_strcmpFold(const char16_t * s1,int32_t length1,const char16_t * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1845 u_strcmpFold(const char16_t *s1, int32_t length1,
1846              const char16_t *s2, int32_t length2,
1847              uint32_t options,
1848              UErrorCode *pErrorCode) {
1849     return _cmpFold(s1, length1, s2, length2, options, nullptr, nullptr, pErrorCode);
1850 }
1851 
1852 /* public API functions */
1853 
1854 U_CAPI int32_t U_EXPORT2
u_strCaseCompare(const char16_t * s1,int32_t length1,const char16_t * s2,int32_t length2,uint32_t options,UErrorCode * pErrorCode)1855 u_strCaseCompare(const char16_t *s1, int32_t length1,
1856                  const char16_t *s2, int32_t length2,
1857                  uint32_t options,
1858                  UErrorCode *pErrorCode) {
1859     /* argument checking */
1860     if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
1861         return 0;
1862     }
1863     if(s1==nullptr || length1<-1 || s2==nullptr || length2<-1) {
1864         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1865         return 0;
1866     }
1867     return u_strcmpFold(s1, length1, s2, length2,
1868                         options|U_COMPARE_IGNORE_CASE,
1869                         pErrorCode);
1870 }
1871 
1872 U_CAPI int32_t U_EXPORT2
u_strcasecmp(const char16_t * s1,const char16_t * s2,uint32_t options)1873 u_strcasecmp(const char16_t *s1, const char16_t *s2, uint32_t options) {
1874     UErrorCode errorCode=U_ZERO_ERROR;
1875     return u_strcmpFold(s1, -1, s2, -1,
1876                         options|U_COMPARE_IGNORE_CASE,
1877                         &errorCode);
1878 }
1879 
1880 U_CAPI int32_t U_EXPORT2
u_memcasecmp(const char16_t * s1,const char16_t * s2,int32_t length,uint32_t options)1881 u_memcasecmp(const char16_t *s1, const char16_t *s2, int32_t length, uint32_t options) {
1882     UErrorCode errorCode=U_ZERO_ERROR;
1883     return u_strcmpFold(s1, length, s2, length,
1884                         options|U_COMPARE_IGNORE_CASE,
1885                         &errorCode);
1886 }
1887 
1888 U_CAPI int32_t U_EXPORT2
u_strncasecmp(const char16_t * s1,const char16_t * s2,int32_t n,uint32_t options)1889 u_strncasecmp(const char16_t *s1, const char16_t *s2, int32_t n, uint32_t options) {
1890     UErrorCode errorCode=U_ZERO_ERROR;
1891     return u_strcmpFold(s1, n, s2, n,
1892                         options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
1893                         &errorCode);
1894 }
1895 
1896 /* internal API - detect length of shared prefix */
1897 U_CAPI void
u_caseInsensitivePrefixMatch(const char16_t * s1,int32_t length1,const char16_t * s2,int32_t length2,uint32_t options,int32_t * matchLen1,int32_t * matchLen2,UErrorCode * pErrorCode)1898 u_caseInsensitivePrefixMatch(const char16_t *s1, int32_t length1,
1899                              const char16_t *s2, int32_t length2,
1900                              uint32_t options,
1901                              int32_t *matchLen1, int32_t *matchLen2,
1902                              UErrorCode *pErrorCode) {
1903     _cmpFold(s1, length1, s2, length2, options,
1904         matchLen1, matchLen2, pErrorCode);
1905 }
1906