• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2004-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucase.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2004aug30
16 *   created by: Markus W. Scherer
17 *
18 *   Low-level Unicode character/string case mapping code.
19 *   Much code moved here (and modified) from uchar.c.
20 */
21 
22 #include "unicode/utypes.h"
23 #include "unicode/unistr.h"
24 #include "unicode/uset.h"
25 #include "unicode/utf16.h"
26 #include "cmemory.h"
27 #include "uassert.h"
28 #include "ucase.h"
29 #include "umutex.h"
30 #include "utrie2.h"
31 
32 /* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
33 #define INCLUDED_FROM_UCASE_CPP
34 #include "ucase_props_data.h"
35 
36 /* set of property starts for UnicodeSet ------------------------------------ */
37 
38 static UBool U_CALLCONV
_enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)39 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
40     /* add the start code point to the USet */
41     const USetAdder *sa=(const USetAdder *)context;
42     sa->add(sa->set, start);
43     return true;
44 }
45 
46 U_CFUNC void U_EXPORT2
ucase_addPropertyStarts(const USetAdder * sa,UErrorCode * pErrorCode)47 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
48     if(U_FAILURE(*pErrorCode)) {
49         return;
50     }
51 
52     /* add the start code point of each same-value range of the trie */
53     utrie2_enum(&ucase_props_singleton.trie, nullptr, _enumPropertyStartsRange, sa);
54 
55     /* add code points with hardcoded properties, plus the ones following them */
56 
57     /* (none right now, see comment below) */
58 
59     /*
60      * Omit code points with hardcoded specialcasing properties
61      * because we do not build property UnicodeSets for them right now.
62      */
63 }
64 
65 /* data access primitives --------------------------------------------------- */
66 
67 U_CAPI const struct UCaseProps * U_EXPORT2
ucase_getSingleton(int32_t * pExceptionsLength,int32_t * pUnfoldLength)68 ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
69     *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
70     *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
71     return &ucase_props_singleton;
72 }
73 
74 U_CFUNC const UTrie2 * U_EXPORT2
ucase_getTrie()75 ucase_getTrie() {
76     return &ucase_props_singleton.trie;
77 }
78 
79 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
80 
81 /* number of bits in an 8-bit integer value */
82 static const uint8_t flagsOffset[256]={
83     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
84     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
85     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
86     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
87     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
88     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
89     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
90     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
91     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
94     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
98     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
99 };
100 
101 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
102 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
103 
104 /*
105  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
106  *
107  * @param excWord (in) initial exceptions word
108  * @param idx (in) desired slot index
109  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
110  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
111  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
112  */
113 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
114     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
115         (pExc16)+=SLOT_OFFSET(excWord, idx); \
116         (value)=*pExc16; \
117     } else { \
118         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
119         (value)=*pExc16++; \
120         (value)=((value)<<16)|*pExc16; \
121     } \
122 } UPRV_BLOCK_MACRO_END
123 
124 /* simple case mappings ----------------------------------------------------- */
125 
126 U_CAPI UChar32 U_EXPORT2
ucase_tolower(UChar32 c)127 ucase_tolower(UChar32 c) {
128     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
129     if(!UCASE_HAS_EXCEPTION(props)) {
130         if(UCASE_IS_UPPER_OR_TITLE(props)) {
131             c+=UCASE_GET_DELTA(props);
132         }
133     } else {
134         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
135         uint16_t excWord=*pe++;
136         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
137             int32_t delta;
138             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
139             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
140         }
141         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
142             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
143         }
144     }
145     return c;
146 }
147 
148 U_CAPI UChar32 U_EXPORT2
ucase_toupper(UChar32 c)149 ucase_toupper(UChar32 c) {
150     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
151     if(!UCASE_HAS_EXCEPTION(props)) {
152         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
153             c+=UCASE_GET_DELTA(props);
154         }
155     } else {
156         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
157         uint16_t excWord=*pe++;
158         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
159             int32_t delta;
160             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
161             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
162         }
163         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
164             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
165         }
166     }
167     return c;
168 }
169 
170 U_CAPI UChar32 U_EXPORT2
ucase_totitle(UChar32 c)171 ucase_totitle(UChar32 c) {
172     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
173     if(!UCASE_HAS_EXCEPTION(props)) {
174         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
175             c+=UCASE_GET_DELTA(props);
176         }
177     } else {
178         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
179         uint16_t excWord=*pe++;
180         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
181             int32_t delta;
182             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
183             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
184         }
185         int32_t idx;
186         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
187             idx=UCASE_EXC_TITLE;
188         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
189             idx=UCASE_EXC_UPPER;
190         } else {
191             return c;
192         }
193         GET_SLOT_VALUE(excWord, idx, pe, c);
194     }
195     return c;
196 }
197 
198 static const char16_t iDot[2] = { 0x69, 0x307 };
199 static const char16_t jDot[2] = { 0x6a, 0x307 };
200 static const char16_t iOgonekDot[3] = { 0x12f, 0x307 };
201 static const char16_t iDotGrave[3] = { 0x69, 0x307, 0x300 };
202 static const char16_t iDotAcute[3] = { 0x69, 0x307, 0x301 };
203 static const char16_t iDotTilde[3] = { 0x69, 0x307, 0x303 };
204 
205 
206 U_CFUNC void U_EXPORT2
ucase_addCaseClosure(UChar32 c,const USetAdder * sa)207 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
208     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
209     if(!UCASE_HAS_EXCEPTION(props)) {
210         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
211             /* add the one simple case mapping, no matter what type it is */
212             int32_t delta=UCASE_GET_DELTA(props);
213             if(delta!=0) {
214                 sa->add(sa->set, c+delta);
215             }
216         }
217     } else {
218         /*
219          * c has exceptions, so there may be multiple simple and/or
220          * full case mappings. Add them all.
221          */
222         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
223         uint16_t excWord=*pe++;
224         const uint16_t *pe0=pe;
225 
226         // Hardcode the case closure of i and its relatives and ignore the
227         // data file data for these characters.
228         // The Turkic dotless i and dotted I with their case mapping conditions
229         // and case folding option make the related characters behave specially.
230         // This code matches their closure behavior to their case folding behavior.
231         if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
232             // These characters have Turkic case foldings. Hardcode their closure.
233             if (c == 0x49) {
234                 // Regular i and I are in one equivalence class.
235                 sa->add(sa->set, 0x69);
236                 return;
237             } else if (c == 0x130) {
238                 // Dotted I is in a class with <0069 0307>
239                 // (for canonical equivalence with <0049 0307>).
240                 sa->addString(sa->set, iDot, 2);
241                 return;
242             }
243         } else if (c == 0x69) {
244             sa->add(sa->set, 0x49);
245             return;
246         } else if (c == 0x131) {
247             // Dotless i is in a class by itself.
248             return;
249         }
250 
251         /* add all simple case mappings */
252         for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
253             if(HAS_SLOT(excWord, idx)) {
254                 pe=pe0;
255                 UChar32 mapping;
256                 GET_SLOT_VALUE(excWord, idx, pe, mapping);
257                 sa->add(sa->set, mapping);
258             }
259         }
260         if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
261             pe=pe0;
262             int32_t delta;
263             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
264             sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
265         }
266 
267         /* get the closure string pointer & length */
268         const char16_t *closure;
269         int32_t closureLength;
270         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
271             pe=pe0;
272             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
273             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
274             closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
275         } else {
276             closureLength=0;
277             closure=nullptr;
278         }
279 
280         /* add the full case folding */
281         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
282             pe=pe0;
283             int32_t fullLength;
284             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
285 
286             /* start of full case mapping strings */
287             ++pe;
288 
289             fullLength&=0xffff; /* bits 16 and higher are reserved */
290 
291             /* skip the lowercase result string */
292             pe+=fullLength&UCASE_FULL_LOWER;
293             fullLength>>=4;
294 
295             /* add the full case folding string */
296             int32_t length=fullLength&0xf;
297             if(length!=0) {
298                 sa->addString(sa->set, (const char16_t *)pe, length);
299                 pe+=length;
300             }
301 
302             /* skip the uppercase and titlecase strings */
303             fullLength>>=4;
304             pe+=fullLength&0xf;
305             fullLength>>=4;
306             pe+=fullLength;
307 
308             closure=(const char16_t *)pe; /* behind full case mappings */
309         }
310 
311         /* add each code point in the closure string */
312         for(int32_t idx=0; idx<closureLength;) {
313             UChar32 mapping;
314             U16_NEXT_UNSAFE(closure, idx, mapping);
315             sa->add(sa->set, mapping);
316         }
317     }
318 }
319 
320 namespace {
321 
322 /**
323  * Add the simple case closure mapping,
324  * except if there is not actually an scf relationship between the two characters.
325  * TODO: Unicode should probably add the corresponding scf mappings.
326  * See https://crbug.com/v8/13377 and Unicode-internal PAG issue #23.
327  * If & when those scf mappings are added, we should be able to remove all of these exceptions.
328  */
addOneSimpleCaseClosure(UChar32 c,UChar32 t,const USetAdder * sa)329 void addOneSimpleCaseClosure(UChar32 c, UChar32 t, const USetAdder *sa) {
330     switch (c) {
331     case 0x0390:
332         if (t == 0x1FD3) { return; }
333         break;
334     case 0x03B0:
335         if (t == 0x1FE3) { return; }
336         break;
337     case 0x1FD3:
338         if (t == 0x0390) { return; }
339         break;
340     case 0x1FE3:
341         if (t == 0x03B0) { return; }
342         break;
343     case 0xFB05:
344         if (t == 0xFB06) { return; }
345         break;
346     case 0xFB06:
347         if (t == 0xFB05) { return; }
348         break;
349     default:
350         break;
351     }
352     sa->add(sa->set, t);
353 }
354 
355 }  // namespace
356 
357 U_CFUNC void U_EXPORT2
ucase_addSimpleCaseClosure(UChar32 c,const USetAdder * sa)358 ucase_addSimpleCaseClosure(UChar32 c, const USetAdder *sa) {
359     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
360     if(!UCASE_HAS_EXCEPTION(props)) {
361         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
362             /* add the one simple case mapping, no matter what type it is */
363             int32_t delta=UCASE_GET_DELTA(props);
364             if(delta!=0) {
365                 sa->add(sa->set, c+delta);
366             }
367         }
368     } else {
369         // c has exceptions. Add the mappings relevant for scf=Simple_Case_Folding.
370         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
371         uint16_t excWord=*pe++;
372         const uint16_t *pe0=pe;
373 
374         // Hardcode the case closure of i and its relatives and ignore the
375         // data file data for these characters, like in ucase_addCaseClosure().
376         if (excWord&UCASE_EXC_CONDITIONAL_FOLD) {
377             // These characters have Turkic case foldings. Hardcode their closure.
378             if (c == 0x49) {
379                 // Regular i and I are in one equivalence class.
380                 sa->add(sa->set, 0x69);
381                 return;
382             } else if (c == 0x130) {
383                 // For scf=Simple_Case_Folding, dotted I is in a class by itself.
384                 return;
385             }
386         } else if (c == 0x69) {
387             sa->add(sa->set, 0x49);
388             return;
389         } else if (c == 0x131) {
390             // Dotless i is in a class by itself.
391             return;
392         }
393 
394         // Add all simple case mappings.
395         for(int32_t idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
396             if(HAS_SLOT(excWord, idx)) {
397                 pe=pe0;
398                 UChar32 mapping;
399                 GET_SLOT_VALUE(excWord, idx, pe, mapping);
400                 addOneSimpleCaseClosure(c, mapping, sa);
401             }
402         }
403         if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
404             pe=pe0;
405             int32_t delta;
406             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
407             UChar32 mapping = (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
408             addOneSimpleCaseClosure(c, mapping, sa);
409         }
410 
411         /* get the closure string pointer & length */
412         const char16_t *closure;
413         int32_t closureLength;
414         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
415             pe=pe0;
416             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
417             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
418             closure=(const char16_t *)pe+1; /* behind this slot, unless there are full case mappings */
419         } else {
420             closureLength=0;
421             closure=nullptr;
422         }
423 
424         // Skip the full case mappings.
425         if(closureLength > 0 && HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
426             pe=pe0;
427             int32_t fullLength;
428             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
429 
430             /* start of full case mapping strings */
431             ++pe;
432 
433             fullLength&=0xffff; /* bits 16 and higher are reserved */
434 
435             // Skip all 4 full case mappings.
436             pe+=fullLength&UCASE_FULL_LOWER;
437             fullLength>>=4;
438             pe+=fullLength&0xf;
439             fullLength>>=4;
440             pe+=fullLength&0xf;
441             fullLength>>=4;
442             pe+=fullLength;
443 
444             closure=(const char16_t *)pe; /* behind full case mappings */
445         }
446 
447         // Add each code point in the closure string whose scf maps back to c.
448         for(int32_t idx=0; idx<closureLength;) {
449             UChar32 mapping;
450             U16_NEXT_UNSAFE(closure, idx, mapping);
451             addOneSimpleCaseClosure(c, mapping, sa);
452         }
453     }
454 }
455 
456 /*
457  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
458  * must be length>0 and max>0 and length<=max
459  */
460 static inline int32_t
strcmpMax(const char16_t * s,int32_t length,const char16_t * t,int32_t max)461 strcmpMax(const char16_t *s, int32_t length, const char16_t *t, int32_t max) {
462     int32_t c1, c2;
463 
464     max-=length; /* we require length<=max, so no need to decrement max in the loop */
465     do {
466         c1=*s++;
467         c2=*t++;
468         if(c2==0) {
469             return 1; /* reached the end of t but not of s */
470         }
471         c1-=c2;
472         if(c1!=0) {
473             return c1; /* return difference result */
474         }
475     } while(--length>0);
476     /* ends with length==0 */
477 
478     if(max==0 || *t==0) {
479         return 0; /* equal to length of both strings */
480     } else {
481         return -max; /* return length difference */
482     }
483 }
484 
485 U_CFUNC UBool U_EXPORT2
ucase_addStringCaseClosure(const char16_t * s,int32_t length,const USetAdder * sa)486 ucase_addStringCaseClosure(const char16_t *s, int32_t length, const USetAdder *sa) {
487     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
488 
489     if(ucase_props_singleton.unfold==nullptr || s==nullptr) {
490         return false; /* no reverse case folding data, or no string */
491     }
492     if(length<=1) {
493         /* the string is too short to find any match */
494         /*
495          * more precise would be:
496          * if(!u_strHasMoreChar32Than(s, length, 1))
497          * but this does not make much practical difference because
498          * a single supplementary code point would just not be found
499          */
500         return false;
501     }
502 
503     const uint16_t *unfold=ucase_props_singleton.unfold;
504     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
505     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
506     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
507     unfold+=unfoldRowWidth;
508 
509     if(length>unfoldStringWidth) {
510         /* the string is too long to find any match */
511         return false;
512     }
513 
514     /* do a binary search for the string */
515     start=0;
516     limit=unfoldRows;
517     while(start<limit) {
518         i=(start+limit)/2;
519         const char16_t *p=reinterpret_cast<const char16_t *>(unfold+(i*unfoldRowWidth));
520         result=strcmpMax(s, length, p, unfoldStringWidth);
521 
522         if(result==0) {
523             /* found the string: add each code point, and its case closure */
524             UChar32 c;
525 
526             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
527                 U16_NEXT_UNSAFE(p, i, c);
528                 sa->add(sa->set, c);
529                 ucase_addCaseClosure(c, sa);
530             }
531             return true;
532         } else if(result<0) {
533             limit=i;
534         } else /* result>0 */ {
535             start=i+1;
536         }
537     }
538 
539     return false; /* string not found */
540 }
541 
542 U_NAMESPACE_BEGIN
543 
FullCaseFoldingIterator()544 FullCaseFoldingIterator::FullCaseFoldingIterator()
545         : unfold(reinterpret_cast<const char16_t *>(ucase_props_singleton.unfold)),
546           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
547           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
548           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
549           currentRow(0),
550           rowCpIndex(unfoldStringWidth) {
551     unfold+=unfoldRowWidth;
552 }
553 
554 UChar32
next(UnicodeString & full)555 FullCaseFoldingIterator::next(UnicodeString &full) {
556     // Advance past the last-delivered code point.
557     const char16_t *p=unfold+(currentRow*unfoldRowWidth);
558     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
559         ++currentRow;
560         p+=unfoldRowWidth;
561         rowCpIndex=unfoldStringWidth;
562     }
563     if(currentRow>=unfoldRows) { return U_SENTINEL; }
564     // Set "full" to the NUL-terminated string in the first unfold column.
565     int32_t length=unfoldStringWidth;
566     while(length>0 && p[length-1]==0) { --length; }
567     full.setTo(false, p, length);
568     // Return the code point.
569     UChar32 c;
570     U16_NEXT_UNSAFE(p, rowCpIndex, c);
571     return c;
572 }
573 
574 namespace LatinCase {
575 
576 const int8_t TO_LOWER_NORMAL[LIMIT] = {
577     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
578     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
579     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
581 
582     0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
583     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
584     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
585     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
586 
587     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
588     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
589     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
590     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591 
592     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
593     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
594     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
595     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
596 
597     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
598     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
599     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
600     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
601 
602     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
603     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
604     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
605     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
606 };
607 
608 const int8_t TO_LOWER_TR_LT[LIMIT] = {
609     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
610     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
611     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
612     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
613 
614     0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
615     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
616     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618 
619     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
620     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
621     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 
624     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
625     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
626     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
627     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
628 
629     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
630     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
631     1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
632     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
633 
634     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
635     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
636     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
637     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
638 };
639 
640 const int8_t TO_UPPER_NORMAL[LIMIT] = {
641     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
642     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
643     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
644     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
645 
646     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648     0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
649     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
650 
651     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
652     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
653     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
654     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
655 
656     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
657     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
658     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
659     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
660 
661     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
662     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
663     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
664     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
665 
666     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
667     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
668     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
669     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
670 };
671 
672 const int8_t TO_UPPER_TR[LIMIT] = {
673     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
674     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
675     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
676     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
677 
678     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
679     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
680     0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
681     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
682 
683     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
684     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
685     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
686     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
687 
688     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
689     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
690     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
691     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
692 
693     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
694     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
695     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
696     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
697 
698     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
699     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
700     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
701     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
702 };
703 
704 }  // namespace LatinCase
705 
706 U_NAMESPACE_END
707 
708 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
709 U_CAPI int32_t U_EXPORT2
ucase_getType(UChar32 c)710 ucase_getType(UChar32 c) {
711     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
712     return UCASE_GET_TYPE(props);
713 }
714 
715 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
716 U_CAPI int32_t U_EXPORT2
ucase_getTypeOrIgnorable(UChar32 c)717 ucase_getTypeOrIgnorable(UChar32 c) {
718     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
719     return UCASE_GET_TYPE_AND_IGNORABLE(props);
720 }
721 
722 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
723 static inline int32_t
getDotType(UChar32 c)724 getDotType(UChar32 c) {
725     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
726     if(!UCASE_HAS_EXCEPTION(props)) {
727         return props&UCASE_DOT_MASK;
728     } else {
729         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
730         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
731     }
732 }
733 
734 U_CAPI UBool U_EXPORT2
ucase_isSoftDotted(UChar32 c)735 ucase_isSoftDotted(UChar32 c) {
736     return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
737 }
738 
739 U_CAPI UBool U_EXPORT2
ucase_isCaseSensitive(UChar32 c)740 ucase_isCaseSensitive(UChar32 c) {
741     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
742     if(!UCASE_HAS_EXCEPTION(props)) {
743         return (UBool)((props&UCASE_SENSITIVE)!=0);
744     } else {
745         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
746         return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
747     }
748 }
749 
750 /* string casing ------------------------------------------------------------ */
751 
752 /*
753  * These internal functions form the core of string case mappings.
754  * They map single code points to result code points or strings and take
755  * all necessary conditions (context, locale ID, options) into account.
756  *
757  * They do not iterate over the source or write to the destination
758  * so that the same functions are useful for non-standard string storage,
759  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
760  * For the same reason, the "surrounding text" context is passed in as a
761  * UCaseContextIterator which does not make any assumptions about
762  * the underlying storage.
763  *
764  * This section contains helper functions that check for conditions
765  * in the input text surrounding the current code point
766  * according to SpecialCasing.txt.
767  *
768  * Each helper function gets the index
769  * - after the current code point if it looks at following text
770  * - before the current code point if it looks at preceding text
771  *
772  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
773  *
774  * Final_Sigma
775  *   C is preceded by a sequence consisting of
776  *     a cased letter and a case-ignorable sequence,
777  *   and C is not followed by a sequence consisting of
778  *     an ignorable sequence and then a cased letter.
779  *
780  * More_Above
781  *   C is followed by one or more characters of combining class 230 (ABOVE)
782  *   in the combining character sequence.
783  *
784  * After_Soft_Dotted
785  *   The last preceding character with combining class of zero before C
786  *   was Soft_Dotted,
787  *   and there is no intervening combining character class 230 (ABOVE).
788  *
789  * Before_Dot
790  *   C is followed by combining dot above (U+0307).
791  *   Any sequence of characters with a combining class that is neither 0 nor 230
792  *   may intervene between the current character and the combining dot above.
793  *
794  * The erratum from 2002-10-31 adds the condition
795  *
796  * After_I
797  *   The last preceding base character was an uppercase I, and there is no
798  *   intervening combining character class 230 (ABOVE).
799  *
800  *   (See Jitterbug 2344 and the comments on After_I below.)
801  *
802  * Helper definitions in Unicode 3.2 UAX 21:
803  *
804  * D1. A character C is defined to be cased
805  *     if it meets any of the following criteria:
806  *
807  *   - The general category of C is Titlecase Letter (Lt)
808  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
809  *   - Given D = NFD(C), then it is not the case that:
810  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
811  *     (This third criterion does not add any characters to the list
812  *      for Unicode 3.2. Ignored.)
813  *
814  * D2. A character C is defined to be case-ignorable
815  *     if it meets either of the following criteria:
816  *
817  *   - The general category of C is
818  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
819  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
820  *   - C is one of the following characters
821  *     U+0027 APOSTROPHE
822  *     U+00AD SOFT HYPHEN (SHY)
823  *     U+2019 RIGHT SINGLE QUOTATION MARK
824  *            (the preferred character for apostrophe)
825  *
826  * D3. A case-ignorable sequence is a sequence of
827  *     zero or more case-ignorable characters.
828  */
829 
830 #define is_d(c) ((c)=='d' || (c)=='D')
831 #define is_e(c) ((c)=='e' || (c)=='E')
832 #define is_i(c) ((c)=='i' || (c)=='I')
833 #define is_l(c) ((c)=='l' || (c)=='L')
834 #define is_r(c) ((c)=='r' || (c)=='R')
835 #define is_t(c) ((c)=='t' || (c)=='T')
836 #define is_u(c) ((c)=='u' || (c)=='U')
837 #define is_y(c) ((c)=='y' || (c)=='Y')
838 #define is_z(c) ((c)=='z' || (c)=='Z')
839 
840 /* separator? */
841 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
842 
843 /**
844  * Requires non-nullptr locale ID but otherwise does the equivalent of
845  * checking for language codes as if uloc_getLanguage() were called:
846  * Accepts both 2- and 3-letter codes and accepts case variants.
847  */
848 U_CFUNC int32_t
ucase_getCaseLocale(const char * locale)849 ucase_getCaseLocale(const char *locale) {
850     /*
851      * This function used to use uloc_getLanguage(), but the current code
852      * removes the dependency of this low-level code on uloc implementation code
853      * and is faster because not the whole locale ID has to be
854      * examined and copied/transformed.
855      *
856      * Because this code does not want to depend on uloc, the caller must
857      * pass in a non-nullptr locale, i.e., may need to call uloc_getDefault().
858      */
859     char c=*locale++;
860     // Fastpath for English "en" which is often used for default (=root locale) case mappings,
861     // and for Chinese "zh": Very common but no special case mapping behavior.
862     // Then check lowercase vs. uppercase to reduce the number of comparisons
863     // for other locales without special behavior.
864     if(c=='e') {
865         /* el or ell? */
866         c=*locale++;
867         if(is_l(c)) {
868             c=*locale++;
869             if(is_l(c)) {
870                 c=*locale;
871             }
872             if(is_sep(c)) {
873                 return UCASE_LOC_GREEK;
874             }
875         }
876         // en, es, ... -> root
877     } else if(c=='z') {
878         return UCASE_LOC_ROOT;
879 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
880     } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
881 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
882     } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
883 #else
884 #   error Unknown charset family!
885 #endif
886         // lowercase c
887         if(c=='t') {
888             /* tr or tur? */
889             c=*locale++;
890             if(is_u(c)) {
891                 c=*locale++;
892             }
893             if(is_r(c)) {
894                 c=*locale;
895                 if(is_sep(c)) {
896                     return UCASE_LOC_TURKISH;
897                 }
898             }
899         } else if(c=='a') {
900             /* az or aze? */
901             c=*locale++;
902             if(is_z(c)) {
903                 c=*locale++;
904                 if(is_e(c)) {
905                     c=*locale;
906                 }
907                 if(is_sep(c)) {
908                     return UCASE_LOC_TURKISH;
909                 }
910             }
911         } else if(c=='l') {
912             /* lt or lit? */
913             c=*locale++;
914             if(is_i(c)) {
915                 c=*locale++;
916             }
917             if(is_t(c)) {
918                 c=*locale;
919                 if(is_sep(c)) {
920                     return UCASE_LOC_LITHUANIAN;
921                 }
922             }
923         } else if(c=='n') {
924             /* nl or nld? */
925             c=*locale++;
926             if(is_l(c)) {
927                 c=*locale++;
928                 if(is_d(c)) {
929                     c=*locale;
930                 }
931                 if(is_sep(c)) {
932                     return UCASE_LOC_DUTCH;
933                 }
934             }
935         } else if(c=='h') {
936             /* hy or hye? *not* hyw */
937             c=*locale++;
938             if(is_y(c)) {
939                 c=*locale++;
940                 if(is_e(c)) {
941                     c=*locale;
942                 }
943                 if(is_sep(c)) {
944                     return UCASE_LOC_ARMENIAN;
945                 }
946             }
947         }
948     } else {
949         // uppercase c
950         // Same code as for lowercase c but also check for 'E'.
951         if(c=='T') {
952             /* tr or tur? */
953             c=*locale++;
954             if(is_u(c)) {
955                 c=*locale++;
956             }
957             if(is_r(c)) {
958                 c=*locale;
959                 if(is_sep(c)) {
960                     return UCASE_LOC_TURKISH;
961                 }
962             }
963         } else if(c=='A') {
964             /* az or aze? */
965             c=*locale++;
966             if(is_z(c)) {
967                 c=*locale++;
968                 if(is_e(c)) {
969                     c=*locale;
970                 }
971                 if(is_sep(c)) {
972                     return UCASE_LOC_TURKISH;
973                 }
974             }
975         } else if(c=='L') {
976             /* lt or lit? */
977             c=*locale++;
978             if(is_i(c)) {
979                 c=*locale++;
980             }
981             if(is_t(c)) {
982                 c=*locale;
983                 if(is_sep(c)) {
984                     return UCASE_LOC_LITHUANIAN;
985                 }
986             }
987         } else if(c=='E') {
988             /* el or ell? */
989             c=*locale++;
990             if(is_l(c)) {
991                 c=*locale++;
992                 if(is_l(c)) {
993                     c=*locale;
994                 }
995                 if(is_sep(c)) {
996                     return UCASE_LOC_GREEK;
997                 }
998             }
999         } else if(c=='N') {
1000             /* nl or nld? */
1001             c=*locale++;
1002             if(is_l(c)) {
1003                 c=*locale++;
1004                 if(is_d(c)) {
1005                     c=*locale;
1006                 }
1007                 if(is_sep(c)) {
1008                     return UCASE_LOC_DUTCH;
1009                 }
1010             }
1011         } else if(c=='H') {
1012             /* hy or hye? *not* hyw */
1013             c=*locale++;
1014             if(is_y(c)) {
1015                 c=*locale++;
1016                 if(is_e(c)) {
1017                     c=*locale;
1018                 }
1019                 if(is_sep(c)) {
1020                     return UCASE_LOC_ARMENIAN;
1021                 }
1022             }
1023         }
1024     }
1025     return UCASE_LOC_ROOT;
1026 }
1027 
1028 /*
1029  * Is followed by
1030  *   {case-ignorable}* cased
1031  * ?
1032  * (dir determines looking forward/backward)
1033  * If a character is case-ignorable, it is skipped regardless of whether
1034  * it is also cased or not.
1035  */
1036 static UBool
isFollowedByCasedLetter(UCaseContextIterator * iter,void * context,int8_t dir)1037 isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
1038     UChar32 c;
1039 
1040     if(iter==nullptr) {
1041         return false;
1042     }
1043 
1044     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
1045         int32_t type=ucase_getTypeOrIgnorable(c);
1046         if(type&4) {
1047             /* case-ignorable, continue with the loop */
1048         } else if(type!=UCASE_NONE) {
1049             return true; /* followed by cased letter */
1050         } else {
1051             return false; /* uncased and not case-ignorable */
1052         }
1053     }
1054 
1055     return false; /* not followed by cased letter */
1056 }
1057 
1058 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
1059 static UBool
isPrecededBySoftDotted(UCaseContextIterator * iter,void * context)1060 isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
1061     UChar32 c;
1062     int32_t dotType;
1063     int8_t dir;
1064 
1065     if(iter==nullptr) {
1066         return false;
1067     }
1068 
1069     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1070         dotType=getDotType(c);
1071         if(dotType==UCASE_SOFT_DOTTED) {
1072             return true; /* preceded by TYPE_i */
1073         } else if(dotType!=UCASE_OTHER_ACCENT) {
1074             return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
1075         }
1076     }
1077 
1078     return false; /* not preceded by TYPE_i */
1079 }
1080 
1081 /*
1082  * See Jitterbug 2344:
1083  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
1084  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
1085  * we made those releases compatible with Unicode 3.2 which had not fixed
1086  * a related bug in SpecialCasing.txt.
1087  *
1088  * From the Jitterbug 2344 text:
1089  * ... this bug is listed as a Unicode erratum
1090  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
1091  * <quote>
1092  * There are two errors in SpecialCasing.txt.
1093  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
1094  * 2. An incorrect context definition. Correct as follows:
1095  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
1096  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
1097  * ---
1098  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1099  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1100  * where the context After_I is defined as:
1101  * The last preceding base character was an uppercase I, and there is no
1102  * intervening combining character class 230 (ABOVE).
1103  * </quote>
1104  *
1105  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
1106  *
1107  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1108  * # This matches the behavior of the canonically equivalent I-dot_above
1109  *
1110  * See also the description in this place in older versions of uchar.c (revision 1.100).
1111  *
1112  * Markus W. Scherer 2003-feb-15
1113  */
1114 
1115 /* Is preceded by base character 'I' with no intervening cc=230 ? */
1116 static UBool
isPrecededBy_I(UCaseContextIterator * iter,void * context)1117 isPrecededBy_I(UCaseContextIterator *iter, void *context) {
1118     UChar32 c;
1119     int32_t dotType;
1120     int8_t dir;
1121 
1122     if(iter==nullptr) {
1123         return false;
1124     }
1125 
1126     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1127         if(c==0x49) {
1128             return true; /* preceded by I */
1129         }
1130         dotType=getDotType(c);
1131         if(dotType!=UCASE_OTHER_ACCENT) {
1132             return false; /* preceded by different base character (not I), or intervening cc==230 */
1133         }
1134     }
1135 
1136     return false; /* not preceded by I */
1137 }
1138 
1139 /* Is followed by one or more cc==230 ? */
1140 static UBool
isFollowedByMoreAbove(UCaseContextIterator * iter,void * context)1141 isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
1142     UChar32 c;
1143     int32_t dotType;
1144     int8_t dir;
1145 
1146     if(iter==nullptr) {
1147         return false;
1148     }
1149 
1150     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1151         dotType=getDotType(c);
1152         if(dotType==UCASE_ABOVE) {
1153             return true; /* at least one cc==230 following */
1154         } else if(dotType!=UCASE_OTHER_ACCENT) {
1155             return false; /* next base character, no more cc==230 following */
1156         }
1157     }
1158 
1159     return false; /* no more cc==230 following */
1160 }
1161 
1162 /* Is followed by a dot above (without cc==230 in between) ? */
1163 static UBool
isFollowedByDotAbove(UCaseContextIterator * iter,void * context)1164 isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1165     UChar32 c;
1166     int32_t dotType;
1167     int8_t dir;
1168 
1169     if(iter==nullptr) {
1170         return false;
1171     }
1172 
1173     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1174         if(c==0x307) {
1175             return true;
1176         }
1177         dotType=getDotType(c);
1178         if(dotType!=UCASE_OTHER_ACCENT) {
1179             return false; /* next base character or cc==230 in between */
1180         }
1181     }
1182 
1183     return false; /* no dot above following */
1184 }
1185 
1186 U_CAPI int32_t U_EXPORT2
ucase_toFullLower(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t loc)1187 ucase_toFullLower(UChar32 c,
1188                   UCaseContextIterator *iter, void *context,
1189                   const char16_t **pString,
1190                   int32_t loc) {
1191     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1192     U_ASSERT(c >= 0);
1193     UChar32 result=c;
1194     // Reset the output pointer in case it was uninitialized.
1195     *pString=nullptr;
1196     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1197     if(!UCASE_HAS_EXCEPTION(props)) {
1198         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1199             result=c+UCASE_GET_DELTA(props);
1200         }
1201     } else {
1202         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1203         uint16_t excWord=*pe++;
1204         int32_t full;
1205 
1206         pe2=pe;
1207 
1208         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1209             /* use hardcoded conditions and mappings */
1210 
1211             /*
1212              * Test for conditional mappings first
1213              *   (otherwise the unconditional default mappings are always taken),
1214              * then test for characters that have unconditional mappings in SpecialCasing.txt,
1215              * then get the UnicodeData.txt mappings.
1216              */
1217             if( loc==UCASE_LOC_LITHUANIAN &&
1218                     /* base characters, find accents above */
1219                     (((c==0x49 || c==0x4a || c==0x12e) &&
1220                         isFollowedByMoreAbove(iter, context)) ||
1221                     /* precomposed with accent above, no need to find one */
1222                     (c==0xcc || c==0xcd || c==0x128))
1223             ) {
1224                 /*
1225                     # Lithuanian
1226 
1227                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1228 
1229                     # Introduce an explicit dot above when lowercasing capital I's and J's
1230                     # whenever there are more accents above.
1231                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1232 
1233                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1234                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1235                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1236                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1237                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1238                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1239                  */
1240                 switch(c) {
1241                 case 0x49:  /* LATIN CAPITAL LETTER I */
1242                     *pString=iDot;
1243                     return 2;
1244                 case 0x4a:  /* LATIN CAPITAL LETTER J */
1245                     *pString=jDot;
1246                     return 2;
1247                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1248                     *pString=iOgonekDot;
1249                     return 2;
1250                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1251                     *pString=iDotGrave;
1252                     return 3;
1253                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1254                     *pString=iDotAcute;
1255                     return 3;
1256                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1257                     *pString=iDotTilde;
1258                     return 3;
1259                 default:
1260                     return 0; /* will not occur */
1261                 }
1262             /* # Turkish and Azeri */
1263             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1264                 /*
1265                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1266                     # The following rules handle those cases.
1267 
1268                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1269                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1270                  */
1271                 return 0x69;
1272             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1273                 /*
1274                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1275                     # This matches the behavior of the canonically equivalent I-dot_above
1276 
1277                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1278                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1279                  */
1280                 return 0; /* remove the dot (continue without output) */
1281             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1282                 /*
1283                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1284 
1285                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1286                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1287                  */
1288                 return 0x131;
1289             } else if(c==0x130) {
1290                 /*
1291                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
1292 
1293                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1294                  */
1295                 *pString=iDot;
1296                 return 2;
1297             } else if(  c==0x3a3 &&
1298                         !isFollowedByCasedLetter(iter, context, 1) &&
1299                         isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1300             ) {
1301                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1302                 /*
1303                     # Special case for final form of sigma
1304 
1305                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1306                  */
1307                 return 0x3c2; /* greek small final sigma */
1308             } else {
1309                 /* no known conditional special case mapping, use a normal mapping */
1310             }
1311         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1312             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1313             full&=UCASE_FULL_LOWER;
1314             if(full!=0) {
1315                 /* set the output pointer to the lowercase mapping */
1316                 *pString=reinterpret_cast<const char16_t *>(pe+1);
1317 
1318                 /* return the string length */
1319                 return full;
1320             }
1321         }
1322 
1323         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1324             int32_t delta;
1325             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1326             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1327         }
1328         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1329             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1330         }
1331     }
1332 
1333     return (result==c) ? ~result : result;
1334 }
1335 
1336 /* internal */
1337 static int32_t
toUpperOrTitle(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t loc,UBool upperNotTitle)1338 toUpperOrTitle(UChar32 c,
1339                UCaseContextIterator *iter, void *context,
1340                const char16_t **pString,
1341                int32_t loc,
1342                UBool upperNotTitle) {
1343     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1344     U_ASSERT(c >= 0);
1345     UChar32 result=c;
1346     // Reset the output pointer in case it was uninitialized.
1347     *pString=nullptr;
1348     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1349     if(!UCASE_HAS_EXCEPTION(props)) {
1350         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1351             result=c+UCASE_GET_DELTA(props);
1352         }
1353     } else {
1354         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1355         uint16_t excWord=*pe++;
1356         int32_t full, idx;
1357 
1358         pe2=pe;
1359 
1360         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1361             /* use hardcoded conditions and mappings */
1362             if(loc==UCASE_LOC_TURKISH && c==0x69) {
1363                 /*
1364                     # Turkish and Azeri
1365 
1366                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1367                     # The following rules handle those cases.
1368 
1369                     # When uppercasing, i turns into a dotted capital I
1370 
1371                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1372                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1373                 */
1374                 return 0x130;
1375             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1376                 /*
1377                     # Lithuanian
1378 
1379                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1380 
1381                     # Remove DOT ABOVE after "i" with upper or titlecase
1382 
1383                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1384                  */
1385                 return 0; /* remove the dot (continue without output) */
1386             } else if(c==0x0587) {
1387                 // See ICU-13416:
1388                 // և ligature ech-yiwn
1389                 // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
1390                 // but to ԵՎ=ech+vew in Eastern Armenian.
1391                 if(loc==UCASE_LOC_ARMENIAN) {
1392                     *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
1393                 } else {
1394                     *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
1395                 }
1396                 return 2;
1397             } else {
1398                 /* no known conditional special case mapping, use a normal mapping */
1399             }
1400         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1401             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1402 
1403             /* start of full case mapping strings */
1404             ++pe;
1405 
1406             /* skip the lowercase and case-folding result strings */
1407             pe+=full&UCASE_FULL_LOWER;
1408             full>>=4;
1409             pe+=full&0xf;
1410             full>>=4;
1411 
1412             if(upperNotTitle) {
1413                 full&=0xf;
1414             } else {
1415                 /* skip the uppercase result string */
1416                 pe+=full&0xf;
1417                 full=(full>>4)&0xf;
1418             }
1419 
1420             if(full!=0) {
1421                 /* set the output pointer to the result string */
1422                 *pString=reinterpret_cast<const char16_t *>(pe);
1423 
1424                 /* return the string length */
1425                 return full;
1426             }
1427         }
1428 
1429         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1430             int32_t delta;
1431             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1432             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1433         }
1434         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1435             idx=UCASE_EXC_TITLE;
1436         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1437             /* here, titlecase is same as uppercase */
1438             idx=UCASE_EXC_UPPER;
1439         } else {
1440             return ~c;
1441         }
1442         GET_SLOT_VALUE(excWord, idx, pe2, result);
1443     }
1444 
1445     return (result==c) ? ~result : result;
1446 }
1447 
1448 U_CAPI int32_t U_EXPORT2
ucase_toFullUpper(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t caseLocale)1449 ucase_toFullUpper(UChar32 c,
1450                   UCaseContextIterator *iter, void *context,
1451                   const char16_t **pString,
1452                   int32_t caseLocale) {
1453     return toUpperOrTitle(c, iter, context, pString, caseLocale, true);
1454 }
1455 
1456 U_CAPI int32_t U_EXPORT2
ucase_toFullTitle(UChar32 c,UCaseContextIterator * iter,void * context,const char16_t ** pString,int32_t caseLocale)1457 ucase_toFullTitle(UChar32 c,
1458                   UCaseContextIterator *iter, void *context,
1459                   const char16_t **pString,
1460                   int32_t caseLocale) {
1461     return toUpperOrTitle(c, iter, context, pString, caseLocale, false);
1462 }
1463 
1464 /* case folding ------------------------------------------------------------- */
1465 
1466 /*
1467  * Case folding is similar to lowercasing.
1468  * The result may be a simple mapping, i.e., a single code point, or
1469  * a full mapping, i.e., a string.
1470  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1471  * then only the lowercase mapping is stored.
1472  *
1473  * Some special cases are hardcoded because their conditions cannot be
1474  * parsed and processed from CaseFolding.txt.
1475  *
1476  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1477 
1478 # C: common case folding, common mappings shared by both simple and full mappings.
1479 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1480 # S: simple case folding, mappings to single characters where different from F.
1481 # T: special case for uppercase I and dotted uppercase I
1482 #    - For non-Turkic languages, this mapping is normally not used.
1483 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1484 #
1485 # Usage:
1486 #  A. To do a simple case folding, use the mappings with status C + S.
1487 #  B. To do a full case folding, use the mappings with status C + F.
1488 #
1489 #    The mappings with status T can be used or omitted depending on the desired case-folding
1490 #    behavior. (The default option is to exclude them.)
1491 
1492  * Unicode 3.2 has 'T' mappings as follows:
1493 
1494 0049; T; 0131; # LATIN CAPITAL LETTER I
1495 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1496 
1497  * while the default mappings for these code points are:
1498 
1499 0049; C; 0069; # LATIN CAPITAL LETTER I
1500 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1501 
1502  * U+0130 has no simple case folding (simple-case-folds to itself).
1503  */
1504 
1505 /* return the simple case folding mapping for c */
1506 U_CAPI UChar32 U_EXPORT2
ucase_fold(UChar32 c,uint32_t options)1507 ucase_fold(UChar32 c, uint32_t options) {
1508     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1509     if(!UCASE_HAS_EXCEPTION(props)) {
1510         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1511             c+=UCASE_GET_DELTA(props);
1512         }
1513     } else {
1514         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1515         uint16_t excWord=*pe++;
1516         int32_t idx;
1517         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1518             /* special case folding mappings, hardcoded */
1519             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1520                 /* default mappings */
1521                 if(c==0x49) {
1522                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1523                     return 0x69;
1524                 } else if(c==0x130) {
1525                     /* no simple case folding for U+0130 */
1526                     return c;
1527                 }
1528             } else {
1529                 /* Turkic mappings */
1530                 if(c==0x49) {
1531                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1532                     return 0x131;
1533                 } else if(c==0x130) {
1534                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1535                     return 0x69;
1536                 }
1537             }
1538         }
1539         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1540             return c;
1541         }
1542         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1543             int32_t delta;
1544             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1545             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1546         }
1547         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1548             idx=UCASE_EXC_FOLD;
1549         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1550             idx=UCASE_EXC_LOWER;
1551         } else {
1552             return c;
1553         }
1554         GET_SLOT_VALUE(excWord, idx, pe, c);
1555     }
1556     return c;
1557 }
1558 
1559 /*
1560  * Issue for canonical caseless match (UAX #21):
1561  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1562  * canonical equivalence, unlike default-option casefolding.
1563  * For example, I-grave and I + grave fold to strings that are not canonically
1564  * equivalent.
1565  * For more details, see the comment in unorm_compare() in unorm.cpp
1566  * and the intermediate prototype changes for Jitterbug 2021.
1567  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1568  *
1569  * This did not get fixed because it appears that it is not possible to fix
1570  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1571  * together in a way that they still fold to common result strings.
1572  */
1573 
1574 U_CAPI int32_t U_EXPORT2
ucase_toFullFolding(UChar32 c,const char16_t ** pString,uint32_t options)1575 ucase_toFullFolding(UChar32 c,
1576                     const char16_t **pString,
1577                     uint32_t options) {
1578     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1579     U_ASSERT(c >= 0);
1580     UChar32 result=c;
1581     // Reset the output pointer in case it was uninitialized.
1582     *pString=nullptr;
1583     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1584     if(!UCASE_HAS_EXCEPTION(props)) {
1585         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1586             result=c+UCASE_GET_DELTA(props);
1587         }
1588     } else {
1589         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1590         uint16_t excWord=*pe++;
1591         int32_t full, idx;
1592 
1593         pe2=pe;
1594 
1595         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1596             /* use hardcoded conditions and mappings */
1597             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1598                 /* default mappings */
1599                 if(c==0x49) {
1600                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1601                     return 0x69;
1602                 } else if(c==0x130) {
1603                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1604                     *pString=iDot;
1605                     return 2;
1606                 }
1607             } else {
1608                 /* Turkic mappings */
1609                 if(c==0x49) {
1610                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1611                     return 0x131;
1612                 } else if(c==0x130) {
1613                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1614                     return 0x69;
1615                 }
1616             }
1617         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1618             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1619 
1620             /* start of full case mapping strings */
1621             ++pe;
1622 
1623             /* skip the lowercase result string */
1624             pe+=full&UCASE_FULL_LOWER;
1625             full=(full>>4)&0xf;
1626 
1627             if(full!=0) {
1628                 /* set the output pointer to the result string */
1629                 *pString=reinterpret_cast<const char16_t *>(pe);
1630 
1631                 /* return the string length */
1632                 return full;
1633             }
1634         }
1635 
1636         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1637             return ~c;
1638         }
1639         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1640             int32_t delta;
1641             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1642             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1643         }
1644         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1645             idx=UCASE_EXC_FOLD;
1646         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1647             idx=UCASE_EXC_LOWER;
1648         } else {
1649             return ~c;
1650         }
1651         GET_SLOT_VALUE(excWord, idx, pe2, result);
1652     }
1653 
1654     return (result==c) ? ~result : result;
1655 }
1656 
1657 /* case mapping properties API ---------------------------------------------- */
1658 
1659 /* public API (see uchar.h) */
1660 
1661 U_CAPI UBool U_EXPORT2
u_isULowercase(UChar32 c)1662 u_isULowercase(UChar32 c) {
1663     return (UBool)(UCASE_LOWER==ucase_getType(c));
1664 }
1665 
1666 U_CAPI UBool U_EXPORT2
u_isUUppercase(UChar32 c)1667 u_isUUppercase(UChar32 c) {
1668     return (UBool)(UCASE_UPPER==ucase_getType(c));
1669 }
1670 
1671 /* Transforms the Unicode character to its lower case equivalent.*/
1672 U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c)1673 u_tolower(UChar32 c) {
1674     return ucase_tolower(c);
1675 }
1676 
1677 /* Transforms the Unicode character to its upper case equivalent.*/
1678 U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c)1679 u_toupper(UChar32 c) {
1680     return ucase_toupper(c);
1681 }
1682 
1683 /* Transforms the Unicode character to its title case equivalent.*/
1684 U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c)1685 u_totitle(UChar32 c) {
1686     return ucase_totitle(c);
1687 }
1688 
1689 /* return the simple case folding mapping for c */
1690 U_CAPI UChar32 U_EXPORT2
u_foldCase(UChar32 c,uint32_t options)1691 u_foldCase(UChar32 c, uint32_t options) {
1692     return ucase_fold(c, options);
1693 }
1694 
1695 U_CFUNC int32_t U_EXPORT2
ucase_hasBinaryProperty(UChar32 c,UProperty which)1696 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1697     /* case mapping properties */
1698     const char16_t *resultString;
1699     switch(which) {
1700     case UCHAR_LOWERCASE:
1701         return (UBool)(UCASE_LOWER==ucase_getType(c));
1702     case UCHAR_UPPERCASE:
1703         return (UBool)(UCASE_UPPER==ucase_getType(c));
1704     case UCHAR_SOFT_DOTTED:
1705         return ucase_isSoftDotted(c);
1706     case UCHAR_CASE_SENSITIVE:
1707         return ucase_isCaseSensitive(c);
1708     case UCHAR_CASED:
1709         return (UBool)(UCASE_NONE!=ucase_getType(c));
1710     case UCHAR_CASE_IGNORABLE:
1711         return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1712     /*
1713      * Note: The following Changes_When_Xyz are defined as testing whether
1714      * the NFD form of the input changes when Xyz-case-mapped.
1715      * However, this simpler implementation of these properties,
1716      * ignoring NFD, passes the tests.
1717      * The implementation needs to be changed if the tests start failing.
1718      * When that happens, optimizations should be used to work with the
1719      * per-single-code point ucase_toFullXyz() functions unless
1720      * the NFD form has more than one code point,
1721      * and the property starts set needs to be the union of the
1722      * start sets for normalization and case mappings.
1723      */
1724     case UCHAR_CHANGES_WHEN_LOWERCASED:
1725         return (UBool)(ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1726     case UCHAR_CHANGES_WHEN_UPPERCASED:
1727         return (UBool)(ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1728     case UCHAR_CHANGES_WHEN_TITLECASED:
1729         return (UBool)(ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1730     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1731     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1732         return (UBool)(
1733             ucase_toFullLower(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
1734             ucase_toFullUpper(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0 ||
1735             ucase_toFullTitle(c, nullptr, nullptr, &resultString, UCASE_LOC_ROOT)>=0);
1736     default:
1737         return false;
1738     }
1739 }
1740