• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2004-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucase.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2004aug30
16 *   created by: Markus W. Scherer
17 *
18 *   Low-level Unicode character/string case mapping code.
19 *   Much code moved here (and modified) from uchar.c.
20 */
21 
22 #include "unicode/utypes.h"
23 #include "unicode/unistr.h"
24 #include "unicode/uset.h"
25 #include "unicode/utf16.h"
26 #include "cmemory.h"
27 #include "uassert.h"
28 #include "ucase.h"
29 #include "umutex.h"
30 #include "utrie2.h"
31 
32 /* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
33 #define INCLUDED_FROM_UCASE_CPP
34 #include "ucase_props_data.h"
35 
36 /* set of property starts for UnicodeSet ------------------------------------ */
37 
38 static UBool U_CALLCONV
_enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)39 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
40     /* add the start code point to the USet */
41     const USetAdder *sa=(const USetAdder *)context;
42     sa->add(sa->set, start);
43     return true;
44 }
45 
46 U_CFUNC void U_EXPORT2
ucase_addPropertyStarts(const USetAdder * sa,UErrorCode * pErrorCode)47 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
48     if(U_FAILURE(*pErrorCode)) {
49         return;
50     }
51 
52     /* add the start code point of each same-value range of the trie */
53     utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
54 
55     /* add code points with hardcoded properties, plus the ones following them */
56 
57     /* (none right now, see comment below) */
58 
59     /*
60      * Omit code points with hardcoded specialcasing properties
61      * because we do not build property UnicodeSets for them right now.
62      */
63 }
64 
65 /* data access primitives --------------------------------------------------- */
66 
67 U_CAPI const struct UCaseProps * U_EXPORT2
ucase_getSingleton(int32_t * pExceptionsLength,int32_t * pUnfoldLength)68 ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
69     *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
70     *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
71     return &ucase_props_singleton;
72 }
73 
74 U_CFUNC const UTrie2 * U_EXPORT2
ucase_getTrie()75 ucase_getTrie() {
76     return &ucase_props_singleton.trie;
77 }
78 
79 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
80 
81 /* number of bits in an 8-bit integer value */
82 static const uint8_t flagsOffset[256]={
83     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
84     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
85     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
86     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
87     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
88     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
89     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
90     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
91     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
94     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
98     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
99 };
100 
101 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
102 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
103 
104 /*
105  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
106  *
107  * @param excWord (in) initial exceptions word
108  * @param idx (in) desired slot index
109  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
110  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
111  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
112  */
113 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
114     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
115         (pExc16)+=SLOT_OFFSET(excWord, idx); \
116         (value)=*pExc16; \
117     } else { \
118         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
119         (value)=*pExc16++; \
120         (value)=((value)<<16)|*pExc16; \
121     } \
122 } UPRV_BLOCK_MACRO_END
123 
124 /* simple case mappings ----------------------------------------------------- */
125 
126 U_CAPI UChar32 U_EXPORT2
ucase_tolower(UChar32 c)127 ucase_tolower(UChar32 c) {
128     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
129     if(!UCASE_HAS_EXCEPTION(props)) {
130         if(UCASE_IS_UPPER_OR_TITLE(props)) {
131             c+=UCASE_GET_DELTA(props);
132         }
133     } else {
134         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
135         uint16_t excWord=*pe++;
136         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
137             int32_t delta;
138             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
139             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
140         }
141         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
142             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
143         }
144     }
145     return c;
146 }
147 
148 U_CAPI UChar32 U_EXPORT2
ucase_toupper(UChar32 c)149 ucase_toupper(UChar32 c) {
150     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
151     if(!UCASE_HAS_EXCEPTION(props)) {
152         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
153             c+=UCASE_GET_DELTA(props);
154         }
155     } else {
156         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
157         uint16_t excWord=*pe++;
158         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
159             int32_t delta;
160             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
161             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
162         }
163         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
164             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
165         }
166     }
167     return c;
168 }
169 
170 U_CAPI UChar32 U_EXPORT2
ucase_totitle(UChar32 c)171 ucase_totitle(UChar32 c) {
172     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
173     if(!UCASE_HAS_EXCEPTION(props)) {
174         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
175             c+=UCASE_GET_DELTA(props);
176         }
177     } else {
178         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
179         uint16_t excWord=*pe++;
180         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
181             int32_t delta;
182             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
183             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
184         }
185         int32_t idx;
186         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
187             idx=UCASE_EXC_TITLE;
188         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
189             idx=UCASE_EXC_UPPER;
190         } else {
191             return c;
192         }
193         GET_SLOT_VALUE(excWord, idx, pe, c);
194     }
195     return c;
196 }
197 
198 static const UChar iDot[2] = { 0x69, 0x307 };
199 static const UChar jDot[2] = { 0x6a, 0x307 };
200 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
201 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
202 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
203 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
204 
205 
206 U_CFUNC void U_EXPORT2
ucase_addCaseClosure(UChar32 c,const USetAdder * sa)207 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
208     uint16_t props;
209 
210     /*
211      * Hardcode the case closure of i and its relatives and ignore the
212      * data file data for these characters.
213      * The Turkic dotless i and dotted I with their case mapping conditions
214      * and case folding option make the related characters behave specially.
215      * This code matches their closure behavior to their case folding behavior.
216      */
217 
218     switch(c) {
219     case 0x49:
220         /* regular i and I are in one equivalence class */
221         sa->add(sa->set, 0x69);
222         return;
223     case 0x69:
224         sa->add(sa->set, 0x49);
225         return;
226     case 0x130:
227         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
228         sa->addString(sa->set, iDot, 2);
229         return;
230     case 0x131:
231         /* dotless i is in a class by itself */
232         return;
233     default:
234         /* otherwise use the data file data */
235         break;
236     }
237 
238     props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
239     if(!UCASE_HAS_EXCEPTION(props)) {
240         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
241             /* add the one simple case mapping, no matter what type it is */
242             int32_t delta=UCASE_GET_DELTA(props);
243             if(delta!=0) {
244                 sa->add(sa->set, c+delta);
245             }
246         }
247     } else {
248         /*
249          * c has exceptions, so there may be multiple simple and/or
250          * full case mappings. Add them all.
251          */
252         const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
253         const UChar *closure;
254         uint16_t excWord=*pe++;
255         int32_t idx, closureLength, fullLength, length;
256 
257         pe0=pe;
258 
259         /* add all simple case mappings */
260         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
261             if(HAS_SLOT(excWord, idx)) {
262                 pe=pe0;
263                 GET_SLOT_VALUE(excWord, idx, pe, c);
264                 sa->add(sa->set, c);
265             }
266         }
267         if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
268             pe=pe0;
269             int32_t delta;
270             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
271             sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
272         }
273 
274         /* get the closure string pointer & length */
275         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
276             pe=pe0;
277             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
278             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
279             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
280         } else {
281             closureLength=0;
282             closure=NULL;
283         }
284 
285         /* add the full case folding */
286         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
287             pe=pe0;
288             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
289 
290             /* start of full case mapping strings */
291             ++pe;
292 
293             fullLength&=0xffff; /* bits 16 and higher are reserved */
294 
295             /* skip the lowercase result string */
296             pe+=fullLength&UCASE_FULL_LOWER;
297             fullLength>>=4;
298 
299             /* add the full case folding string */
300             length=fullLength&0xf;
301             if(length!=0) {
302                 sa->addString(sa->set, (const UChar *)pe, length);
303                 pe+=length;
304             }
305 
306             /* skip the uppercase and titlecase strings */
307             fullLength>>=4;
308             pe+=fullLength&0xf;
309             fullLength>>=4;
310             pe+=fullLength;
311 
312             closure=(const UChar *)pe; /* behind full case mappings */
313         }
314 
315         /* add each code point in the closure string */
316         for(idx=0; idx<closureLength;) {
317             U16_NEXT_UNSAFE(closure, idx, c);
318             sa->add(sa->set, c);
319         }
320     }
321 }
322 
323 /*
324  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
325  * must be length>0 and max>0 and length<=max
326  */
327 static inline int32_t
strcmpMax(const UChar * s,int32_t length,const UChar * t,int32_t max)328 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
329     int32_t c1, c2;
330 
331     max-=length; /* we require length<=max, so no need to decrement max in the loop */
332     do {
333         c1=*s++;
334         c2=*t++;
335         if(c2==0) {
336             return 1; /* reached the end of t but not of s */
337         }
338         c1-=c2;
339         if(c1!=0) {
340             return c1; /* return difference result */
341         }
342     } while(--length>0);
343     /* ends with length==0 */
344 
345     if(max==0 || *t==0) {
346         return 0; /* equal to length of both strings */
347     } else {
348         return -max; /* return length difference */
349     }
350 }
351 
352 U_CFUNC UBool U_EXPORT2
ucase_addStringCaseClosure(const UChar * s,int32_t length,const USetAdder * sa)353 ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
354     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
355 
356     if(ucase_props_singleton.unfold==NULL || s==NULL) {
357         return false; /* no reverse case folding data, or no string */
358     }
359     if(length<=1) {
360         /* the string is too short to find any match */
361         /*
362          * more precise would be:
363          * if(!u_strHasMoreChar32Than(s, length, 1))
364          * but this does not make much practical difference because
365          * a single supplementary code point would just not be found
366          */
367         return false;
368     }
369 
370     const uint16_t *unfold=ucase_props_singleton.unfold;
371     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
372     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
373     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
374     unfold+=unfoldRowWidth;
375 
376     if(length>unfoldStringWidth) {
377         /* the string is too long to find any match */
378         return false;
379     }
380 
381     /* do a binary search for the string */
382     start=0;
383     limit=unfoldRows;
384     while(start<limit) {
385         i=(start+limit)/2;
386         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
387         result=strcmpMax(s, length, p, unfoldStringWidth);
388 
389         if(result==0) {
390             /* found the string: add each code point, and its case closure */
391             UChar32 c;
392 
393             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
394                 U16_NEXT_UNSAFE(p, i, c);
395                 sa->add(sa->set, c);
396                 ucase_addCaseClosure(c, sa);
397             }
398             return true;
399         } else if(result<0) {
400             limit=i;
401         } else /* result>0 */ {
402             start=i+1;
403         }
404     }
405 
406     return false; /* string not found */
407 }
408 
409 U_NAMESPACE_BEGIN
410 
FullCaseFoldingIterator()411 FullCaseFoldingIterator::FullCaseFoldingIterator()
412         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
413           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
414           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
415           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
416           currentRow(0),
417           rowCpIndex(unfoldStringWidth) {
418     unfold+=unfoldRowWidth;
419 }
420 
421 UChar32
next(UnicodeString & full)422 FullCaseFoldingIterator::next(UnicodeString &full) {
423     // Advance past the last-delivered code point.
424     const UChar *p=unfold+(currentRow*unfoldRowWidth);
425     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
426         ++currentRow;
427         p+=unfoldRowWidth;
428         rowCpIndex=unfoldStringWidth;
429     }
430     if(currentRow>=unfoldRows) { return U_SENTINEL; }
431     // Set "full" to the NUL-terminated string in the first unfold column.
432     int32_t length=unfoldStringWidth;
433     while(length>0 && p[length-1]==0) { --length; }
434     full.setTo(false, p, length);
435     // Return the code point.
436     UChar32 c;
437     U16_NEXT_UNSAFE(p, rowCpIndex, c);
438     return c;
439 }
440 
441 namespace LatinCase {
442 
443 const int8_t TO_LOWER_NORMAL[LIMIT] = {
444     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
445     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
446     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
447     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
448 
449     0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
450     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
451     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453 
454     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
455     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
456     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458 
459     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
460     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
461     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463 
464     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
465     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
466     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
467     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
468 
469     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
470     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
471     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
472     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
473 };
474 
475 const int8_t TO_LOWER_TR_LT[LIMIT] = {
476     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
477     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
478     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
479     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
480 
481     0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
482     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
483     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485 
486     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
487     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
488     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490 
491     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
492     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
493     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 
496     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
497     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
498     1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
499     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
500 
501     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
502     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
503     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
504     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
505 };
506 
507 const int8_t TO_UPPER_NORMAL[LIMIT] = {
508     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
509     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
510     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
511     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
512 
513     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
514     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
515     0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
516     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
517 
518     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
519     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
521     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
522 
523     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
524     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
525     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
526     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
527 
528     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
529     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
530     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
531     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
532 
533     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
534     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
535     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
536     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
537 };
538 
539 const int8_t TO_UPPER_TR[LIMIT] = {
540     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
541     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
542     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
543     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544 
545     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
546     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547     0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
548     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
549 
550     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
554 
555     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
556     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
557     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
558     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
559 
560     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
561     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
562     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
563     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
564 
565     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
566     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
567     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
568     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
569 };
570 
571 }  // namespace LatinCase
572 
573 U_NAMESPACE_END
574 
575 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
576 U_CAPI int32_t U_EXPORT2
ucase_getType(UChar32 c)577 ucase_getType(UChar32 c) {
578     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
579     return UCASE_GET_TYPE(props);
580 }
581 
582 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
583 U_CAPI int32_t U_EXPORT2
ucase_getTypeOrIgnorable(UChar32 c)584 ucase_getTypeOrIgnorable(UChar32 c) {
585     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
586     return UCASE_GET_TYPE_AND_IGNORABLE(props);
587 }
588 
589 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
590 static inline int32_t
getDotType(UChar32 c)591 getDotType(UChar32 c) {
592     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
593     if(!UCASE_HAS_EXCEPTION(props)) {
594         return props&UCASE_DOT_MASK;
595     } else {
596         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
597         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
598     }
599 }
600 
601 U_CAPI UBool U_EXPORT2
ucase_isSoftDotted(UChar32 c)602 ucase_isSoftDotted(UChar32 c) {
603     return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
604 }
605 
606 U_CAPI UBool U_EXPORT2
ucase_isCaseSensitive(UChar32 c)607 ucase_isCaseSensitive(UChar32 c) {
608     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
609     if(!UCASE_HAS_EXCEPTION(props)) {
610         return (UBool)((props&UCASE_SENSITIVE)!=0);
611     } else {
612         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
613         return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
614     }
615 }
616 
617 /* string casing ------------------------------------------------------------ */
618 
619 /*
620  * These internal functions form the core of string case mappings.
621  * They map single code points to result code points or strings and take
622  * all necessary conditions (context, locale ID, options) into account.
623  *
624  * They do not iterate over the source or write to the destination
625  * so that the same functions are useful for non-standard string storage,
626  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
627  * For the same reason, the "surrounding text" context is passed in as a
628  * UCaseContextIterator which does not make any assumptions about
629  * the underlying storage.
630  *
631  * This section contains helper functions that check for conditions
632  * in the input text surrounding the current code point
633  * according to SpecialCasing.txt.
634  *
635  * Each helper function gets the index
636  * - after the current code point if it looks at following text
637  * - before the current code point if it looks at preceding text
638  *
639  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
640  *
641  * Final_Sigma
642  *   C is preceded by a sequence consisting of
643  *     a cased letter and a case-ignorable sequence,
644  *   and C is not followed by a sequence consisting of
645  *     an ignorable sequence and then a cased letter.
646  *
647  * More_Above
648  *   C is followed by one or more characters of combining class 230 (ABOVE)
649  *   in the combining character sequence.
650  *
651  * After_Soft_Dotted
652  *   The last preceding character with combining class of zero before C
653  *   was Soft_Dotted,
654  *   and there is no intervening combining character class 230 (ABOVE).
655  *
656  * Before_Dot
657  *   C is followed by combining dot above (U+0307).
658  *   Any sequence of characters with a combining class that is neither 0 nor 230
659  *   may intervene between the current character and the combining dot above.
660  *
661  * The erratum from 2002-10-31 adds the condition
662  *
663  * After_I
664  *   The last preceding base character was an uppercase I, and there is no
665  *   intervening combining character class 230 (ABOVE).
666  *
667  *   (See Jitterbug 2344 and the comments on After_I below.)
668  *
669  * Helper definitions in Unicode 3.2 UAX 21:
670  *
671  * D1. A character C is defined to be cased
672  *     if it meets any of the following criteria:
673  *
674  *   - The general category of C is Titlecase Letter (Lt)
675  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
676  *   - Given D = NFD(C), then it is not the case that:
677  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
678  *     (This third criterion does not add any characters to the list
679  *      for Unicode 3.2. Ignored.)
680  *
681  * D2. A character C is defined to be case-ignorable
682  *     if it meets either of the following criteria:
683  *
684  *   - The general category of C is
685  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
686  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
687  *   - C is one of the following characters
688  *     U+0027 APOSTROPHE
689  *     U+00AD SOFT HYPHEN (SHY)
690  *     U+2019 RIGHT SINGLE QUOTATION MARK
691  *            (the preferred character for apostrophe)
692  *
693  * D3. A case-ignorable sequence is a sequence of
694  *     zero or more case-ignorable characters.
695  */
696 
697 #define is_d(c) ((c)=='d' || (c)=='D')
698 #define is_e(c) ((c)=='e' || (c)=='E')
699 #define is_i(c) ((c)=='i' || (c)=='I')
700 #define is_l(c) ((c)=='l' || (c)=='L')
701 #define is_r(c) ((c)=='r' || (c)=='R')
702 #define is_t(c) ((c)=='t' || (c)=='T')
703 #define is_u(c) ((c)=='u' || (c)=='U')
704 #define is_y(c) ((c)=='y' || (c)=='Y')
705 #define is_z(c) ((c)=='z' || (c)=='Z')
706 
707 /* separator? */
708 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
709 
710 /**
711  * Requires non-NULL locale ID but otherwise does the equivalent of
712  * checking for language codes as if uloc_getLanguage() were called:
713  * Accepts both 2- and 3-letter codes and accepts case variants.
714  */
715 U_CFUNC int32_t
ucase_getCaseLocale(const char * locale)716 ucase_getCaseLocale(const char *locale) {
717     /*
718      * This function used to use uloc_getLanguage(), but the current code
719      * removes the dependency of this low-level code on uloc implementation code
720      * and is faster because not the whole locale ID has to be
721      * examined and copied/transformed.
722      *
723      * Because this code does not want to depend on uloc, the caller must
724      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
725      */
726     char c=*locale++;
727     // Fastpath for English "en" which is often used for default (=root locale) case mappings,
728     // and for Chinese "zh": Very common but no special case mapping behavior.
729     // Then check lowercase vs. uppercase to reduce the number of comparisons
730     // for other locales without special behavior.
731     if(c=='e') {
732         /* el or ell? */
733         c=*locale++;
734         if(is_l(c)) {
735             c=*locale++;
736             if(is_l(c)) {
737                 c=*locale;
738             }
739             if(is_sep(c)) {
740                 return UCASE_LOC_GREEK;
741             }
742         }
743         // en, es, ... -> root
744     } else if(c=='z') {
745         return UCASE_LOC_ROOT;
746 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
747     } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
748 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
749     } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
750 #else
751 #   error Unknown charset family!
752 #endif
753         // lowercase c
754         if(c=='t') {
755             /* tr or tur? */
756             c=*locale++;
757             if(is_u(c)) {
758                 c=*locale++;
759             }
760             if(is_r(c)) {
761                 c=*locale;
762                 if(is_sep(c)) {
763                     return UCASE_LOC_TURKISH;
764                 }
765             }
766         } else if(c=='a') {
767             /* az or aze? */
768             c=*locale++;
769             if(is_z(c)) {
770                 c=*locale++;
771                 if(is_e(c)) {
772                     c=*locale;
773                 }
774                 if(is_sep(c)) {
775                     return UCASE_LOC_TURKISH;
776                 }
777             }
778         } else if(c=='l') {
779             /* lt or lit? */
780             c=*locale++;
781             if(is_i(c)) {
782                 c=*locale++;
783             }
784             if(is_t(c)) {
785                 c=*locale;
786                 if(is_sep(c)) {
787                     return UCASE_LOC_LITHUANIAN;
788                 }
789             }
790         } else if(c=='n') {
791             /* nl or nld? */
792             c=*locale++;
793             if(is_l(c)) {
794                 c=*locale++;
795                 if(is_d(c)) {
796                     c=*locale;
797                 }
798                 if(is_sep(c)) {
799                     return UCASE_LOC_DUTCH;
800                 }
801             }
802         } else if(c=='h') {
803             /* hy or hye? *not* hyw */
804             c=*locale++;
805             if(is_y(c)) {
806                 c=*locale++;
807                 if(is_e(c)) {
808                     c=*locale;
809                 }
810                 if(is_sep(c)) {
811                     return UCASE_LOC_ARMENIAN;
812                 }
813             }
814         }
815     } else {
816         // uppercase c
817         // Same code as for lowercase c but also check for 'E'.
818         if(c=='T') {
819             /* tr or tur? */
820             c=*locale++;
821             if(is_u(c)) {
822                 c=*locale++;
823             }
824             if(is_r(c)) {
825                 c=*locale;
826                 if(is_sep(c)) {
827                     return UCASE_LOC_TURKISH;
828                 }
829             }
830         } else if(c=='A') {
831             /* az or aze? */
832             c=*locale++;
833             if(is_z(c)) {
834                 c=*locale++;
835                 if(is_e(c)) {
836                     c=*locale;
837                 }
838                 if(is_sep(c)) {
839                     return UCASE_LOC_TURKISH;
840                 }
841             }
842         } else if(c=='L') {
843             /* lt or lit? */
844             c=*locale++;
845             if(is_i(c)) {
846                 c=*locale++;
847             }
848             if(is_t(c)) {
849                 c=*locale;
850                 if(is_sep(c)) {
851                     return UCASE_LOC_LITHUANIAN;
852                 }
853             }
854         } else if(c=='E') {
855             /* el or ell? */
856             c=*locale++;
857             if(is_l(c)) {
858                 c=*locale++;
859                 if(is_l(c)) {
860                     c=*locale;
861                 }
862                 if(is_sep(c)) {
863                     return UCASE_LOC_GREEK;
864                 }
865             }
866         } else if(c=='N') {
867             /* nl or nld? */
868             c=*locale++;
869             if(is_l(c)) {
870                 c=*locale++;
871                 if(is_d(c)) {
872                     c=*locale;
873                 }
874                 if(is_sep(c)) {
875                     return UCASE_LOC_DUTCH;
876                 }
877             }
878         } else if(c=='H') {
879             /* hy or hye? *not* hyw */
880             c=*locale++;
881             if(is_y(c)) {
882                 c=*locale++;
883                 if(is_e(c)) {
884                     c=*locale;
885                 }
886                 if(is_sep(c)) {
887                     return UCASE_LOC_ARMENIAN;
888                 }
889             }
890         }
891     }
892     return UCASE_LOC_ROOT;
893 }
894 
895 /*
896  * Is followed by
897  *   {case-ignorable}* cased
898  * ?
899  * (dir determines looking forward/backward)
900  * If a character is case-ignorable, it is skipped regardless of whether
901  * it is also cased or not.
902  */
903 static UBool
isFollowedByCasedLetter(UCaseContextIterator * iter,void * context,int8_t dir)904 isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
905     UChar32 c;
906 
907     if(iter==NULL) {
908         return false;
909     }
910 
911     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
912         int32_t type=ucase_getTypeOrIgnorable(c);
913         if(type&4) {
914             /* case-ignorable, continue with the loop */
915         } else if(type!=UCASE_NONE) {
916             return true; /* followed by cased letter */
917         } else {
918             return false; /* uncased and not case-ignorable */
919         }
920     }
921 
922     return false; /* not followed by cased letter */
923 }
924 
925 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
926 static UBool
isPrecededBySoftDotted(UCaseContextIterator * iter,void * context)927 isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
928     UChar32 c;
929     int32_t dotType;
930     int8_t dir;
931 
932     if(iter==NULL) {
933         return false;
934     }
935 
936     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
937         dotType=getDotType(c);
938         if(dotType==UCASE_SOFT_DOTTED) {
939             return true; /* preceded by TYPE_i */
940         } else if(dotType!=UCASE_OTHER_ACCENT) {
941             return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
942         }
943     }
944 
945     return false; /* not preceded by TYPE_i */
946 }
947 
948 /*
949  * See Jitterbug 2344:
950  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
951  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
952  * we made those releases compatible with Unicode 3.2 which had not fixed
953  * a related bug in SpecialCasing.txt.
954  *
955  * From the Jitterbug 2344 text:
956  * ... this bug is listed as a Unicode erratum
957  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
958  * <quote>
959  * There are two errors in SpecialCasing.txt.
960  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
961  * 2. An incorrect context definition. Correct as follows:
962  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
963  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
964  * ---
965  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
966  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
967  * where the context After_I is defined as:
968  * The last preceding base character was an uppercase I, and there is no
969  * intervening combining character class 230 (ABOVE).
970  * </quote>
971  *
972  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
973  *
974  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
975  * # This matches the behavior of the canonically equivalent I-dot_above
976  *
977  * See also the description in this place in older versions of uchar.c (revision 1.100).
978  *
979  * Markus W. Scherer 2003-feb-15
980  */
981 
982 /* Is preceded by base character 'I' with no intervening cc=230 ? */
983 static UBool
isPrecededBy_I(UCaseContextIterator * iter,void * context)984 isPrecededBy_I(UCaseContextIterator *iter, void *context) {
985     UChar32 c;
986     int32_t dotType;
987     int8_t dir;
988 
989     if(iter==NULL) {
990         return false;
991     }
992 
993     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
994         if(c==0x49) {
995             return true; /* preceded by I */
996         }
997         dotType=getDotType(c);
998         if(dotType!=UCASE_OTHER_ACCENT) {
999             return false; /* preceded by different base character (not I), or intervening cc==230 */
1000         }
1001     }
1002 
1003     return false; /* not preceded by I */
1004 }
1005 
1006 /* Is followed by one or more cc==230 ? */
1007 static UBool
isFollowedByMoreAbove(UCaseContextIterator * iter,void * context)1008 isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
1009     UChar32 c;
1010     int32_t dotType;
1011     int8_t dir;
1012 
1013     if(iter==NULL) {
1014         return false;
1015     }
1016 
1017     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1018         dotType=getDotType(c);
1019         if(dotType==UCASE_ABOVE) {
1020             return true; /* at least one cc==230 following */
1021         } else if(dotType!=UCASE_OTHER_ACCENT) {
1022             return false; /* next base character, no more cc==230 following */
1023         }
1024     }
1025 
1026     return false; /* no more cc==230 following */
1027 }
1028 
1029 /* Is followed by a dot above (without cc==230 in between) ? */
1030 static UBool
isFollowedByDotAbove(UCaseContextIterator * iter,void * context)1031 isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1032     UChar32 c;
1033     int32_t dotType;
1034     int8_t dir;
1035 
1036     if(iter==NULL) {
1037         return false;
1038     }
1039 
1040     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1041         if(c==0x307) {
1042             return true;
1043         }
1044         dotType=getDotType(c);
1045         if(dotType!=UCASE_OTHER_ACCENT) {
1046             return false; /* next base character or cc==230 in between */
1047         }
1048     }
1049 
1050     return false; /* no dot above following */
1051 }
1052 
1053 U_CAPI int32_t U_EXPORT2
ucase_toFullLower(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t loc)1054 ucase_toFullLower(UChar32 c,
1055                   UCaseContextIterator *iter, void *context,
1056                   const UChar **pString,
1057                   int32_t loc) {
1058     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1059     U_ASSERT(c >= 0);
1060     UChar32 result=c;
1061     // Reset the output pointer in case it was uninitialized.
1062     *pString=nullptr;
1063     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1064     if(!UCASE_HAS_EXCEPTION(props)) {
1065         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1066             result=c+UCASE_GET_DELTA(props);
1067         }
1068     } else {
1069         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1070         uint16_t excWord=*pe++;
1071         int32_t full;
1072 
1073         pe2=pe;
1074 
1075         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1076             /* use hardcoded conditions and mappings */
1077 
1078             /*
1079              * Test for conditional mappings first
1080              *   (otherwise the unconditional default mappings are always taken),
1081              * then test for characters that have unconditional mappings in SpecialCasing.txt,
1082              * then get the UnicodeData.txt mappings.
1083              */
1084             if( loc==UCASE_LOC_LITHUANIAN &&
1085                     /* base characters, find accents above */
1086                     (((c==0x49 || c==0x4a || c==0x12e) &&
1087                         isFollowedByMoreAbove(iter, context)) ||
1088                     /* precomposed with accent above, no need to find one */
1089                     (c==0xcc || c==0xcd || c==0x128))
1090             ) {
1091                 /*
1092                     # Lithuanian
1093 
1094                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1095 
1096                     # Introduce an explicit dot above when lowercasing capital I's and J's
1097                     # whenever there are more accents above.
1098                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1099 
1100                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1101                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1102                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1103                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1104                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1105                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1106                  */
1107                 switch(c) {
1108                 case 0x49:  /* LATIN CAPITAL LETTER I */
1109                     *pString=iDot;
1110                     return 2;
1111                 case 0x4a:  /* LATIN CAPITAL LETTER J */
1112                     *pString=jDot;
1113                     return 2;
1114                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1115                     *pString=iOgonekDot;
1116                     return 2;
1117                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1118                     *pString=iDotGrave;
1119                     return 3;
1120                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1121                     *pString=iDotAcute;
1122                     return 3;
1123                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1124                     *pString=iDotTilde;
1125                     return 3;
1126                 default:
1127                     return 0; /* will not occur */
1128                 }
1129             /* # Turkish and Azeri */
1130             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1131                 /*
1132                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1133                     # The following rules handle those cases.
1134 
1135                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1136                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1137                  */
1138                 return 0x69;
1139             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1140                 /*
1141                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1142                     # This matches the behavior of the canonically equivalent I-dot_above
1143 
1144                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1145                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1146                  */
1147                 return 0; /* remove the dot (continue without output) */
1148             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1149                 /*
1150                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1151 
1152                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1153                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1154                  */
1155                 return 0x131;
1156             } else if(c==0x130) {
1157                 /*
1158                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
1159 
1160                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1161                  */
1162                 *pString=iDot;
1163                 return 2;
1164             } else if(  c==0x3a3 &&
1165                         !isFollowedByCasedLetter(iter, context, 1) &&
1166                         isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1167             ) {
1168                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1169                 /*
1170                     # Special case for final form of sigma
1171 
1172                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1173                  */
1174                 return 0x3c2; /* greek small final sigma */
1175             } else {
1176                 /* no known conditional special case mapping, use a normal mapping */
1177             }
1178         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1179             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1180             full&=UCASE_FULL_LOWER;
1181             if(full!=0) {
1182                 /* set the output pointer to the lowercase mapping */
1183                 *pString=reinterpret_cast<const UChar *>(pe+1);
1184 
1185                 /* return the string length */
1186                 return full;
1187             }
1188         }
1189 
1190         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1191             int32_t delta;
1192             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1193             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1194         }
1195         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1196             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1197         }
1198     }
1199 
1200     return (result==c) ? ~result : result;
1201 }
1202 
1203 /* internal */
1204 static int32_t
toUpperOrTitle(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t loc,UBool upperNotTitle)1205 toUpperOrTitle(UChar32 c,
1206                UCaseContextIterator *iter, void *context,
1207                const UChar **pString,
1208                int32_t loc,
1209                UBool upperNotTitle) {
1210     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1211     U_ASSERT(c >= 0);
1212     UChar32 result=c;
1213     // Reset the output pointer in case it was uninitialized.
1214     *pString=nullptr;
1215     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1216     if(!UCASE_HAS_EXCEPTION(props)) {
1217         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1218             result=c+UCASE_GET_DELTA(props);
1219         }
1220     } else {
1221         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1222         uint16_t excWord=*pe++;
1223         int32_t full, idx;
1224 
1225         pe2=pe;
1226 
1227         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1228             /* use hardcoded conditions and mappings */
1229             if(loc==UCASE_LOC_TURKISH && c==0x69) {
1230                 /*
1231                     # Turkish and Azeri
1232 
1233                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1234                     # The following rules handle those cases.
1235 
1236                     # When uppercasing, i turns into a dotted capital I
1237 
1238                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1239                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1240                 */
1241                 return 0x130;
1242             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1243                 /*
1244                     # Lithuanian
1245 
1246                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1247 
1248                     # Remove DOT ABOVE after "i" with upper or titlecase
1249 
1250                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1251                  */
1252                 return 0; /* remove the dot (continue without output) */
1253             } else if(c==0x0587) {
1254                 // See ICU-13416:
1255                 // և ligature ech-yiwn
1256                 // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
1257                 // but to ԵՎ=ech+vew in Eastern Armenian.
1258                 if(loc==UCASE_LOC_ARMENIAN) {
1259                     *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
1260                 } else {
1261                     *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
1262                 }
1263                 return 2;
1264             } else {
1265                 /* no known conditional special case mapping, use a normal mapping */
1266             }
1267         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1268             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1269 
1270             /* start of full case mapping strings */
1271             ++pe;
1272 
1273             /* skip the lowercase and case-folding result strings */
1274             pe+=full&UCASE_FULL_LOWER;
1275             full>>=4;
1276             pe+=full&0xf;
1277             full>>=4;
1278 
1279             if(upperNotTitle) {
1280                 full&=0xf;
1281             } else {
1282                 /* skip the uppercase result string */
1283                 pe+=full&0xf;
1284                 full=(full>>4)&0xf;
1285             }
1286 
1287             if(full!=0) {
1288                 /* set the output pointer to the result string */
1289                 *pString=reinterpret_cast<const UChar *>(pe);
1290 
1291                 /* return the string length */
1292                 return full;
1293             }
1294         }
1295 
1296         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1297             int32_t delta;
1298             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1299             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1300         }
1301         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1302             idx=UCASE_EXC_TITLE;
1303         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1304             /* here, titlecase is same as uppercase */
1305             idx=UCASE_EXC_UPPER;
1306         } else {
1307             return ~c;
1308         }
1309         GET_SLOT_VALUE(excWord, idx, pe2, result);
1310     }
1311 
1312     return (result==c) ? ~result : result;
1313 }
1314 
1315 U_CAPI int32_t U_EXPORT2
ucase_toFullUpper(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t caseLocale)1316 ucase_toFullUpper(UChar32 c,
1317                   UCaseContextIterator *iter, void *context,
1318                   const UChar **pString,
1319                   int32_t caseLocale) {
1320     return toUpperOrTitle(c, iter, context, pString, caseLocale, true);
1321 }
1322 
1323 U_CAPI int32_t U_EXPORT2
ucase_toFullTitle(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t caseLocale)1324 ucase_toFullTitle(UChar32 c,
1325                   UCaseContextIterator *iter, void *context,
1326                   const UChar **pString,
1327                   int32_t caseLocale) {
1328     return toUpperOrTitle(c, iter, context, pString, caseLocale, false);
1329 }
1330 
1331 /* case folding ------------------------------------------------------------- */
1332 
1333 /*
1334  * Case folding is similar to lowercasing.
1335  * The result may be a simple mapping, i.e., a single code point, or
1336  * a full mapping, i.e., a string.
1337  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1338  * then only the lowercase mapping is stored.
1339  *
1340  * Some special cases are hardcoded because their conditions cannot be
1341  * parsed and processed from CaseFolding.txt.
1342  *
1343  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1344 
1345 # C: common case folding, common mappings shared by both simple and full mappings.
1346 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1347 # S: simple case folding, mappings to single characters where different from F.
1348 # T: special case for uppercase I and dotted uppercase I
1349 #    - For non-Turkic languages, this mapping is normally not used.
1350 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1351 #
1352 # Usage:
1353 #  A. To do a simple case folding, use the mappings with status C + S.
1354 #  B. To do a full case folding, use the mappings with status C + F.
1355 #
1356 #    The mappings with status T can be used or omitted depending on the desired case-folding
1357 #    behavior. (The default option is to exclude them.)
1358 
1359  * Unicode 3.2 has 'T' mappings as follows:
1360 
1361 0049; T; 0131; # LATIN CAPITAL LETTER I
1362 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1363 
1364  * while the default mappings for these code points are:
1365 
1366 0049; C; 0069; # LATIN CAPITAL LETTER I
1367 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1368 
1369  * U+0130 has no simple case folding (simple-case-folds to itself).
1370  */
1371 
1372 /* return the simple case folding mapping for c */
1373 U_CAPI UChar32 U_EXPORT2
ucase_fold(UChar32 c,uint32_t options)1374 ucase_fold(UChar32 c, uint32_t options) {
1375     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1376     if(!UCASE_HAS_EXCEPTION(props)) {
1377         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1378             c+=UCASE_GET_DELTA(props);
1379         }
1380     } else {
1381         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1382         uint16_t excWord=*pe++;
1383         int32_t idx;
1384         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1385             /* special case folding mappings, hardcoded */
1386             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1387                 /* default mappings */
1388                 if(c==0x49) {
1389                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1390                     return 0x69;
1391                 } else if(c==0x130) {
1392                     /* no simple case folding for U+0130 */
1393                     return c;
1394                 }
1395             } else {
1396                 /* Turkic mappings */
1397                 if(c==0x49) {
1398                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1399                     return 0x131;
1400                 } else if(c==0x130) {
1401                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1402                     return 0x69;
1403                 }
1404             }
1405         }
1406         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1407             return c;
1408         }
1409         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1410             int32_t delta;
1411             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1412             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1413         }
1414         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1415             idx=UCASE_EXC_FOLD;
1416         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1417             idx=UCASE_EXC_LOWER;
1418         } else {
1419             return c;
1420         }
1421         GET_SLOT_VALUE(excWord, idx, pe, c);
1422     }
1423     return c;
1424 }
1425 
1426 /*
1427  * Issue for canonical caseless match (UAX #21):
1428  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1429  * canonical equivalence, unlike default-option casefolding.
1430  * For example, I-grave and I + grave fold to strings that are not canonically
1431  * equivalent.
1432  * For more details, see the comment in unorm_compare() in unorm.cpp
1433  * and the intermediate prototype changes for Jitterbug 2021.
1434  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1435  *
1436  * This did not get fixed because it appears that it is not possible to fix
1437  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1438  * together in a way that they still fold to common result strings.
1439  */
1440 
1441 U_CAPI int32_t U_EXPORT2
ucase_toFullFolding(UChar32 c,const UChar ** pString,uint32_t options)1442 ucase_toFullFolding(UChar32 c,
1443                     const UChar **pString,
1444                     uint32_t options) {
1445     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1446     U_ASSERT(c >= 0);
1447     UChar32 result=c;
1448     // Reset the output pointer in case it was uninitialized.
1449     *pString=nullptr;
1450     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1451     if(!UCASE_HAS_EXCEPTION(props)) {
1452         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1453             result=c+UCASE_GET_DELTA(props);
1454         }
1455     } else {
1456         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1457         uint16_t excWord=*pe++;
1458         int32_t full, idx;
1459 
1460         pe2=pe;
1461 
1462         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1463             /* use hardcoded conditions and mappings */
1464             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1465                 /* default mappings */
1466                 if(c==0x49) {
1467                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1468                     return 0x69;
1469                 } else if(c==0x130) {
1470                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1471                     *pString=iDot;
1472                     return 2;
1473                 }
1474             } else {
1475                 /* Turkic mappings */
1476                 if(c==0x49) {
1477                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1478                     return 0x131;
1479                 } else if(c==0x130) {
1480                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1481                     return 0x69;
1482                 }
1483             }
1484         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1485             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1486 
1487             /* start of full case mapping strings */
1488             ++pe;
1489 
1490             /* skip the lowercase result string */
1491             pe+=full&UCASE_FULL_LOWER;
1492             full=(full>>4)&0xf;
1493 
1494             if(full!=0) {
1495                 /* set the output pointer to the result string */
1496                 *pString=reinterpret_cast<const UChar *>(pe);
1497 
1498                 /* return the string length */
1499                 return full;
1500             }
1501         }
1502 
1503         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1504             return ~c;
1505         }
1506         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1507             int32_t delta;
1508             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1509             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1510         }
1511         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1512             idx=UCASE_EXC_FOLD;
1513         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1514             idx=UCASE_EXC_LOWER;
1515         } else {
1516             return ~c;
1517         }
1518         GET_SLOT_VALUE(excWord, idx, pe2, result);
1519     }
1520 
1521     return (result==c) ? ~result : result;
1522 }
1523 
1524 /* case mapping properties API ---------------------------------------------- */
1525 
1526 /* public API (see uchar.h) */
1527 
1528 U_CAPI UBool U_EXPORT2
u_isULowercase(UChar32 c)1529 u_isULowercase(UChar32 c) {
1530     return (UBool)(UCASE_LOWER==ucase_getType(c));
1531 }
1532 
1533 U_CAPI UBool U_EXPORT2
u_isUUppercase(UChar32 c)1534 u_isUUppercase(UChar32 c) {
1535     return (UBool)(UCASE_UPPER==ucase_getType(c));
1536 }
1537 
1538 /* Transforms the Unicode character to its lower case equivalent.*/
1539 U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c)1540 u_tolower(UChar32 c) {
1541     return ucase_tolower(c);
1542 }
1543 
1544 /* Transforms the Unicode character to its upper case equivalent.*/
1545 U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c)1546 u_toupper(UChar32 c) {
1547     return ucase_toupper(c);
1548 }
1549 
1550 /* Transforms the Unicode character to its title case equivalent.*/
1551 U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c)1552 u_totitle(UChar32 c) {
1553     return ucase_totitle(c);
1554 }
1555 
1556 /* return the simple case folding mapping for c */
1557 U_CAPI UChar32 U_EXPORT2
u_foldCase(UChar32 c,uint32_t options)1558 u_foldCase(UChar32 c, uint32_t options) {
1559     return ucase_fold(c, options);
1560 }
1561 
1562 U_CFUNC int32_t U_EXPORT2
ucase_hasBinaryProperty(UChar32 c,UProperty which)1563 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1564     /* case mapping properties */
1565     const UChar *resultString;
1566     switch(which) {
1567     case UCHAR_LOWERCASE:
1568         return (UBool)(UCASE_LOWER==ucase_getType(c));
1569     case UCHAR_UPPERCASE:
1570         return (UBool)(UCASE_UPPER==ucase_getType(c));
1571     case UCHAR_SOFT_DOTTED:
1572         return ucase_isSoftDotted(c);
1573     case UCHAR_CASE_SENSITIVE:
1574         return ucase_isCaseSensitive(c);
1575     case UCHAR_CASED:
1576         return (UBool)(UCASE_NONE!=ucase_getType(c));
1577     case UCHAR_CASE_IGNORABLE:
1578         return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1579     /*
1580      * Note: The following Changes_When_Xyz are defined as testing whether
1581      * the NFD form of the input changes when Xyz-case-mapped.
1582      * However, this simpler implementation of these properties,
1583      * ignoring NFD, passes the tests.
1584      * The implementation needs to be changed if the tests start failing.
1585      * When that happens, optimizations should be used to work with the
1586      * per-single-code point ucase_toFullXyz() functions unless
1587      * the NFD form has more than one code point,
1588      * and the property starts set needs to be the union of the
1589      * start sets for normalization and case mappings.
1590      */
1591     case UCHAR_CHANGES_WHEN_LOWERCASED:
1592         return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1593     case UCHAR_CHANGES_WHEN_UPPERCASED:
1594         return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1595     case UCHAR_CHANGES_WHEN_TITLECASED:
1596         return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1597     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1598     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1599         return (UBool)(
1600             ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1601             ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1602             ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1603     default:
1604         return false;
1605     }
1606 }
1607