• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2004-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucase.cpp
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2004aug30
16 *   created by: Markus W. Scherer
17 *
18 *   Low-level Unicode character/string case mapping code.
19 *   Much code moved here (and modified) from uchar.c.
20 */
21 
22 #include "unicode/utypes.h"
23 #include "unicode/unistr.h"
24 #include "unicode/uset.h"
25 #include "unicode/udata.h" /* UDataInfo */
26 #include "unicode/utf16.h"
27 #include "ucmndata.h" /* DataHeader */
28 #include "udatamem.h"
29 #include "umutex.h"
30 #include "uassert.h"
31 #include "cmemory.h"
32 #include "utrie2.h"
33 #include "ucase.h"
34 
35 struct UCaseProps {
36     UDataMemory *mem;
37     const int32_t *indexes;
38     const uint16_t *exceptions;
39     const uint16_t *unfold;
40 
41     UTrie2 trie;
42     uint8_t formatVersion[4];
43 };
44 
45 /* ucase_props_data.h is machine-generated by gencase --csource */
46 #define INCLUDED_FROM_UCASE_CPP
47 #include "ucase_props_data.h"
48 
49 /* UCaseProps singleton ----------------------------------------------------- */
50 
51 U_CAPI const UCaseProps * U_EXPORT2
ucase_getSingleton()52 ucase_getSingleton() {
53     return &ucase_props_singleton;
54 }
55 
56 /* set of property starts for UnicodeSet ------------------------------------ */
57 
58 static UBool U_CALLCONV
_enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)59 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
60     /* add the start code point to the USet */
61     const USetAdder *sa=(const USetAdder *)context;
62     sa->add(sa->set, start);
63     return TRUE;
64 }
65 
66 U_CFUNC void U_EXPORT2
ucase_addPropertyStarts(const UCaseProps * csp,const USetAdder * sa,UErrorCode * pErrorCode)67 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
68     if(U_FAILURE(*pErrorCode)) {
69         return;
70     }
71 
72     /* add the start code point of each same-value range of the trie */
73     utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
74 
75     /* add code points with hardcoded properties, plus the ones following them */
76 
77     /* (none right now, see comment below) */
78 
79     /*
80      * Omit code points with hardcoded specialcasing properties
81      * because we do not build property UnicodeSets for them right now.
82      */
83 }
84 
85 /* data access primitives --------------------------------------------------- */
86 
87 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
88 
89 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
90 
91 /* number of bits in an 8-bit integer value */
92 static const uint8_t flagsOffset[256]={
93     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
94     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
95     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
96     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
97     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
102     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
103     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
104     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
105     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
106     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
107     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
108     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
109 };
110 
111 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
112 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
113 
114 /*
115  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
116  *
117  * @param excWord (in) initial exceptions word
118  * @param idx (in) desired slot index
119  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
120  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
121  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
122  */
123 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
124     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
125         (pExc16)+=SLOT_OFFSET(excWord, idx); \
126         (value)=*pExc16; \
127     } else { \
128         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
129         (value)=*pExc16++; \
130         (value)=((value)<<16)|*pExc16; \
131     }
132 
133 /* simple case mappings ----------------------------------------------------- */
134 
135 U_CAPI UChar32 U_EXPORT2
ucase_tolower(const UCaseProps * csp,UChar32 c)136 ucase_tolower(const UCaseProps *csp, UChar32 c) {
137     uint16_t props=UTRIE2_GET16(&csp->trie, c);
138     if(!PROPS_HAS_EXCEPTION(props)) {
139         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
140             c+=UCASE_GET_DELTA(props);
141         }
142     } else {
143         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
144         uint16_t excWord=*pe++;
145         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
146             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
147         }
148     }
149     return c;
150 }
151 
152 U_CAPI UChar32 U_EXPORT2
ucase_toupper(const UCaseProps * csp,UChar32 c)153 ucase_toupper(const UCaseProps *csp, UChar32 c) {
154     uint16_t props=UTRIE2_GET16(&csp->trie, c);
155     if(!PROPS_HAS_EXCEPTION(props)) {
156         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
157             c+=UCASE_GET_DELTA(props);
158         }
159     } else {
160         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
161         uint16_t excWord=*pe++;
162         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
163             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
164         }
165     }
166     return c;
167 }
168 
169 U_CAPI UChar32 U_EXPORT2
ucase_totitle(const UCaseProps * csp,UChar32 c)170 ucase_totitle(const UCaseProps *csp, UChar32 c) {
171     uint16_t props=UTRIE2_GET16(&csp->trie, c);
172     if(!PROPS_HAS_EXCEPTION(props)) {
173         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
174             c+=UCASE_GET_DELTA(props);
175         }
176     } else {
177         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
178         uint16_t excWord=*pe++;
179         int32_t idx;
180         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
181             idx=UCASE_EXC_TITLE;
182         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
183             idx=UCASE_EXC_UPPER;
184         } else {
185             return c;
186         }
187         GET_SLOT_VALUE(excWord, idx, pe, c);
188     }
189     return c;
190 }
191 
192 static const UChar iDot[2] = { 0x69, 0x307 };
193 static const UChar jDot[2] = { 0x6a, 0x307 };
194 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
195 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
196 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
197 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
198 
199 
200 U_CFUNC void U_EXPORT2
ucase_addCaseClosure(const UCaseProps * csp,UChar32 c,const USetAdder * sa)201 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
202     uint16_t props;
203 
204     /*
205      * Hardcode the case closure of i and its relatives and ignore the
206      * data file data for these characters.
207      * The Turkic dotless i and dotted I with their case mapping conditions
208      * and case folding option make the related characters behave specially.
209      * This code matches their closure behavior to their case folding behavior.
210      */
211 
212     switch(c) {
213     case 0x49:
214         /* regular i and I are in one equivalence class */
215         sa->add(sa->set, 0x69);
216         return;
217     case 0x69:
218         sa->add(sa->set, 0x49);
219         return;
220     case 0x130:
221         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
222         sa->addString(sa->set, iDot, 2);
223         return;
224     case 0x131:
225         /* dotless i is in a class by itself */
226         return;
227     default:
228         /* otherwise use the data file data */
229         break;
230     }
231 
232     props=UTRIE2_GET16(&csp->trie, c);
233     if(!PROPS_HAS_EXCEPTION(props)) {
234         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
235             /* add the one simple case mapping, no matter what type it is */
236             int32_t delta=UCASE_GET_DELTA(props);
237             if(delta!=0) {
238                 sa->add(sa->set, c+delta);
239             }
240         }
241     } else {
242         /*
243          * c has exceptions, so there may be multiple simple and/or
244          * full case mappings. Add them all.
245          */
246         const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
247         const UChar *closure;
248         uint16_t excWord=*pe++;
249         int32_t idx, closureLength, fullLength, length;
250 
251         pe0=pe;
252 
253         /* add all simple case mappings */
254         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
255             if(HAS_SLOT(excWord, idx)) {
256                 pe=pe0;
257                 GET_SLOT_VALUE(excWord, idx, pe, c);
258                 sa->add(sa->set, c);
259             }
260         }
261 
262         /* get the closure string pointer & length */
263         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
264             pe=pe0;
265             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
266             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
267             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
268         } else {
269             closureLength=0;
270             closure=NULL;
271         }
272 
273         /* add the full case folding */
274         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
275             pe=pe0;
276             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
277 
278             /* start of full case mapping strings */
279             ++pe;
280 
281             fullLength&=0xffff; /* bits 16 and higher are reserved */
282 
283             /* skip the lowercase result string */
284             pe+=fullLength&UCASE_FULL_LOWER;
285             fullLength>>=4;
286 
287             /* add the full case folding string */
288             length=fullLength&0xf;
289             if(length!=0) {
290                 sa->addString(sa->set, (const UChar *)pe, length);
291                 pe+=length;
292             }
293 
294             /* skip the uppercase and titlecase strings */
295             fullLength>>=4;
296             pe+=fullLength&0xf;
297             fullLength>>=4;
298             pe+=fullLength;
299 
300             closure=(const UChar *)pe; /* behind full case mappings */
301         }
302 
303         /* add each code point in the closure string */
304         for(idx=0; idx<closureLength;) {
305             U16_NEXT_UNSAFE(closure, idx, c);
306             sa->add(sa->set, c);
307         }
308     }
309 }
310 
311 /*
312  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
313  * must be length>0 and max>0 and length<=max
314  */
315 static inline int32_t
strcmpMax(const UChar * s,int32_t length,const UChar * t,int32_t max)316 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
317     int32_t c1, c2;
318 
319     max-=length; /* we require length<=max, so no need to decrement max in the loop */
320     do {
321         c1=*s++;
322         c2=*t++;
323         if(c2==0) {
324             return 1; /* reached the end of t but not of s */
325         }
326         c1-=c2;
327         if(c1!=0) {
328             return c1; /* return difference result */
329         }
330     } while(--length>0);
331     /* ends with length==0 */
332 
333     if(max==0 || *t==0) {
334         return 0; /* equal to length of both strings */
335     } else {
336         return -max; /* return lengh difference */
337     }
338 }
339 
340 U_CFUNC UBool U_EXPORT2
ucase_addStringCaseClosure(const UCaseProps * csp,const UChar * s,int32_t length,const USetAdder * sa)341 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
342     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
343 
344     if(csp->unfold==NULL || s==NULL) {
345         return FALSE; /* no reverse case folding data, or no string */
346     }
347     if(length<=1) {
348         /* the string is too short to find any match */
349         /*
350          * more precise would be:
351          * if(!u_strHasMoreChar32Than(s, length, 1))
352          * but this does not make much practical difference because
353          * a single supplementary code point would just not be found
354          */
355         return FALSE;
356     }
357 
358     const uint16_t *unfold=csp->unfold;
359     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
360     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
361     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
362     unfold+=unfoldRowWidth;
363 
364     if(length>unfoldStringWidth) {
365         /* the string is too long to find any match */
366         return FALSE;
367     }
368 
369     /* do a binary search for the string */
370     start=0;
371     limit=unfoldRows;
372     while(start<limit) {
373         i=(start+limit)/2;
374         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
375         result=strcmpMax(s, length, p, unfoldStringWidth);
376 
377         if(result==0) {
378             /* found the string: add each code point, and its case closure */
379             UChar32 c;
380 
381             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
382                 U16_NEXT_UNSAFE(p, i, c);
383                 sa->add(sa->set, c);
384                 ucase_addCaseClosure(csp, c, sa);
385             }
386             return TRUE;
387         } else if(result<0) {
388             limit=i;
389         } else /* result>0 */ {
390             start=i+1;
391         }
392     }
393 
394     return FALSE; /* string not found */
395 }
396 
397 U_NAMESPACE_BEGIN
398 
FullCaseFoldingIterator()399 FullCaseFoldingIterator::FullCaseFoldingIterator()
400         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
401           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
402           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
403           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
404           currentRow(0),
405           rowCpIndex(unfoldStringWidth) {
406     unfold+=unfoldRowWidth;
407 }
408 
409 UChar32
next(UnicodeString & full)410 FullCaseFoldingIterator::next(UnicodeString &full) {
411     // Advance past the last-delivered code point.
412     const UChar *p=unfold+(currentRow*unfoldRowWidth);
413     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
414         ++currentRow;
415         p+=unfoldRowWidth;
416         rowCpIndex=unfoldStringWidth;
417     }
418     if(currentRow>=unfoldRows) { return U_SENTINEL; }
419     // Set "full" to the NUL-terminated string in the first unfold column.
420     int32_t length=unfoldStringWidth;
421     while(length>0 && p[length-1]==0) { --length; }
422     full.setTo(FALSE, p, length);
423     // Return the code point.
424     UChar32 c;
425     U16_NEXT_UNSAFE(p, rowCpIndex, c);
426     return c;
427 }
428 
429 U_NAMESPACE_END
430 
431 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
432 U_CAPI int32_t U_EXPORT2
ucase_getType(const UCaseProps * csp,UChar32 c)433 ucase_getType(const UCaseProps *csp, UChar32 c) {
434     uint16_t props=UTRIE2_GET16(&csp->trie, c);
435     return UCASE_GET_TYPE(props);
436 }
437 
438 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
439 U_CAPI int32_t U_EXPORT2
ucase_getTypeOrIgnorable(const UCaseProps * csp,UChar32 c)440 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
441     uint16_t props=UTRIE2_GET16(&csp->trie, c);
442     return UCASE_GET_TYPE_AND_IGNORABLE(props);
443 }
444 
445 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
446 static inline int32_t
getDotType(const UCaseProps * csp,UChar32 c)447 getDotType(const UCaseProps *csp, UChar32 c) {
448     uint16_t props=UTRIE2_GET16(&csp->trie, c);
449     if(!PROPS_HAS_EXCEPTION(props)) {
450         return props&UCASE_DOT_MASK;
451     } else {
452         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
453         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
454     }
455 }
456 
457 U_CAPI UBool U_EXPORT2
ucase_isSoftDotted(const UCaseProps * csp,UChar32 c)458 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
459     return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
460 }
461 
462 U_CAPI UBool U_EXPORT2
ucase_isCaseSensitive(const UCaseProps * csp,UChar32 c)463 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
464     uint16_t props=UTRIE2_GET16(&csp->trie, c);
465     return (UBool)((props&UCASE_SENSITIVE)!=0);
466 }
467 
468 /* string casing ------------------------------------------------------------ */
469 
470 /*
471  * These internal functions form the core of string case mappings.
472  * They map single code points to result code points or strings and take
473  * all necessary conditions (context, locale ID, options) into account.
474  *
475  * They do not iterate over the source or write to the destination
476  * so that the same functions are useful for non-standard string storage,
477  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
478  * For the same reason, the "surrounding text" context is passed in as a
479  * UCaseContextIterator which does not make any assumptions about
480  * the underlying storage.
481  *
482  * This section contains helper functions that check for conditions
483  * in the input text surrounding the current code point
484  * according to SpecialCasing.txt.
485  *
486  * Each helper function gets the index
487  * - after the current code point if it looks at following text
488  * - before the current code point if it looks at preceding text
489  *
490  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
491  *
492  * Final_Sigma
493  *   C is preceded by a sequence consisting of
494  *     a cased letter and a case-ignorable sequence,
495  *   and C is not followed by a sequence consisting of
496  *     an ignorable sequence and then a cased letter.
497  *
498  * More_Above
499  *   C is followed by one or more characters of combining class 230 (ABOVE)
500  *   in the combining character sequence.
501  *
502  * After_Soft_Dotted
503  *   The last preceding character with combining class of zero before C
504  *   was Soft_Dotted,
505  *   and there is no intervening combining character class 230 (ABOVE).
506  *
507  * Before_Dot
508  *   C is followed by combining dot above (U+0307).
509  *   Any sequence of characters with a combining class that is neither 0 nor 230
510  *   may intervene between the current character and the combining dot above.
511  *
512  * The erratum from 2002-10-31 adds the condition
513  *
514  * After_I
515  *   The last preceding base character was an uppercase I, and there is no
516  *   intervening combining character class 230 (ABOVE).
517  *
518  *   (See Jitterbug 2344 and the comments on After_I below.)
519  *
520  * Helper definitions in Unicode 3.2 UAX 21:
521  *
522  * D1. A character C is defined to be cased
523  *     if it meets any of the following criteria:
524  *
525  *   - The general category of C is Titlecase Letter (Lt)
526  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
527  *   - Given D = NFD(C), then it is not the case that:
528  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
529  *     (This third criterium does not add any characters to the list
530  *      for Unicode 3.2. Ignored.)
531  *
532  * D2. A character C is defined to be case-ignorable
533  *     if it meets either of the following criteria:
534  *
535  *   - The general category of C is
536  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
537  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
538  *   - C is one of the following characters
539  *     U+0027 APOSTROPHE
540  *     U+00AD SOFT HYPHEN (SHY)
541  *     U+2019 RIGHT SINGLE QUOTATION MARK
542  *            (the preferred character for apostrophe)
543  *
544  * D3. A case-ignorable sequence is a sequence of
545  *     zero or more case-ignorable characters.
546  */
547 
548 #define is_a(c) ((c)=='a' || (c)=='A')
549 #define is_d(c) ((c)=='d' || (c)=='D')
550 #define is_e(c) ((c)=='e' || (c)=='E')
551 #define is_i(c) ((c)=='i' || (c)=='I')
552 #define is_l(c) ((c)=='l' || (c)=='L')
553 #define is_n(c) ((c)=='n' || (c)=='N')
554 #define is_r(c) ((c)=='r' || (c)=='R')
555 #define is_t(c) ((c)=='t' || (c)=='T')
556 #define is_u(c) ((c)=='u' || (c)=='U')
557 #define is_z(c) ((c)=='z' || (c)=='Z')
558 
559 /* separator? */
560 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
561 
562 /**
563  * Requires non-NULL locale ID but otherwise does the equivalent of
564  * checking for language codes as if uloc_getLanguage() were called:
565  * Accepts both 2- and 3-letter codes and accepts case variants.
566  */
567 U_CFUNC int32_t
ucase_getCaseLocale(const char * locale,int32_t * locCache)568 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
569     int32_t result;
570     char c;
571 
572     if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
573         return result;
574     }
575 
576     result=UCASE_LOC_ROOT;
577 
578     /*
579      * This function used to use uloc_getLanguage(), but the current code
580      * removes the dependency of this low-level code on uloc implementation code
581      * and is faster because not the whole locale ID has to be
582      * examined and copied/transformed.
583      *
584      * Because this code does not want to depend on uloc, the caller must
585      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
586      */
587     c=*locale++;
588     if(is_t(c)) {
589         /* tr or tur? */
590         c=*locale++;
591         if(is_u(c)) {
592             c=*locale++;
593         }
594         if(is_r(c)) {
595             c=*locale;
596             if(is_sep(c)) {
597                 result=UCASE_LOC_TURKISH;
598             }
599         }
600     } else if(is_a(c)) {
601         /* az or aze? */
602         c=*locale++;
603         if(is_z(c)) {
604             c=*locale++;
605             if(is_e(c)) {
606                 c=*locale;
607             }
608             if(is_sep(c)) {
609                 result=UCASE_LOC_TURKISH;
610             }
611         }
612     } else if(is_l(c)) {
613         /* lt or lit? */
614         c=*locale++;
615         if(is_i(c)) {
616             c=*locale++;
617         }
618         if(is_t(c)) {
619             c=*locale;
620             if(is_sep(c)) {
621                 result=UCASE_LOC_LITHUANIAN;
622             }
623         }
624     } else if(is_e(c)) {
625         /* el or ell? */
626         c=*locale++;
627         if(is_l(c)) {
628             c=*locale++;
629             if(is_l(c)) {
630                 c=*locale;
631             }
632             if(is_sep(c)) {
633                 result=UCASE_LOC_GREEK;
634             }
635         }
636     } else if(is_n(c)) {
637         /* nl or nld? */
638         c=*locale++;
639         if(is_l(c)) {
640             c=*locale++;
641             if(is_d(c)) {
642                 c=*locale;
643             }
644             if(is_sep(c)) {
645                 result=UCASE_LOC_DUTCH;
646             }
647         }
648     }
649 
650     if(locCache!=NULL) {
651         *locCache=result;
652     }
653     return result;
654 }
655 
656 /*
657  * Is followed by
658  *   {case-ignorable}* cased
659  * ?
660  * (dir determines looking forward/backward)
661  * If a character is case-ignorable, it is skipped regardless of whether
662  * it is also cased or not.
663  */
664 static UBool
isFollowedByCasedLetter(const UCaseProps * csp,UCaseContextIterator * iter,void * context,int8_t dir)665 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
666     UChar32 c;
667 
668     if(iter==NULL) {
669         return FALSE;
670     }
671 
672     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
673         int32_t type=ucase_getTypeOrIgnorable(csp, c);
674         if(type&4) {
675             /* case-ignorable, continue with the loop */
676         } else if(type!=UCASE_NONE) {
677             return TRUE; /* followed by cased letter */
678         } else {
679             return FALSE; /* uncased and not case-ignorable */
680         }
681     }
682 
683     return FALSE; /* not followed by cased letter */
684 }
685 
686 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
687 static UBool
isPrecededBySoftDotted(const UCaseProps * csp,UCaseContextIterator * iter,void * context)688 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
689     UChar32 c;
690     int32_t dotType;
691     int8_t dir;
692 
693     if(iter==NULL) {
694         return FALSE;
695     }
696 
697     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
698         dotType=getDotType(csp, c);
699         if(dotType==UCASE_SOFT_DOTTED) {
700             return TRUE; /* preceded by TYPE_i */
701         } else if(dotType!=UCASE_OTHER_ACCENT) {
702             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
703         }
704     }
705 
706     return FALSE; /* not preceded by TYPE_i */
707 }
708 
709 /*
710  * See Jitterbug 2344:
711  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
712  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
713  * we made those releases compatible with Unicode 3.2 which had not fixed
714  * a related bug in SpecialCasing.txt.
715  *
716  * From the Jitterbug 2344 text:
717  * ... this bug is listed as a Unicode erratum
718  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
719  * <quote>
720  * There are two errors in SpecialCasing.txt.
721  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
722  * 2. An incorrect context definition. Correct as follows:
723  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
724  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
725  * ---
726  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
727  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
728  * where the context After_I is defined as:
729  * The last preceding base character was an uppercase I, and there is no
730  * intervening combining character class 230 (ABOVE).
731  * </quote>
732  *
733  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
734  *
735  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
736  * # This matches the behavior of the canonically equivalent I-dot_above
737  *
738  * See also the description in this place in older versions of uchar.c (revision 1.100).
739  *
740  * Markus W. Scherer 2003-feb-15
741  */
742 
743 /* Is preceded by base character 'I' with no intervening cc=230 ? */
744 static UBool
isPrecededBy_I(const UCaseProps * csp,UCaseContextIterator * iter,void * context)745 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
746     UChar32 c;
747     int32_t dotType;
748     int8_t dir;
749 
750     if(iter==NULL) {
751         return FALSE;
752     }
753 
754     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
755         if(c==0x49) {
756             return TRUE; /* preceded by I */
757         }
758         dotType=getDotType(csp, c);
759         if(dotType!=UCASE_OTHER_ACCENT) {
760             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
761         }
762     }
763 
764     return FALSE; /* not preceded by I */
765 }
766 
767 /* Is followed by one or more cc==230 ? */
768 static UBool
isFollowedByMoreAbove(const UCaseProps * csp,UCaseContextIterator * iter,void * context)769 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
770     UChar32 c;
771     int32_t dotType;
772     int8_t dir;
773 
774     if(iter==NULL) {
775         return FALSE;
776     }
777 
778     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
779         dotType=getDotType(csp, c);
780         if(dotType==UCASE_ABOVE) {
781             return TRUE; /* at least one cc==230 following */
782         } else if(dotType!=UCASE_OTHER_ACCENT) {
783             return FALSE; /* next base character, no more cc==230 following */
784         }
785     }
786 
787     return FALSE; /* no more cc==230 following */
788 }
789 
790 /* Is followed by a dot above (without cc==230 in between) ? */
791 static UBool
isFollowedByDotAbove(const UCaseProps * csp,UCaseContextIterator * iter,void * context)792 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
793     UChar32 c;
794     int32_t dotType;
795     int8_t dir;
796 
797     if(iter==NULL) {
798         return FALSE;
799     }
800 
801     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
802         if(c==0x307) {
803             return TRUE;
804         }
805         dotType=getDotType(csp, c);
806         if(dotType!=UCASE_OTHER_ACCENT) {
807             return FALSE; /* next base character or cc==230 in between */
808         }
809     }
810 
811     return FALSE; /* no dot above following */
812 }
813 
814 U_CAPI int32_t U_EXPORT2
ucase_toFullLower(const UCaseProps * csp,UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,const char * locale,int32_t * locCache)815 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
816                   UCaseContextIterator *iter, void *context,
817                   const UChar **pString,
818                   const char *locale, int32_t *locCache) {
819     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
820     U_ASSERT(c >= 0);
821     UChar32 result=c;
822     uint16_t props=UTRIE2_GET16(&csp->trie, c);
823     if(!PROPS_HAS_EXCEPTION(props)) {
824         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
825             result=c+UCASE_GET_DELTA(props);
826         }
827     } else {
828         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
829         uint16_t excWord=*pe++;
830         int32_t full;
831 
832         pe2=pe;
833 
834         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
835             /* use hardcoded conditions and mappings */
836             int32_t loc=ucase_getCaseLocale(locale, locCache);
837 
838             /*
839              * Test for conditional mappings first
840              *   (otherwise the unconditional default mappings are always taken),
841              * then test for characters that have unconditional mappings in SpecialCasing.txt,
842              * then get the UnicodeData.txt mappings.
843              */
844             if( loc==UCASE_LOC_LITHUANIAN &&
845                     /* base characters, find accents above */
846                     (((c==0x49 || c==0x4a || c==0x12e) &&
847                         isFollowedByMoreAbove(csp, iter, context)) ||
848                     /* precomposed with accent above, no need to find one */
849                     (c==0xcc || c==0xcd || c==0x128))
850             ) {
851                 /*
852                     # Lithuanian
853 
854                     # Lithuanian retains the dot in a lowercase i when followed by accents.
855 
856                     # Introduce an explicit dot above when lowercasing capital I's and J's
857                     # whenever there are more accents above.
858                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
859 
860                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
861                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
862                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
863                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
864                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
865                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
866                  */
867                 switch(c) {
868                 case 0x49:  /* LATIN CAPITAL LETTER I */
869                     *pString=iDot;
870                     return 2;
871                 case 0x4a:  /* LATIN CAPITAL LETTER J */
872                     *pString=jDot;
873                     return 2;
874                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
875                     *pString=iOgonekDot;
876                     return 2;
877                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
878                     *pString=iDotGrave;
879                     return 3;
880                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
881                     *pString=iDotAcute;
882                     return 3;
883                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
884                     *pString=iDotTilde;
885                     return 3;
886                 default:
887                     return 0; /* will not occur */
888                 }
889             /* # Turkish and Azeri */
890             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
891                 /*
892                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
893                     # The following rules handle those cases.
894 
895                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
896                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
897                  */
898                 return 0x69;
899             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
900                 /*
901                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
902                     # This matches the behavior of the canonically equivalent I-dot_above
903 
904                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
905                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
906                  */
907                 return 0; /* remove the dot (continue without output) */
908             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
909                 /*
910                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
911 
912                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
913                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
914                  */
915                 return 0x131;
916             } else if(c==0x130) {
917                 /*
918                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
919 
920                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
921                  */
922                 *pString=iDot;
923                 return 2;
924             } else if(  c==0x3a3 &&
925                         !isFollowedByCasedLetter(csp, iter, context, 1) &&
926                         isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
927             ) {
928                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
929                 /*
930                     # Special case for final form of sigma
931 
932                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
933                  */
934                 return 0x3c2; /* greek small final sigma */
935             } else {
936                 /* no known conditional special case mapping, use a normal mapping */
937             }
938         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
939             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
940             full&=UCASE_FULL_LOWER;
941             if(full!=0) {
942                 /* set the output pointer to the lowercase mapping */
943                 *pString=reinterpret_cast<const UChar *>(pe+1);
944 
945                 /* return the string length */
946                 return full;
947             }
948         }
949 
950         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
951             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
952         }
953     }
954 
955     return (result==c) ? ~result : result;
956 }
957 
958 /* internal */
959 static int32_t
toUpperOrTitle(const UCaseProps * csp,UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,const char * locale,int32_t * locCache,UBool upperNotTitle)960 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
961                UCaseContextIterator *iter, void *context,
962                const UChar **pString,
963                const char *locale, int32_t *locCache,
964                UBool upperNotTitle) {
965     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
966     U_ASSERT(c >= 0);
967     UChar32 result=c;
968     uint16_t props=UTRIE2_GET16(&csp->trie, c);
969     if(!PROPS_HAS_EXCEPTION(props)) {
970         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
971             result=c+UCASE_GET_DELTA(props);
972         }
973     } else {
974         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
975         uint16_t excWord=*pe++;
976         int32_t full, idx;
977 
978         pe2=pe;
979 
980         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
981             /* use hardcoded conditions and mappings */
982             int32_t loc=ucase_getCaseLocale(locale, locCache);
983 
984             if(loc==UCASE_LOC_TURKISH && c==0x69) {
985                 /*
986                     # Turkish and Azeri
987 
988                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
989                     # The following rules handle those cases.
990 
991                     # When uppercasing, i turns into a dotted capital I
992 
993                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
994                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
995                 */
996                 return 0x130;
997             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
998                 /*
999                     # Lithuanian
1000 
1001                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1002 
1003                     # Remove DOT ABOVE after "i" with upper or titlecase
1004 
1005                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1006                  */
1007                 return 0; /* remove the dot (continue without output) */
1008             } else {
1009                 /* no known conditional special case mapping, use a normal mapping */
1010             }
1011         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1012             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1013 
1014             /* start of full case mapping strings */
1015             ++pe;
1016 
1017             /* skip the lowercase and case-folding result strings */
1018             pe+=full&UCASE_FULL_LOWER;
1019             full>>=4;
1020             pe+=full&0xf;
1021             full>>=4;
1022 
1023             if(upperNotTitle) {
1024                 full&=0xf;
1025             } else {
1026                 /* skip the uppercase result string */
1027                 pe+=full&0xf;
1028                 full=(full>>4)&0xf;
1029             }
1030 
1031             if(full!=0) {
1032                 /* set the output pointer to the result string */
1033                 *pString=reinterpret_cast<const UChar *>(pe);
1034 
1035                 /* return the string length */
1036                 return full;
1037             }
1038         }
1039 
1040         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1041             idx=UCASE_EXC_TITLE;
1042         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1043             /* here, titlecase is same as uppercase */
1044             idx=UCASE_EXC_UPPER;
1045         } else {
1046             return ~c;
1047         }
1048         GET_SLOT_VALUE(excWord, idx, pe2, result);
1049     }
1050 
1051     return (result==c) ? ~result : result;
1052 }
1053 
1054 U_CAPI int32_t U_EXPORT2
ucase_toFullUpper(const UCaseProps * csp,UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,const char * locale,int32_t * locCache)1055 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1056                   UCaseContextIterator *iter, void *context,
1057                   const UChar **pString,
1058                   const char *locale, int32_t *locCache) {
1059     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1060 }
1061 
1062 U_CAPI int32_t U_EXPORT2
ucase_toFullTitle(const UCaseProps * csp,UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,const char * locale,int32_t * locCache)1063 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1064                   UCaseContextIterator *iter, void *context,
1065                   const UChar **pString,
1066                   const char *locale, int32_t *locCache) {
1067     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1068 }
1069 
1070 /* case folding ------------------------------------------------------------- */
1071 
1072 /*
1073  * Case folding is similar to lowercasing.
1074  * The result may be a simple mapping, i.e., a single code point, or
1075  * a full mapping, i.e., a string.
1076  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1077  * then only the lowercase mapping is stored.
1078  *
1079  * Some special cases are hardcoded because their conditions cannot be
1080  * parsed and processed from CaseFolding.txt.
1081  *
1082  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1083 
1084 # C: common case folding, common mappings shared by both simple and full mappings.
1085 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1086 # S: simple case folding, mappings to single characters where different from F.
1087 # T: special case for uppercase I and dotted uppercase I
1088 #    - For non-Turkic languages, this mapping is normally not used.
1089 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1090 #
1091 # Usage:
1092 #  A. To do a simple case folding, use the mappings with status C + S.
1093 #  B. To do a full case folding, use the mappings with status C + F.
1094 #
1095 #    The mappings with status T can be used or omitted depending on the desired case-folding
1096 #    behavior. (The default option is to exclude them.)
1097 
1098  * Unicode 3.2 has 'T' mappings as follows:
1099 
1100 0049; T; 0131; # LATIN CAPITAL LETTER I
1101 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1102 
1103  * while the default mappings for these code points are:
1104 
1105 0049; C; 0069; # LATIN CAPITAL LETTER I
1106 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1107 
1108  * U+0130 has no simple case folding (simple-case-folds to itself).
1109  */
1110 
1111 /* return the simple case folding mapping for c */
1112 U_CAPI UChar32 U_EXPORT2
ucase_fold(const UCaseProps * csp,UChar32 c,uint32_t options)1113 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
1114     uint16_t props=UTRIE2_GET16(&csp->trie, c);
1115     if(!PROPS_HAS_EXCEPTION(props)) {
1116         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1117             c+=UCASE_GET_DELTA(props);
1118         }
1119     } else {
1120         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1121         uint16_t excWord=*pe++;
1122         int32_t idx;
1123         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1124             /* special case folding mappings, hardcoded */
1125             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1126                 /* default mappings */
1127                 if(c==0x49) {
1128                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1129                     return 0x69;
1130                 } else if(c==0x130) {
1131                     /* no simple case folding for U+0130 */
1132                     return c;
1133                 }
1134             } else {
1135                 /* Turkic mappings */
1136                 if(c==0x49) {
1137                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1138                     return 0x131;
1139                 } else if(c==0x130) {
1140                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1141                     return 0x69;
1142                 }
1143             }
1144         }
1145         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1146             idx=UCASE_EXC_FOLD;
1147         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1148             idx=UCASE_EXC_LOWER;
1149         } else {
1150             return c;
1151         }
1152         GET_SLOT_VALUE(excWord, idx, pe, c);
1153     }
1154     return c;
1155 }
1156 
1157 /*
1158  * Issue for canonical caseless match (UAX #21):
1159  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1160  * canonical equivalence, unlike default-option casefolding.
1161  * For example, I-grave and I + grave fold to strings that are not canonically
1162  * equivalent.
1163  * For more details, see the comment in unorm_compare() in unorm.cpp
1164  * and the intermediate prototype changes for Jitterbug 2021.
1165  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1166  *
1167  * This did not get fixed because it appears that it is not possible to fix
1168  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1169  * together in a way that they still fold to common result strings.
1170  */
1171 
1172 U_CAPI int32_t U_EXPORT2
ucase_toFullFolding(const UCaseProps * csp,UChar32 c,const UChar ** pString,uint32_t options)1173 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1174                     const UChar **pString,
1175                     uint32_t options) {
1176     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1177     U_ASSERT(c >= 0);
1178     UChar32 result=c;
1179     uint16_t props=UTRIE2_GET16(&csp->trie, c);
1180     if(!PROPS_HAS_EXCEPTION(props)) {
1181         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1182             result=c+UCASE_GET_DELTA(props);
1183         }
1184     } else {
1185         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1186         uint16_t excWord=*pe++;
1187         int32_t full, idx;
1188 
1189         pe2=pe;
1190 
1191         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1192             /* use hardcoded conditions and mappings */
1193             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1194                 /* default mappings */
1195                 if(c==0x49) {
1196                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1197                     return 0x69;
1198                 } else if(c==0x130) {
1199                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1200                     *pString=iDot;
1201                     return 2;
1202                 }
1203             } else {
1204                 /* Turkic mappings */
1205                 if(c==0x49) {
1206                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1207                     return 0x131;
1208                 } else if(c==0x130) {
1209                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1210                     return 0x69;
1211                 }
1212             }
1213         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1214             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1215 
1216             /* start of full case mapping strings */
1217             ++pe;
1218 
1219             /* skip the lowercase result string */
1220             pe+=full&UCASE_FULL_LOWER;
1221             full=(full>>4)&0xf;
1222 
1223             if(full!=0) {
1224                 /* set the output pointer to the result string */
1225                 *pString=reinterpret_cast<const UChar *>(pe);
1226 
1227                 /* return the string length */
1228                 return full;
1229             }
1230         }
1231 
1232         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1233             idx=UCASE_EXC_FOLD;
1234         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1235             idx=UCASE_EXC_LOWER;
1236         } else {
1237             return ~c;
1238         }
1239         GET_SLOT_VALUE(excWord, idx, pe2, result);
1240     }
1241 
1242     return (result==c) ? ~result : result;
1243 }
1244 
1245 /* case mapping properties API ---------------------------------------------- */
1246 
1247 #define GET_CASE_PROPS() &ucase_props_singleton
1248 
1249 /* public API (see uchar.h) */
1250 
1251 U_CAPI UBool U_EXPORT2
u_isULowercase(UChar32 c)1252 u_isULowercase(UChar32 c) {
1253     return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1254 }
1255 
1256 U_CAPI UBool U_EXPORT2
u_isUUppercase(UChar32 c)1257 u_isUUppercase(UChar32 c) {
1258     return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1259 }
1260 
1261 /* Transforms the Unicode character to its lower case equivalent.*/
1262 U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c)1263 u_tolower(UChar32 c) {
1264     return ucase_tolower(GET_CASE_PROPS(), c);
1265 }
1266 
1267 /* Transforms the Unicode character to its upper case equivalent.*/
1268 U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c)1269 u_toupper(UChar32 c) {
1270     return ucase_toupper(GET_CASE_PROPS(), c);
1271 }
1272 
1273 /* Transforms the Unicode character to its title case equivalent.*/
1274 U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c)1275 u_totitle(UChar32 c) {
1276     return ucase_totitle(GET_CASE_PROPS(), c);
1277 }
1278 
1279 /* return the simple case folding mapping for c */
1280 U_CAPI UChar32 U_EXPORT2
u_foldCase(UChar32 c,uint32_t options)1281 u_foldCase(UChar32 c, uint32_t options) {
1282     return ucase_fold(GET_CASE_PROPS(), c, options);
1283 }
1284 
1285 U_CFUNC int32_t U_EXPORT2
ucase_hasBinaryProperty(UChar32 c,UProperty which)1286 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1287     /* case mapping properties */
1288     const UChar *resultString;
1289     int32_t locCache;
1290     const UCaseProps *csp=GET_CASE_PROPS();
1291     if(csp==NULL) {
1292         return FALSE;
1293     }
1294     switch(which) {
1295     case UCHAR_LOWERCASE:
1296         return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1297     case UCHAR_UPPERCASE:
1298         return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1299     case UCHAR_SOFT_DOTTED:
1300         return ucase_isSoftDotted(csp, c);
1301     case UCHAR_CASE_SENSITIVE:
1302         return ucase_isCaseSensitive(csp, c);
1303     case UCHAR_CASED:
1304         return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
1305     case UCHAR_CASE_IGNORABLE:
1306         return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
1307     /*
1308      * Note: The following Changes_When_Xyz are defined as testing whether
1309      * the NFD form of the input changes when Xyz-case-mapped.
1310      * However, this simpler implementation of these properties,
1311      * ignoring NFD, passes the tests.
1312      * The implementation needs to be changed if the tests start failing.
1313      * When that happens, optimizations should be used to work with the
1314      * per-single-code point ucase_toFullXyz() functions unless
1315      * the NFD form has more than one code point,
1316      * and the property starts set needs to be the union of the
1317      * start sets for normalization and case mappings.
1318      */
1319     case UCHAR_CHANGES_WHEN_LOWERCASED:
1320         locCache=UCASE_LOC_ROOT;
1321         return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1322     case UCHAR_CHANGES_WHEN_UPPERCASED:
1323         locCache=UCASE_LOC_ROOT;
1324         return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1325     case UCHAR_CHANGES_WHEN_TITLECASED:
1326         locCache=UCASE_LOC_ROOT;
1327         return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1328     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1329     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1330         locCache=UCASE_LOC_ROOT;
1331         return (UBool)(
1332             ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1333             ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1334             ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1335     default:
1336         return FALSE;
1337     }
1338 }
1339