• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2004-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  ucase.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2004aug30
14 *   created by: Markus W. Scherer
15 *
16 *   Low-level Unicode character/string case mapping code.
17 *   Much code moved here (and modified) from uchar.c.
18 */
19 
20 #include "unicode/utypes.h"
21 #include "unicode/unistr.h"
22 #include "unicode/uset.h"
23 #include "unicode/udata.h" /* UDataInfo */
24 #include "unicode/utf16.h"
25 #include "ucmndata.h" /* DataHeader */
26 #include "udatamem.h"
27 #include "umutex.h"
28 #include "uassert.h"
29 #include "cmemory.h"
30 #include "utrie2.h"
31 #include "ucase.h"
32 #include "ucln_cmn.h"
33 
34 struct UCaseProps {
35     UDataMemory *mem;
36     const int32_t *indexes;
37     const uint16_t *exceptions;
38     const uint16_t *unfold;
39 
40     UTrie2 trie;
41     uint8_t formatVersion[4];
42 };
43 
44 /* ucase_props_data.h is machine-generated by gencase --csource */
45 #define INCLUDED_FROM_UCASE_CPP
46 #include "ucase_props_data.h"
47 
48 /* UCaseProps singleton ----------------------------------------------------- */
49 
50 U_CAPI const UCaseProps * U_EXPORT2
ucase_getSingleton()51 ucase_getSingleton() {
52     return &ucase_props_singleton;
53 }
54 
55 /* set of property starts for UnicodeSet ------------------------------------ */
56 
57 static UBool U_CALLCONV
_enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)58 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
59     /* add the start code point to the USet */
60     const USetAdder *sa=(const USetAdder *)context;
61     sa->add(sa->set, start);
62     return TRUE;
63 }
64 
65 U_CFUNC void U_EXPORT2
ucase_addPropertyStarts(const UCaseProps * csp,const USetAdder * sa,UErrorCode * pErrorCode)66 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
67     if(U_FAILURE(*pErrorCode)) {
68         return;
69     }
70 
71     /* add the start code point of each same-value range of the trie */
72     utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
73 
74     /* add code points with hardcoded properties, plus the ones following them */
75 
76     /* (none right now, see comment below) */
77 
78     /*
79      * Omit code points with hardcoded specialcasing properties
80      * because we do not build property UnicodeSets for them right now.
81      */
82 }
83 
84 /* data access primitives --------------------------------------------------- */
85 
86 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
87 
88 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
89 
90 /* number of bits in an 8-bit integer value */
91 static const uint8_t flagsOffset[256]={
92     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
93     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
97     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
98     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
100     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
101     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
103     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
105     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
106     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
107     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
108 };
109 
110 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
111 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
112 
113 /*
114  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
115  *
116  * @param excWord (in) initial exceptions word
117  * @param idx (in) desired slot index
118  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
119  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
120  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
121  */
122 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
123     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
124         (pExc16)+=SLOT_OFFSET(excWord, idx); \
125         (value)=*pExc16; \
126     } else { \
127         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
128         (value)=*pExc16++; \
129         (value)=((value)<<16)|*pExc16; \
130     }
131 
132 /* simple case mappings ----------------------------------------------------- */
133 
134 U_CAPI UChar32 U_EXPORT2
ucase_tolower(const UCaseProps * csp,UChar32 c)135 ucase_tolower(const UCaseProps *csp, UChar32 c) {
136     uint16_t props=UTRIE2_GET16(&csp->trie, c);
137     if(!PROPS_HAS_EXCEPTION(props)) {
138         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
139             c+=UCASE_GET_DELTA(props);
140         }
141     } else {
142         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
143         uint16_t excWord=*pe++;
144         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
145             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
146         }
147     }
148     return c;
149 }
150 
151 U_CAPI UChar32 U_EXPORT2
ucase_toupper(const UCaseProps * csp,UChar32 c)152 ucase_toupper(const UCaseProps *csp, UChar32 c) {
153     uint16_t props=UTRIE2_GET16(&csp->trie, c);
154     if(!PROPS_HAS_EXCEPTION(props)) {
155         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
156             c+=UCASE_GET_DELTA(props);
157         }
158     } else {
159         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
160         uint16_t excWord=*pe++;
161         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
162             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
163         }
164     }
165     return c;
166 }
167 
168 U_CAPI UChar32 U_EXPORT2
ucase_totitle(const UCaseProps * csp,UChar32 c)169 ucase_totitle(const UCaseProps *csp, UChar32 c) {
170     uint16_t props=UTRIE2_GET16(&csp->trie, c);
171     if(!PROPS_HAS_EXCEPTION(props)) {
172         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
173             c+=UCASE_GET_DELTA(props);
174         }
175     } else {
176         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
177         uint16_t excWord=*pe++;
178         int32_t idx;
179         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
180             idx=UCASE_EXC_TITLE;
181         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
182             idx=UCASE_EXC_UPPER;
183         } else {
184             return c;
185         }
186         GET_SLOT_VALUE(excWord, idx, pe, c);
187     }
188     return c;
189 }
190 
191 static const UChar iDot[2] = { 0x69, 0x307 };
192 static const UChar jDot[2] = { 0x6a, 0x307 };
193 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
194 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
195 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
196 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
197 
198 
199 U_CFUNC void U_EXPORT2
ucase_addCaseClosure(const UCaseProps * csp,UChar32 c,const USetAdder * sa)200 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
201     uint16_t props;
202 
203     /*
204      * Hardcode the case closure of i and its relatives and ignore the
205      * data file data for these characters.
206      * The Turkic dotless i and dotted I with their case mapping conditions
207      * and case folding option make the related characters behave specially.
208      * This code matches their closure behavior to their case folding behavior.
209      */
210 
211     switch(c) {
212     case 0x49:
213         /* regular i and I are in one equivalence class */
214         sa->add(sa->set, 0x69);
215         return;
216     case 0x69:
217         sa->add(sa->set, 0x49);
218         return;
219     case 0x130:
220         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
221         sa->addString(sa->set, iDot, 2);
222         return;
223     case 0x131:
224         /* dotless i is in a class by itself */
225         return;
226     default:
227         /* otherwise use the data file data */
228         break;
229     }
230 
231     props=UTRIE2_GET16(&csp->trie, c);
232     if(!PROPS_HAS_EXCEPTION(props)) {
233         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
234             /* add the one simple case mapping, no matter what type it is */
235             int32_t delta=UCASE_GET_DELTA(props);
236             if(delta!=0) {
237                 sa->add(sa->set, c+delta);
238             }
239         }
240     } else {
241         /*
242          * c has exceptions, so there may be multiple simple and/or
243          * full case mappings. Add them all.
244          */
245         const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
246         const UChar *closure;
247         uint16_t excWord=*pe++;
248         int32_t idx, closureLength, fullLength, length;
249 
250         pe0=pe;
251 
252         /* add all simple case mappings */
253         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
254             if(HAS_SLOT(excWord, idx)) {
255                 pe=pe0;
256                 GET_SLOT_VALUE(excWord, idx, pe, c);
257                 sa->add(sa->set, c);
258             }
259         }
260 
261         /* get the closure string pointer & length */
262         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
263             pe=pe0;
264             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
265             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
266             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
267         } else {
268             closureLength=0;
269             closure=NULL;
270         }
271 
272         /* add the full case folding */
273         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
274             pe=pe0;
275             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
276 
277             /* start of full case mapping strings */
278             ++pe;
279 
280             fullLength&=0xffff; /* bits 16 and higher are reserved */
281 
282             /* skip the lowercase result string */
283             pe+=fullLength&UCASE_FULL_LOWER;
284             fullLength>>=4;
285 
286             /* add the full case folding string */
287             length=fullLength&0xf;
288             if(length!=0) {
289                 sa->addString(sa->set, (const UChar *)pe, length);
290                 pe+=length;
291             }
292 
293             /* skip the uppercase and titlecase strings */
294             fullLength>>=4;
295             pe+=fullLength&0xf;
296             fullLength>>=4;
297             pe+=fullLength;
298 
299             closure=(const UChar *)pe; /* behind full case mappings */
300         }
301 
302         /* add each code point in the closure string */
303         for(idx=0; idx<closureLength;) {
304             U16_NEXT_UNSAFE(closure, idx, c);
305             sa->add(sa->set, c);
306         }
307     }
308 }
309 
310 /*
311  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
312  * must be length>0 and max>0 and length<=max
313  */
314 static inline int32_t
strcmpMax(const UChar * s,int32_t length,const UChar * t,int32_t max)315 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
316     int32_t c1, c2;
317 
318     max-=length; /* we require length<=max, so no need to decrement max in the loop */
319     do {
320         c1=*s++;
321         c2=*t++;
322         if(c2==0) {
323             return 1; /* reached the end of t but not of s */
324         }
325         c1-=c2;
326         if(c1!=0) {
327             return c1; /* return difference result */
328         }
329     } while(--length>0);
330     /* ends with length==0 */
331 
332     if(max==0 || *t==0) {
333         return 0; /* equal to length of both strings */
334     } else {
335         return -max; /* return lengh difference */
336     }
337 }
338 
339 U_CFUNC UBool U_EXPORT2
ucase_addStringCaseClosure(const UCaseProps * csp,const UChar * s,int32_t length,const USetAdder * sa)340 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
341     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
342 
343     if(csp->unfold==NULL || s==NULL) {
344         return FALSE; /* no reverse case folding data, or no string */
345     }
346     if(length<=1) {
347         /* the string is too short to find any match */
348         /*
349          * more precise would be:
350          * if(!u_strHasMoreChar32Than(s, length, 1))
351          * but this does not make much practical difference because
352          * a single supplementary code point would just not be found
353          */
354         return FALSE;
355     }
356 
357     const uint16_t *unfold=csp->unfold;
358     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
359     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
360     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
361     unfold+=unfoldRowWidth;
362 
363     if(length>unfoldStringWidth) {
364         /* the string is too long to find any match */
365         return FALSE;
366     }
367 
368     /* do a binary search for the string */
369     start=0;
370     limit=unfoldRows;
371     while(start<limit) {
372         i=(start+limit)/2;
373         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
374         result=strcmpMax(s, length, p, unfoldStringWidth);
375 
376         if(result==0) {
377             /* found the string: add each code point, and its case closure */
378             UChar32 c;
379 
380             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
381                 U16_NEXT_UNSAFE(p, i, c);
382                 sa->add(sa->set, c);
383                 ucase_addCaseClosure(csp, c, sa);
384             }
385             return TRUE;
386         } else if(result<0) {
387             limit=i;
388         } else /* result>0 */ {
389             start=i+1;
390         }
391     }
392 
393     return FALSE; /* string not found */
394 }
395 
396 U_NAMESPACE_BEGIN
397 
FullCaseFoldingIterator()398 FullCaseFoldingIterator::FullCaseFoldingIterator()
399         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
400           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
401           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
402           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
403           currentRow(0),
404           rowCpIndex(unfoldStringWidth) {
405     unfold+=unfoldRowWidth;
406 }
407 
408 UChar32
next(UnicodeString & full)409 FullCaseFoldingIterator::next(UnicodeString &full) {
410     // Advance past the last-delivered code point.
411     const UChar *p=unfold+(currentRow*unfoldRowWidth);
412     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
413         ++currentRow;
414         p+=unfoldRowWidth;
415         rowCpIndex=unfoldStringWidth;
416     }
417     if(currentRow>=unfoldRows) { return U_SENTINEL; }
418     // Set "full" to the NUL-terminated string in the first unfold column.
419     int32_t length=unfoldStringWidth;
420     while(length>0 && p[length-1]==0) { --length; }
421     full.setTo(FALSE, p, length);
422     // Return the code point.
423     UChar32 c;
424     U16_NEXT_UNSAFE(p, rowCpIndex, c);
425     return c;
426 }
427 
428 U_NAMESPACE_END
429 
430 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
431 U_CAPI int32_t U_EXPORT2
ucase_getType(const UCaseProps * csp,UChar32 c)432 ucase_getType(const UCaseProps *csp, UChar32 c) {
433     uint16_t props=UTRIE2_GET16(&csp->trie, c);
434     return UCASE_GET_TYPE(props);
435 }
436 
437 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
438 U_CAPI int32_t U_EXPORT2
ucase_getTypeOrIgnorable(const UCaseProps * csp,UChar32 c)439 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
440     uint16_t props=UTRIE2_GET16(&csp->trie, c);
441     return UCASE_GET_TYPE_AND_IGNORABLE(props);
442 }
443 
444 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
445 static inline int32_t
getDotType(const UCaseProps * csp,UChar32 c)446 getDotType(const UCaseProps *csp, UChar32 c) {
447     uint16_t props=UTRIE2_GET16(&csp->trie, c);
448     if(!PROPS_HAS_EXCEPTION(props)) {
449         return props&UCASE_DOT_MASK;
450     } else {
451         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
452         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
453     }
454 }
455 
456 U_CAPI UBool U_EXPORT2
ucase_isSoftDotted(const UCaseProps * csp,UChar32 c)457 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
458     return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
459 }
460 
461 U_CAPI UBool U_EXPORT2
ucase_isCaseSensitive(const UCaseProps * csp,UChar32 c)462 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
463     uint16_t props=UTRIE2_GET16(&csp->trie, c);
464     return (UBool)((props&UCASE_SENSITIVE)!=0);
465 }
466 
467 /* string casing ------------------------------------------------------------ */
468 
469 /*
470  * These internal functions form the core of string case mappings.
471  * They map single code points to result code points or strings and take
472  * all necessary conditions (context, locale ID, options) into account.
473  *
474  * They do not iterate over the source or write to the destination
475  * so that the same functions are useful for non-standard string storage,
476  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
477  * For the same reason, the "surrounding text" context is passed in as a
478  * UCaseContextIterator which does not make any assumptions about
479  * the underlying storage.
480  *
481  * This section contains helper functions that check for conditions
482  * in the input text surrounding the current code point
483  * according to SpecialCasing.txt.
484  *
485  * Each helper function gets the index
486  * - after the current code point if it looks at following text
487  * - before the current code point if it looks at preceding text
488  *
489  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
490  *
491  * Final_Sigma
492  *   C is preceded by a sequence consisting of
493  *     a cased letter and a case-ignorable sequence,
494  *   and C is not followed by a sequence consisting of
495  *     an ignorable sequence and then a cased letter.
496  *
497  * More_Above
498  *   C is followed by one or more characters of combining class 230 (ABOVE)
499  *   in the combining character sequence.
500  *
501  * After_Soft_Dotted
502  *   The last preceding character with combining class of zero before C
503  *   was Soft_Dotted,
504  *   and there is no intervening combining character class 230 (ABOVE).
505  *
506  * Before_Dot
507  *   C is followed by combining dot above (U+0307).
508  *   Any sequence of characters with a combining class that is neither 0 nor 230
509  *   may intervene between the current character and the combining dot above.
510  *
511  * The erratum from 2002-10-31 adds the condition
512  *
513  * After_I
514  *   The last preceding base character was an uppercase I, and there is no
515  *   intervening combining character class 230 (ABOVE).
516  *
517  *   (See Jitterbug 2344 and the comments on After_I below.)
518  *
519  * Helper definitions in Unicode 3.2 UAX 21:
520  *
521  * D1. A character C is defined to be cased
522  *     if it meets any of the following criteria:
523  *
524  *   - The general category of C is Titlecase Letter (Lt)
525  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
526  *   - Given D = NFD(C), then it is not the case that:
527  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
528  *     (This third criterium does not add any characters to the list
529  *      for Unicode 3.2. Ignored.)
530  *
531  * D2. A character C is defined to be case-ignorable
532  *     if it meets either of the following criteria:
533  *
534  *   - The general category of C is
535  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
536  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
537  *   - C is one of the following characters
538  *     U+0027 APOSTROPHE
539  *     U+00AD SOFT HYPHEN (SHY)
540  *     U+2019 RIGHT SINGLE QUOTATION MARK
541  *            (the preferred character for apostrophe)
542  *
543  * D3. A case-ignorable sequence is a sequence of
544  *     zero or more case-ignorable characters.
545  */
546 
547 #define is_a(c) ((c)=='a' || (c)=='A')
548 #define is_d(c) ((c)=='d' || (c)=='D')
549 #define is_e(c) ((c)=='e' || (c)=='E')
550 #define is_i(c) ((c)=='i' || (c)=='I')
551 #define is_l(c) ((c)=='l' || (c)=='L')
552 #define is_n(c) ((c)=='n' || (c)=='N')
553 #define is_r(c) ((c)=='r' || (c)=='R')
554 #define is_t(c) ((c)=='t' || (c)=='T')
555 #define is_u(c) ((c)=='u' || (c)=='U')
556 #define is_z(c) ((c)=='z' || (c)=='Z')
557 
558 /* separator? */
559 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
560 
561 /**
562  * Requires non-NULL locale ID but otherwise does the equivalent of
563  * checking for language codes as if uloc_getLanguage() were called:
564  * Accepts both 2- and 3-letter codes and accepts case variants.
565  */
566 U_CFUNC int32_t
ucase_getCaseLocale(const char * locale,int32_t * locCache)567 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
568     int32_t result;
569     char c;
570 
571     if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
572         return result;
573     }
574 
575     result=UCASE_LOC_ROOT;
576 
577     /*
578      * This function used to use uloc_getLanguage(), but the current code
579      * removes the dependency of this low-level code on uloc implementation code
580      * and is faster because not the whole locale ID has to be
581      * examined and copied/transformed.
582      *
583      * Because this code does not want to depend on uloc, the caller must
584      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
585      */
586     c=*locale++;
587     if(is_t(c)) {
588         /* tr or tur? */
589         c=*locale++;
590         if(is_u(c)) {
591             c=*locale++;
592         }
593         if(is_r(c)) {
594             c=*locale;
595             if(is_sep(c)) {
596                 result=UCASE_LOC_TURKISH;
597             }
598         }
599     } else if(is_a(c)) {
600         /* az or aze? */
601         c=*locale++;
602         if(is_z(c)) {
603             c=*locale++;
604             if(is_e(c)) {
605                 c=*locale;
606             }
607             if(is_sep(c)) {
608                 result=UCASE_LOC_TURKISH;
609             }
610         }
611     } else if(is_l(c)) {
612         /* lt or lit? */
613         c=*locale++;
614         if(is_i(c)) {
615             c=*locale++;
616         }
617         if(is_t(c)) {
618             c=*locale;
619             if(is_sep(c)) {
620                 result=UCASE_LOC_LITHUANIAN;
621             }
622         }
623     } else if(is_n(c)) {
624         /* nl or nld? */
625         c=*locale++;
626         if(is_l(c)) {
627             c=*locale++;
628             if(is_d(c)) {
629                 c=*locale;
630             }
631             if(is_sep(c)) {
632                 result=UCASE_LOC_DUTCH;
633             }
634         }
635     }
636 
637     if(locCache!=NULL) {
638         *locCache=result;
639     }
640     return result;
641 }
642 
643 /*
644  * Is followed by
645  *   {case-ignorable}* cased
646  * ?
647  * (dir determines looking forward/backward)
648  * If a character is case-ignorable, it is skipped regardless of whether
649  * it is also cased or not.
650  */
651 static UBool
isFollowedByCasedLetter(const UCaseProps * csp,UCaseContextIterator * iter,void * context,int8_t dir)652 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
653     UChar32 c;
654 
655     if(iter==NULL) {
656         return FALSE;
657     }
658 
659     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
660         int32_t type=ucase_getTypeOrIgnorable(csp, c);
661         if(type&4) {
662             /* case-ignorable, continue with the loop */
663         } else if(type!=UCASE_NONE) {
664             return TRUE; /* followed by cased letter */
665         } else {
666             return FALSE; /* uncased and not case-ignorable */
667         }
668     }
669 
670     return FALSE; /* not followed by cased letter */
671 }
672 
673 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
674 static UBool
isPrecededBySoftDotted(const UCaseProps * csp,UCaseContextIterator * iter,void * context)675 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
676     UChar32 c;
677     int32_t dotType;
678     int8_t dir;
679 
680     if(iter==NULL) {
681         return FALSE;
682     }
683 
684     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
685         dotType=getDotType(csp, c);
686         if(dotType==UCASE_SOFT_DOTTED) {
687             return TRUE; /* preceded by TYPE_i */
688         } else if(dotType!=UCASE_OTHER_ACCENT) {
689             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
690         }
691     }
692 
693     return FALSE; /* not preceded by TYPE_i */
694 }
695 
696 /*
697  * See Jitterbug 2344:
698  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
699  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
700  * we made those releases compatible with Unicode 3.2 which had not fixed
701  * a related bug in SpecialCasing.txt.
702  *
703  * From the Jitterbug 2344 text:
704  * ... this bug is listed as a Unicode erratum
705  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
706  * <quote>
707  * There are two errors in SpecialCasing.txt.
708  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
709  * 2. An incorrect context definition. Correct as follows:
710  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
711  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
712  * ---
713  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
714  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
715  * where the context After_I is defined as:
716  * The last preceding base character was an uppercase I, and there is no
717  * intervening combining character class 230 (ABOVE).
718  * </quote>
719  *
720  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
721  *
722  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
723  * # This matches the behavior of the canonically equivalent I-dot_above
724  *
725  * See also the description in this place in older versions of uchar.c (revision 1.100).
726  *
727  * Markus W. Scherer 2003-feb-15
728  */
729 
730 /* Is preceded by base character 'I' with no intervening cc=230 ? */
731 static UBool
isPrecededBy_I(const UCaseProps * csp,UCaseContextIterator * iter,void * context)732 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
733     UChar32 c;
734     int32_t dotType;
735     int8_t dir;
736 
737     if(iter==NULL) {
738         return FALSE;
739     }
740 
741     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
742         if(c==0x49) {
743             return TRUE; /* preceded by I */
744         }
745         dotType=getDotType(csp, c);
746         if(dotType!=UCASE_OTHER_ACCENT) {
747             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
748         }
749     }
750 
751     return FALSE; /* not preceded by I */
752 }
753 
754 /* Is followed by one or more cc==230 ? */
755 static UBool
isFollowedByMoreAbove(const UCaseProps * csp,UCaseContextIterator * iter,void * context)756 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
757     UChar32 c;
758     int32_t dotType;
759     int8_t dir;
760 
761     if(iter==NULL) {
762         return FALSE;
763     }
764 
765     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
766         dotType=getDotType(csp, c);
767         if(dotType==UCASE_ABOVE) {
768             return TRUE; /* at least one cc==230 following */
769         } else if(dotType!=UCASE_OTHER_ACCENT) {
770             return FALSE; /* next base character, no more cc==230 following */
771         }
772     }
773 
774     return FALSE; /* no more cc==230 following */
775 }
776 
777 /* Is followed by a dot above (without cc==230 in between) ? */
778 static UBool
isFollowedByDotAbove(const UCaseProps * csp,UCaseContextIterator * iter,void * context)779 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
780     UChar32 c;
781     int32_t dotType;
782     int8_t dir;
783 
784     if(iter==NULL) {
785         return FALSE;
786     }
787 
788     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
789         if(c==0x307) {
790             return TRUE;
791         }
792         dotType=getDotType(csp, c);
793         if(dotType!=UCASE_OTHER_ACCENT) {
794             return FALSE; /* next base character or cc==230 in between */
795         }
796     }
797 
798     return FALSE; /* no dot above following */
799 }
800 
801 U_CAPI int32_t U_EXPORT2
ucase_toFullLower(const UCaseProps * csp,UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,const char * locale,int32_t * locCache)802 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
803                   UCaseContextIterator *iter, void *context,
804                   const UChar **pString,
805                   const char *locale, int32_t *locCache)
806 {
807     UChar32 result=c;
808     uint16_t props=UTRIE2_GET16(&csp->trie, c);
809     if(!PROPS_HAS_EXCEPTION(props)) {
810         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
811             result=c+UCASE_GET_DELTA(props);
812         }
813     } else {
814         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
815         uint16_t excWord=*pe++;
816         int32_t full;
817 
818         pe2=pe;
819 
820         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
821             /* use hardcoded conditions and mappings */
822             int32_t loc=ucase_getCaseLocale(locale, locCache);
823 
824             /*
825              * Test for conditional mappings first
826              *   (otherwise the unconditional default mappings are always taken),
827              * then test for characters that have unconditional mappings in SpecialCasing.txt,
828              * then get the UnicodeData.txt mappings.
829              */
830             if( loc==UCASE_LOC_LITHUANIAN &&
831                     /* base characters, find accents above */
832                     (((c==0x49 || c==0x4a || c==0x12e) &&
833                         isFollowedByMoreAbove(csp, iter, context)) ||
834                     /* precomposed with accent above, no need to find one */
835                     (c==0xcc || c==0xcd || c==0x128))
836             ) {
837                 /*
838                     # Lithuanian
839 
840                     # Lithuanian retains the dot in a lowercase i when followed by accents.
841 
842                     # Introduce an explicit dot above when lowercasing capital I's and J's
843                     # whenever there are more accents above.
844                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
845 
846                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
847                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
848                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
849                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
850                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
851                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
852                  */
853                 switch(c) {
854                 case 0x49:  /* LATIN CAPITAL LETTER I */
855                     *pString=iDot;
856                     return 2;
857                 case 0x4a:  /* LATIN CAPITAL LETTER J */
858                     *pString=jDot;
859                     return 2;
860                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
861                     *pString=iOgonekDot;
862                     return 2;
863                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
864                     *pString=iDotGrave;
865                     return 3;
866                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
867                     *pString=iDotAcute;
868                     return 3;
869                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
870                     *pString=iDotTilde;
871                     return 3;
872                 default:
873                     return 0; /* will not occur */
874                 }
875             /* # Turkish and Azeri */
876             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
877                 /*
878                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
879                     # The following rules handle those cases.
880 
881                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
882                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
883                  */
884                 return 0x69;
885             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
886                 /*
887                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
888                     # This matches the behavior of the canonically equivalent I-dot_above
889 
890                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
891                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
892                  */
893                 return 0; /* remove the dot (continue without output) */
894             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
895                 /*
896                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
897 
898                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
899                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
900                  */
901                 return 0x131;
902             } else if(c==0x130) {
903                 /*
904                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
905 
906                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
907                  */
908                 *pString=iDot;
909                 return 2;
910             } else if(  c==0x3a3 &&
911                         !isFollowedByCasedLetter(csp, iter, context, 1) &&
912                         isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
913             ) {
914                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
915                 /*
916                     # Special case for final form of sigma
917 
918                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
919                  */
920                 return 0x3c2; /* greek small final sigma */
921             } else {
922                 /* no known conditional special case mapping, use a normal mapping */
923             }
924         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
925             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
926             full&=UCASE_FULL_LOWER;
927             if(full!=0) {
928                 /* set the output pointer to the lowercase mapping */
929                 *pString=reinterpret_cast<const UChar *>(pe+1);
930 
931                 /* return the string length */
932                 return full;
933             }
934         }
935 
936         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
937             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
938         }
939     }
940 
941     return (result==c) ? ~result : result;
942 }
943 
944 /* internal */
945 static int32_t
toUpperOrTitle(const UCaseProps * csp,UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,const char * locale,int32_t * locCache,UBool upperNotTitle)946 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
947                UCaseContextIterator *iter, void *context,
948                const UChar **pString,
949                const char *locale, int32_t *locCache,
950                UBool upperNotTitle) {
951     UChar32 result=c;
952     uint16_t props=UTRIE2_GET16(&csp->trie, c);
953     if(!PROPS_HAS_EXCEPTION(props)) {
954         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
955             result=c+UCASE_GET_DELTA(props);
956         }
957     } else {
958         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
959         uint16_t excWord=*pe++;
960         int32_t full, idx;
961 
962         pe2=pe;
963 
964         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
965             /* use hardcoded conditions and mappings */
966             int32_t loc=ucase_getCaseLocale(locale, locCache);
967 
968             if(loc==UCASE_LOC_TURKISH && c==0x69) {
969                 /*
970                     # Turkish and Azeri
971 
972                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
973                     # The following rules handle those cases.
974 
975                     # When uppercasing, i turns into a dotted capital I
976 
977                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
978                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
979                 */
980                 return 0x130;
981             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
982                 /*
983                     # Lithuanian
984 
985                     # Lithuanian retains the dot in a lowercase i when followed by accents.
986 
987                     # Remove DOT ABOVE after "i" with upper or titlecase
988 
989                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
990                  */
991                 return 0; /* remove the dot (continue without output) */
992             } else {
993                 /* no known conditional special case mapping, use a normal mapping */
994             }
995         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
996             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
997 
998             /* start of full case mapping strings */
999             ++pe;
1000 
1001             /* skip the lowercase and case-folding result strings */
1002             pe+=full&UCASE_FULL_LOWER;
1003             full>>=4;
1004             pe+=full&0xf;
1005             full>>=4;
1006 
1007             if(upperNotTitle) {
1008                 full&=0xf;
1009             } else {
1010                 /* skip the uppercase result string */
1011                 pe+=full&0xf;
1012                 full=(full>>4)&0xf;
1013             }
1014 
1015             if(full!=0) {
1016                 /* set the output pointer to the result string */
1017                 *pString=reinterpret_cast<const UChar *>(pe);
1018 
1019                 /* return the string length */
1020                 return full;
1021             }
1022         }
1023 
1024         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1025             idx=UCASE_EXC_TITLE;
1026         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1027             /* here, titlecase is same as uppercase */
1028             idx=UCASE_EXC_UPPER;
1029         } else {
1030             return ~c;
1031         }
1032         GET_SLOT_VALUE(excWord, idx, pe2, result);
1033     }
1034 
1035     return (result==c) ? ~result : result;
1036 }
1037 
1038 U_CAPI int32_t U_EXPORT2
ucase_toFullUpper(const UCaseProps * csp,UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,const char * locale,int32_t * locCache)1039 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
1040                   UCaseContextIterator *iter, void *context,
1041                   const UChar **pString,
1042                   const char *locale, int32_t *locCache) {
1043     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
1044 }
1045 
1046 U_CAPI int32_t U_EXPORT2
ucase_toFullTitle(const UCaseProps * csp,UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,const char * locale,int32_t * locCache)1047 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
1048                   UCaseContextIterator *iter, void *context,
1049                   const UChar **pString,
1050                   const char *locale, int32_t *locCache) {
1051     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
1052 }
1053 
1054 /* case folding ------------------------------------------------------------- */
1055 
1056 /*
1057  * Case folding is similar to lowercasing.
1058  * The result may be a simple mapping, i.e., a single code point, or
1059  * a full mapping, i.e., a string.
1060  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1061  * then only the lowercase mapping is stored.
1062  *
1063  * Some special cases are hardcoded because their conditions cannot be
1064  * parsed and processed from CaseFolding.txt.
1065  *
1066  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1067 
1068 # C: common case folding, common mappings shared by both simple and full mappings.
1069 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1070 # S: simple case folding, mappings to single characters where different from F.
1071 # T: special case for uppercase I and dotted uppercase I
1072 #    - For non-Turkic languages, this mapping is normally not used.
1073 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1074 #
1075 # Usage:
1076 #  A. To do a simple case folding, use the mappings with status C + S.
1077 #  B. To do a full case folding, use the mappings with status C + F.
1078 #
1079 #    The mappings with status T can be used or omitted depending on the desired case-folding
1080 #    behavior. (The default option is to exclude them.)
1081 
1082  * Unicode 3.2 has 'T' mappings as follows:
1083 
1084 0049; T; 0131; # LATIN CAPITAL LETTER I
1085 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1086 
1087  * while the default mappings for these code points are:
1088 
1089 0049; C; 0069; # LATIN CAPITAL LETTER I
1090 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1091 
1092  * U+0130 has no simple case folding (simple-case-folds to itself).
1093  */
1094 
1095 /* return the simple case folding mapping for c */
1096 U_CAPI UChar32 U_EXPORT2
ucase_fold(const UCaseProps * csp,UChar32 c,uint32_t options)1097 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
1098     uint16_t props=UTRIE2_GET16(&csp->trie, c);
1099     if(!PROPS_HAS_EXCEPTION(props)) {
1100         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1101             c+=UCASE_GET_DELTA(props);
1102         }
1103     } else {
1104         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
1105         uint16_t excWord=*pe++;
1106         int32_t idx;
1107         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1108             /* special case folding mappings, hardcoded */
1109             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1110                 /* default mappings */
1111                 if(c==0x49) {
1112                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1113                     return 0x69;
1114                 } else if(c==0x130) {
1115                     /* no simple case folding for U+0130 */
1116                     return c;
1117                 }
1118             } else {
1119                 /* Turkic mappings */
1120                 if(c==0x49) {
1121                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1122                     return 0x131;
1123                 } else if(c==0x130) {
1124                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1125                     return 0x69;
1126                 }
1127             }
1128         }
1129         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1130             idx=UCASE_EXC_FOLD;
1131         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1132             idx=UCASE_EXC_LOWER;
1133         } else {
1134             return c;
1135         }
1136         GET_SLOT_VALUE(excWord, idx, pe, c);
1137     }
1138     return c;
1139 }
1140 
1141 /*
1142  * Issue for canonical caseless match (UAX #21):
1143  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1144  * canonical equivalence, unlike default-option casefolding.
1145  * For example, I-grave and I + grave fold to strings that are not canonically
1146  * equivalent.
1147  * For more details, see the comment in unorm_compare() in unorm.cpp
1148  * and the intermediate prototype changes for Jitterbug 2021.
1149  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1150  *
1151  * This did not get fixed because it appears that it is not possible to fix
1152  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1153  * together in a way that they still fold to common result strings.
1154  */
1155 
1156 U_CAPI int32_t U_EXPORT2
ucase_toFullFolding(const UCaseProps * csp,UChar32 c,const UChar ** pString,uint32_t options)1157 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
1158                     const UChar **pString,
1159                     uint32_t options)
1160 {
1161     UChar32 result=c;
1162     uint16_t props=UTRIE2_GET16(&csp->trie, c);
1163     if(!PROPS_HAS_EXCEPTION(props)) {
1164         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
1165             result=c+UCASE_GET_DELTA(props);
1166         }
1167     } else {
1168         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
1169         uint16_t excWord=*pe++;
1170         int32_t full, idx;
1171 
1172         pe2=pe;
1173 
1174         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1175             /* use hardcoded conditions and mappings */
1176             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1177                 /* default mappings */
1178                 if(c==0x49) {
1179                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1180                     return 0x69;
1181                 } else if(c==0x130) {
1182                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1183                     *pString=iDot;
1184                     return 2;
1185                 }
1186             } else {
1187                 /* Turkic mappings */
1188                 if(c==0x49) {
1189                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1190                     return 0x131;
1191                 } else if(c==0x130) {
1192                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1193                     return 0x69;
1194                 }
1195             }
1196         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1197             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1198 
1199             /* start of full case mapping strings */
1200             ++pe;
1201 
1202             /* skip the lowercase result string */
1203             pe+=full&UCASE_FULL_LOWER;
1204             full=(full>>4)&0xf;
1205 
1206             if(full!=0) {
1207                 /* set the output pointer to the result string */
1208                 *pString=reinterpret_cast<const UChar *>(pe);
1209 
1210                 /* return the string length */
1211                 return full;
1212             }
1213         }
1214 
1215         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1216             idx=UCASE_EXC_FOLD;
1217         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1218             idx=UCASE_EXC_LOWER;
1219         } else {
1220             return ~c;
1221         }
1222         GET_SLOT_VALUE(excWord, idx, pe2, result);
1223     }
1224 
1225     return (result==c) ? ~result : result;
1226 }
1227 
1228 /* case mapping properties API ---------------------------------------------- */
1229 
1230 #define GET_CASE_PROPS() &ucase_props_singleton
1231 
1232 /* public API (see uchar.h) */
1233 
1234 U_CAPI UBool U_EXPORT2
u_isULowercase(UChar32 c)1235 u_isULowercase(UChar32 c) {
1236     return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
1237 }
1238 
1239 U_CAPI UBool U_EXPORT2
u_isUUppercase(UChar32 c)1240 u_isUUppercase(UChar32 c) {
1241     return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
1242 }
1243 
1244 /* Transforms the Unicode character to its lower case equivalent.*/
1245 U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c)1246 u_tolower(UChar32 c) {
1247     return ucase_tolower(GET_CASE_PROPS(), c);
1248 }
1249 
1250 /* Transforms the Unicode character to its upper case equivalent.*/
1251 U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c)1252 u_toupper(UChar32 c) {
1253     return ucase_toupper(GET_CASE_PROPS(), c);
1254 }
1255 
1256 /* Transforms the Unicode character to its title case equivalent.*/
1257 U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c)1258 u_totitle(UChar32 c) {
1259     return ucase_totitle(GET_CASE_PROPS(), c);
1260 }
1261 
1262 /* return the simple case folding mapping for c */
1263 U_CAPI UChar32 U_EXPORT2
u_foldCase(UChar32 c,uint32_t options)1264 u_foldCase(UChar32 c, uint32_t options) {
1265     return ucase_fold(GET_CASE_PROPS(), c, options);
1266 }
1267 
1268 U_CFUNC int32_t U_EXPORT2
ucase_hasBinaryProperty(UChar32 c,UProperty which)1269 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1270     /* case mapping properties */
1271     const UChar *resultString;
1272     int32_t locCache;
1273     const UCaseProps *csp=GET_CASE_PROPS();
1274     if(csp==NULL) {
1275         return FALSE;
1276     }
1277     switch(which) {
1278     case UCHAR_LOWERCASE:
1279         return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
1280     case UCHAR_UPPERCASE:
1281         return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
1282     case UCHAR_SOFT_DOTTED:
1283         return ucase_isSoftDotted(csp, c);
1284     case UCHAR_CASE_SENSITIVE:
1285         return ucase_isCaseSensitive(csp, c);
1286     case UCHAR_CASED:
1287         return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
1288     case UCHAR_CASE_IGNORABLE:
1289         return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
1290     /*
1291      * Note: The following Changes_When_Xyz are defined as testing whether
1292      * the NFD form of the input changes when Xyz-case-mapped.
1293      * However, this simpler implementation of these properties,
1294      * ignoring NFD, passes the tests.
1295      * The implementation needs to be changed if the tests start failing.
1296      * When that happens, optimizations should be used to work with the
1297      * per-single-code point ucase_toFullXyz() functions unless
1298      * the NFD form has more than one code point,
1299      * and the property starts set needs to be the union of the
1300      * start sets for normalization and case mappings.
1301      */
1302     case UCHAR_CHANGES_WHEN_LOWERCASED:
1303         locCache=UCASE_LOC_ROOT;
1304         return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1305     case UCHAR_CHANGES_WHEN_UPPERCASED:
1306         locCache=UCASE_LOC_ROOT;
1307         return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1308     case UCHAR_CHANGES_WHEN_TITLECASED:
1309         locCache=UCASE_LOC_ROOT;
1310         return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1311     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1312     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1313         locCache=UCASE_LOC_ROOT;
1314         return (UBool)(
1315             ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1316             ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
1317             ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
1318     default:
1319         return FALSE;
1320     }
1321 }
1322