• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  *
7  *   Copyright (C) 2004-2015, International Business Machines
8  *   Corporation and others.  All Rights Reserved.
9  *
10  *******************************************************************************
11  *   file name:  UCaseProps.java
12  *   encoding:   US-ASCII
13  *   tab size:   8 (not used)
14  *   indentation:4
15  *
16  *   created on: 2005jan29
17  *   created by: Markus W. Scherer
18  *
19  *   Low-level Unicode character/string case mapping code.
20  *   Java port of ucase.h/.c.
21  */
22 
23 package android.icu.impl;
24 
25 import java.io.IOException;
26 import java.nio.ByteBuffer;
27 import java.util.Iterator;
28 import java.util.Locale;
29 
30 import android.icu.lang.UCharacter;
31 import android.icu.lang.UProperty;
32 import android.icu.text.UTF16;
33 import android.icu.text.UnicodeSet;
34 import android.icu.util.ICUUncheckedIOException;
35 import android.icu.util.ULocale;
36 
37 /**
38  * @hide Only a subset of ICU is exposed in Android
39  */
40 public final class UCaseProps {
41 
42     // constructors etc. --------------------------------------------------- ***
43 
44     // port of ucase_openProps()
UCaseProps()45     private UCaseProps() throws IOException {
46         ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME);
47         readData(bytes);
48     }
49 
readData(ByteBuffer bytes)50     private final void readData(ByteBuffer bytes) throws IOException {
51         // read the header
52         ICUBinary.readHeader(bytes, FMT, new IsAcceptable());
53 
54         // read indexes[]
55         int count=bytes.getInt();
56         if(count<IX_TOP) {
57             throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);
58         }
59         indexes=new int[count];
60 
61         indexes[0]=count;
62         for(int i=1; i<count; ++i) {
63             indexes[i]=bytes.getInt();
64         }
65 
66         // read the trie
67         trie=Trie2_16.createFromSerialized(bytes);
68         int expectedTrieLength=indexes[IX_TRIE_SIZE];
69         int trieLength=trie.getSerializedLength();
70         if(trieLength>expectedTrieLength) {
71             throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie");
72         }
73         // skip padding after trie bytes
74         ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength);
75 
76         // read exceptions[]
77         count=indexes[IX_EXC_LENGTH];
78         if(count>0) {
79             exceptions=ICUBinary.getString(bytes, count, 0);
80         }
81 
82         // read unfold[]
83         count=indexes[IX_UNFOLD_LENGTH];
84         if(count>0) {
85             unfold=ICUBinary.getChars(bytes, count, 0);
86         }
87     }
88 
89     // implement ICUBinary.Authenticate
90     private final static class IsAcceptable implements ICUBinary.Authenticate {
91         @Override
isDataVersionAcceptable(byte version[])92         public boolean isDataVersionAcceptable(byte version[]) {
93             return version[0]==3;
94         }
95     }
96 
97     // set of property starts for UnicodeSet ------------------------------- ***
98 
addPropertyStarts(UnicodeSet set)99     public final void addPropertyStarts(UnicodeSet set) {
100         /* add the start code point of each same-value range of the trie */
101         Iterator<Trie2.Range> trieIterator=trie.iterator();
102         Trie2.Range range;
103         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
104             set.add(range.startCodePoint);
105         }
106 
107         /* add code points with hardcoded properties, plus the ones following them */
108 
109         /* (none right now, see comment below) */
110 
111         /*
112          * Omit code points with hardcoded specialcasing properties
113          * because we do not build property UnicodeSets for them right now.
114          */
115     }
116 
117     // data access primitives ---------------------------------------------- ***
getExceptionsOffset(int props)118     private static final int getExceptionsOffset(int props) {
119         return props>>EXC_SHIFT;
120     }
121 
propsHasException(int props)122     private static final boolean propsHasException(int props) {
123         return (props&EXCEPTION)!=0;
124     }
125 
126     /* number of bits in an 8-bit integer value */
127     private static final byte flagsOffset[/*256*/]={
128         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
129         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
130         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
131         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
132         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
133         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
134         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
135         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
136         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
137         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
138         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
139         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
140         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
141         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
142         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
143         4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
144     };
145 
hasSlot(int flags, int index)146     private static final boolean hasSlot(int flags, int index) {
147         return (flags&(1<<index))!=0;
148     }
slotOffset(int flags, int index)149     private static final byte slotOffset(int flags, int index) {
150         return flagsOffset[flags&((1<<index)-1)];
151     }
152 
153     /*
154      * Get the value of an optional-value slot where hasSlot(excWord, index).
155      *
156      * @param excWord (in) initial exceptions word
157      * @param index (in) desired slot index
158      * @param excOffset (in) offset into exceptions[] after excWord=exceptions.charAt(excOffset++);
159      * @return bits 31..0: slot value
160      *             63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot
161      */
getSlotValueAndOffset(int excWord, int index, int excOffset)162     private final long getSlotValueAndOffset(int excWord, int index, int excOffset) {
163         long value;
164         if((excWord&EXC_DOUBLE_SLOTS)==0) {
165             excOffset+=slotOffset(excWord, index);
166             value=exceptions.charAt(excOffset);
167         } else {
168             excOffset+=2*slotOffset(excWord, index);
169             value=exceptions.charAt(excOffset++);
170             value=(value<<16)|exceptions.charAt(excOffset);
171         }
172         return value |((long)excOffset<<32);
173     }
174 
175     /* same as getSlotValueAndOffset() but does not return the slot offset */
getSlotValue(int excWord, int index, int excOffset)176     private final int getSlotValue(int excWord, int index, int excOffset) {
177         int value;
178         if((excWord&EXC_DOUBLE_SLOTS)==0) {
179             excOffset+=slotOffset(excWord, index);
180             value=exceptions.charAt(excOffset);
181         } else {
182             excOffset+=2*slotOffset(excWord, index);
183             value=exceptions.charAt(excOffset++);
184             value=(value<<16)|exceptions.charAt(excOffset);
185         }
186         return value;
187     }
188 
189     // simple case mappings ------------------------------------------------ ***
190 
tolower(int c)191     public final int tolower(int c) {
192         int props=trie.get(c);
193         if(!propsHasException(props)) {
194             if(getTypeFromProps(props)>=UPPER) {
195                 c+=getDelta(props);
196             }
197         } else {
198             int excOffset=getExceptionsOffset(props);
199             int excWord=exceptions.charAt(excOffset++);
200             if(hasSlot(excWord, EXC_LOWER)) {
201                 c=getSlotValue(excWord, EXC_LOWER, excOffset);
202             }
203         }
204         return c;
205     }
206 
toupper(int c)207     public final int toupper(int c) {
208         int props=trie.get(c);
209         if(!propsHasException(props)) {
210             if(getTypeFromProps(props)==LOWER) {
211                 c+=getDelta(props);
212             }
213         } else {
214             int excOffset=getExceptionsOffset(props);
215             int excWord=exceptions.charAt(excOffset++);
216             if(hasSlot(excWord, EXC_UPPER)) {
217                 c=getSlotValue(excWord, EXC_UPPER, excOffset);
218             }
219         }
220         return c;
221     }
222 
totitle(int c)223     public final int totitle(int c) {
224         int props=trie.get(c);
225         if(!propsHasException(props)) {
226             if(getTypeFromProps(props)==LOWER) {
227                 c+=getDelta(props);
228             }
229         } else {
230             int excOffset=getExceptionsOffset(props);
231             int excWord=exceptions.charAt(excOffset++);
232             int index;
233             if(hasSlot(excWord, EXC_TITLE)) {
234                 index=EXC_TITLE;
235             } else if(hasSlot(excWord, EXC_UPPER)) {
236                 index=EXC_UPPER;
237             } else {
238                 return c;
239             }
240             c=getSlotValue(excWord, index, excOffset);
241         }
242         return c;
243     }
244 
245     /**
246      * Adds all simple case mappings and the full case folding for c to sa,
247      * and also adds special case closure mappings.
248      * c itself is not added.
249      * For example, the mappings
250      * - for s include long s
251      * - for sharp s include ss
252      * - for k include the Kelvin sign
253      */
addCaseClosure(int c, UnicodeSet set)254     public final void addCaseClosure(int c, UnicodeSet set) {
255         /*
256          * Hardcode the case closure of i and its relatives and ignore the
257          * data file data for these characters.
258          * The Turkic dotless i and dotted I with their case mapping conditions
259          * and case folding option make the related characters behave specially.
260          * This code matches their closure behavior to their case folding behavior.
261          */
262 
263         switch(c) {
264         case 0x49:
265             /* regular i and I are in one equivalence class */
266             set.add(0x69);
267             return;
268         case 0x69:
269             set.add(0x49);
270             return;
271         case 0x130:
272             /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
273             set.add(iDot);
274             return;
275         case 0x131:
276             /* dotless i is in a class by itself */
277             return;
278         default:
279             /* otherwise use the data file data */
280             break;
281         }
282 
283         int props=trie.get(c);
284         if(!propsHasException(props)) {
285             if(getTypeFromProps(props)!=NONE) {
286                 /* add the one simple case mapping, no matter what type it is */
287                 int delta=getDelta(props);
288                 if(delta!=0) {
289                     set.add(c+delta);
290                 }
291             }
292         } else {
293             /*
294              * c has exceptions, so there may be multiple simple and/or
295              * full case mappings. Add them all.
296              */
297             int excOffset0, excOffset=getExceptionsOffset(props);
298             int closureOffset;
299             int excWord=exceptions.charAt(excOffset++);
300             int index, closureLength, fullLength, length;
301 
302             excOffset0=excOffset;
303 
304             /* add all simple case mappings */
305             for(index=EXC_LOWER; index<=EXC_TITLE; ++index) {
306                 if(hasSlot(excWord, index)) {
307                     excOffset=excOffset0;
308                     c=getSlotValue(excWord, index, excOffset);
309                     set.add(c);
310                 }
311             }
312 
313             /* get the closure string pointer & length */
314             if(hasSlot(excWord, EXC_CLOSURE)) {
315                 excOffset=excOffset0;
316                 long value=getSlotValueAndOffset(excWord, EXC_CLOSURE, excOffset);
317                 closureLength=(int)value&CLOSURE_MAX_LENGTH; /* higher bits are reserved */
318                 closureOffset=(int)(value>>32)+1; /* behind this slot, unless there are full case mappings */
319             } else {
320                 closureLength=0;
321                 closureOffset=0;
322             }
323 
324             /* add the full case folding */
325             if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
326                 excOffset=excOffset0;
327                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
328                 fullLength=(int)value;
329 
330                 /* start of full case mapping strings */
331                 excOffset=(int)(value>>32)+1;
332 
333                 fullLength&=0xffff; /* bits 16 and higher are reserved */
334 
335                 /* skip the lowercase result string */
336                 excOffset+=fullLength&FULL_LOWER;
337                 fullLength>>=4;
338 
339                 /* add the full case folding string */
340                 length=fullLength&0xf;
341                 if(length!=0) {
342                     set.add(exceptions.substring(excOffset, excOffset+length));
343                     excOffset+=length;
344                 }
345 
346                 /* skip the uppercase and titlecase strings */
347                 fullLength>>=4;
348                 excOffset+=fullLength&0xf;
349                 fullLength>>=4;
350                 excOffset+=fullLength;
351 
352                 closureOffset=excOffset; /* behind full case mappings */
353             }
354 
355             /* add each code point in the closure string */
356             int limit=closureOffset+closureLength;
357             for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
358                 c=exceptions.codePointAt(index);
359                 set.add(c);
360             }
361         }
362     }
363 
364     /*
365      * compare s, which has a length, with t=unfold[unfoldOffset..], which has a maximum length or is NUL-terminated
366      * must be s.length()>0 and max>0 and s.length()<=max
367      */
strcmpMax(String s, int unfoldOffset, int max)368     private final int strcmpMax(String s, int unfoldOffset, int max) {
369         int i1, length, c1, c2;
370 
371         length=s.length();
372         max-=length; /* we require length<=max, so no need to decrement max in the loop */
373         i1=0;
374         do {
375             c1=s.charAt(i1++);
376             c2=unfold[unfoldOffset++];
377             if(c2==0) {
378                 return 1; /* reached the end of t but not of s */
379             }
380             c1-=c2;
381             if(c1!=0) {
382                 return c1; /* return difference result */
383             }
384         } while(--length>0);
385         /* ends with length==0 */
386 
387         if(max==0 || unfold[unfoldOffset]==0) {
388             return 0; /* equal to length of both strings */
389         } else {
390             return -max; /* return lengh difference */
391         }
392     }
393 
394     /**
395      * Maps the string to single code points and adds the associated case closure
396      * mappings.
397      * The string is mapped to code points if it is their full case folding string.
398      * In other words, this performs a reverse full case folding and then
399      * adds the case closure items of the resulting code points.
400      * If the string is found and its closure applied, then
401      * the string itself is added as well as part of its code points' closure.
402      *
403      * @return true if the string was found
404      */
addStringCaseClosure(String s, UnicodeSet set)405     public final boolean addStringCaseClosure(String s, UnicodeSet set) {
406         int i, length, start, limit, result, unfoldOffset, unfoldRows, unfoldRowWidth, unfoldStringWidth;
407 
408         if(unfold==null || s==null) {
409             return false; /* no reverse case folding data, or no string */
410         }
411         length=s.length();
412         if(length<=1) {
413             /* the string is too short to find any match */
414             /*
415              * more precise would be:
416              * if(!u_strHasMoreChar32Than(s, length, 1))
417              * but this does not make much practical difference because
418              * a single supplementary code point would just not be found
419              */
420             return false;
421         }
422 
423         unfoldRows=unfold[UNFOLD_ROWS];
424         unfoldRowWidth=unfold[UNFOLD_ROW_WIDTH];
425         unfoldStringWidth=unfold[UNFOLD_STRING_WIDTH];
426         //unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;
427 
428         if(length>unfoldStringWidth) {
429             /* the string is too long to find any match */
430             return false;
431         }
432 
433         /* do a binary search for the string */
434         start=0;
435         limit=unfoldRows;
436         while(start<limit) {
437             i=(start+limit)/2;
438             unfoldOffset=((i+1)*unfoldRowWidth); // +1 to skip the header values above
439             result=strcmpMax(s, unfoldOffset, unfoldStringWidth);
440 
441             if(result==0) {
442                 /* found the string: add each code point, and its case closure */
443                 int c;
444 
445                 for(i=unfoldStringWidth; i<unfoldRowWidth && unfold[unfoldOffset+i]!=0; i+=UTF16.getCharCount(c)) {
446                     c=UTF16.charAt(unfold, unfoldOffset, unfold.length, i);
447                     set.add(c);
448                     addCaseClosure(c, set);
449                 }
450                 return true;
451             } else if(result<0) {
452                 limit=i;
453             } else /* result>0 */ {
454                 start=i+1;
455             }
456         }
457 
458         return false; /* string not found */
459     }
460 
461     /** @return NONE, LOWER, UPPER, TITLE */
getType(int c)462     public final int getType(int c) {
463         return getTypeFromProps(trie.get(c));
464     }
465 
466     /** @return like getType() but also sets IGNORABLE if c is case-ignorable */
getTypeOrIgnorable(int c)467     public final int getTypeOrIgnorable(int c) {
468         return getTypeAndIgnorableFromProps(trie.get(c));
469     }
470 
471     /** @return NO_DOT, SOFT_DOTTED, ABOVE, OTHER_ACCENT */
getDotType(int c)472     public final int getDotType(int c) {
473         int props=trie.get(c);
474         if(!propsHasException(props)) {
475             return props&DOT_MASK;
476         } else {
477             return (exceptions.charAt(getExceptionsOffset(props))>>EXC_DOT_SHIFT)&DOT_MASK;
478         }
479     }
480 
isSoftDotted(int c)481     public final boolean isSoftDotted(int c) {
482         return getDotType(c)==SOFT_DOTTED;
483     }
484 
isCaseSensitive(int c)485     public final boolean isCaseSensitive(int c) {
486         return (trie.get(c)&SENSITIVE)!=0;
487     }
488 
489     // string casing ------------------------------------------------------- ***
490 
491     /*
492      * These internal functions form the core of string case mappings.
493      * They map single code points to result code points or strings and take
494      * all necessary conditions (context, locale ID, options) into account.
495      *
496      * They do not iterate over the source or write to the destination
497      * so that the same functions are useful for non-standard string storage,
498      * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
499      * For the same reason, the "surrounding text" context is passed in as a
500      * ContextIterator which does not make any assumptions about
501      * the underlying storage.
502      *
503      * This section contains helper functions that check for conditions
504      * in the input text surrounding the current code point
505      * according to SpecialCasing.txt.
506      *
507      * Each helper function gets the index
508      * - after the current code point if it looks at following text
509      * - before the current code point if it looks at preceding text
510      *
511      * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
512      *
513      * Final_Sigma
514      *   C is preceded by a sequence consisting of
515      *     a cased letter and a case-ignorable sequence,
516      *   and C is not followed by a sequence consisting of
517      *     an ignorable sequence and then a cased letter.
518      *
519      * More_Above
520      *   C is followed by one or more characters of combining class 230 (ABOVE)
521      *   in the combining character sequence.
522      *
523      * After_Soft_Dotted
524      *   The last preceding character with combining class of zero before C
525      *   was Soft_Dotted,
526      *   and there is no intervening combining character class 230 (ABOVE).
527      *
528      * Before_Dot
529      *   C is followed by combining dot above (U+0307).
530      *   Any sequence of characters with a combining class that is neither 0 nor 230
531      *   may intervene between the current character and the combining dot above.
532      *
533      * The erratum from 2002-10-31 adds the condition
534      *
535      * After_I
536      *   The last preceding base character was an uppercase I, and there is no
537      *   intervening combining character class 230 (ABOVE).
538      *
539      *   (See Jitterbug 2344 and the comments on After_I below.)
540      *
541      * Helper definitions in Unicode 3.2 UAX 21:
542      *
543      * D1. A character C is defined to be cased
544      *     if it meets any of the following criteria:
545      *
546      *   - The general category of C is Titlecase Letter (Lt)
547      *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
548      *   - Given D = NFD(C), then it is not the case that:
549      *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
550      *     (This third criterium does not add any characters to the list
551      *      for Unicode 3.2. Ignored.)
552      *
553      * D2. A character C is defined to be case-ignorable
554      *     if it meets either of the following criteria:
555      *
556      *   - The general category of C is
557      *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
558      *     Letter Modifier (Lm), or Symbol Modifier (Sk)
559      *   - C is one of the following characters
560      *     U+0027 APOSTROPHE
561      *     U+00AD SOFT HYPHEN (SHY)
562      *     U+2019 RIGHT SINGLE QUOTATION MARK
563      *            (the preferred character for apostrophe)
564      *
565      * D3. A case-ignorable sequence is a sequence of
566      *     zero or more case-ignorable characters.
567      */
568 
569     /**
570      * Iterator for string case mappings, which need to look at the
571      * context (surrounding text) of a given character for conditional mappings.
572      *
573      * The iterator only needs to go backward or forward away from the
574      * character in question. It does not use any indexes on this interface.
575      * It does not support random access or an arbitrary change of
576      * iteration direction.
577      *
578      * The code point being case-mapped itself is never returned by
579      * this iterator.
580      */
581     public interface ContextIterator {
582         /**
583          * Reset the iterator for forward or backward iteration.
584          * @param dir >0: Begin iterating forward from the first code point
585          * after the one that is being case-mapped.
586          *            <0: Begin iterating backward from the first code point
587          * before the one that is being case-mapped.
588          */
reset(int dir)589         public void reset(int dir);
590         /**
591          * Iterate and return the next code point, moving in the direction
592          * determined by the reset() call.
593          * @return Next code point, or <0 when the iteration is done.
594          */
next()595         public int next();
596     }
597 
598     /**
599      * For string case mappings, a single character (a code point) is mapped
600      * either to itself (in which case in-place mapping functions do nothing),
601      * or to another single code point, or to a string.
602      * Aside from the string contents, these are indicated with a single int
603      * value as follows:
604      *
605      * Mapping to self: Negative values (~self instead of -self to support U+0000)
606      *
607      * Mapping to another code point: Positive values >MAX_STRING_LENGTH
608      *
609      * Mapping to a string: The string length (0..MAX_STRING_LENGTH) is
610      * returned. Note that the string result may indeed have zero length.
611      */
612     public static final int MAX_STRING_LENGTH=0x1f;
613 
614     //ivate static final int LOC_UNKNOWN=0;
615     public static final int LOC_ROOT=1;
616     private static final int LOC_TURKISH=2;
617     private static final int LOC_LITHUANIAN=3;
618     static final int LOC_GREEK=4;
619     public static final int LOC_DUTCH=5;
620 
getCaseLocale(Locale locale)621     public static final int getCaseLocale(Locale locale) {
622         return getCaseLocale(locale.getLanguage());
623     }
getCaseLocale(ULocale locale)624     public static final int getCaseLocale(ULocale locale) {
625         return getCaseLocale(locale.getLanguage());
626     }
627     /** Accepts both 2- and 3-letter language subtags. */
getCaseLocale(String language)628     private static final int getCaseLocale(String language) {
629         // Check the subtag length to reduce the number of comparisons
630         // for locales without special behavior.
631         // Fastpath for English "en" which is often used for default (=root locale) case mappings,
632         // and for Chinese "zh": Very common but no special case mapping behavior.
633         if(language.length()==2) {
634             if(language.equals("en") || language.charAt(0)>'t') {
635                 return LOC_ROOT;
636             } else if(language.equals("tr") || language.equals("az")) {
637                 return LOC_TURKISH;
638             } else if(language.equals("el")) {
639                 return LOC_GREEK;
640             } else if(language.equals("lt")) {
641                 return LOC_LITHUANIAN;
642             } else if(language.equals("nl")) {
643                 return LOC_DUTCH;
644             }
645         } else if(language.length()==3) {
646             if(language.equals("tur") || language.equals("aze")) {
647                 return LOC_TURKISH;
648             } else if(language.equals("ell")) {
649                 return LOC_GREEK;
650             } else if(language.equals("lit")) {
651                 return LOC_LITHUANIAN;
652             } else if(language.equals("nld")) {
653                 return LOC_DUTCH;
654             }
655         }
656         return LOC_ROOT;
657     }
658 
659     /* Is followed by {case-ignorable}* cased  ? (dir determines looking forward/backward) */
isFollowedByCasedLetter(ContextIterator iter, int dir)660     private final boolean isFollowedByCasedLetter(ContextIterator iter, int dir) {
661         int c;
662 
663         if(iter==null) {
664             return false;
665         }
666 
667         for(iter.reset(dir); (c=iter.next())>=0;) {
668             int type=getTypeOrIgnorable(c);
669             if((type&4)!=0) {
670                 /* case-ignorable, continue with the loop */
671             } else if(type!=NONE) {
672                 return true; /* followed by cased letter */
673             } else {
674                 return false; /* uncased and not case-ignorable */
675             }
676         }
677 
678         return false; /* not followed by cased letter */
679     }
680 
681     /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
isPrecededBySoftDotted(ContextIterator iter)682     private final boolean isPrecededBySoftDotted(ContextIterator iter) {
683         int c;
684         int dotType;
685 
686         if(iter==null) {
687             return false;
688         }
689 
690         for(iter.reset(-1); (c=iter.next())>=0;) {
691             dotType=getDotType(c);
692             if(dotType==SOFT_DOTTED) {
693                 return true; /* preceded by TYPE_i */
694             } else if(dotType!=OTHER_ACCENT) {
695                 return false; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
696             }
697         }
698 
699         return false; /* not preceded by TYPE_i */
700     }
701 
702     /*
703      * See Jitterbug 2344:
704      * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
705      * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
706      * we made those releases compatible with Unicode 3.2 which had not fixed
707      * a related bug in SpecialCasing.txt.
708      *
709      * From the Jitterbug 2344 text:
710      * ... this bug is listed as a Unicode erratum
711      * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
712      * <quote>
713      * There are two errors in SpecialCasing.txt.
714      * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
715      * 2. An incorrect context definition. Correct as follows:
716      * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
717      * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
718      * ---
719      * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
720      * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
721      * where the context After_I is defined as:
722      * The last preceding base character was an uppercase I, and there is no
723      * intervening combining character class 230 (ABOVE).
724      * </quote>
725      *
726      * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
727      *
728      * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
729      * # This matches the behavior of the canonically equivalent I-dot_above
730      *
731      * See also the description in this place in older versions of uchar.c (revision 1.100).
732      *
733      * Markus W. Scherer 2003-feb-15
734      */
735 
736     /* Is preceded by base character 'I' with no intervening cc=230 ? */
isPrecededBy_I(ContextIterator iter)737     private final boolean isPrecededBy_I(ContextIterator iter) {
738         int c;
739         int dotType;
740 
741         if(iter==null) {
742             return false;
743         }
744 
745         for(iter.reset(-1); (c=iter.next())>=0;) {
746             if(c==0x49) {
747                 return true; /* preceded by I */
748             }
749             dotType=getDotType(c);
750             if(dotType!=OTHER_ACCENT) {
751                 return false; /* preceded by different base character (not I), or intervening cc==230 */
752             }
753         }
754 
755         return false; /* not preceded by I */
756     }
757 
758     /* Is followed by one or more cc==230 ? */
isFollowedByMoreAbove(ContextIterator iter)759     private final boolean isFollowedByMoreAbove(ContextIterator iter) {
760         int c;
761         int dotType;
762 
763         if(iter==null) {
764             return false;
765         }
766 
767         for(iter.reset(1); (c=iter.next())>=0;) {
768             dotType=getDotType(c);
769             if(dotType==ABOVE) {
770                 return true; /* at least one cc==230 following */
771             } else if(dotType!=OTHER_ACCENT) {
772                 return false; /* next base character, no more cc==230 following */
773             }
774         }
775 
776         return false; /* no more cc==230 following */
777     }
778 
779     /* Is followed by a dot above (without cc==230 in between) ? */
isFollowedByDotAbove(ContextIterator iter)780     private final boolean isFollowedByDotAbove(ContextIterator iter) {
781         int c;
782         int dotType;
783 
784         if(iter==null) {
785             return false;
786         }
787 
788         for(iter.reset(1); (c=iter.next())>=0; ) {
789             if(c==0x307) {
790                 return true;
791             }
792             dotType=getDotType(c);
793             if(dotType!=OTHER_ACCENT) {
794                 return false; /* next base character or cc==230 in between */
795             }
796         }
797 
798         return false; /* no dot above following */
799     }
800 
801     private static final String
802         iDot=       "i\u0307",
803         jDot=       "j\u0307",
804         iOgonekDot= "\u012f\u0307",
805         iDotGrave=  "i\u0307\u0300",
806         iDotAcute=  "i\u0307\u0301",
807         iDotTilde=  "i\u0307\u0303";
808 
809     /**
810      * Get the full lowercase mapping for c.
811      *
812      * @param c Character to be mapped.
813      * @param iter Character iterator, used for context-sensitive mappings.
814      *             See ContextIterator for details.
815      *             If iter==null then a context-independent result is returned.
816      * @param out If the mapping result is a string, then it is appended to out.
817      * @param caseLocale Case locale value from ucase_getCaseLocale().
818      * @return Output code point or string length, see MAX_STRING_LENGTH.
819      *
820      * @see ContextIterator
821      * @see #MAX_STRING_LENGTH
822      * @hide draft / provisional / internal are hidden on Android
823      */
toFullLower(int c, ContextIterator iter, Appendable out, int caseLocale)824     public final int toFullLower(int c, ContextIterator iter, Appendable out, int caseLocale) {
825         int result, props;
826 
827         result=c;
828         props=trie.get(c);
829         if(!propsHasException(props)) {
830             if(getTypeFromProps(props)>=UPPER) {
831                 result=c+getDelta(props);
832             }
833         } else {
834             int excOffset=getExceptionsOffset(props), excOffset2;
835             int excWord=exceptions.charAt(excOffset++);
836             int full;
837 
838             excOffset2=excOffset;
839 
840             if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
841                 /* use hardcoded conditions and mappings */
842                 /*
843                  * Test for conditional mappings first
844                  *   (otherwise the unconditional default mappings are always taken),
845                  * then test for characters that have unconditional mappings in SpecialCasing.txt,
846                  * then get the UnicodeData.txt mappings.
847                  */
848                 if( caseLocale==LOC_LITHUANIAN &&
849                         /* base characters, find accents above */
850                         (((c==0x49 || c==0x4a || c==0x12e) &&
851                             isFollowedByMoreAbove(iter)) ||
852                         /* precomposed with accent above, no need to find one */
853                         (c==0xcc || c==0xcd || c==0x128))
854                 ) {
855                     /*
856                         # Lithuanian
857 
858                         # Lithuanian retains the dot in a lowercase i when followed by accents.
859 
860                         # Introduce an explicit dot above when lowercasing capital I's and J's
861                         # whenever there are more accents above.
862                         # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
863 
864                         0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
865                         004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
866                         012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
867                         00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
868                         00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
869                         0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
870                      */
871                     try {
872                         switch(c) {
873                         case 0x49:  /* LATIN CAPITAL LETTER I */
874                             out.append(iDot);
875                             return 2;
876                         case 0x4a:  /* LATIN CAPITAL LETTER J */
877                             out.append(jDot);
878                             return 2;
879                         case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
880                             out.append(iOgonekDot);
881                             return 2;
882                         case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
883                             out.append(iDotGrave);
884                             return 3;
885                         case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
886                             out.append(iDotAcute);
887                             return 3;
888                         case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
889                             out.append(iDotTilde);
890                             return 3;
891                         default:
892                             return 0; /* will not occur */
893                         }
894                     } catch (IOException e) {
895                         throw new ICUUncheckedIOException(e);
896                     }
897                 /* # Turkish and Azeri */
898                 } else if(caseLocale==LOC_TURKISH && c==0x130) {
899                     /*
900                         # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
901                         # The following rules handle those cases.
902 
903                         0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
904                         0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
905                      */
906                     return 0x69;
907                 } else if(caseLocale==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) {
908                     /*
909                         # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
910                         # This matches the behavior of the canonically equivalent I-dot_above
911 
912                         0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
913                         0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
914                      */
915                     return 0; /* remove the dot (continue without output) */
916                 } else if(caseLocale==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) {
917                     /*
918                         # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
919 
920                         0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
921                         0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
922                      */
923                     return 0x131;
924                 } else if(c==0x130) {
925                     /*
926                         # Preserve canonical equivalence for I with dot. Turkic is handled below.
927 
928                         0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
929                      */
930                     try {
931                         out.append(iDot);
932                         return 2;
933                     } catch (IOException e) {
934                         throw new ICUUncheckedIOException(e);
935                     }
936                 } else if(  c==0x3a3 &&
937                             !isFollowedByCasedLetter(iter, 1) &&
938                             isFollowedByCasedLetter(iter, -1) /* -1=preceded */
939                 ) {
940                     /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
941                     /*
942                         # Special case for final form of sigma
943 
944                         03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
945                      */
946                     return 0x3c2; /* greek small final sigma */
947                 } else {
948                     /* no known conditional special case mapping, use a normal mapping */
949                 }
950             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
951                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
952                 full=(int)value&FULL_LOWER;
953                 if(full!=0) {
954                     /* start of full case mapping strings */
955                     excOffset=(int)(value>>32)+1;
956 
957                     try {
958                         // append the lowercase mapping
959                         out.append(exceptions, excOffset, excOffset+full);
960 
961                         /* return the string length */
962                         return full;
963                     } catch (IOException e) {
964                         throw new ICUUncheckedIOException(e);
965                     }
966                 }
967             }
968 
969             if(hasSlot(excWord, EXC_LOWER)) {
970                 result=getSlotValue(excWord, EXC_LOWER, excOffset2);
971             }
972         }
973 
974         return (result==c) ? ~result : result;
975     }
976 
977     /* internal */
toUpperOrTitle(int c, ContextIterator iter, Appendable out, int loc, boolean upperNotTitle)978     private final int toUpperOrTitle(int c, ContextIterator iter,
979                                      Appendable out,
980                                      int loc,
981                                      boolean upperNotTitle) {
982         int result;
983         int props;
984 
985         result=c;
986         props=trie.get(c);
987         if(!propsHasException(props)) {
988             if(getTypeFromProps(props)==LOWER) {
989                 result=c+getDelta(props);
990             }
991         } else {
992             int excOffset=getExceptionsOffset(props), excOffset2;
993             int excWord=exceptions.charAt(excOffset++);
994             int full, index;
995 
996             excOffset2=excOffset;
997 
998             if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
999                 /* use hardcoded conditions and mappings */
1000                 if(loc==LOC_TURKISH && c==0x69) {
1001                     /*
1002                         # Turkish and Azeri
1003 
1004                         # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1005                         # The following rules handle those cases.
1006 
1007                         # When uppercasing, i turns into a dotted capital I
1008 
1009                         0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1010                         0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1011                     */
1012                     return 0x130;
1013                 } else if(loc==LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter)) {
1014                     /*
1015                         # Lithuanian
1016 
1017                         # Lithuanian retains the dot in a lowercase i when followed by accents.
1018 
1019                         # Remove DOT ABOVE after "i" with upper or titlecase
1020 
1021                         0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1022                      */
1023                     return 0; /* remove the dot (continue without output) */
1024                 } else {
1025                     /* no known conditional special case mapping, use a normal mapping */
1026                 }
1027             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1028                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
1029                 full=(int)value&0xffff;
1030 
1031                 /* start of full case mapping strings */
1032                 excOffset=(int)(value>>32)+1;
1033 
1034                 /* skip the lowercase and case-folding result strings */
1035                 excOffset+=full&FULL_LOWER;
1036                 full>>=4;
1037                 excOffset+=full&0xf;
1038                 full>>=4;
1039 
1040                 if(upperNotTitle) {
1041                     full&=0xf;
1042                 } else {
1043                     /* skip the uppercase result string */
1044                     excOffset+=full&0xf;
1045                     full=(full>>4)&0xf;
1046                 }
1047 
1048                 if(full!=0) {
1049                     try {
1050                         // append the result string
1051                         out.append(exceptions, excOffset, excOffset+full);
1052 
1053                         /* return the string length */
1054                         return full;
1055                     } catch (IOException e) {
1056                         throw new ICUUncheckedIOException(e);
1057                     }
1058                 }
1059             }
1060 
1061             if(!upperNotTitle && hasSlot(excWord, EXC_TITLE)) {
1062                 index=EXC_TITLE;
1063             } else if(hasSlot(excWord, EXC_UPPER)) {
1064                 /* here, titlecase is same as uppercase */
1065                 index=EXC_UPPER;
1066             } else {
1067                 return ~c;
1068             }
1069             result=getSlotValue(excWord, index, excOffset2);
1070         }
1071 
1072         return (result==c) ? ~result : result;
1073     }
1074 
toFullUpper(int c, ContextIterator iter, Appendable out, int caseLocale)1075     public final int toFullUpper(int c, ContextIterator iter,
1076                                  Appendable out,
1077                                  int caseLocale) {
1078         return toUpperOrTitle(c, iter, out, caseLocale, true);
1079     }
1080 
toFullTitle(int c, ContextIterator iter, Appendable out, int caseLocale)1081     public final int toFullTitle(int c, ContextIterator iter,
1082                                  Appendable out,
1083                                  int caseLocale) {
1084         return toUpperOrTitle(c, iter, out, caseLocale, false);
1085     }
1086 
1087     /* case folding ------------------------------------------------------------- */
1088 
1089     /*
1090      * Case folding is similar to lowercasing.
1091      * The result may be a simple mapping, i.e., a single code point, or
1092      * a full mapping, i.e., a string.
1093      * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1094      * then only the lowercase mapping is stored.
1095      *
1096      * Some special cases are hardcoded because their conditions cannot be
1097      * parsed and processed from CaseFolding.txt.
1098      *
1099      * Unicode 3.2 CaseFolding.txt specifies for its status field:
1100 
1101     # C: common case folding, common mappings shared by both simple and full mappings.
1102     # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1103     # S: simple case folding, mappings to single characters where different from F.
1104     # T: special case for uppercase I and dotted uppercase I
1105     #    - For non-Turkic languages, this mapping is normally not used.
1106     #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1107     #
1108     # Usage:
1109     #  A. To do a simple case folding, use the mappings with status C + S.
1110     #  B. To do a full case folding, use the mappings with status C + F.
1111     #
1112     #    The mappings with status T can be used or omitted depending on the desired case-folding
1113     #    behavior. (The default option is to exclude them.)
1114 
1115      * Unicode 3.2 has 'T' mappings as follows:
1116 
1117     0049; T; 0131; # LATIN CAPITAL LETTER I
1118     0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1119 
1120      * while the default mappings for these code points are:
1121 
1122     0049; C; 0069; # LATIN CAPITAL LETTER I
1123     0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1124 
1125      * U+0130 has no simple case folding (simple-case-folds to itself).
1126      */
1127 
1128     /**
1129      * Bit mask for getting just the options from a string compare options word
1130      * that are relevant for case folding (of a single string or code point).
1131      * @hide draft / provisional / internal are hidden on Android
1132      */
1133     private static final int FOLD_CASE_OPTIONS_MASK = 0xff;
1134 
1135     /* return the simple case folding mapping for c */
fold(int c, int options)1136     public final int fold(int c, int options) {
1137         int props=trie.get(c);
1138         if(!propsHasException(props)) {
1139             if(getTypeFromProps(props)>=UPPER) {
1140                 c+=getDelta(props);
1141             }
1142         } else {
1143             int excOffset=getExceptionsOffset(props);
1144             int excWord=exceptions.charAt(excOffset++);
1145             int index;
1146             if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
1147                 /* special case folding mappings, hardcoded */
1148                 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
1149                     /* default mappings */
1150                     if(c==0x49) {
1151                         /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1152                         return 0x69;
1153                     } else if(c==0x130) {
1154                         /* no simple case folding for U+0130 */
1155                         return c;
1156                     }
1157                 } else {
1158                     /* Turkic mappings */
1159                     if(c==0x49) {
1160                         /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1161                         return 0x131;
1162                     } else if(c==0x130) {
1163                         /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1164                         return 0x69;
1165                     }
1166                 }
1167             }
1168             if(hasSlot(excWord, EXC_FOLD)) {
1169                 index=EXC_FOLD;
1170             } else if(hasSlot(excWord, EXC_LOWER)) {
1171                 index=EXC_LOWER;
1172             } else {
1173                 return c;
1174             }
1175             c=getSlotValue(excWord, index, excOffset);
1176         }
1177         return c;
1178     }
1179 
1180     /*
1181      * Issue for canonical caseless match (UAX #21):
1182      * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1183      * canonical equivalence, unlike default-option casefolding.
1184      * For example, I-grave and I + grave fold to strings that are not canonically
1185      * equivalent.
1186      * For more details, see the comment in unorm_compare() in unorm.cpp
1187      * and the intermediate prototype changes for Jitterbug 2021.
1188      * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1189      *
1190      * This did not get fixed because it appears that it is not possible to fix
1191      * it for uppercase and lowercase characters (I-grave vs. i-grave)
1192      * together in a way that they still fold to common result strings.
1193      */
1194 
toFullFolding(int c, Appendable out, int options)1195     public final int toFullFolding(int c, Appendable out, int options) {
1196         int result;
1197         int props;
1198 
1199         result=c;
1200         props=trie.get(c);
1201         if(!propsHasException(props)) {
1202             if(getTypeFromProps(props)>=UPPER) {
1203                 result=c+getDelta(props);
1204             }
1205         } else {
1206             int excOffset=getExceptionsOffset(props), excOffset2;
1207             int excWord=exceptions.charAt(excOffset++);
1208             int full, index;
1209 
1210             excOffset2=excOffset;
1211 
1212             if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
1213                 /* use hardcoded conditions and mappings */
1214                 if((options&FOLD_CASE_OPTIONS_MASK)==UCharacter.FOLD_CASE_DEFAULT) {
1215                     /* default mappings */
1216                     if(c==0x49) {
1217                         /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1218                         return 0x69;
1219                     } else if(c==0x130) {
1220                         /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1221                         try {
1222                             out.append(iDot);
1223                             return 2;
1224                         } catch (IOException e) {
1225                             throw new ICUUncheckedIOException(e);
1226                         }
1227                     }
1228                 } else {
1229                     /* Turkic mappings */
1230                     if(c==0x49) {
1231                         /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1232                         return 0x131;
1233                     } else if(c==0x130) {
1234                         /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1235                         return 0x69;
1236                     }
1237                 }
1238             } else if(hasSlot(excWord, EXC_FULL_MAPPINGS)) {
1239                 long value=getSlotValueAndOffset(excWord, EXC_FULL_MAPPINGS, excOffset);
1240                 full=(int)value&0xffff;
1241 
1242                 /* start of full case mapping strings */
1243                 excOffset=(int)(value>>32)+1;
1244 
1245                 /* skip the lowercase result string */
1246                 excOffset+=full&FULL_LOWER;
1247                 full=(full>>4)&0xf;
1248 
1249                 if(full!=0) {
1250                     try {
1251                         // append the result string
1252                         out.append(exceptions, excOffset, excOffset+full);
1253 
1254                         /* return the string length */
1255                         return full;
1256                     } catch (IOException e) {
1257                         throw new ICUUncheckedIOException(e);
1258                     }
1259                 }
1260             }
1261 
1262             if(hasSlot(excWord, EXC_FOLD)) {
1263                 index=EXC_FOLD;
1264             } else if(hasSlot(excWord, EXC_LOWER)) {
1265                 index=EXC_LOWER;
1266             } else {
1267                 return ~c;
1268             }
1269             result=getSlotValue(excWord, index, excOffset2);
1270         }
1271 
1272         return (result==c) ? ~result : result;
1273     }
1274 
1275     /* case mapping properties API ---------------------------------------------- */
1276 
1277     /*
1278      * We need a StringBuilder for multi-code point output from the
1279      * full case mapping functions. However, we do not actually use that output,
1280      * we just check whether the input character was mapped to anything else.
1281      * We use a shared StringBuilder to avoid allocating a new one in each call.
1282      * We remove its contents each time so that it does not grow large over time.
1283      *
1284      * @internal
1285      */
1286     public static final StringBuilder dummyStringBuilder = new StringBuilder();
1287 
hasBinaryProperty(int c, int which)1288     public final boolean hasBinaryProperty(int c, int which) {
1289         switch(which) {
1290         case UProperty.LOWERCASE:
1291             return LOWER==getType(c);
1292         case UProperty.UPPERCASE:
1293             return UPPER==getType(c);
1294         case UProperty.SOFT_DOTTED:
1295             return isSoftDotted(c);
1296         case UProperty.CASE_SENSITIVE:
1297             return isCaseSensitive(c);
1298         case UProperty.CASED:
1299             return NONE!=getType(c);
1300         case UProperty.CASE_IGNORABLE:
1301             return (getTypeOrIgnorable(c)>>2)!=0;
1302         /*
1303          * Note: The following Changes_When_Xyz are defined as testing whether
1304          * the NFD form of the input changes when Xyz-case-mapped.
1305          * However, this simpler implementation of these properties,
1306          * ignoring NFD, passes the tests.
1307          * The implementation needs to be changed if the tests start failing.
1308          * When that happens, optimizations should be used to work with the
1309          * per-single-code point ucase_toFullXyz() functions unless
1310          * the NFD form has more than one code point,
1311          * and the property starts set needs to be the union of the
1312          * start sets for normalization and case mappings.
1313          */
1314         case UProperty.CHANGES_WHEN_LOWERCASED:
1315             dummyStringBuilder.setLength(0);
1316             return toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0;
1317         case UProperty.CHANGES_WHEN_UPPERCASED:
1318             dummyStringBuilder.setLength(0);
1319             return toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0;
1320         case UProperty.CHANGES_WHEN_TITLECASED:
1321             dummyStringBuilder.setLength(0);
1322             return toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
1323         /* case UProperty.CHANGES_WHEN_CASEFOLDED: -- in UCharacterProperty.java */
1324         case UProperty.CHANGES_WHEN_CASEMAPPED:
1325             dummyStringBuilder.setLength(0);
1326             return
1327                 toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
1328                 toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
1329                 toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
1330         default:
1331             return false;
1332         }
1333     }
1334 
1335     // data members -------------------------------------------------------- ***
1336     private int indexes[];
1337     private String exceptions;
1338     private char unfold[];
1339 
1340     private Trie2_16 trie;
1341 
1342     // data format constants ----------------------------------------------- ***
1343     private static final String DATA_NAME="ucase";
1344     private static final String DATA_TYPE="icu";
1345     private static final String DATA_FILE_NAME=DATA_NAME+"."+DATA_TYPE;
1346 
1347     /* format "cAsE" */
1348     private static final int FMT=0x63415345;
1349 
1350     /* indexes into indexes[] */
1351     //private static final int IX_INDEX_TOP=0;
1352     //private static final int IX_LENGTH=1;
1353     private static final int IX_TRIE_SIZE=2;
1354     private static final int IX_EXC_LENGTH=3;
1355     private static final int IX_UNFOLD_LENGTH=4;
1356 
1357     //private static final int IX_MAX_FULL_LENGTH=15;
1358     private static final int IX_TOP=16;
1359 
1360     // definitions for 16-bit case properties word ------------------------- ***
1361 
1362     /* 2-bit constants for types of cased characters */
1363     public static final int TYPE_MASK=3;
1364     public static final int NONE=0;
1365     public static final int LOWER=1;
1366     public static final int UPPER=2;
1367     public static final int TITLE=3;
1368 
1369     /** @return NONE, LOWER, UPPER, TITLE */
getTypeFromProps(int props)1370     private static final int getTypeFromProps(int props) {
1371         return props&TYPE_MASK;
1372     }
1373 
1374     /** @return like getTypeFromProps() but also sets IGNORABLE if props indicate case-ignorable */
getTypeAndIgnorableFromProps(int props)1375     private static final int getTypeAndIgnorableFromProps(int props) {
1376         return props&7;
1377     }
1378 
1379     static final int IGNORABLE=4;
1380     private static final int SENSITIVE=     8;
1381     private static final int EXCEPTION=     0x10;
1382 
1383     private static final int DOT_MASK=      0x60;
1384     //private static final int NO_DOT=        0;      /* normal characters with cc=0 */
1385     private static final int SOFT_DOTTED=   0x20;   /* soft-dotted characters with cc=0 */
1386     private static final int ABOVE=         0x40;   /* "above" accents with cc=230 */
1387     private static final int OTHER_ACCENT=  0x60;   /* other accent character (0<cc!=230) */
1388 
1389     /* no exception: bits 15..7 are a 9-bit signed case mapping delta */
1390     private static final int DELTA_SHIFT=   7;
1391     //private static final int DELTA_MASK=    0xff80;
1392     //private static final int MAX_DELTA=     0xff;
1393     //private static final int MIN_DELTA=     (-MAX_DELTA-1);
1394 
getDelta(int props)1395     private static final int getDelta(int props) {
1396         return (short)props>>DELTA_SHIFT;
1397     }
1398 
1399     /* exception: bits 15..5 are an unsigned 11-bit index into the exceptions array */
1400     private static final int EXC_SHIFT=     5;
1401     //private static final int EXC_MASK=      0xffe0;
1402     //private static final int MAX_EXCEPTIONS=((EXC_MASK>>EXC_SHIFT)+1);
1403 
1404     /* definitions for 16-bit main exceptions word ------------------------------ */
1405 
1406     /* first 8 bits indicate values in optional slots */
1407     private static final int EXC_LOWER=0;
1408     private static final int EXC_FOLD=1;
1409     private static final int EXC_UPPER=2;
1410     private static final int EXC_TITLE=3;
1411     //private static final int EXC_4=4;           /* reserved */
1412     //private static final int EXC_5=5;           /* reserved */
1413     private static final int EXC_CLOSURE=6;
1414     private static final int EXC_FULL_MAPPINGS=7;
1415     //private static final int EXC_ALL_SLOTS=8;   /* one past the last slot */
1416 
1417     /* each slot is 2 uint16_t instead of 1 */
1418     private static final int EXC_DOUBLE_SLOTS=          0x100;
1419 
1420     /* reserved: exception bits 11..9 */
1421 
1422     /* EXC_DOT_MASK=DOT_MASK<<EXC_DOT_SHIFT */
1423     private static final int EXC_DOT_SHIFT=7;
1424 
1425     /* normally stored in the main word, but pushed out for larger exception indexes */
1426     //private static final int EXC_DOT_MASK=              0x3000;
1427     //private static final int EXC_NO_DOT=                0;
1428     //private static final int EXC_SOFT_DOTTED=           0x1000;
1429     //private static final int EXC_ABOVE=                 0x2000; /* "above" accents with cc=230 */
1430     //private static final int EXC_OTHER_ACCENT=          0x3000; /* other character (0<cc!=230) */
1431 
1432     /* complex/conditional mappings */
1433     private static final int EXC_CONDITIONAL_SPECIAL=   0x4000;
1434     private static final int EXC_CONDITIONAL_FOLD=      0x8000;
1435 
1436     /* definitions for lengths word for full case mappings */
1437     private static final int FULL_LOWER=    0xf;
1438     //private static final int FULL_FOLDING=  0xf0;
1439     //private static final int FULL_UPPER=    0xf00;
1440     //private static final int FULL_TITLE=    0xf000;
1441 
1442     /* maximum lengths */
1443     //private static final int FULL_MAPPINGS_MAX_LENGTH=4*0xf;
1444     private static final int CLOSURE_MAX_LENGTH=0xf;
1445 
1446     /* constants for reverse case folding ("unfold") data */
1447     private static final int UNFOLD_ROWS=0;
1448     private static final int UNFOLD_ROW_WIDTH=1;
1449     private static final int UNFOLD_STRING_WIDTH=2;
1450 
1451     /*
1452      * public singleton instance
1453      */
1454     public static final UCaseProps INSTANCE;
1455 
1456     // This static initializer block must be placed after
1457     // other static member initialization
1458     static {
1459         try {
1460             INSTANCE = new UCaseProps();
1461         } catch (IOException e) {
1462             throw new ICUUncheckedIOException(e);
1463         }
1464     }
1465 }
1466