• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  *   Copyright (C) 2009-2015, International Business Machines
7  *   Corporation and others.  All Rights Reserved.
8  *******************************************************************************
9  */
10 
11 package ohos.global.icu.impl;
12 
13 import java.io.IOException;
14 import java.nio.ByteBuffer;
15 import java.util.ArrayList;
16 
17 import ohos.global.icu.text.UTF16;
18 import ohos.global.icu.text.UnicodeSet;
19 import ohos.global.icu.util.CodePointMap;
20 import ohos.global.icu.util.CodePointTrie;
21 import ohos.global.icu.util.ICUUncheckedIOException;
22 import ohos.global.icu.util.MutableCodePointTrie;
23 import ohos.global.icu.util.VersionInfo;
24 
25 /**
26  * Low-level implementation of the Unicode Normalization Algorithm.
27  * For the data structure and details see the documentation at the end of
28  * C++ normalizer2impl.h and in the design doc at
29  * http://site.icu-project.org/design/normalization/custom
30  * @hide exposed on OHOS
31  */
32 public final class Normalizer2Impl {
33     /**
34      * @hide exposed on OHOS
35      */
36     public static final class Hangul {
37         /* Korean Hangul and Jamo constants */
38         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
39         public static final int JAMO_L_END=0x1112;
40         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
41         public static final int JAMO_V_END=0x1175;
42         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
43         public static final int JAMO_T_END=0x11c2;
44 
45         public static final int HANGUL_BASE=0xac00;
46         public static final int HANGUL_END=0xd7a3;
47 
48         public static final int JAMO_L_COUNT=19;
49         public static final int JAMO_V_COUNT=21;
50         public static final int JAMO_T_COUNT=28;
51 
52         public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
53         public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
54 
55         public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
56 
57         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
58         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
59 
isHangul(int c)60         public static boolean isHangul(int c) {
61             return HANGUL_BASE<=c && c<HANGUL_LIMIT;
62         }
isHangulLV(int c)63         public static boolean isHangulLV(int c) {
64             c-=HANGUL_BASE;
65             return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
66         }
isJamoL(int c)67         public static boolean isJamoL(int c) {
68             return JAMO_L_BASE<=c && c<JAMO_L_LIMIT;
69         }
isJamoV(int c)70         public static boolean isJamoV(int c) {
71             return JAMO_V_BASE<=c && c<JAMO_V_LIMIT;
72         }
isJamoT(int c)73         public static boolean isJamoT(int c) {
74             int t=c-JAMO_T_BASE;
75             return 0<t && t<JAMO_T_COUNT;  // not JAMO_T_BASE itself
76         }
isJamo(int c)77         public static boolean isJamo(int c) {
78             return JAMO_L_BASE<=c && c<=JAMO_T_END &&
79                 (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
80         }
81 
82         /**
83          * Decomposes c, which must be a Hangul syllable, into buffer
84          * and returns the length of the decomposition (2 or 3).
85          */
decompose(int c, Appendable buffer)86         public static int decompose(int c, Appendable buffer) {
87             try {
88                 c-=HANGUL_BASE;
89                 int c2=c%JAMO_T_COUNT;
90                 c/=JAMO_T_COUNT;
91                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
92                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
93                 if(c2==0) {
94                     return 2;
95                 } else {
96                     buffer.append((char)(JAMO_T_BASE+c2));
97                     return 3;
98                 }
99             } catch(IOException e) {
100                 // Will not occur because we do not write to I/O.
101                 throw new ICUUncheckedIOException(e);
102             }
103         }
104 
105         /**
106          * Decomposes c, which must be a Hangul syllable, into buffer.
107          * This is the raw, not recursive, decomposition. Its length is always 2.
108          */
getRawDecomposition(int c, Appendable buffer)109         public static void getRawDecomposition(int c, Appendable buffer) {
110             try {
111                 int orig=c;
112                 c-=HANGUL_BASE;
113                 int c2=c%JAMO_T_COUNT;
114                 if(c2==0) {
115                     c/=JAMO_T_COUNT;
116                     buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
117                     buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
118                 } else {
119                     buffer.append((char)(orig-c2));  // LV syllable
120                     buffer.append((char)(JAMO_T_BASE+c2));
121                 }
122             } catch(IOException e) {
123                 // Will not occur because we do not write to I/O.
124                 throw new ICUUncheckedIOException(e);
125             }
126         }
127     }
128 
129     /**
130      * Writable buffer that takes care of canonical ordering.
131      * Its Appendable methods behave like the C++ implementation's
132      * appendZeroCC() methods.
133      * <p>
134      * If dest is a StringBuilder, then the buffer writes directly to it.
135      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
136      * until no further changes are necessary and whole segments are appended.
137      * append() methods that take combining-class values always write to the StringBuilder.
138      * Other append() methods flush and append to the Appendable.
139      * @hide exposed on OHOS
140      */
141     public static final class ReorderingBuffer implements Appendable {
ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity)142         public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) {
143             impl=ni;
144             app=dest;
145             if(app instanceof StringBuilder) {
146                 appIsStringBuilder=true;
147                 str=(StringBuilder)dest;
148                 // In Java, the constructor subsumes public void init(int destCapacity) {
149                 str.ensureCapacity(destCapacity);
150                 reorderStart=0;
151                 if(str.length()==0) {
152                     lastCC=0;
153                 } else {
154                     setIterator();
155                     lastCC=previousCC();
156                     // Set reorderStart after the last code point with cc<=1 if there is one.
157                     if(lastCC>1) {
158                         while(previousCC()>1) {}
159                     }
160                     reorderStart=codePointLimit;
161                 }
162             } else {
163                 appIsStringBuilder=false;
164                 str=new StringBuilder();
165                 reorderStart=0;
166                 lastCC=0;
167             }
168         }
169 
isEmpty()170         public boolean isEmpty() { return str.length()==0; }
length()171         public int length() { return str.length(); }
getLastCC()172         public int getLastCC() { return lastCC; }
173 
getStringBuilder()174         public StringBuilder getStringBuilder() { return str; }
175 
equals(CharSequence s, int start, int limit)176         public boolean equals(CharSequence s, int start, int limit) {
177             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
178         }
179 
append(int c, int cc)180         public void append(int c, int cc) {
181             if(lastCC<=cc || cc==0) {
182                 str.appendCodePoint(c);
183                 lastCC=cc;
184                 if(cc<=1) {
185                     reorderStart=str.length();
186                 }
187             } else {
188                 insert(c, cc);
189             }
190         }
append(CharSequence s, int start, int limit, boolean isNFD, int leadCC, int trailCC)191         public void append(CharSequence s, int start, int limit, boolean isNFD,
192                            int leadCC, int trailCC) {
193             if(start==limit) {
194                 return;
195             }
196             if(lastCC<=leadCC || leadCC==0) {
197                 if(trailCC<=1) {
198                     reorderStart=str.length()+(limit-start);
199                 } else if(leadCC<=1) {
200                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
201                 }
202                 str.append(s, start, limit);
203                 lastCC=trailCC;
204             } else {
205                 int c=Character.codePointAt(s, start);
206                 start+=Character.charCount(c);
207                 insert(c, leadCC);  // insert first code point
208                 while(start<limit) {
209                     c=Character.codePointAt(s, start);
210                     start+=Character.charCount(c);
211                     if(start<limit) {
212                         if (isNFD) {
213                             leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
214                         } else {
215                             leadCC = impl.getCC(impl.getNorm16(c));
216                         }
217                     } else {
218                         leadCC=trailCC;
219                     }
220                     append(c, leadCC);
221                 }
222             }
223         }
224         // The following append() methods work like C++ appendZeroCC().
225         // They assume that the cc or trailCC of their input is 0.
226         // Most of them implement Appendable interface methods.
227         @Override
append(char c)228         public ReorderingBuffer append(char c) {
229             str.append(c);
230             lastCC=0;
231             reorderStart=str.length();
232             return this;
233         }
appendZeroCC(int c)234         public void appendZeroCC(int c) {
235             str.appendCodePoint(c);
236             lastCC=0;
237             reorderStart=str.length();
238         }
239         @Override
append(CharSequence s)240         public ReorderingBuffer append(CharSequence s) {
241             if(s.length()!=0) {
242                 str.append(s);
243                 lastCC=0;
244                 reorderStart=str.length();
245             }
246             return this;
247         }
248         @Override
append(CharSequence s, int start, int limit)249         public ReorderingBuffer append(CharSequence s, int start, int limit) {
250             if(start!=limit) {
251                 str.append(s, start, limit);
252                 lastCC=0;
253                 reorderStart=str.length();
254             }
255             return this;
256         }
257         /**
258          * Flushes from the intermediate StringBuilder to the Appendable,
259          * if they are different objects.
260          * Used after recomposition.
261          * Must be called at the end when writing to a non-StringBuilder Appendable.
262          */
flush()263         public void flush() {
264             if(appIsStringBuilder) {
265                 reorderStart=str.length();
266             } else {
267                 try {
268                     app.append(str);
269                     str.setLength(0);
270                     reorderStart=0;
271                 } catch(IOException e) {
272                     throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
273                 }
274             }
275             lastCC=0;
276         }
277         /**
278          * Flushes from the intermediate StringBuilder to the Appendable,
279          * if they are different objects.
280          * Then appends the new text to the Appendable or StringBuilder.
281          * Normally used after quick check loops find a non-empty sequence.
282          */
flushAndAppendZeroCC(CharSequence s, int start, int limit)283         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
284             if(appIsStringBuilder) {
285                 str.append(s, start, limit);
286                 reorderStart=str.length();
287             } else {
288                 try {
289                     app.append(str).append(s, start, limit);
290                     str.setLength(0);
291                     reorderStart=0;
292                 } catch(IOException e) {
293                     throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
294                 }
295             }
296             lastCC=0;
297             return this;
298         }
remove()299         public void remove() {
300             str.setLength(0);
301             lastCC=0;
302             reorderStart=0;
303         }
removeSuffix(int suffixLength)304         public void removeSuffix(int suffixLength) {
305             int oldLength=str.length();
306             str.delete(oldLength-suffixLength, oldLength);
307             lastCC=0;
308             reorderStart=str.length();
309         }
310 
311         /*
312          * TODO: Revisit whether it makes sense to track reorderStart.
313          * It is set to after the last known character with cc<=1,
314          * which stops previousCC() before it reads that character and looks up its cc.
315          * previousCC() is normally only called from insert().
316          * In other words, reorderStart speeds up the insertion of a combining mark
317          * into a multi-combining mark sequence where it does not belong at the end.
318          * This might not be worth the trouble.
319          * On the other hand, it's not a huge amount of trouble.
320          *
321          * We probably need it for UNORM_SIMPLE_APPEND.
322          */
323 
324         // Inserts c somewhere before the last character.
325         // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(int c, int cc)326         private void insert(int c, int cc) {
327             for(setIterator(), skipPrevious(); previousCC()>cc;) {}
328             // insert c at codePointLimit, after the character with prevCC<=cc
329             if(c<=0xffff) {
330                 str.insert(codePointLimit, (char)c);
331                 if(cc<=1) {
332                     reorderStart=codePointLimit+1;
333                 }
334             } else {
335                 str.insert(codePointLimit, Character.toChars(c));
336                 if(cc<=1) {
337                     reorderStart=codePointLimit+2;
338                 }
339             }
340         }
341 
342         private final Normalizer2Impl impl;
343         private final Appendable app;
344         private final StringBuilder str;
345         private final boolean appIsStringBuilder;
346         private int reorderStart;
347         private int lastCC;
348 
349         // private backward iterator
setIterator()350         private void setIterator() { codePointStart=str.length(); }
skipPrevious()351         private void skipPrevious() {  // Requires 0<codePointStart.
352             codePointLimit=codePointStart;
353             codePointStart=str.offsetByCodePoints(codePointStart, -1);
354         }
previousCC()355         private int previousCC() {  // Returns 0 if there is no previous character.
356             codePointLimit=codePointStart;
357             if(reorderStart>=codePointStart) {
358                 return 0;
359             }
360             int c=str.codePointBefore(codePointStart);
361             codePointStart-=Character.charCount(c);
362             return impl.getCCFromYesOrMaybeCP(c);
363         }
364 
365         private int codePointStart, codePointLimit;
366     }
367 
368     // TODO: Propose as public API on the UTF16 class.
369     // TODO: Propose widening UTF16 methods that take char to take int.
370     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
371     /**
372      * @hide exposed on OHOS
373      */
374     public static final class UTF16Plus {
375         /**
376          * Is this code point a lead surrogate (U+d800..U+dbff)?
377          * @param c code unit or code point
378          * @return true or false
379          */
isLeadSurrogate(int c)380         public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; }
381         /**
382          * Is this code point a trail surrogate (U+dc00..U+dfff)?
383          * @param c code unit or code point
384          * @return true or false
385          */
isTrailSurrogate(int c)386         public static boolean isTrailSurrogate(int c) { return (c & 0xfffffc00) == 0xdc00; }
387         /**
388          * Is this code point a surrogate (U+d800..U+dfff)?
389          * @param c code unit or code point
390          * @return true or false
391          */
isSurrogate(int c)392         public static boolean isSurrogate(int c) { return (c & 0xfffff800) == 0xd800; }
393         /**
394          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
395          * is it a lead surrogate?
396          * @param c code unit or code point
397          * @return true or false
398          */
isSurrogateLead(int c)399         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
400         /**
401          * Compares two CharSequence objects for binary equality.
402          * @param s1 first sequence
403          * @param s2 second sequence
404          * @return true if s1 contains the same text as s2
405          */
equal(CharSequence s1, CharSequence s2)406         public static boolean equal(CharSequence s1,  CharSequence s2) {
407             if(s1==s2) {
408                 return true;
409             }
410             int length=s1.length();
411             if(length!=s2.length()) {
412                 return false;
413             }
414             for(int i=0; i<length; ++i) {
415                 if(s1.charAt(i)!=s2.charAt(i)) {
416                     return false;
417                 }
418             }
419             return true;
420         }
421         /**
422          * Compares two CharSequence subsequences for binary equality.
423          * @param s1 first sequence
424          * @param start1 start offset in first sequence
425          * @param limit1 limit offset in first sequence
426          * @param s2 second sequence
427          * @param start2 start offset in second sequence
428          * @param limit2 limit offset in second sequence
429          * @return true if s1.subSequence(start1, limit1) contains the same text
430          *              as s2.subSequence(start2, limit2)
431          */
equal(CharSequence s1, int start1, int limit1, CharSequence s2, int start2, int limit2)432         public static boolean equal(CharSequence s1, int start1, int limit1,
433                                     CharSequence s2, int start2, int limit2) {
434             if((limit1-start1)!=(limit2-start2)) {
435                 return false;
436             }
437             if(s1==s2 && start1==start2) {
438                 return true;
439             }
440             while(start1<limit1) {
441                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
442                     return false;
443                 }
444             }
445             return true;
446         }
447     }
448 
Normalizer2Impl()449     public Normalizer2Impl() {}
450 
451     private static final class IsAcceptable implements ICUBinary.Authenticate {
452         @Override
isDataVersionAcceptable(byte version[])453         public boolean isDataVersionAcceptable(byte version[]) {
454             return version[0]==4;
455         }
456     }
457     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
458     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
459 
load(ByteBuffer bytes)460     public Normalizer2Impl load(ByteBuffer bytes) {
461         try {
462             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
463             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
464             if(indexesLength<=IX_MIN_LCCC_CP) {
465                 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes");
466             }
467             int[] inIndexes=new int[indexesLength];
468             inIndexes[0]=indexesLength*4;
469             for(int i=1; i<indexesLength; ++i) {
470                 inIndexes[i]=bytes.getInt();
471             }
472 
473             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
474             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
475             minLcccCP=inIndexes[IX_MIN_LCCC_CP];
476 
477             minYesNo=inIndexes[IX_MIN_YES_NO];
478             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
479             minNoNo=inIndexes[IX_MIN_NO_NO];
480             minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
481             minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
482             minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
483             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
484             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
485             assert((minMaybeYes&7)==0);  // 8-aligned for noNoDelta bit fields
486             centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
487 
488             // Read the normTrie.
489             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
490             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
491             int triePosition = bytes.position();
492             normTrie = CodePointTrie.Fast16.fromBinary(bytes);
493             int trieLength = bytes.position() - triePosition;
494             if(trieLength>(nextOffset-offset)) {
495                 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie");
496             }
497             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
498 
499             // Read the composition and mapping data.
500             offset=nextOffset;
501             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
502             int numChars=(nextOffset-offset)/2;
503             if(numChars!=0) {
504                 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
505                 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
506             }
507 
508             // smallFCD: new in formatVersion 2
509             offset=nextOffset;
510             smallFCD=new byte[0x100];
511             bytes.get(smallFCD);
512 
513             return this;
514         } catch(IOException e) {
515             throw new ICUUncheckedIOException(e);
516         }
517     }
load(String name)518     public Normalizer2Impl load(String name) {
519         return load(ICUBinary.getRequiredData(name));
520     }
521 
addLcccChars(UnicodeSet set)522     public void addLcccChars(UnicodeSet set) {
523         int start = 0;
524         CodePointMap.Range range = new CodePointMap.Range();
525         while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT,
526                 null, range)) {
527             int end = range.getEnd();
528             int norm16 = range.getValue();
529             if (norm16 > MIN_NORMAL_MAYBE_YES && norm16 != JAMO_VT) {
530                 set.add(start, end);
531             } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
532                 int fcd16 = getFCD16(start);
533                 if (fcd16 > 0xff) { set.add(start, end); }
534             }
535             start = end + 1;
536         }
537     }
538 
addPropertyStarts(UnicodeSet set)539     public void addPropertyStarts(UnicodeSet set) {
540         // Add the start code point of each same-value range of the trie.
541         int start = 0;
542         CodePointMap.Range range = new CodePointMap.Range();
543         while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT,
544                 null, range)) {
545             int end = range.getEnd();
546             int value = range.getValue();
547             set.add(start);
548             if (start != end && isAlgorithmicNoNo(value) &&
549                     (value & DELTA_TCCC_MASK) > DELTA_TCCC_1) {
550                 // Range of code points with same-norm16-value algorithmic decompositions.
551                 // They might have different non-zero FCD16 values.
552                 int prevFCD16 = getFCD16(start);
553                 while (++start <= end) {
554                     int fcd16 = getFCD16(start);
555                     if (fcd16 != prevFCD16) {
556                         set.add(start);
557                         prevFCD16 = fcd16;
558                     }
559                 }
560             }
561             start = end + 1;
562         }
563 
564         /* add Hangul LV syllables and LV+1 because of skippables */
565         for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) {
566             set.add(c);
567             set.add(c+1);
568         }
569         set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
570     }
571 
addCanonIterPropertyStarts(UnicodeSet set)572     public void addCanonIterPropertyStarts(UnicodeSet set) {
573         // Add the start code point of each same-value range of the canonical iterator data trie.
574         ensureCanonIterData();
575         // Currently only used for the SEGMENT_STARTER property.
576         int start = 0;
577         CodePointMap.Range range = new CodePointMap.Range();
578         while (canonIterData.getRange(start, segmentStarterMapper, range)) {
579             set.add(start);
580             start = range.getEnd() + 1;
581         }
582     }
583     private static final CodePointMap.ValueFilter segmentStarterMapper =
584             new CodePointMap.ValueFilter() {
585         @Override
586         public int apply(int value) {
587             return value & CANON_NOT_SEGMENT_STARTER;
588         }
589     };
590 
591     // low-level properties ------------------------------------------------ ***
592 
593     // Note: Normalizer2Impl.java r30983 (2011-nov-27)
594     // still had getFCDTrie() which built and cached an FCD trie.
595     // That provided faster access to FCD data than getFCD16FromNormData()
596     // but required synchronization and consumed some 10kB of heap memory
597     // in any process that uses FCD (e.g., via collation).
598     // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
599     // at least for ASCII & CJK.
600 
601     /**
602      * Builds the canonical-iterator data for this instance.
603      * This is required before any of {@link #isCanonSegmentStarter(int)} or
604      * {@link #getCanonStartSet(int, UnicodeSet)} are called,
605      * or else they crash.
606      * @return this
607      */
ensureCanonIterData()608     public synchronized Normalizer2Impl ensureCanonIterData() {
609         if(canonIterData==null) {
610             MutableCodePointTrie mutableTrie = new MutableCodePointTrie(0, 0);
611             canonStartSets=new ArrayList<UnicodeSet>();
612             int start = 0;
613             CodePointMap.Range range = new CodePointMap.Range();
614             while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT,
615                     null, range)) {
616                 final int end = range.getEnd();
617                 final int norm16 = range.getValue();
618                 if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) {
619                     // Inert, or 2-way mapping (including Hangul syllable).
620                     // We do not write a canonStartSet for any yesNo character.
621                     // Composites from 2-way mappings are added at runtime from the
622                     // starter's compositions list, and the other characters in
623                     // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
624                     // "maybe" characters.
625                     start = end + 1;
626                     continue;
627                 }
628                 for (int c = start; c <= end; ++c) {
629                     final int oldValue = mutableTrie.get(c);
630                     int newValue=oldValue;
631                     if(isMaybeOrNonZeroCC(norm16)) {
632                         // not a segment starter if it occurs in a decomposition or has cc!=0
633                         newValue|=CANON_NOT_SEGMENT_STARTER;
634                         if(norm16<MIN_NORMAL_MAYBE_YES) {
635                             newValue|=CANON_HAS_COMPOSITIONS;
636                         }
637                     } else if(norm16<minYesNo) {
638                         newValue|=CANON_HAS_COMPOSITIONS;
639                     } else {
640                         // c has a one-way decomposition
641                         int c2=c;
642                         // Do not modify the whole-range norm16 value.
643                         int norm16_2=norm16;
644                         if (isDecompNoAlgorithmic(norm16_2)) {
645                             // Maps to an isCompYesAndZeroCC.
646                             c2 = mapAlgorithmic(c2, norm16_2);
647                             norm16_2 = getRawNorm16(c2);
648                             // No compatibility mappings for the CanonicalIterator.
649                             assert(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));
650                         }
651                         if (norm16_2 > minYesNo) {
652                             // c decomposes, get everything from the variable-length extra data
653                             int mapping=norm16_2>>OFFSET_SHIFT;
654                             int firstUnit=extraData.charAt(mapping);
655                             int length=firstUnit&MAPPING_LENGTH_MASK;
656                             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
657                                 if(c==c2 && (extraData.charAt(mapping-1)&0xff)!=0) {
658                                     newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
659                                 }
660                             }
661                             // Skip empty mappings (no characters in the decomposition).
662                             if(length!=0) {
663                                 ++mapping;  // skip over the firstUnit
664                                 // add c to first code point's start set
665                                 int limit=mapping+length;
666                                 c2=extraData.codePointAt(mapping);
667                                 addToStartSet(mutableTrie, c, c2);
668                                 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
669                                 // one-way mapping. A 2-way mapping is possible here after
670                                 // intermediate algorithmic mapping.
671                                 if(norm16_2>=minNoNo) {
672                                     while((mapping+=Character.charCount(c2))<limit) {
673                                         c2=extraData.codePointAt(mapping);
674                                         int c2Value = mutableTrie.get(c2);
675                                         if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
676                                             mutableTrie.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
677                                         }
678                                     }
679                                 }
680                             }
681                         } else {
682                             // c decomposed to c2 algorithmically; c has cc==0
683                             addToStartSet(mutableTrie, c, c2);
684                         }
685                     }
686                     if(newValue!=oldValue) {
687                         mutableTrie.set(c, newValue);
688                     }
689                 }
690                 start = end + 1;
691             }
692             canonIterData = mutableTrie.buildImmutable(
693                     CodePointTrie.Type.SMALL, CodePointTrie.ValueWidth.BITS_32);
694         }
695         return this;
696     }
697 
698     // The trie stores values for lead surrogate code *units*.
699     // Surrogate code *points* are inert.
getNorm16(int c)700     public int getNorm16(int c) {
701         return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c);
702     }
getRawNorm16(int c)703     public int getRawNorm16(int c) { return normTrie.get(c); }
704 
getCompQuickCheck(int norm16)705     public int getCompQuickCheck(int norm16) {
706         if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
707             return 1;  // yes
708         } else if(minMaybeYes<=norm16) {
709             return 2;  // maybe
710         } else {
711             return 0;  // no
712         }
713     }
isAlgorithmicNoNo(int norm16)714     public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
isCompNo(int norm16)715     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
isDecompYes(int norm16)716     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
717 
getCC(int norm16)718     public int getCC(int norm16) {
719         if(norm16>=MIN_NORMAL_MAYBE_YES) {
720             return getCCFromNormalYesOrMaybe(norm16);
721         }
722         if(norm16<minNoNo || limitNoNo<=norm16) {
723             return 0;
724         }
725         return getCCFromNoNo(norm16);
726     }
getCCFromNormalYesOrMaybe(int norm16)727     public static int getCCFromNormalYesOrMaybe(int norm16) {
728         return (norm16 >> OFFSET_SHIFT) & 0xff;
729     }
getCCFromYesOrMaybe(int norm16)730     public static int getCCFromYesOrMaybe(int norm16) {
731         return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
732     }
getCCFromYesOrMaybeCP(int c)733     public int getCCFromYesOrMaybeCP(int c) {
734         if (c < minCompNoMaybeCP) { return 0; }
735         return getCCFromYesOrMaybe(getNorm16(c));
736     }
737 
738     /**
739      * Returns the FCD data for code point c.
740      * @param c A Unicode code point.
741      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
742      */
getFCD16(int c)743     public int getFCD16(int c) {
744         if(c<minDecompNoCP) {
745             return 0;
746         } else if(c<=0xffff) {
747             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
748         }
749         return getFCD16FromNormData(c);
750     }
751     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
singleLeadMightHaveNonZeroFCD16(int lead)752     public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
753         // 0<=lead<=0xffff
754         byte bits=smallFCD[lead>>8];
755         if(bits==0) { return false; }
756         return ((bits>>((lead>>5)&7))&1)!=0;
757     }
758 
759     /** Gets the FCD value from the regular normalization data. */
getFCD16FromNormData(int c)760     public int getFCD16FromNormData(int c) {
761         int norm16=getNorm16(c);
762         if (norm16 >= limitNoNo) {
763             if(norm16>=MIN_NORMAL_MAYBE_YES) {
764                 // combining mark
765                 norm16=getCCFromNormalYesOrMaybe(norm16);
766                 return norm16|(norm16<<8);
767             } else if(norm16>=minMaybeYes) {
768                 return 0;
769             } else {  // isDecompNoAlgorithmic(norm16)
770                 int deltaTrailCC = norm16 & DELTA_TCCC_MASK;
771                 if (deltaTrailCC <= DELTA_TCCC_1) {
772                     return deltaTrailCC >> OFFSET_SHIFT;
773                 }
774                 // Maps to an isCompYesAndZeroCC.
775                 c=mapAlgorithmic(c, norm16);
776                 norm16 = getRawNorm16(c);
777             }
778         }
779         if(norm16<=minYesNo || isHangulLVT(norm16)) {
780             // no decomposition or Hangul syllable, all zeros
781             return 0;
782         }
783         // c decomposes, get everything from the variable-length extra data
784         int mapping=norm16>>OFFSET_SHIFT;
785         int firstUnit=extraData.charAt(mapping);
786         int fcd16=firstUnit>>8;  // tccc
787         if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
788             fcd16|=extraData.charAt(mapping-1)&0xff00;  // lccc
789         }
790         return fcd16;
791     }
792 
793     /**
794      * Gets the decomposition for one code point.
795      * @param c code point
796      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
797      */
getDecomposition(int c)798     public String getDecomposition(int c) {
799         int norm16;
800         if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) {
801             // c does not decompose
802             return null;
803         }
804         int decomp = -1;
805         if(isDecompNoAlgorithmic(norm16)) {
806             // Maps to an isCompYesAndZeroCC.
807             decomp=c=mapAlgorithmic(c, norm16);
808             // The mapping might decompose further.
809             norm16 = getRawNorm16(c);
810         }
811         if (norm16 < minYesNo) {
812             if(decomp<0) {
813                 return null;
814             } else {
815                 return UTF16.valueOf(decomp);
816             }
817         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
818             // Hangul syllable: decompose algorithmically
819             StringBuilder buffer=new StringBuilder();
820             Hangul.decompose(c, buffer);
821             return buffer.toString();
822         }
823         // c decomposes, get everything from the variable-length extra data
824         int mapping=norm16>>OFFSET_SHIFT;
825         int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK;
826         return extraData.substring(mapping, mapping+length);
827     }
828 
829     /**
830      * Gets the raw decomposition for one code point.
831      * @param c code point
832      * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition
833      */
getRawDecomposition(int c)834     public String getRawDecomposition(int c) {
835         int norm16;
836         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
837             // c does not decompose
838             return null;
839         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
840             // Hangul syllable: decompose algorithmically
841             StringBuilder buffer=new StringBuilder();
842             Hangul.getRawDecomposition(c, buffer);
843             return buffer.toString();
844         } else if(isDecompNoAlgorithmic(norm16)) {
845             return UTF16.valueOf(mapAlgorithmic(c, norm16));
846         }
847         // c decomposes, get everything from the variable-length extra data
848         int mapping=norm16>>OFFSET_SHIFT;
849         int firstUnit=extraData.charAt(mapping);
850         int mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
851         if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) {
852             // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
853             // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
854             int rawMapping=mapping-((firstUnit>>7)&1)-1;
855             char rm0=extraData.charAt(rawMapping);
856             if(rm0<=MAPPING_LENGTH_MASK) {
857                 return extraData.substring(rawMapping-rm0, rawMapping);
858             } else {
859                 // Copy the normal mapping and replace its first two code units with rm0.
860                 StringBuilder buffer=new StringBuilder(mLength-1).append(rm0);
861                 mapping+=1+2;  // skip over the firstUnit and the first two mapping code units
862                 return buffer.append(extraData, mapping, mapping+mLength-2).toString();
863             }
864         } else {
865             mapping+=1;  // skip over the firstUnit
866             return extraData.substring(mapping, mapping+mLength);
867         }
868     }
869 
870     /**
871      * Returns true if code point c starts a canonical-iterator string segment.
872      * <b>{@link #ensureCanonIterData()} must have been called before this method,
873      * or else this method will crash.</b>
874      * @param c A Unicode code point.
875      * @return true if c starts a canonical-iterator string segment.
876      */
isCanonSegmentStarter(int c)877     public boolean isCanonSegmentStarter(int c) {
878         return canonIterData.get(c)>=0;
879     }
880     /**
881      * Returns true if there are characters whose decomposition starts with c.
882      * If so, then the set is cleared and then filled with those characters.
883      * <b>{@link #ensureCanonIterData()} must have been called before this method,
884      * or else this method will crash.</b>
885      * @param c A Unicode code point.
886      * @param set A UnicodeSet to receive the characters whose decompositions
887      *        start with c, if there are any.
888      * @return true if there are characters whose decomposition starts with c.
889      */
getCanonStartSet(int c, UnicodeSet set)890     public boolean getCanonStartSet(int c, UnicodeSet set) {
891         int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
892         if(canonValue==0) {
893             return false;
894         }
895         set.clear();
896         int value=canonValue&CANON_VALUE_MASK;
897         if((canonValue&CANON_HAS_SET)!=0) {
898             set.addAll(canonStartSets.get(value));
899         } else if(value!=0) {
900             set.add(value);
901         }
902         if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
903             int norm16 = getRawNorm16(c);
904             if(norm16==JAMO_L) {
905                 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
906                 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
907             } else {
908                 addComposites(getCompositionsList(norm16), set);
909             }
910         }
911         return true;
912     }
913 
914     // Fixed norm16 values.
915     public static final int MIN_YES_YES_WITH_CC=0xfe02;
916     public static final int JAMO_VT=0xfe00;
917     public static final int MIN_NORMAL_MAYBE_YES=0xfc00;
918     public static final int JAMO_L=2;  // offset=1 hasCompBoundaryAfter=FALSE
919     public static final int INERT=1;  // offset=0 hasCompBoundaryAfter=TRUE
920 
921     // norm16 bit 0 is comp-boundary-after.
922     public static final int HAS_COMP_BOUNDARY_AFTER=1;
923     public static final int OFFSET_SHIFT=1;
924 
925     // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
926     // tccc (0, 1, >1) for quick FCC boundary-after tests.
927     public static final int DELTA_TCCC_0=0;
928     public static final int DELTA_TCCC_1=2;
929     public static final int DELTA_TCCC_GT_1=4;
930     public static final int DELTA_TCCC_MASK=6;
931     public static final int DELTA_SHIFT=3;
932 
933     public static final int MAX_DELTA=0x40;
934 
935     // Byte offsets from the start of the data, after the generic header.
936     public static final int IX_NORM_TRIE_OFFSET=0;
937     public static final int IX_EXTRA_DATA_OFFSET=1;
938     public static final int IX_SMALL_FCD_OFFSET=2;
939     public static final int IX_RESERVED3_OFFSET=3;
940     public static final int IX_TOTAL_SIZE=7;
941 
942     // Code point thresholds for quick check codes.
943     public static final int IX_MIN_DECOMP_NO_CP=8;
944     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
945 
946     // Norm16 value thresholds for quick check combinations and types of extra data.
947 
948     /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
949     public static final int IX_MIN_YES_NO=10;
950     /** Mappings are comp-normalized. */
951     public static final int IX_MIN_NO_NO=11;
952     public static final int IX_LIMIT_NO_NO=12;
953     public static final int IX_MIN_MAYBE_YES=13;
954 
955     /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
956     public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
957     /** Mappings are not comp-normalized but have a comp boundary before. */
958     public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
959     /** Mappings do not have a comp boundary before. */
960     public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
961     /** Mappings to the empty string. */
962     public static final int IX_MIN_NO_NO_EMPTY=17;
963 
964     public static final int IX_MIN_LCCC_CP=18;
965     public static final int IX_COUNT=20;
966 
967     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
968     public static final int MAPPING_HAS_RAW_MAPPING=0x40;
969     // unused bit 0x20;
970     public static final int MAPPING_LENGTH_MASK=0x1f;
971 
972     public static final int COMP_1_LAST_TUPLE=0x8000;
973     public static final int COMP_1_TRIPLE=1;
974     public static final int COMP_1_TRAIL_LIMIT=0x3400;
975     public static final int COMP_1_TRAIL_MASK=0x7ffe;
976     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
977     public static final int COMP_2_TRAIL_SHIFT=6;
978     public static final int COMP_2_TRAIL_MASK=0xffc0;
979 
980     // higher-level functionality ------------------------------------------ ***
981 
982     // NFD without an NFD Normalizer2 instance.
decompose(CharSequence s, StringBuilder dest)983     public Appendable decompose(CharSequence s, StringBuilder dest) {
984         decompose(s, 0, s.length(), dest, s.length());
985         return dest;
986     }
987     /**
988      * Decomposes s[src, limit[ and writes the result to dest.
989      * limit can be NULL if src is NUL-terminated.
990      * destLengthEstimate is the initial dest buffer capacity and can be -1.
991      */
decompose(CharSequence s, int src, int limit, StringBuilder dest, int destLengthEstimate)992     public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
993                    int destLengthEstimate) {
994         if(destLengthEstimate<0) {
995             destLengthEstimate=limit-src;
996         }
997         dest.setLength(0);
998         ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
999         decompose(s, src, limit, buffer);
1000     }
1001 
1002     // Dual functionality:
1003     // buffer!=NULL: normalize
1004     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
decompose(CharSequence s, int src, int limit, ReorderingBuffer buffer)1005     public int decompose(CharSequence s, int src, int limit,
1006                          ReorderingBuffer buffer) {
1007         int minNoCP=minDecompNoCP;
1008 
1009         int prevSrc;
1010         int c=0;
1011         int norm16=0;
1012 
1013         // only for quick check
1014         int prevBoundary=src;
1015         int prevCC=0;
1016 
1017         for(;;) {
1018             // count code units below the minimum or with irrelevant data for the quick check
1019             for(prevSrc=src; src!=limit;) {
1020                 if( (c=s.charAt(src))<minNoCP ||
1021                     isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c))
1022                 ) {
1023                     ++src;
1024                 } else if (!UTF16Plus.isLeadSurrogate(c)) {
1025                     break;
1026                 } else {
1027                     char c2;
1028                     if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
1029                         c = Character.toCodePoint((char)c, c2);
1030                         norm16 = normTrie.suppGet(c);
1031                         if (isMostDecompYesAndZeroCC(norm16)) {
1032                             src += 2;
1033                         } else {
1034                             break;
1035                         }
1036                     } else {
1037                         ++src;  // unpaired lead surrogate: inert
1038                     }
1039                 }
1040             }
1041             // copy these code units all at once
1042             if(src!=prevSrc) {
1043                 if(buffer!=null) {
1044                     buffer.flushAndAppendZeroCC(s, prevSrc, src);
1045                 } else {
1046                     prevCC=0;
1047                     prevBoundary=src;
1048                 }
1049             }
1050             if(src==limit) {
1051                 break;
1052             }
1053 
1054             // Check one above-minimum, relevant code point.
1055             src+=Character.charCount(c);
1056             if(buffer!=null) {
1057                 decompose(c, norm16, buffer);
1058             } else {
1059                 if(isDecompYes(norm16)) {
1060                     int cc=getCCFromYesOrMaybe(norm16);
1061                     if(prevCC<=cc || cc==0) {
1062                         prevCC=cc;
1063                         if(cc<=1) {
1064                             prevBoundary=src;
1065                         }
1066                         continue;
1067                     }
1068                 }
1069                 return prevBoundary;  // "no" or cc out of order
1070             }
1071         }
1072         return src;
1073     }
decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer)1074     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
1075         int limit=s.length();
1076         if(limit==0) {
1077             return;
1078         }
1079         if(doDecompose) {
1080             decompose(s, 0, limit, buffer);
1081             return;
1082         }
1083         // Just merge the strings at the boundary.
1084         int c=Character.codePointAt(s, 0);
1085         int src=0;
1086         int firstCC, prevCC, cc;
1087         firstCC=prevCC=cc=getCC(getNorm16(c));
1088         while(cc!=0) {
1089             prevCC=cc;
1090             src+=Character.charCount(c);
1091             if(src>=limit) {
1092                 break;
1093             }
1094             c=Character.codePointAt(s, src);
1095             cc=getCC(getNorm16(c));
1096         };
1097         buffer.append(s, 0, src, false, firstCC, prevCC);
1098         buffer.append(s, src, limit);
1099     }
1100 
1101     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1102     // doCompose: normalize
1103     // !doCompose: isNormalized (buffer must be empty and initialized)
compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer)1104     public boolean compose(CharSequence s, int src, int limit,
1105                            boolean onlyContiguous,
1106                            boolean doCompose,
1107                            ReorderingBuffer buffer) {
1108         int prevBoundary=src;
1109         int minNoMaybeCP=minCompNoMaybeCP;
1110 
1111         for (;;) {
1112             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1113             // or with (compYes && ccc==0) properties.
1114             int prevSrc;
1115             int c = 0;
1116             int norm16 = 0;
1117             for (;;) {
1118                 if (src == limit) {
1119                     if (prevBoundary != limit && doCompose) {
1120                         buffer.append(s, prevBoundary, limit);
1121                     }
1122                     return true;
1123                 }
1124                 if( (c=s.charAt(src))<minNoMaybeCP ||
1125                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
1126                 ) {
1127                     ++src;
1128                 } else {
1129                     prevSrc = src++;
1130                     if (!UTF16Plus.isLeadSurrogate(c)) {
1131                         break;
1132                     } else {
1133                         char c2;
1134                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
1135                             ++src;
1136                             c = Character.toCodePoint((char)c, c2);
1137                             norm16 = normTrie.suppGet(c);
1138                             if (!isCompYesAndZeroCC(norm16)) {
1139                                 break;
1140                             }
1141                         }
1142                     }
1143                 }
1144             }
1145             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1146             // The current character is either a "noNo" (has a mapping)
1147             // or a "maybeYes" (combines backward)
1148             // or a "yesYes" with ccc!=0.
1149             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1150 
1151             // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1152             if (!isMaybeOrNonZeroCC(norm16)) {  // minNoNo <= norm16 < minMaybeYes
1153                 if (!doCompose) {
1154                     return false;
1155                 }
1156                 // Fast path for mapping a character that is immediately surrounded by boundaries.
1157                 // In this case, we need not decompose around the current character.
1158                 if (isDecompNoAlgorithmic(norm16)) {
1159                     // Maps to a single isCompYesAndZeroCC character
1160                     // which also implies hasCompBoundaryBefore.
1161                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1162                             hasCompBoundaryBefore(s, src, limit)) {
1163                         if (prevBoundary != prevSrc) {
1164                             buffer.append(s, prevBoundary, prevSrc);
1165                         }
1166                         buffer.append(mapAlgorithmic(c, norm16), 0);
1167                         prevBoundary = src;
1168                         continue;
1169                     }
1170                 } else if (norm16 < minNoNoCompBoundaryBefore) {
1171                     // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1172                     if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) ||
1173                             hasCompBoundaryBefore(s, src, limit)) {
1174                         if (prevBoundary != prevSrc) {
1175                             buffer.append(s, prevBoundary, prevSrc);
1176                         }
1177                         int mapping = norm16 >> OFFSET_SHIFT;
1178                         int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK;
1179                         buffer.append(extraData, mapping, mapping + length);
1180                         prevBoundary = src;
1181                         continue;
1182                     }
1183                 } else if (norm16 >= minNoNoEmpty) {
1184                     // The current character maps to nothing.
1185                     // Simply omit it from the output if there is a boundary before _or_ after it.
1186                     // The character itself implies no boundaries.
1187                     if (hasCompBoundaryBefore(s, src, limit) ||
1188                             hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) {
1189                         if (prevBoundary != prevSrc) {
1190                             buffer.append(s, prevBoundary, prevSrc);
1191                         }
1192                         prevBoundary = src;
1193                         continue;
1194                     }
1195                 }
1196                 // Other "noNo" type, or need to examine more text around this character:
1197                 // Fall through to the slow path.
1198             } else if (isJamoVT(norm16) && prevBoundary != prevSrc) {
1199                 char prev=s.charAt(prevSrc-1);
1200                 if(c<Hangul.JAMO_T_BASE) {
1201                     // The current character is a Jamo Vowel,
1202                     // compose with previous Jamo L and following Jamo T.
1203                     char l = (char)(prev-Hangul.JAMO_L_BASE);
1204                     if(l<Hangul.JAMO_L_COUNT) {
1205                         if (!doCompose) {
1206                             return false;
1207                         }
1208                         int t;
1209                         if (src != limit &&
1210                                 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) &&
1211                                 t < Hangul.JAMO_T_COUNT) {
1212                             // The next character is a Jamo T.
1213                             ++src;
1214                         } else if (hasCompBoundaryBefore(s, src, limit)) {
1215                             // No Jamo T follows, not even via decomposition.
1216                             t = 0;
1217                         } else {
1218                             t = -1;
1219                         }
1220                         if (t >= 0) {
1221                             int syllable = Hangul.HANGUL_BASE +
1222                                 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) *
1223                                 Hangul.JAMO_T_COUNT + t;
1224                             --prevSrc;  // Replace the Jamo L as well.
1225                             if (prevBoundary != prevSrc) {
1226                                 buffer.append(s, prevBoundary, prevSrc);
1227                             }
1228                             buffer.append((char)syllable);
1229                             prevBoundary = src;
1230                             continue;
1231                         }
1232                         // If we see L+V+x where x!=T then we drop to the slow path,
1233                         // decompose and recompose.
1234                         // This is to deal with NFKC finding normal L and V but a
1235                         // compatibility variant of a T.
1236                         // We need to either fully compose that combination here
1237                         // (which would complicate the code and may not work with strange custom data)
1238                         // or use the slow path.
1239                     }
1240                 } else if (Hangul.isHangulLV(prev)) {
1241                     // The current character is a Jamo Trailing consonant,
1242                     // compose with previous Hangul LV that does not contain a Jamo T.
1243                     if (!doCompose) {
1244                         return false;
1245                     }
1246                     int syllable = prev + c - Hangul.JAMO_T_BASE;
1247                     --prevSrc;  // Replace the Hangul LV as well.
1248                     if (prevBoundary != prevSrc) {
1249                         buffer.append(s, prevBoundary, prevSrc);
1250                     }
1251                     buffer.append((char)syllable);
1252                     prevBoundary = src;
1253                     continue;
1254                 }
1255                 // No matching context, or may need to decompose surrounding text first:
1256                 // Fall through to the slow path.
1257             } else if (norm16 > JAMO_VT) {  // norm16 >= MIN_YES_YES_WITH_CC
1258                 // One or more combining marks that do not combine-back:
1259                 // Check for canonical order, copy unchanged if ok and
1260                 // if followed by a character with a boundary-before.
1261                 int cc = getCCFromNormalYesOrMaybe(norm16);  // cc!=0
1262                 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) {
1263                     // Fails FCD test, need to decompose and contiguously recompose.
1264                     if (!doCompose) {
1265                         return false;
1266                     }
1267                 } else {
1268                     // If !onlyContiguous (not FCC), then we ignore the tccc of
1269                     // the previous character which passed the quick check "yes && ccc==0" test.
1270                     int n16;
1271                     for (;;) {
1272                         if (src == limit) {
1273                             if (doCompose) {
1274                                 buffer.append(s, prevBoundary, limit);
1275                             }
1276                             return true;
1277                         }
1278                         int prevCC = cc;
1279                         c = Character.codePointAt(s, src);
1280                         n16 = normTrie.get(c);
1281                         if (n16 >= MIN_YES_YES_WITH_CC) {
1282                             cc = getCCFromNormalYesOrMaybe(n16);
1283                             if (prevCC > cc) {
1284                                 if (!doCompose) {
1285                                     return false;
1286                                 }
1287                                 break;
1288                             }
1289                         } else {
1290                             break;
1291                         }
1292                         src += Character.charCount(c);
1293                     }
1294                     // p is after the last in-order combining mark.
1295                     // If there is a boundary here, then we continue with no change.
1296                     if (norm16HasCompBoundaryBefore(n16)) {
1297                         if (isCompYesAndZeroCC(n16)) {
1298                             src += Character.charCount(c);
1299                         }
1300                         continue;
1301                     }
1302                     // Use the slow path. There is no boundary in [prevSrc, src[.
1303                 }
1304             }
1305 
1306             // Slow path: Find the nearest boundaries around the current character,
1307             // decompose and recompose.
1308             if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
1309                 c = Character.codePointBefore(s, prevSrc);
1310                 norm16 = normTrie.get(c);
1311                 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1312                     prevSrc -= Character.charCount(c);
1313                 }
1314             }
1315             if (doCompose && prevBoundary != prevSrc) {
1316                 buffer.append(s, prevBoundary, prevSrc);
1317             }
1318             int recomposeStartIndex=buffer.length();
1319             // We know there is not a boundary here.
1320             decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous,
1321                            buffer);
1322             // Decompose until the next boundary.
1323             src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous,
1324                                  buffer);
1325             recompose(buffer, recomposeStartIndex, onlyContiguous);
1326             if(!doCompose) {
1327                 if(!buffer.equals(s, prevSrc, src)) {
1328                     return false;
1329                 }
1330                 buffer.remove();
1331             }
1332             prevBoundary=src;
1333         }
1334     }
1335 
1336     /**
1337      * Very similar to compose(): Make the same changes in both places if relevant.
1338      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
1339      * !doSpan: quickCheck
1340      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
1341      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
1342      *         then the quick check result is "no"
1343      */
composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan)1344     public int composeQuickCheck(CharSequence s, int src, int limit,
1345                                  boolean onlyContiguous, boolean doSpan) {
1346         int qcResult=0;
1347         int prevBoundary=src;
1348         int minNoMaybeCP=minCompNoMaybeCP;
1349 
1350         for(;;) {
1351             // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1352             // or with (compYes && ccc==0) properties.
1353             int prevSrc;
1354             int c = 0;
1355             int norm16 = 0;
1356             for (;;) {
1357                 if(src==limit) {
1358                     return (src<<1)|qcResult;  // "yes" or "maybe"
1359                 }
1360                 if( (c=s.charAt(src))<minNoMaybeCP ||
1361                     isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
1362                 ) {
1363                     ++src;
1364                 } else {
1365                     prevSrc = src++;
1366                     if (!UTF16Plus.isLeadSurrogate(c)) {
1367                         break;
1368                     } else {
1369                         char c2;
1370                         if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
1371                             ++src;
1372                             c = Character.toCodePoint((char)c, c2);
1373                             norm16 = normTrie.suppGet(c);
1374                             if (!isCompYesAndZeroCC(norm16)) {
1375                                 break;
1376                             }
1377                         }
1378                     }
1379                 }
1380             }
1381             // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1382             // The current character is either a "noNo" (has a mapping)
1383             // or a "maybeYes" (combines backward)
1384             // or a "yesYes" with ccc!=0.
1385             // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1386 
1387             int prevNorm16 = INERT;
1388             if (prevBoundary != prevSrc) {
1389                 prevBoundary = prevSrc;
1390                 if (!norm16HasCompBoundaryBefore(norm16)) {
1391                     c = Character.codePointBefore(s, prevSrc);
1392                     int n16 = getNorm16(c);
1393                     if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
1394                         prevBoundary -= Character.charCount(c);
1395                         prevNorm16 = n16;
1396                     }
1397                 }
1398             }
1399 
1400             if(isMaybeOrNonZeroCC(norm16)) {
1401                 int cc=getCCFromYesOrMaybe(norm16);
1402                 if (onlyContiguous /* FCC */ && cc != 0 &&
1403                         getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) {
1404                     // The [prevBoundary..prevSrc[ character
1405                     // passed the quick check "yes && ccc==0" test
1406                     // but is out of canonical order with the current combining mark.
1407                 } else {
1408                     // If !onlyContiguous (not FCC), then we ignore the tccc of
1409                     // the previous character which passed the quick check "yes && ccc==0" test.
1410                     for (;;) {
1411                         if (norm16 < MIN_YES_YES_WITH_CC) {
1412                             if (!doSpan) {
1413                                 qcResult = 1;
1414                             } else {
1415                                 return prevBoundary << 1;  // spanYes does not care to know it's "maybe"
1416                             }
1417                         }
1418                         if (src == limit) {
1419                             return (src<<1) | qcResult;  // "yes" or "maybe"
1420                         }
1421                         int prevCC = cc;
1422                         c = Character.codePointAt(s, src);
1423                         norm16 = getNorm16(c);
1424                         if (isMaybeOrNonZeroCC(norm16)) {
1425                             cc = getCCFromYesOrMaybe(norm16);
1426                             if (!(prevCC <= cc || cc == 0)) {
1427                                 break;
1428                             }
1429                         } else {
1430                             break;
1431                         }
1432                         src += Character.charCount(c);
1433                     }
1434                     // src is after the last in-order combining mark.
1435                     if (isCompYesAndZeroCC(norm16)) {
1436                         prevBoundary = src;
1437                         src += Character.charCount(c);
1438                         continue;
1439                     }
1440                 }
1441             }
1442             return prevBoundary<<1;  // "no"
1443         }
1444     }
composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer)1445     public void composeAndAppend(CharSequence s,
1446                                  boolean doCompose,
1447                                  boolean onlyContiguous,
1448                                  ReorderingBuffer buffer) {
1449         int src=0, limit=s.length();
1450         if(!buffer.isEmpty()) {
1451             int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous);
1452             if(0!=firstStarterInSrc) {
1453                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1454                                                                buffer.length(), onlyContiguous);
1455                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1456                                                        firstStarterInSrc+16);
1457                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1458                 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1459                 middle.append(s, 0, firstStarterInSrc);
1460                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1461                 src=firstStarterInSrc;
1462             }
1463         }
1464         if(doCompose) {
1465             compose(s, src, limit, onlyContiguous, true, buffer);
1466         } else {
1467             buffer.append(s, src, limit);
1468         }
1469     }
1470     // Dual functionality:
1471     // buffer!=NULL: normalize
1472     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer)1473     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1474         // Note: In this function we use buffer->appendZeroCC() because we track
1475         // the lead and trail combining classes here, rather than leaving it to
1476         // the ReorderingBuffer.
1477         // The exception is the call to decomposeShort() which uses the buffer
1478         // in the normal way.
1479 
1480         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1481         // Similar to the prevBoundary in the compose() implementation.
1482         int prevBoundary=src;
1483         int prevSrc;
1484         int c=0;
1485         int prevFCD16=0;
1486         int fcd16=0;
1487 
1488         for(;;) {
1489             // count code units with lccc==0
1490             for(prevSrc=src; src!=limit;) {
1491                 if((c=s.charAt(src))<minLcccCP) {
1492                     prevFCD16=~c;
1493                     ++src;
1494                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1495                     prevFCD16=0;
1496                     ++src;
1497                 } else {
1498                     if (UTF16Plus.isLeadSurrogate(c)) {
1499                         char c2;
1500                         if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
1501                             c = Character.toCodePoint((char)c, c2);
1502                         }
1503                     }
1504                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1505                         prevFCD16=fcd16;
1506                         src+=Character.charCount(c);
1507                     } else {
1508                         break;
1509                     }
1510                 }
1511             }
1512             // copy these code units all at once
1513             if(src!=prevSrc) {
1514                 if(src==limit) {
1515                     if(buffer!=null) {
1516                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1517                     }
1518                     break;
1519                 }
1520                 prevBoundary=src;
1521                 // We know that the previous character's lccc==0.
1522                 if(prevFCD16<0) {
1523                     // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
1524                     int prev=~prevFCD16;
1525                     if(prev<minDecompNoCP) {
1526                         prevFCD16=0;
1527                     } else {
1528                         prevFCD16=getFCD16FromNormData(prev);
1529                         if(prevFCD16>1) {
1530                             --prevBoundary;
1531                         }
1532                     }
1533                 } else {
1534                     int p=src-1;
1535                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1536                         Character.isHighSurrogate(s.charAt(p-1))
1537                     ) {
1538                         --p;
1539                         // Need to fetch the previous character's FCD value because
1540                         // prevFCD16 was just for the trail surrogate code point.
1541                         prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1542                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1543                     }
1544                     if(prevFCD16>1) {
1545                         prevBoundary=p;
1546                     }
1547                 }
1548                 if(buffer!=null) {
1549                     // The last lccc==0 character is excluded from the
1550                     // flush-and-append call in case it needs to be modified.
1551                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1552                     buffer.append(s, prevBoundary, src);
1553                 }
1554                 // The start of the current character (c).
1555                 prevSrc=src;
1556             } else if(src==limit) {
1557                 break;
1558             }
1559 
1560             src+=Character.charCount(c);
1561             // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1562             // Check for proper order, and decompose locally if necessary.
1563             if((prevFCD16&0xff)<=(fcd16>>8)) {
1564                 // proper order: prev tccc <= current lccc
1565                 if((fcd16&0xff)<=1) {
1566                     prevBoundary=src;
1567                 }
1568                 if(buffer!=null) {
1569                     buffer.appendZeroCC(c);
1570                 }
1571                 prevFCD16=fcd16;
1572                 continue;
1573             } else if(buffer==null) {
1574                 return prevBoundary;  // quick check "no"
1575             } else {
1576                 /*
1577                  * Back out the part of the source that we copied or appended
1578                  * already but is now going to be decomposed.
1579                  * prevSrc is set to after what was copied/appended.
1580                  */
1581                 buffer.removeSuffix(prevSrc-prevBoundary);
1582                 /*
1583                  * Find the part of the source that needs to be decomposed,
1584                  * up to the next safe boundary.
1585                  */
1586                 src=findNextFCDBoundary(s, src, limit);
1587                 /*
1588                  * The source text does not fulfill the conditions for FCD.
1589                  * Decompose and reorder a limited piece of the text.
1590                  */
1591                 decomposeShort(s, prevBoundary, src, false, false, buffer);
1592                 prevBoundary=src;
1593                 prevFCD16=0;
1594             }
1595         }
1596         return src;
1597     }
makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer)1598     public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) {
1599         int src=0, limit=s.length();
1600         if(!buffer.isEmpty()) {
1601             int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit);
1602             if(0!=firstBoundaryInSrc) {
1603                 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(),
1604                                                                buffer.length());
1605                 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+
1606                                                        firstBoundaryInSrc+16);
1607                 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length());
1608                 buffer.removeSuffix(buffer.length()-lastBoundaryInDest);
1609                 middle.append(s, 0, firstBoundaryInSrc);
1610                 makeFCD(middle, 0, middle.length(), buffer);
1611                 src=firstBoundaryInSrc;
1612             }
1613         }
1614         if(doMakeFCD) {
1615             makeFCD(s, src, limit, buffer);
1616         } else {
1617             buffer.append(s, src, limit);
1618         }
1619     }
1620 
hasDecompBoundaryBefore(int c)1621     public boolean hasDecompBoundaryBefore(int c) {
1622         return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) ||
1623             norm16HasDecompBoundaryBefore(getNorm16(c));
1624     }
norm16HasDecompBoundaryBefore(int norm16)1625     public boolean norm16HasDecompBoundaryBefore(int norm16) {
1626         if (norm16 < minNoNoCompNoMaybeCC) {
1627             return true;
1628         }
1629         if (norm16 >= limitNoNo) {
1630             return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1631         }
1632         // c decomposes, get everything from the variable-length extra data
1633         int mapping=norm16>>OFFSET_SHIFT;
1634         int firstUnit=extraData.charAt(mapping);
1635         // true if leadCC==0 (hasFCDBoundaryBefore())
1636         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1637     }
hasDecompBoundaryAfter(int c)1638     public boolean hasDecompBoundaryAfter(int c) {
1639         if (c < minDecompNoCP) {
1640             return true;
1641         }
1642         if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) {
1643             return true;
1644         }
1645         return norm16HasDecompBoundaryAfter(getNorm16(c));
1646     }
norm16HasDecompBoundaryAfter(int norm16)1647     public boolean norm16HasDecompBoundaryAfter(int norm16) {
1648         if(norm16 <= minYesNo || isHangulLVT(norm16)) {
1649             return true;
1650         }
1651         if (norm16 >= limitNoNo) {
1652             if (isMaybeOrNonZeroCC(norm16)) {
1653                 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT;
1654             }
1655             // Maps to an isCompYesAndZeroCC.
1656             return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1;
1657         }
1658         // c decomposes, get everything from the variable-length extra data
1659         int mapping=norm16>>OFFSET_SHIFT;
1660         int firstUnit=extraData.charAt(mapping);
1661         // decomp after-boundary: same as hasFCDBoundaryAfter(),
1662         // fcd16<=1 || trailCC==0
1663         if(firstUnit>0x1ff) {
1664             return false;  // trailCC>1
1665         }
1666         if(firstUnit<=0xff) {
1667             return true;  // trailCC==0
1668         }
1669         // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1670         // true if leadCC==0 (hasFCDBoundaryBefore())
1671         return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0;
1672     }
isDecompInert(int c)1673     public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1674 
hasCompBoundaryBefore(int c)1675     public boolean hasCompBoundaryBefore(int c) {
1676         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
1677     }
hasCompBoundaryAfter(int c, boolean onlyContiguous)1678     public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) {
1679         return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
1680     }
isCompInert(int c, boolean onlyContiguous)1681     public boolean isCompInert(int c, boolean onlyContiguous) {
1682         int norm16=getNorm16(c);
1683         return isCompYesAndZeroCC(norm16) &&
1684             (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
1685             (!onlyContiguous || isInert(norm16) || extraData.charAt(norm16>>OFFSET_SHIFT) <= 0x1ff);
1686     }
1687 
hasFCDBoundaryBefore(int c)1688     public boolean hasFCDBoundaryBefore(int c) { return hasDecompBoundaryBefore(c); }
hasFCDBoundaryAfter(int c)1689     public boolean hasFCDBoundaryAfter(int c) { return hasDecompBoundaryAfter(c); }
isFCDInert(int c)1690     public boolean isFCDInert(int c) { return getFCD16(c)<=1; }
1691 
isMaybe(int norm16)1692     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
isMaybeOrNonZeroCC(int norm16)1693     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
isInert(int norm16)1694     private static boolean isInert(int norm16) { return norm16==INERT; }
isJamoL(int norm16)1695     private static boolean isJamoL(int norm16) { return norm16==JAMO_L; }
isJamoVT(int norm16)1696     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
hangulLVT()1697     private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
isHangulLV(int norm16)1698     private boolean isHangulLV(int norm16) { return norm16==minYesNo; }
isHangulLVT(int norm16)1699     private boolean isHangulLVT(int norm16) {
1700         return norm16==hangulLVT();
1701     }
isCompYesAndZeroCC(int norm16)1702     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1703     // UBool isCompYes(uint16_t norm16) const {
1704     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1705     // }
1706     // UBool isCompYesOrMaybe(uint16_t norm16) const {
1707     //     return norm16<minNoNo || minMaybeYes<=norm16;
1708     // }
1709     // private boolean hasZeroCCFromDecompYes(int norm16) {
1710     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1711     // }
isDecompYesAndZeroCC(int norm16)1712     private boolean isDecompYesAndZeroCC(int norm16) {
1713         return norm16<minYesNo ||
1714                norm16==JAMO_VT ||
1715                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1716     }
1717     /**
1718      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1719      * the MaybeYes which combine-forward and have ccc=0.
1720      * (Standard Unicode 10 normalization does not have such characters.)
1721      */
isMostDecompYesAndZeroCC(int norm16)1722     private boolean isMostDecompYesAndZeroCC(int norm16) {
1723         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1724     }
isDecompNoAlgorithmic(int norm16)1725     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1726 
1727     // For use with isCompYes().
1728     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1729     // static uint8_t getCCFromYes(uint16_t norm16) {
1730     //     return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
1731     // }
getCCFromNoNo(int norm16)1732     private int getCCFromNoNo(int norm16) {
1733         int mapping=norm16>>OFFSET_SHIFT;
1734         if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1735             return extraData.charAt(mapping-1)&0xff;
1736         } else {
1737             return 0;
1738         }
1739     }
getTrailCCFromCompYesAndZeroCC(int norm16)1740     int getTrailCCFromCompYesAndZeroCC(int norm16) {
1741         if(norm16<=minYesNo) {
1742             return 0;  // yesYes and Hangul LV have ccc=tccc=0
1743         } else {
1744             // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
1745             return extraData.charAt(norm16>>OFFSET_SHIFT)>>8;  // tccc from yesNo
1746         }
1747     }
1748 
1749     // Requires algorithmic-NoNo.
mapAlgorithmic(int c, int norm16)1750     private int mapAlgorithmic(int c, int norm16) {
1751         return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
1752     }
1753 
1754     // Requires minYesNo<norm16<limitNoNo.
1755     // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); }
1756 
1757     /**
1758      * @return index into maybeYesCompositions, or -1
1759      */
getCompositionsListForDecompYes(int norm16)1760     private int getCompositionsListForDecompYes(int norm16) {
1761         if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
1762             return -1;
1763         } else {
1764             if((norm16-=minMaybeYes)<0) {
1765                 // norm16<minMaybeYes: index into extraData which is a substring at
1766                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1767                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1768                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1769             }
1770             return norm16>>OFFSET_SHIFT;
1771         }
1772     }
1773     /**
1774      * @return index into maybeYesCompositions
1775      */
getCompositionsListForComposite(int norm16)1776     private int getCompositionsListForComposite(int norm16) {
1777         // A composite has both mapping & compositions list.
1778         int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
1779         int firstUnit=maybeYesCompositions.charAt(list);
1780         return list+  // mapping in maybeYesCompositions
1781             1+  // +1 to skip the first unit with the mapping length
1782             (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1783     }
getCompositionsListForMaybe(int norm16)1784     private int getCompositionsListForMaybe(int norm16) {
1785         // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES
1786         return (norm16-minMaybeYes)>>OFFSET_SHIFT;
1787     }
1788     /**
1789      * @param c code point must have compositions
1790      * @return index into maybeYesCompositions
1791      */
getCompositionsList(int norm16)1792     private int getCompositionsList(int norm16) {
1793         return isDecompYes(norm16) ?
1794                 getCompositionsListForDecompYes(norm16) :
1795                 getCompositionsListForComposite(norm16);
1796     }
1797 
1798     // Decompose a short piece of text which is likely to contain characters that
1799     // fail the quick check loop and/or where the quick check loop's overhead
1800     // is unlikely to be amortized.
1801     // Called by the compose() and makeFCD() implementations.
1802     // Public in Java for collation implementation code.
decomposeShort( CharSequence s, int src, int limit, boolean stopAtCompBoundary, boolean onlyContiguous, ReorderingBuffer buffer)1803     private int decomposeShort(
1804             CharSequence s, int src, int limit,
1805             boolean stopAtCompBoundary, boolean onlyContiguous,
1806             ReorderingBuffer buffer) {
1807         while(src<limit) {
1808             int c=Character.codePointAt(s, src);
1809             if (stopAtCompBoundary && c < minCompNoMaybeCP) {
1810                 return src;
1811             }
1812             int norm16 = getNorm16(c);
1813             if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
1814                 return src;
1815             }
1816             src+=Character.charCount(c);
1817             decompose(c, norm16, buffer);
1818             if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
1819                 return src;
1820             }
1821         }
1822         return src;
1823     }
decompose(int c, int norm16, ReorderingBuffer buffer)1824     private void decompose(int c, int norm16, ReorderingBuffer buffer) {
1825         // get the decomposition and the lead and trail cc's
1826         if (norm16 >= limitNoNo) {
1827             if (isMaybeOrNonZeroCC(norm16)) {
1828                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1829                 return;
1830             }
1831             // Maps to an isCompYesAndZeroCC.
1832             c=mapAlgorithmic(c, norm16);
1833             norm16 = getRawNorm16(c);
1834         }
1835         if (norm16 < minYesNo) {
1836             // c does not decompose
1837             buffer.append(c, 0);
1838         } else if(isHangulLV(norm16) || isHangulLVT(norm16)) {
1839             // Hangul syllable: decompose algorithmically
1840             Hangul.decompose(c, buffer);
1841         } else {
1842             // c decomposes, get everything from the variable-length extra data
1843             int mapping=norm16>>OFFSET_SHIFT;
1844             int firstUnit=extraData.charAt(mapping);
1845             int length=firstUnit&MAPPING_LENGTH_MASK;
1846             int leadCC, trailCC;
1847             trailCC=firstUnit>>8;
1848             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1849                 leadCC=extraData.charAt(mapping-1)>>8;
1850             } else {
1851                 leadCC=0;
1852             }
1853             ++mapping;  // skip over the firstUnit
1854             buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC);
1855         }
1856     }
1857 
1858     /**
1859      * Finds the recomposition result for
1860      * a forward-combining "lead" character,
1861      * specified with a pointer to its compositions list,
1862      * and a backward-combining "trail" character.
1863      *
1864      * <p>If the lead and trail characters combine, then this function returns
1865      * the following "compositeAndFwd" value:
1866      * <pre>
1867      * Bits 21..1  composite character
1868      * Bit      0  set if the composite is a forward-combining starter
1869      * </pre>
1870      * otherwise it returns -1.
1871      *
1872      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1873      * encoded as either pairs or triples of 16-bit units.
1874      * The last entry has the high bit of its first unit set.
1875      *
1876      * <p>The list is sorted by ascending trail characters (there are no duplicates).
1877      * A linear search is used.
1878      *
1879      * <p>See normalizer2impl.h for a more detailed description
1880      * of the compositions list format.
1881      */
combine(String compositions, int list, int trail)1882     private static int combine(String compositions, int list, int trail) {
1883         int key1, firstUnit;
1884         if(trail<COMP_1_TRAIL_LIMIT) {
1885             // trail character is 0..33FF
1886             // result entry may have 2 or 3 units
1887             key1=(trail<<1);
1888             while(key1>(firstUnit=compositions.charAt(list))) {
1889                 list+=2+(firstUnit&COMP_1_TRIPLE);
1890             }
1891             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1892                 if((firstUnit&COMP_1_TRIPLE)!=0) {
1893                     return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1894                 } else {
1895                     return compositions.charAt(list+1);
1896                 }
1897             }
1898         } else {
1899             // trail character is 3400..10FFFF
1900             // result entry has 3 units
1901             key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1902             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1903             int secondUnit;
1904             for(;;) {
1905                 if(key1>(firstUnit=compositions.charAt(list))) {
1906                     list+=2+(firstUnit&COMP_1_TRIPLE);
1907                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1908                     if(key2>(secondUnit=compositions.charAt(list+1))) {
1909                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1910                             break;
1911                         } else {
1912                             list+=3;
1913                         }
1914                     } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1915                         return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
1916                     } else {
1917                         break;
1918                     }
1919                 } else {
1920                     break;
1921                 }
1922             }
1923         }
1924         return -1;
1925     }
1926     /**
1927      * @param list some character's compositions list
1928      * @param set recursively receives the composites from these compositions
1929      */
addComposites(int list, UnicodeSet set)1930     private void addComposites(int list, UnicodeSet set) {
1931         int firstUnit, compositeAndFwd;
1932         do {
1933             firstUnit=maybeYesCompositions.charAt(list);
1934             if((firstUnit&COMP_1_TRIPLE)==0) {
1935                 compositeAndFwd=maybeYesCompositions.charAt(list+1);
1936                 list+=2;
1937             } else {
1938                 compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
1939                                 maybeYesCompositions.charAt(list+2);
1940                 list+=3;
1941             }
1942             int composite=compositeAndFwd>>1;
1943             if((compositeAndFwd&1)!=0) {
1944                 addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
1945             }
1946             set.add(composite);
1947         } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1948     }
1949     /*
1950      * Recomposes the buffer text starting at recomposeStartIndex
1951      * (which is in NFD - decomposed and canonically ordered),
1952      * and truncates the buffer contents.
1953      *
1954      * Note that recomposition never lengthens the text:
1955      * Any character consists of either one or two code units;
1956      * a composition may contain at most one more code unit than the original starter,
1957      * while the combining mark that is removed has at least one code unit.
1958      */
recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous)1959     private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
1960                            boolean onlyContiguous) {
1961         StringBuilder sb=buffer.getStringBuilder();
1962         int p=recomposeStartIndex;
1963         if(p==sb.length()) {
1964             return;
1965         }
1966 
1967         int starter, pRemove;
1968         int compositionsList;
1969         int c, compositeAndFwd;
1970         int norm16;
1971         int cc, prevCC;
1972         boolean starterIsSupplementary;
1973 
1974         // Some of the following variables are not used until we have a forward-combining starter
1975         // and are only initialized now to avoid compiler warnings.
1976         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1977         starter=-1;
1978         starterIsSupplementary=false;
1979         prevCC=0;
1980 
1981         for(;;) {
1982             c=sb.codePointAt(p);
1983             p+=Character.charCount(c);
1984             norm16=getNorm16(c);
1985             cc=getCCFromYesOrMaybe(norm16);
1986             if( // this character combines backward and
1987                 isMaybe(norm16) &&
1988                 // we have seen a starter that combines forward and
1989                 compositionsList>=0 &&
1990                 // the backward-combining character is not blocked
1991                 (prevCC<cc || prevCC==0)
1992             ) {
1993                 if(isJamoVT(norm16)) {
1994                     // c is a Jamo V/T, see if we can compose it with the previous character.
1995                     if(c<Hangul.JAMO_T_BASE) {
1996                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1997                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1998                         if(prev<Hangul.JAMO_L_COUNT) {
1999                             pRemove=p-1;
2000                             char syllable=(char)
2001                                 (Hangul.HANGUL_BASE+
2002                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
2003                                  Hangul.JAMO_T_COUNT);
2004                             char t;
2005                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
2006                                 ++p;
2007                                 syllable+=t;  // The next character was a Jamo T.
2008                             }
2009                             sb.setCharAt(starter, syllable);
2010                             // remove the Jamo V/T
2011                             sb.delete(pRemove, p);
2012                             p=pRemove;
2013                         }
2014                     }
2015                     /*
2016                      * No "else" for Jamo T:
2017                      * Since the input is in NFD, there are no Hangul LV syllables that
2018                      * a Jamo T could combine with.
2019                      * All Jamo Ts are combined above when handling Jamo Vs.
2020                      */
2021                     if(p==sb.length()) {
2022                         break;
2023                     }
2024                     compositionsList=-1;
2025                     continue;
2026                 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
2027                     // The starter and the combining mark (c) do combine.
2028                     int composite=compositeAndFwd>>1;
2029 
2030                     // Remove the combining mark.
2031                     pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
2032                     sb.delete(pRemove, p);
2033                     p=pRemove;
2034                     // Replace the starter with the composite.
2035                     if(starterIsSupplementary) {
2036                         if(composite>0xffff) {
2037                             // both are supplementary
2038                             sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
2039                             sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
2040                         } else {
2041                             sb.setCharAt(starter, (char)c);
2042                             sb.deleteCharAt(starter+1);
2043                             // The composite is shorter than the starter,
2044                             // move the intermediate characters forward one.
2045                             starterIsSupplementary=false;
2046                             --p;
2047                         }
2048                     } else if(composite>0xffff) {
2049                         // The composite is longer than the starter,
2050                         // move the intermediate characters back one.
2051                         starterIsSupplementary=true;
2052                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
2053                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
2054                         ++p;
2055                     } else {
2056                         // both are on the BMP
2057                         sb.setCharAt(starter, (char)composite);
2058                     }
2059 
2060                     // Keep prevCC because we removed the combining mark.
2061 
2062                     if(p==sb.length()) {
2063                         break;
2064                     }
2065                     // Is the composite a starter that combines forward?
2066                     if((compositeAndFwd&1)!=0) {
2067                         compositionsList=
2068                             getCompositionsListForComposite(getRawNorm16(composite));
2069                     } else {
2070                         compositionsList=-1;
2071                     }
2072 
2073                     // We combined; continue with looking for compositions.
2074                     continue;
2075                 }
2076             }
2077 
2078             // no combination this time
2079             prevCC=cc;
2080             if(p==sb.length()) {
2081                 break;
2082             }
2083 
2084             // If c did not combine, then check if it is a starter.
2085             if(cc==0) {
2086                 // Found a new starter.
2087                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
2088                     // It may combine with something, prepare for it.
2089                     if(c<=0xffff) {
2090                         starterIsSupplementary=false;
2091                         starter=p-1;
2092                     } else {
2093                         starterIsSupplementary=true;
2094                         starter=p-2;
2095                     }
2096                 }
2097             } else if(onlyContiguous) {
2098                 // FCC: no discontiguous compositions; any intervening character blocks.
2099                 compositionsList=-1;
2100             }
2101         }
2102         buffer.flush();
2103     }
2104 
composePair(int a, int b)2105     public int composePair(int a, int b) {
2106         int norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16
2107         int list;
2108         if(isInert(norm16)) {
2109             return -1;
2110         } else if(norm16<minYesNoMappingsOnly) {
2111             // a combines forward.
2112             if(isJamoL(norm16)) {
2113                 b-=Hangul.JAMO_V_BASE;
2114                 if(0<=b && b<Hangul.JAMO_V_COUNT) {
2115                     return
2116                         (Hangul.HANGUL_BASE+
2117                          ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)*
2118                          Hangul.JAMO_T_COUNT);
2119                 } else {
2120                     return -1;
2121                 }
2122             } else if(isHangulLV(norm16)) {
2123                 b-=Hangul.JAMO_T_BASE;
2124                 if(0<b && b<Hangul.JAMO_T_COUNT) {  // not b==0!
2125                     return a+b;
2126                 } else {
2127                     return -1;
2128                 }
2129             } else {
2130                 // 'a' has a compositions list in extraData
2131                 list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT;
2132                 if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
2133                     list+=  // mapping pointer
2134                         1+  // +1 to skip the first unit with the mapping length
2135                         (maybeYesCompositions.charAt(list)&MAPPING_LENGTH_MASK);  // + mapping length
2136                 }
2137             }
2138         } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
2139             return -1;
2140         } else {
2141             list=getCompositionsListForMaybe(norm16);  // offset into maybeYesCompositions
2142         }
2143         if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
2144             return -1;
2145         }
2146         return combine(maybeYesCompositions, list, b)>>1;
2147     }
2148 
2149     /**
2150      * Does c have a composition boundary before it?
2151      * True if its decomposition begins with a character that has
2152      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
2153      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
2154      * (isCompYesAndZeroCC()) so we need not decompose.
2155      */
hasCompBoundaryBefore(int c, int norm16)2156     private boolean hasCompBoundaryBefore(int c, int norm16) {
2157         return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
2158     }
norm16HasCompBoundaryBefore(int norm16)2159     private boolean norm16HasCompBoundaryBefore(int norm16) {
2160         return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
2161     }
hasCompBoundaryBefore(CharSequence s, int src, int limit)2162     private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) {
2163         return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src));
2164     }
norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous)2165     private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) {
2166         return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
2167             (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
2168     }
hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous)2169     private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) {
2170         return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous);
2171     }
2172     /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
isTrailCC01ForCompBoundaryAfter(int norm16)2173     private boolean isTrailCC01ForCompBoundaryAfter(int norm16) {
2174         return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
2175             (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff);
2176     }
2177 
findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous)2178     private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) {
2179         while(p>0) {
2180             int c=Character.codePointBefore(s, p);
2181             int norm16 = getNorm16(c);
2182             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2183                 break;
2184             }
2185             p-=Character.charCount(c);
2186             if(hasCompBoundaryBefore(c, norm16)) {
2187                 break;
2188             }
2189         }
2190         return p;
2191     }
findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous)2192     private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) {
2193         while(p<limit) {
2194             int c=Character.codePointAt(s, p);
2195             int norm16=normTrie.get(c);
2196             if(hasCompBoundaryBefore(c, norm16)) {
2197                 break;
2198             }
2199             p+=Character.charCount(c);
2200             if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
2201                 break;
2202             }
2203         }
2204         return p;
2205     }
2206 
findPreviousFCDBoundary(CharSequence s, int p)2207     private int findPreviousFCDBoundary(CharSequence s, int p) {
2208         while(p>0) {
2209             int c=Character.codePointBefore(s, p);
2210             int norm16;
2211             if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16 = getNorm16(c))) {
2212                 break;
2213             }
2214             p-=Character.charCount(c);
2215             if (norm16HasDecompBoundaryBefore(norm16)) {
2216                 break;
2217             }
2218         }
2219         return p;
2220     }
findNextFCDBoundary(CharSequence s, int p, int limit)2221     private int findNextFCDBoundary(CharSequence s, int p, int limit) {
2222         while(p<limit) {
2223             int c=Character.codePointAt(s, p);
2224             int norm16;
2225             if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) {
2226                 break;
2227             }
2228             p+=Character.charCount(c);
2229             if (norm16HasDecompBoundaryAfter(norm16)) {
2230                 break;
2231             }
2232         }
2233         return p;
2234     }
2235 
getPreviousTrailCC(CharSequence s, int start, int p)2236     private int getPreviousTrailCC(CharSequence s, int start, int p) {
2237         if (start == p) {
2238             return 0;
2239         }
2240         return getFCD16(Character.codePointBefore(s, p));
2241     }
2242 
addToStartSet(MutableCodePointTrie mutableTrie, int origin, int decompLead)2243     private void addToStartSet(MutableCodePointTrie mutableTrie, int origin, int decompLead) {
2244         int canonValue = mutableTrie.get(decompLead);
2245         if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2246             // origin is the first character whose decomposition starts with
2247             // the character for which we are setting the value.
2248             mutableTrie.set(decompLead, canonValue|origin);
2249         } else {
2250             // origin is not the first character, or it is U+0000.
2251             UnicodeSet set;
2252             if((canonValue&CANON_HAS_SET)==0) {
2253                 int firstOrigin=canonValue&CANON_VALUE_MASK;
2254                 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size();
2255                 mutableTrie.set(decompLead, canonValue);
2256                 canonStartSets.add(set=new UnicodeSet());
2257                 if(firstOrigin!=0) {
2258                     set.add(firstOrigin);
2259                 }
2260             } else {
2261                 set=canonStartSets.get(canonValue&CANON_VALUE_MASK);
2262             }
2263             set.add(origin);
2264         }
2265     }
2266 
2267     @SuppressWarnings("unused")
2268     private VersionInfo dataVersion;
2269 
2270     // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
2271     private int minDecompNoCP;
2272     private int minCompNoMaybeCP;
2273     private int minLcccCP;
2274 
2275     // Norm16 value thresholds for quick check combinations and types of extra data.
2276     private int minYesNo;
2277     private int minYesNoMappingsOnly;
2278     private int minNoNo;
2279     private int minNoNoCompBoundaryBefore;
2280     private int minNoNoCompNoMaybeCC;
2281     private int minNoNoEmpty;
2282     private int limitNoNo;
2283     private int centerNoNoDelta;
2284     private int minMaybeYes;
2285 
2286     private CodePointTrie.Fast16 normTrie;
2287     private String maybeYesCompositions;
2288     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2289     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2290 
2291     private CodePointTrie canonIterData;
2292     private ArrayList<UnicodeSet> canonStartSets;
2293 
2294     // bits in canonIterData
2295     private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
2296     private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
2297     private static final int CANON_HAS_SET = 0x200000;
2298     private static final int CANON_VALUE_MASK = 0x1fffff;
2299 }
2300