• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 *******************************************************************************
6 * Copyright (C) 2010-2014, International Business Machines
7 * Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 * CollationIterator.java, ported from collationiterator.h/.cpp
10 *
11 * C++ version created on: 2010oct27
12 * created by: Markus W. Scherer
13 */
14 
15 package ohos.global.icu.impl.coll;
16 
17 import ohos.global.icu.impl.Normalizer2Impl.Hangul;
18 import ohos.global.icu.impl.Trie2_32;
19 import ohos.global.icu.util.BytesTrie;
20 import ohos.global.icu.util.CharsTrie;
21 import ohos.global.icu.util.ICUException;
22 
23 /**
24  * Collation element iterator and abstract character iterator.
25  *
26  * When a method returns a code point value, it must be in 0..10FFFF,
27  * except it can be negative as a sentinel value.
28  * @hide exposed on OHOS
29  */
30 public abstract class CollationIterator {
31     private static final class CEBuffer {
32         /** Large enough for CEs of most short strings. */
33         private static final int INITIAL_CAPACITY = 40;
34 
CEBuffer()35         CEBuffer() {}
36 
append(long ce)37         void append(long ce) {
38             if(length >= INITIAL_CAPACITY) {
39                 ensureAppendCapacity(1);
40             }
41             buffer[length++] = ce;
42         }
43 
appendUnsafe(long ce)44         void appendUnsafe(long ce) {
45             buffer[length++] = ce;
46         }
47 
ensureAppendCapacity(int appCap)48         void ensureAppendCapacity(int appCap) {
49             int capacity = buffer.length;
50             if((length + appCap) <= capacity) { return; }
51             do {
52                 if(capacity < 1000) {
53                     capacity *= 4;
54                 } else {
55                     capacity *= 2;
56                 }
57             } while(capacity < (length + appCap));
58             long[] newBuffer = new long[capacity];
59             System.arraycopy(buffer, 0, newBuffer, 0, length);
60             buffer = newBuffer;
61         }
62 
incLength()63         void incLength() {
64             // Use INITIAL_CAPACITY for a very simple fastpath.
65             // (Rather than buffer.getCapacity().)
66             if(length >= INITIAL_CAPACITY) {
67                 ensureAppendCapacity(1);
68             }
69             ++length;
70         }
71 
set(int i, long ce)72         long set(int i, long ce) {
73             return buffer[i] = ce;
74         }
get(int i)75         long get(int i) { return buffer[i]; }
76 
getCEs()77         long[] getCEs() { return buffer; }
78 
79         int length = 0;
80 
81         private long[] buffer = new long[INITIAL_CAPACITY];
82     }
83 
84     // State of combining marks skipped in discontiguous contraction.
85     // We create a state object on first use and keep it around deactivated between uses.
86     private static final class SkippedState {
87         // Born active but empty.
SkippedState()88         SkippedState() {}
clear()89         void clear() {
90             oldBuffer.setLength(0);
91             pos = 0;
92             // The newBuffer is reset by setFirstSkipped().
93         }
94 
isEmpty()95         boolean isEmpty() { return oldBuffer.length() == 0; }
96 
hasNext()97         boolean hasNext() { return pos < oldBuffer.length(); }
98 
99         // Requires hasNext().
next()100         int next() {
101             int c = oldBuffer.codePointAt(pos);
102             pos += Character.charCount(c);
103             return c;
104         }
105 
106         // Accounts for one more input code point read beyond the end of the marks buffer.
incBeyond()107         void incBeyond() {
108             assert(!hasNext());
109             ++pos;
110         }
111 
112         // Goes backward through the skipped-marks buffer.
113         // Returns the number of code points read beyond the skipped marks
114         // that need to be backtracked through normal input.
backwardNumCodePoints(int n)115         int backwardNumCodePoints(int n) {
116             int length = oldBuffer.length();
117             int beyond = pos - length;
118             if(beyond > 0) {
119                 if(beyond >= n) {
120                     // Not back far enough to re-enter the oldBuffer.
121                     pos -= n;
122                     return n;
123                 } else {
124                     // Back out all beyond-oldBuffer code points and re-enter the buffer.
125                     pos = oldBuffer.offsetByCodePoints(length, beyond - n);
126                     return beyond;
127                 }
128             } else {
129                 // Go backwards from inside the oldBuffer.
130                 pos = oldBuffer.offsetByCodePoints(pos, -n);
131                 return 0;
132             }
133         }
134 
setFirstSkipped(int c)135         void setFirstSkipped(int c) {
136             skipLengthAtMatch = 0;
137             newBuffer.setLength(0);
138             newBuffer.appendCodePoint(c);
139         }
140 
skip(int c)141         void skip(int c) {
142             newBuffer.appendCodePoint(c);
143         }
144 
recordMatch()145         void recordMatch() { skipLengthAtMatch = newBuffer.length(); }
146 
147         // Replaces the characters we consumed with the newly skipped ones.
replaceMatch()148         void replaceMatch() {
149             // Note: UnicodeString.replace() pins pos to at most length().
150             int oldLength = oldBuffer.length();
151             if(pos > oldLength) { pos = oldLength; }
152             oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch);
153             pos = 0;
154         }
155 
saveTrieState(CharsTrie trie)156         void saveTrieState(CharsTrie trie) { trie.saveState(state); }
resetToTrieState(CharsTrie trie)157         void resetToTrieState(CharsTrie trie) { trie.resetToState(state); }
158 
159         // Combining marks skipped in previous discontiguous-contraction matching.
160         // After that discontiguous contraction was completed, we start reading them from here.
161         private final StringBuilder oldBuffer = new StringBuilder();
162         // Combining marks newly skipped in current discontiguous-contraction matching.
163         // These might have been read from the normal text or from the oldBuffer.
164         private final StringBuilder newBuffer = new StringBuilder();
165         // Reading index in oldBuffer,
166         // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).
167         private int pos;
168         // newBuffer.length() at the time of the last matching character.
169         // When a partial match fails, we back out skipped and partial-matching input characters.
170         private int skipLengthAtMatch;
171         // We save the trie state before we attempt to match a character,
172         // so that we can skip it and try the next one.
173         private CharsTrie.State state = new CharsTrie.State();
174     };
175 
176     /**
177      * Partially constructs the iterator.
178      * In Java, we cache partially constructed iterators
179      * and finish their setup when starting to work on text
180      * (via reset(boolean) and the setText(numeric, ...) methods of subclasses).
181      * This avoids memory allocations for iterators that remain unused.
182      *
183      * <p>In C++, there is only one constructor, and iterators are
184      * stack-allocated as needed.
185      */
CollationIterator(CollationData d)186     public CollationIterator(CollationData d) {
187         trie = d.trie;
188         data = d;
189         numCpFwd = -1;
190         isNumeric = false;
191         ceBuffer = null;
192     }
193 
CollationIterator(CollationData d, boolean numeric)194     public CollationIterator(CollationData d, boolean numeric) {
195         trie = d.trie;
196         data = d;
197         numCpFwd = -1;
198         isNumeric = numeric;
199         ceBuffer = new CEBuffer();
200     }
201 
202     @Override
equals(Object other)203     public boolean equals(Object other) {
204         // Subclasses: Call this method and then add more specific checks.
205         // Compare the iterator state but not the collation data (trie & data fields):
206         // Assume that the caller compares the data.
207         // Ignore skipped since that should be unused between calls to nextCE().
208         // (It only stays around to avoid another memory allocation.)
209         if(other == null) { return false; }
210         if(!this.getClass().equals(other.getClass())) { return false; }
211         CollationIterator o = (CollationIterator)other;
212         if(!(ceBuffer.length == o.ceBuffer.length &&
213                 cesIndex == o.cesIndex &&
214                 numCpFwd == o.numCpFwd &&
215                 isNumeric == o.isNumeric)) {
216             return false;
217         }
218         for(int i = 0; i < ceBuffer.length; ++i) {
219             if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; }
220         }
221         return true;
222     }
223 
224     @Override
hashCode()225     public int hashCode() {
226         // Dummy return to prevent compile warnings.
227         return 0;
228     }
229 
230     /**
231      * Resets the iterator state and sets the position to the specified offset.
232      * Subclasses must implement, and must call the parent class method,
233      * or CollationIterator.reset().
234      */
resetToOffset(int newOffset)235     public abstract void resetToOffset(int newOffset);
236 
getOffset()237     public abstract int getOffset();
238 
239     /**
240      * Returns the next collation element.
241      */
nextCE()242     public final long nextCE() {
243         if(cesIndex < ceBuffer.length) {
244             // Return the next buffered CE.
245             return ceBuffer.get(cesIndex++);
246         }
247         assert cesIndex == ceBuffer.length;
248         ceBuffer.incLength();
249         long cAndCE32 = handleNextCE32();
250         int c = (int)(cAndCE32 >> 32);
251         int ce32 = (int)cAndCE32;
252         int t = ce32 & 0xff;
253         if(t < Collation.SPECIAL_CE32_LOW_BYTE) {  // Forced-inline of isSpecialCE32(ce32).
254             // Normal CE from the main data.
255             // Forced-inline of ceFromSimpleCE32(ce32).
256             return ceBuffer.set(cesIndex++,
257                     ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
258         }
259         CollationData d;
260         // The compiler should be able to optimize the previous and the following
261         // comparisons of t with the same constant.
262         if(t == Collation.SPECIAL_CE32_LOW_BYTE) {
263             if(c < 0) {
264                 return ceBuffer.set(cesIndex++, Collation.NO_CE);
265             }
266             d = data.base;
267             ce32 = d.getCE32(c);
268             t = ce32 & 0xff;
269             if(t < Collation.SPECIAL_CE32_LOW_BYTE) {
270                 // Normal CE from the base data.
271                 return ceBuffer.set(cesIndex++,
272                         ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
273             }
274         } else {
275             d = data;
276         }
277         if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) {
278             // Forced-inline of ceFromLongPrimaryCE32(ce32).
279             return ceBuffer.set(cesIndex++,
280                     ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE);
281         }
282         return nextCEFromCE32(d, c, ce32);
283     }
284 
285     /**
286      * Fetches all CEs.
287      * @return getCEsLength()
288      */
fetchCEs()289     public final int fetchCEs() {
290         while(nextCE() != Collation.NO_CE) {
291             // No need to loop for each expansion CE.
292             cesIndex = ceBuffer.length;
293         }
294         return ceBuffer.length;
295     }
296 
297     /**
298      * Overwrites the current CE (the last one returned by nextCE()).
299      */
setCurrentCE(long ce)300     final void setCurrentCE(long ce) {
301         assert cesIndex > 0;
302         ceBuffer.set(cesIndex - 1, ce);
303     }
304 
305     /**
306      * Returns the previous collation element.
307      */
previousCE(UVector32 offsets)308     public final long previousCE(UVector32 offsets) {
309         if(ceBuffer.length > 0) {
310             // Return the previous buffered CE.
311             return ceBuffer.get(--ceBuffer.length);
312         }
313         offsets.removeAllElements();
314         int limitOffset = getOffset();
315         int c = previousCodePoint();
316         if(c < 0) { return Collation.NO_CE; }
317         if(data.isUnsafeBackward(c, isNumeric)) {
318             return previousCEUnsafe(c, offsets);
319         }
320         // Simple, safe-backwards iteration:
321         // Get a CE going backwards, handle prefixes but no contractions.
322         int ce32 = data.getCE32(c);
323         CollationData d;
324         if(ce32 == Collation.FALLBACK_CE32) {
325             d = data.base;
326             ce32 = d.getCE32(c);
327         } else {
328             d = data;
329         }
330         if(Collation.isSimpleOrLongCE32(ce32)) {
331             return Collation.ceFromCE32(ce32);
332         }
333         appendCEsFromCE32(d, c, ce32, false);
334         if(ceBuffer.length > 1) {
335             offsets.addElement(getOffset());
336             // For an expansion, the offset of each non-initial CE is the limit offset,
337             // consistent with forward iteration.
338             while(offsets.size() <= ceBuffer.length) {
339                 offsets.addElement(limitOffset);
340             };
341         }
342         return ceBuffer.get(--ceBuffer.length);
343     }
344 
getCEsLength()345     public final int getCEsLength() {
346         return ceBuffer.length;
347     }
348 
getCE(int i)349     public final long getCE(int i) {
350         return ceBuffer.get(i);
351     }
352 
getCEs()353     public final long[] getCEs() {
354         return ceBuffer.getCEs();
355     }
356 
clearCEs()357     final void clearCEs() {
358         cesIndex = ceBuffer.length = 0;
359     }
360 
clearCEsIfNoneRemaining()361     public final void clearCEsIfNoneRemaining() {
362         if(cesIndex == ceBuffer.length) { clearCEs(); }
363     }
364 
365     /**
366      * Returns the next code point (with post-increment).
367      * Public for identical-level comparison and for testing.
368      */
nextCodePoint()369     public abstract int nextCodePoint();
370 
371     /**
372      * Returns the previous code point (with pre-decrement).
373      * Public for identical-level comparison and for testing.
374      */
previousCodePoint()375     public abstract int previousCodePoint();
376 
reset()377     protected final void reset() {
378         cesIndex = ceBuffer.length = 0;
379         if(skipped != null) { skipped.clear(); }
380     }
381     /**
382      * Resets the state as well as the numeric setting,
383      * and completes the initialization.
384      * Only exists in Java where we reset cached CollationIterator instances
385      * rather than stack-allocating temporary ones.
386      * (See also the constructor comments.)
387      */
reset(boolean numeric)388     protected final void reset(boolean numeric) {
389         if(ceBuffer == null) {
390             ceBuffer = new CEBuffer();
391         }
392         reset();
393         isNumeric = numeric;
394     }
395 
396     /**
397      * Returns the next code point and its local CE32 value.
398      * Returns Collation.FALLBACK_CE32 at the end of the text (c<0)
399      * or when c's CE32 value is to be looked up in the base data (fallback).
400      *
401      * The code point is used for fallbacks, context and implicit weights.
402      * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32).
403      *
404      * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0.
405      */
handleNextCE32()406     protected long handleNextCE32() {
407         int c = nextCodePoint();
408         if(c < 0) { return NO_CP_AND_CE32; }
409         return makeCodePointAndCE32Pair(c, data.getCE32(c));
410     }
makeCodePointAndCE32Pair(int c, int ce32)411     protected long makeCodePointAndCE32Pair(int c, int ce32) {
412         return ((long)c << 32) | (ce32 & 0xffffffffL);
413     }
414     protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL);
415 
416     /**
417      * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit.
418      * Returns the trail surrogate in that case and advances past it,
419      * if a trail surrogate follows the lead surrogate.
420      * Otherwise returns any other code unit and does not advance.
421      */
handleGetTrailSurrogate()422     protected char handleGetTrailSurrogate() {
423         return 0;
424     }
425 
426     /**
427      * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator.
428      * (Not needed in Java.)
429      */
430     /*protected boolean foundNULTerminator() {
431         return false;
432     }*/
433 
434     /**
435      * @return false if surrogate code points U+D800..U+DFFF
436      *         map to their own implicit primary weights (for UTF-16),
437      *         or true if they map to CE(U+FFFD) (for UTF-8)
438      */
forbidSurrogateCodePoints()439     protected boolean forbidSurrogateCodePoints() {
440         return false;
441     }
442 
forwardNumCodePoints(int num)443     protected abstract void forwardNumCodePoints(int num);
444 
backwardNumCodePoints(int num)445     protected abstract void backwardNumCodePoints(int num);
446 
447     /**
448      * Returns the CE32 from the data trie.
449      * Normally the same as data.getCE32(), but overridden in the builder.
450      * Call this only when the faster data.getCE32() cannot be used.
451      */
getDataCE32(int c)452     protected int getDataCE32(int c) {
453         return data.getCE32(c);
454     }
455 
getCE32FromBuilderData(int ce32)456     protected int getCE32FromBuilderData(int ce32) {
457         throw new ICUException("internal program error: should be unreachable");
458     }
459 
appendCEsFromCE32(CollationData d, int c, int ce32, boolean forward)460     protected final void appendCEsFromCE32(CollationData d, int c, int ce32,
461                            boolean forward) {
462         while(Collation.isSpecialCE32(ce32)) {
463             switch(Collation.tagFromCE32(ce32)) {
464             case Collation.FALLBACK_TAG:
465             case Collation.RESERVED_TAG_3:
466                 throw new ICUException("internal program error: should be unreachable");
467             case Collation.LONG_PRIMARY_TAG:
468                 ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32));
469                 return;
470             case Collation.LONG_SECONDARY_TAG:
471                 ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32));
472                 return;
473             case Collation.LATIN_EXPANSION_TAG:
474                 ceBuffer.ensureAppendCapacity(2);
475                 ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32));
476                 ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32));
477                 ceBuffer.length += 2;
478                 return;
479             case Collation.EXPANSION32_TAG: {
480                 int index = Collation.indexFromCE32(ce32);
481                 int length = Collation.lengthFromCE32(ce32);
482                 ceBuffer.ensureAppendCapacity(length);
483                 do {
484                     ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++]));
485                 } while(--length > 0);
486                 return;
487             }
488             case Collation.EXPANSION_TAG: {
489                 int index = Collation.indexFromCE32(ce32);
490                 int length = Collation.lengthFromCE32(ce32);
491                 ceBuffer.ensureAppendCapacity(length);
492                 do {
493                     ceBuffer.appendUnsafe(d.ces[index++]);
494                 } while(--length > 0);
495                 return;
496             }
497             case Collation.BUILDER_DATA_TAG:
498                 ce32 = getCE32FromBuilderData(ce32);
499                 if(ce32 == Collation.FALLBACK_CE32) {
500                     d = data.base;
501                     ce32 = d.getCE32(c);
502                 }
503                 break;
504             case Collation.PREFIX_TAG:
505                 if(forward) { backwardNumCodePoints(1); }
506                 ce32 = getCE32FromPrefix(d, ce32);
507                 if(forward) { forwardNumCodePoints(1); }
508                 break;
509             case Collation.CONTRACTION_TAG: {
510                 int index = Collation.indexFromCE32(ce32);
511                 int defaultCE32 = d.getCE32FromContexts(index);  // Default if no suffix match.
512                 if(!forward) {
513                     // Backward contractions are handled by previousCEUnsafe().
514                     // c has contractions but they were not found.
515                     ce32 = defaultCE32;
516                     break;
517                 }
518                 int nextCp;
519                 if(skipped == null && numCpFwd < 0) {
520                     // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,
521                     // avoiding the function call and the nextSkippedCodePoint() overhead.
522                     nextCp = nextCodePoint();
523                     if(nextCp < 0) {
524                         // No more text.
525                         ce32 = defaultCE32;
526                         break;
527                     } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
528                             !CollationFCD.mayHaveLccc(nextCp)) {
529                         // All contraction suffixes start with characters with lccc!=0
530                         // but the next code point has lccc==0.
531                         backwardNumCodePoints(1);
532                         ce32 = defaultCE32;
533                         break;
534                     }
535                 } else {
536                     nextCp = nextSkippedCodePoint();
537                     if(nextCp < 0) {
538                         // No more text.
539                         ce32 = defaultCE32;
540                         break;
541                     } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
542                             !CollationFCD.mayHaveLccc(nextCp)) {
543                         // All contraction suffixes start with characters with lccc!=0
544                         // but the next code point has lccc==0.
545                         backwardNumSkipped(1);
546                         ce32 = defaultCE32;
547                         break;
548                     }
549                 }
550                 ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp);
551                 if(ce32 == Collation.NO_CE32) {
552                     // CEs from a discontiguous contraction plus the skipped combining marks
553                     // have been appended already.
554                     return;
555                 }
556                 break;
557             }
558             case Collation.DIGIT_TAG:
559                 if(isNumeric) {
560                     appendNumericCEs(ce32, forward);
561                     return;
562                 } else {
563                     // Fetch the non-numeric-collation CE32 and continue.
564                     ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
565                     break;
566                 }
567             case Collation.U0000_TAG:
568                 assert(c == 0);
569                 // NUL-terminated input not supported in Java.
570                 // Fetch the normal ce32 for U+0000 and continue.
571                 ce32 = d.ce32s[0];
572                 break;
573             case Collation.HANGUL_TAG: {
574                 int[] jamoCE32s = d.jamoCE32s;
575                 c -= Hangul.HANGUL_BASE;
576                 int t = c % Hangul.JAMO_T_COUNT;
577                 c /= Hangul.JAMO_T_COUNT;
578                 int v = c % Hangul.JAMO_V_COUNT;
579                 c /= Hangul.JAMO_V_COUNT;
580                 if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) {
581                     // None of the Jamo CE32s are isSpecialCE32().
582                     // Avoid recursive function calls and per-Jamo tests.
583                     ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3);
584                     ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c]));
585                     ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v]));
586                     ceBuffer.length += 2;
587                     if(t != 0) {
588                         ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t]));
589                     }
590                     return;
591                 } else {
592                     // We should not need to compute each Jamo code point.
593                     // In particular, there should be no offset or implicit ce32.
594                     appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward);
595                     appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward);
596                     if(t == 0) { return; }
597                     // offset 39 = 19 + 21 - 1:
598                     // 19 = JAMO_L_COUNT
599                     // 21 = JAMO_T_COUNT
600                     // -1 = omit t==0
601                     ce32 = jamoCE32s[39 + t];
602                     c = Collation.SENTINEL_CP;
603                     break;
604                 }
605             }
606             case Collation.LEAD_SURROGATE_TAG: {
607                 assert(forward);  // Backward iteration should never see lead surrogate code _unit_ data.
608                 assert(isLeadSurrogate(c));
609                 char trail;
610                 if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) {
611                     c = Character.toCodePoint((char)c, trail);
612                     ce32 &= Collation.LEAD_TYPE_MASK;
613                     if(ce32 == Collation.LEAD_ALL_UNASSIGNED) {
614                         ce32 = Collation.UNASSIGNED_CE32;  // unassigned-implicit
615                     } else if(ce32 == Collation.LEAD_ALL_FALLBACK ||
616                             (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) {
617                         // fall back to the base data
618                         d = d.base;
619                         ce32 = d.getCE32FromSupplementary(c);
620                     }
621                 } else {
622                     // c is an unpaired surrogate.
623                     ce32 = Collation.UNASSIGNED_CE32;
624                 }
625                 break;
626             }
627             case Collation.OFFSET_TAG:
628                 assert(c >= 0);
629                 ceBuffer.append(d.getCEFromOffsetCE32(c, ce32));
630                 return;
631             case Collation.IMPLICIT_TAG:
632                 assert(c >= 0);
633                 if(isSurrogate(c) && forbidSurrogateCodePoints()) {
634                     ce32 = Collation.FFFD_CE32;
635                     break;
636                 } else {
637                     ceBuffer.append(Collation.unassignedCEFromCodePoint(c));
638                     return;
639                 }
640             }
641         }
642         ceBuffer.append(Collation.ceFromSimpleCE32(ce32));
643     }
644 
645     // TODO: Propose widening the UTF16 method.
isSurrogate(int c)646     private static final boolean isSurrogate(int c) {
647         return (c & 0xfffff800) == 0xd800;
648     }
649 
650     // TODO: Propose widening the UTF16 method.
isLeadSurrogate(int c)651     protected static final boolean isLeadSurrogate(int c) {
652         return (c & 0xfffffc00) == 0xd800;
653     }
654 
655     // TODO: Propose widening the UTF16 method.
isTrailSurrogate(int c)656     protected static final boolean isTrailSurrogate(int c) {
657         return (c & 0xfffffc00) == 0xdc00;
658     }
659 
660     // Main lookup trie of the data object.
661     protected final Trie2_32 trie;
662     protected final CollationData data;
663 
nextCEFromCE32(CollationData d, int c, int ce32)664     private final long nextCEFromCE32(CollationData d, int c, int ce32) {
665         --ceBuffer.length;  // Undo ceBuffer.incLength().
666         appendCEsFromCE32(d, c, ce32, true);
667         return ceBuffer.get(cesIndex++);
668     }
669 
getCE32FromPrefix(CollationData d, int ce32)670     private final int getCE32FromPrefix(CollationData d, int ce32) {
671         int index = Collation.indexFromCE32(ce32);
672         ce32 = d.getCE32FromContexts(index);  // Default if no prefix match.
673         index += 2;
674         // Number of code points read before the original code point.
675         int lookBehind = 0;
676         CharsTrie prefixes = new CharsTrie(d.contexts, index);
677         for(;;) {
678             int c = previousCodePoint();
679             if(c < 0) { break; }
680             ++lookBehind;
681             BytesTrie.Result match = prefixes.nextForCodePoint(c);
682             if(match.hasValue()) {
683                 ce32 = prefixes.getValue();
684             }
685             if(!match.hasNext()) { break; }
686         }
687         forwardNumCodePoints(lookBehind);
688         return ce32;
689     }
690 
nextSkippedCodePoint()691     private final int nextSkippedCodePoint() {
692         if(skipped != null && skipped.hasNext()) { return skipped.next(); }
693         if(numCpFwd == 0) { return Collation.SENTINEL_CP; }
694         int c = nextCodePoint();
695         if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); }
696         if(numCpFwd > 0 && c >= 0) { --numCpFwd; }
697         return c;
698     }
699 
backwardNumSkipped(int n)700     private final void backwardNumSkipped(int n) {
701         if(skipped != null && !skipped.isEmpty()) {
702             n = skipped.backwardNumCodePoints(n);
703         }
704         backwardNumCodePoints(n);
705         if(numCpFwd >= 0) { numCpFwd += n; }
706     }
707 
nextCE32FromContraction( CollationData d, int contractionCE32, CharSequence trieChars, int trieOffset, int ce32, int c)708     private final int nextCE32FromContraction(
709             CollationData d, int contractionCE32,
710             CharSequence trieChars, int trieOffset, int ce32, int c) {
711         // c: next code point after the original one
712 
713         // Number of code points read beyond the original code point.
714         // Needed for discontiguous contraction matching.
715         int lookAhead = 1;
716         // Number of code points read since the last match (initially only c).
717         int sinceMatch = 1;
718         // Normally we only need a contiguous match,
719         // and therefore need not remember the suffixes state from before a mismatch for retrying.
720         // If we are already processing skipped combining marks, then we do track the state.
721         CharsTrie suffixes = new CharsTrie(trieChars, trieOffset);
722         if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
723         BytesTrie.Result match = suffixes.firstForCodePoint(c);
724         for(;;) {
725             int nextCp;
726             if(match.hasValue()) {
727                 ce32 = suffixes.getValue();
728                 if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) {
729                     return ce32;
730                 }
731                 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
732                 sinceMatch = 1;
733             } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) {
734                 // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text.
735                 // Back up if necessary, and try a discontiguous contraction.
736                 if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 &&
737                         // Discontiguous contraction matching extends an existing match.
738                         // If there is no match yet, then there is nothing to do.
739                         ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 ||
740                             sinceMatch < lookAhead)) {
741                     // The last character of at least one suffix has lccc!=0,
742                     // allowing for discontiguous contractions.
743                     // UCA S2.1.1 only processes non-starters immediately following
744                     // "a match in the table" (sinceMatch=1).
745                     if(sinceMatch > 1) {
746                         // Return to the state after the last match.
747                         // (Return to sinceMatch=0 and re-fetch the first partially-matched character.)
748                         backwardNumSkipped(sinceMatch);
749                         c = nextSkippedCodePoint();
750                         lookAhead -= sinceMatch - 1;
751                         sinceMatch = 1;
752                     }
753                     if(d.getFCD16(c) > 0xff) {
754                         return nextCE32FromDiscontiguousContraction(
755                             d, suffixes, ce32, lookAhead, c);
756                     }
757                 }
758                 break;
759             } else {
760                 // Continue after partial match (BytesTrie.Result.NO_VALUE) for c.
761                 // It does not have a result value, therefore it is not itself "a match in the table".
762                 // If a partially-matched c has ccc!=0 then
763                 // it might be skipped in discontiguous contraction.
764                 c = nextCp;
765                 ++sinceMatch;
766             }
767             ++lookAhead;
768             match = suffixes.nextForCodePoint(c);
769         }
770         backwardNumSkipped(sinceMatch);
771         return ce32;
772     }
773 
nextCE32FromDiscontiguousContraction( CollationData d, CharsTrie suffixes, int ce32, int lookAhead, int c)774     private final int nextCE32FromDiscontiguousContraction(
775             CollationData d, CharsTrie suffixes, int ce32,
776             int lookAhead, int c) {
777         // UCA section 3.3.2 Contractions:
778         // Contractions that end with non-starter characters
779         // are known as discontiguous contractions.
780         // ... discontiguous contractions must be detected in input text
781         // whenever the final sequence of non-starter characters could be rearranged
782         // so as to make a contiguous matching sequence that is canonically equivalent.
783 
784         // UCA: http://www.unicode.org/reports/tr10/#S2.1
785         // S2.1 Find the longest initial substring S at each point that has a match in the table.
786         // S2.1.1 If there are any non-starters following S, process each non-starter C.
787         // S2.1.2 If C is not blocked from S, find if S + C has a match in the table.
788         //     Note: A non-starter in a string is called blocked
789         //     if there is another non-starter of the same canonical combining class or zero
790         //     between it and the last character of canonical combining class 0.
791         // S2.1.3 If there is a match, replace S by S + C, and remove C.
792 
793         // First: Is a discontiguous contraction even possible?
794         int fcd16 = d.getFCD16(c);
795         assert(fcd16 > 0xff);  // The caller checked this already, as a shortcut.
796         int nextCp = nextSkippedCodePoint();
797         if(nextCp < 0) {
798             // No further text.
799             backwardNumSkipped(1);
800             return ce32;
801         }
802         ++lookAhead;
803         int prevCC = fcd16 & 0xff;
804         fcd16 = d.getFCD16(nextCp);
805         if(fcd16 <= 0xff) {
806             // The next code point after c is a starter (S2.1.1 "process each non-starter").
807             backwardNumSkipped(2);
808             return ce32;
809         }
810 
811         // We have read and matched (lookAhead-2) code points,
812         // read non-matching c and peeked ahead at nextCp.
813         // Return to the state before the mismatch and continue matching with nextCp.
814         if(skipped == null || skipped.isEmpty()) {
815             if(skipped == null) {
816                 skipped = new SkippedState();
817             }
818             suffixes.reset();
819             if(lookAhead > 2) {
820                 // Replay the partial match so far.
821                 backwardNumCodePoints(lookAhead);
822                 suffixes.firstForCodePoint(nextCodePoint());
823                 for(int i = 3; i < lookAhead; ++i) {
824                     suffixes.nextForCodePoint(nextCodePoint());
825                 }
826                 // Skip c (which did not match) and nextCp (which we will try now).
827                 forwardNumCodePoints(2);
828             }
829             skipped.saveTrieState(suffixes);
830         } else {
831             // Reset to the trie state before the failed match of c.
832             skipped.resetToTrieState(suffixes);
833         }
834 
835         skipped.setFirstSkipped(c);
836         // Number of code points read since the last match (at this point: c and nextCp).
837         int sinceMatch = 2;
838         c = nextCp;
839         for(;;) {
840             BytesTrie.Result match;
841             // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)
842             if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) {
843                 // "If there is a match, replace S by S + C, and remove C." (S2.1.3)
844                 // Keep prevCC unchanged.
845                 ce32 = suffixes.getValue();
846                 sinceMatch = 0;
847                 skipped.recordMatch();
848                 if(!match.hasNext()) { break; }
849                 skipped.saveTrieState(suffixes);
850             } else {
851                 // No match for "S + C", skip C.
852                 skipped.skip(c);
853                 skipped.resetToTrieState(suffixes);
854                 prevCC = fcd16 & 0xff;
855             }
856             if((c = nextSkippedCodePoint()) < 0) { break; }
857             ++sinceMatch;
858             fcd16 = d.getFCD16(c);
859             if(fcd16 <= 0xff) {
860                 // The next code point after c is a starter (S2.1.1 "process each non-starter").
861                 break;
862             }
863         }
864         backwardNumSkipped(sinceMatch);
865         boolean isTopDiscontiguous = skipped.isEmpty();
866         skipped.replaceMatch();
867         if(isTopDiscontiguous && !skipped.isEmpty()) {
868             // We did get a match after skipping one or more combining marks,
869             // and we are not in a recursive discontiguous contraction.
870             // Append CEs from the contraction ce32
871             // and then from the combining marks that we skipped before the match.
872             c = Collation.SENTINEL_CP;
873             for(;;) {
874                 appendCEsFromCE32(d, c, ce32, true);
875                 // Fetch CE32s for skipped combining marks from the normal data, with fallback,
876                 // rather than from the CollationData where we found the contraction.
877                 if(!skipped.hasNext()) { break; }
878                 c = skipped.next();
879                 ce32 = getDataCE32(c);
880                 if(ce32 == Collation.FALLBACK_CE32) {
881                     d = data.base;
882                     ce32 = d.getCE32(c);
883                 } else {
884                     d = data;
885                 }
886                 // Note: A nested discontiguous-contraction match
887                 // replaces consumed combining marks with newly skipped ones
888                 // and resets the reading position to the beginning.
889             }
890             skipped.clear();
891             ce32 = Collation.NO_CE32;  // Signal to the caller that the result is in the ceBuffer.
892         }
893         return ce32;
894     }
895 
896     /**
897      * Returns the previous CE when data.isUnsafeBackward(c, isNumeric).
898      */
previousCEUnsafe(int c, UVector32 offsets)899     private final long previousCEUnsafe(int c, UVector32 offsets) {
900         // We just move through the input counting safe and unsafe code points
901         // without collecting the unsafe-backward substring into a buffer and
902         // switching to it.
903         // This is to keep the logic simple. Otherwise we would have to handle
904         // prefix matching going before the backward buffer, switching
905         // to iteration and back, etc.
906         // In the most important case of iterating over a normal string,
907         // reading from the string itself is already maximally fast.
908         // The only drawback there is that after getting the CEs we always
909         // skip backward to the safe character rather than switching out
910         // of a backwardBuffer.
911         // But this should not be the common case for previousCE(),
912         // and correctness and maintainability are more important than
913         // complex optimizations.
914         // Find the first safe character before c.
915         int numBackward = 1;
916         while((c = previousCodePoint()) >= 0) {
917             ++numBackward;
918             if(!data.isUnsafeBackward(c, isNumeric)) {
919                 break;
920             }
921         }
922         // Set the forward iteration limit.
923         // Note: This counts code points.
924         // We cannot enforce a limit in the middle of a surrogate pair or similar.
925         numCpFwd = numBackward;
926         // Reset the forward iterator.
927         cesIndex = 0;
928         assert(ceBuffer.length == 0);
929         // Go forward and collect the CEs.
930         int offset = getOffset();
931         while(numCpFwd > 0) {
932             // nextCE() normally reads one code point.
933             // Contraction matching and digit specials read more and check numCpFwd.
934             --numCpFwd;
935             // Append one or more CEs to the ceBuffer.
936             nextCE();
937             assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE);
938             // No need to loop for getting each expansion CE from nextCE().
939             cesIndex = ceBuffer.length;
940             // However, we need to write an offset for each CE.
941             // This is for CollationElementIterator.getOffset() to return
942             // intermediate offsets from the unsafe-backwards segment.
943             assert(offsets.size() < ceBuffer.length);
944             offsets.addElement(offset);
945             // For an expansion, the offset of each non-initial CE is the limit offset,
946             // consistent with forward iteration.
947             offset = getOffset();
948             while(offsets.size() < ceBuffer.length) {
949                 offsets.addElement(offset);
950             };
951         }
952         assert(offsets.size() == ceBuffer.length);
953         // End offset corresponding to just after the unsafe-backwards segment.
954         offsets.addElement(offset);
955         // Reset the forward iteration limit
956         // and move backward to before the segment for which we fetched CEs.
957         numCpFwd = -1;
958         backwardNumCodePoints(numBackward);
959         // Use the collected CEs and return the last one.
960         cesIndex = 0;  // Avoid cesIndex > ceBuffer.length when that gets decremented.
961         return ceBuffer.get(--ceBuffer.length);
962     }
963 
964     /**
965      * Turns a string of digits (bytes 0..9)
966      * into a sequence of CEs that will sort in numeric order.
967      *
968      * Starts from this ce32's digit value and consumes the following/preceding digits.
969      * The digits string must not be empty and must not have leading zeros.
970      */
971     private final void appendNumericCEs(int ce32, boolean forward) {
972         // Collect digits.
973         // TODO: Use some kind of a byte buffer? We only store values 0..9.
974         StringBuilder digits = new StringBuilder();
975         if(forward) {
976             for(;;) {
977                 char digit = Collation.digitFromCE32(ce32);
978                 digits.append(digit);
979                 if(numCpFwd == 0) { break; }
980                 int c = nextCodePoint();
981                 if(c < 0) { break; }
982                 ce32 = data.getCE32(c);
983                 if(ce32 == Collation.FALLBACK_CE32) {
984                     ce32 = data.base.getCE32(c);
985                 }
986                 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
987                     backwardNumCodePoints(1);
988                     break;
989                 }
990                 if(numCpFwd > 0) { --numCpFwd; }
991             }
992         } else {
993             for(;;) {
994                 char digit = Collation.digitFromCE32(ce32);
995                 digits.append(digit);
996                 int c = previousCodePoint();
997                 if(c < 0) { break; }
998                 ce32 = data.getCE32(c);
999                 if(ce32 == Collation.FALLBACK_CE32) {
1000                     ce32 = data.base.getCE32(c);
1001                 }
1002                 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
1003                     forwardNumCodePoints(1);
1004                     break;
1005                 }
1006             }
1007             // Reverse the digit string.
1008             digits.reverse();
1009         }
1010         int pos = 0;
1011         do {
1012             // Skip leading zeros.
1013             while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; }
1014             // Write a sequence of CEs for at most 254 digits at a time.
1015             int segmentLength = digits.length() - pos;
1016             if(segmentLength > 254) { segmentLength = 254; }
1017             appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength));
1018             pos += segmentLength;
1019         } while(pos < digits.length());
1020     }
1021 
1022     /**
1023      * Turns 1..254 digits into a sequence of CEs.
1024      * Called by appendNumericCEs() for each segment of at most 254 digits.
1025      */
1026     private final void appendNumericSegmentCEs(CharSequence digits) {
1027         int length = digits.length();
1028         assert(1 <= length && length <= 254);
1029         assert(length == 1 || digits.charAt(0) != 0);
1030         long numericPrimary = data.numericPrimary;
1031         // Note: We use primary byte values 2..255: digits are not compressible.
1032         if(length <= 7) {
1033             // Very dense encoding for small numbers.
1034             int value = digits.charAt(0);
1035             for(int i = 1; i < length; ++i) {
1036                 value = value * 10 + digits.charAt(i);
1037             }
1038             // Primary weight second byte values:
1039             //     74 byte values   2.. 75 for small numbers in two-byte primary weights.
1040             //     40 byte values  76..115 for medium numbers in three-byte primary weights.
1041             //     16 byte values 116..131 for large numbers in four-byte primary weights.
1042             //    124 byte values 132..255 for very large numbers with 4..127 digit pairs.
1043             int firstByte = 2;
1044             int numBytes = 74;
1045             if(value < numBytes) {
1046                 // Two-byte primary for 0..73, good for day & month numbers etc.
1047                 long primary = numericPrimary | ((firstByte + value) << 16);
1048                 ceBuffer.append(Collation.makeCE(primary));
1049                 return;
1050             }
1051             value -= numBytes;
1052             firstByte += numBytes;
1053             numBytes = 40;
1054             if(value < numBytes * 254) {
1055                 // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
1056                 long primary = numericPrimary |
1057                     ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8);
1058                 ceBuffer.append(Collation.makeCE(primary));
1059                 return;
1060             }
1061             value -= numBytes * 254;
1062             firstByte += numBytes;
1063             numBytes = 16;
1064             if(value < numBytes * 254 * 254) {
1065                 // Four-byte primary for 10234..1042489=10234+16*254*254-1.
1066                 long primary = numericPrimary | (2 + value % 254);
1067                 value /= 254;
1068                 primary |= (2 + value % 254) << 8;
1069                 value /= 254;
1070                 primary |= (firstByte + value % 254) << 16;
1071                 ceBuffer.append(Collation.makeCE(primary));
1072                 return;
1073             }
1074             // original value > 1042489
1075         }
1076         assert(length >= 7);
1077 
1078         // The second primary byte value 132..255 indicates the number of digit pairs (4..127),
1079         // then we generate primary bytes with those pairs.
1080         // Omit trailing 00 pairs.
1081         // Decrement the value for the last pair.
1082 
1083         // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255.
1084         int numPairs = (length + 1) / 2;
1085         long primary = numericPrimary | ((132 - 4 + numPairs) << 16);
1086         // Find the length without trailing 00 pairs.
1087         while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) {
1088             length -= 2;
1089         }
1090         // Read the first pair.
1091         int pair;
1092         int pos;
1093         if((length & 1) != 0) {
1094             // Only "half a pair" if we have an odd number of digits.
1095             pair = digits.charAt(0);
1096             pos = 1;
1097         } else {
1098             pair = digits.charAt(0) * 10 + digits.charAt(1);
1099             pos = 2;
1100         }
1101         pair = 11 + 2 * pair;
1102         // Add the pairs of digits between pos and length.
1103         int shift = 8;
1104         while(pos < length) {
1105             if(shift == 0) {
1106                 // Every three pairs/bytes we need to store a 4-byte-primary CE
1107                 // and start with a new CE with the '0' primary lead byte.
1108                 primary |= pair;
1109                 ceBuffer.append(Collation.makeCE(primary));
1110                 primary = numericPrimary;
1111                 shift = 16;
1112             } else {
1113                 primary |= pair << shift;
1114                 shift -= 8;
1115             }
1116             pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1));
1117             pos += 2;
1118         }
1119         primary |= (pair - 1) << shift;
1120         ceBuffer.append(Collation.makeCE(primary));
1121     }
1122 
1123     private CEBuffer ceBuffer;
1124     private int cesIndex;
1125 
1126     private SkippedState skipped;
1127 
1128     // Number of code points to read forward, or -1.
1129     // Used as a forward iteration limit in previousCEUnsafe().
1130     private int numCpFwd;
1131     // Numeric collation (CollationSettings.NUMERIC).
1132     private boolean isNumeric;
1133 }
1134