• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationdatabuilder.cpp
7 *
8 * (replaced the former ucol_elm.cpp)
9 *
10 * created on: 2012apr01
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/localpointer.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ucharstrie.h"
21 #include "unicode/ucharstriebuilder.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
24 #include "unicode/usetiter.h"
25 #include "unicode/utf16.h"
26 #include "cmemory.h"
27 #include "collation.h"
28 #include "collationdata.h"
29 #include "collationdatabuilder.h"
30 #include "collationfastlatinbuilder.h"
31 #include "collationiterator.h"
32 #include "normalizer2impl.h"
33 #include "utrie2.h"
34 #include "uvectr32.h"
35 #include "uvectr64.h"
36 #include "uvector.h"
37 
38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
39 
40 U_NAMESPACE_BEGIN
41 
~CEModifier()42 CollationDataBuilder::CEModifier::~CEModifier() {}
43 
44 /**
45  * Build-time context and CE32 for a code point.
46  * If a code point has contextual mappings, then the default (no-context) mapping
47  * and all conditional mappings are stored in a singly-linked list
48  * of ConditionalCE32, sorted by context strings.
49  *
50  * Context strings sort by prefix length, then by prefix, then by contraction suffix.
51  * Context strings must be unique and in ascending order.
52  */
53 struct ConditionalCE32 : public UMemory {
ConditionalCE32ConditionalCE3254     ConditionalCE32(const UnicodeString &ct, uint32_t ce)
55             : context(ct),
56               ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
57               next(-1) {}
58 
hasContextConditionalCE3259     inline UBool hasContext() const { return context.length() > 1; }
prefixLengthConditionalCE3260     inline int32_t prefixLength() const { return context.charAt(0); }
61 
62     /**
63      * "\0" for the first entry for any code point, with its default CE32.
64      *
65      * Otherwise one unit with the length of the prefix string,
66      * then the prefix string, then the contraction suffix.
67      */
68     UnicodeString context;
69     /**
70      * CE32 for the code point and its context.
71      * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
72      */
73     uint32_t ce32;
74     /**
75      * Default CE32 for all contexts with this same prefix.
76      * Initially NO_CE32. Set only while building runtime data structures,
77      * and only on one of the nodes of a sub-list with the same prefix.
78      */
79     uint32_t defaultCE32;
80     /**
81      * CE32 for the built contexts.
82      * When fetching CEs from the builder, the contexts are built into their runtime form
83      * so that the normal collation implementation can process them.
84      * The result is cached in the list head. It is reset when the contexts are modified.
85      */
86     uint32_t builtCE32;
87     /**
88      * Index of the next ConditionalCE32.
89      * Negative for the end of the list.
90      */
91     int32_t next;
92 };
93 
94 U_CDECL_BEGIN
95 
96 U_CAPI void U_CALLCONV
uprv_deleteConditionalCE32(void * obj)97 uprv_deleteConditionalCE32(void *obj) {
98     delete static_cast<ConditionalCE32 *>(obj);
99 }
100 
101 U_CDECL_END
102 
103 /**
104  * Build-time collation element and character iterator.
105  * Uses the runtime CollationIterator for fetching CEs for a string
106  * but reads from the builder's unfinished data structures.
107  * In particular, this class reads from the unfinished trie
108  * and has to avoid CollationIterator::nextCE() and redirect other
109  * calls to data->getCE32() and data->getCE32FromSupplementary().
110  *
111  * We do this so that we need not implement the collation algorithm
112  * again for the builder and make it behave exactly like the runtime code.
113  * That would be more difficult to test and maintain than this indirection.
114  *
115  * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
116  * so the data accesses from those code paths need not be modified.
117  *
118  * This class iterates directly over whole code points
119  * so that the CollationIterator does not need the finished trie
120  * for handling the LEAD_SURROGATE_TAG.
121  */
122 class DataBuilderCollationIterator : public CollationIterator {
123 public:
124     DataBuilderCollationIterator(CollationDataBuilder &b);
125 
126     virtual ~DataBuilderCollationIterator();
127 
128     int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength);
129 
130     virtual void resetToOffset(int32_t newOffset);
131     virtual int32_t getOffset() const;
132 
133     virtual UChar32 nextCodePoint(UErrorCode &errorCode);
134     virtual UChar32 previousCodePoint(UErrorCode &errorCode);
135 
136 protected:
137     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
138     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
139 
140     virtual uint32_t getDataCE32(UChar32 c) const;
141     virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode);
142 
143     CollationDataBuilder &builder;
144     CollationData builderData;
145     uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
146     const UnicodeString *s;
147     int32_t pos;
148 };
149 
DataBuilderCollationIterator(CollationDataBuilder & b)150 DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b)
151         : CollationIterator(&builderData, /*numeric=*/ FALSE),
152           builder(b), builderData(b.nfcImpl),
153           s(NULL), pos(0) {
154     builderData.base = builder.base;
155     // Set all of the jamoCE32s[] to indirection CE32s.
156     for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
157         UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j);
158         jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) |
159                 CollationDataBuilder::IS_BUILDER_JAMO_CE32;
160     }
161     builderData.jamoCE32s = jamoCE32s;
162 }
163 
~DataBuilderCollationIterator()164 DataBuilderCollationIterator::~DataBuilderCollationIterator() {}
165 
166 int32_t
fetchCEs(const UnicodeString & str,int32_t start,int64_t ces[],int32_t cesLength)167 DataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start,
168                                        int64_t ces[], int32_t cesLength) {
169     // Set the pointers each time, in case they changed due to reallocation.
170     builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer());
171     builderData.ces = builder.ce64s.getBuffer();
172     builderData.contexts = builder.contexts.getBuffer();
173     // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32().
174     reset();
175     s = &str;
176     pos = start;
177     UErrorCode errorCode = U_ZERO_ERROR;
178     while(U_SUCCESS(errorCode) && pos < s->length()) {
179         // No need to keep all CEs in the iterator buffer.
180         clearCEs();
181         UChar32 c = s->char32At(pos);
182         pos += U16_LENGTH(c);
183         uint32_t ce32 = utrie2_get32(builder.trie, c);
184         const CollationData *d;
185         if(ce32 == Collation::FALLBACK_CE32) {
186             d = builder.base;
187             ce32 = builder.base->getCE32(c);
188         } else {
189             d = &builderData;
190         }
191         appendCEsFromCE32(d, c, ce32, /*forward=*/ TRUE, errorCode);
192         U_ASSERT(U_SUCCESS(errorCode));
193         for(int32_t i = 0; i < getCEsLength(); ++i) {
194             int64_t ce = getCE(i);
195             if(ce != 0) {
196                 if(cesLength < Collation::MAX_EXPANSION_LENGTH) {
197                     ces[cesLength] = ce;
198                 }
199                 ++cesLength;
200             }
201         }
202     }
203     return cesLength;
204 }
205 
206 void
resetToOffset(int32_t newOffset)207 DataBuilderCollationIterator::resetToOffset(int32_t newOffset) {
208     reset();
209     pos = newOffset;
210 }
211 
212 int32_t
getOffset() const213 DataBuilderCollationIterator::getOffset() const {
214     return pos;
215 }
216 
217 UChar32
nextCodePoint(UErrorCode &)218 DataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
219     if(pos == s->length()) {
220         return U_SENTINEL;
221     }
222     UChar32 c = s->char32At(pos);
223     pos += U16_LENGTH(c);
224     return c;
225 }
226 
227 UChar32
previousCodePoint(UErrorCode &)228 DataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
229     if(pos == 0) {
230         return U_SENTINEL;
231     }
232     UChar32 c = s->char32At(pos - 1);
233     pos -= U16_LENGTH(c);
234     return c;
235 }
236 
237 void
forwardNumCodePoints(int32_t num,UErrorCode &)238 DataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
239     pos = s->moveIndex32(pos, num);
240 }
241 
242 void
backwardNumCodePoints(int32_t num,UErrorCode &)243 DataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
244     pos = s->moveIndex32(pos, -num);
245 }
246 
247 uint32_t
getDataCE32(UChar32 c) const248 DataBuilderCollationIterator::getDataCE32(UChar32 c) const {
249     return utrie2_get32(builder.trie, c);
250 }
251 
252 uint32_t
getCE32FromBuilderData(uint32_t ce32,UErrorCode & errorCode)253 DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) {
254     U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG));
255     if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) {
256         UChar32 jamo = Collation::indexFromCE32(ce32);
257         return utrie2_get32(builder.trie, jamo);
258     } else {
259         ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32);
260         if(cond->builtCE32 == Collation::NO_CE32) {
261             // Build the context-sensitive mappings into their runtime form and cache the result.
262             cond->builtCE32 = builder.buildContext(cond, errorCode);
263             if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
264                 errorCode = U_ZERO_ERROR;
265                 builder.clearContexts();
266                 cond->builtCE32 = builder.buildContext(cond, errorCode);
267             }
268             builderData.contexts = builder.contexts.getBuffer();
269         }
270         return cond->builtCE32;
271     }
272 }
273 
274 // ------------------------------------------------------------------------- ***
275 
CollationDataBuilder(UErrorCode & errorCode)276 CollationDataBuilder::CollationDataBuilder(UErrorCode &errorCode)
277         : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
278           base(NULL), baseSettings(NULL),
279           trie(NULL),
280           ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode),
281           modified(FALSE),
282           fastLatinEnabled(FALSE), fastLatinBuilder(NULL),
283           collIter(NULL) {
284     // Reserve the first CE32 for U+0000.
285     ce32s.addElement(0, errorCode);
286     conditionalCE32s.setDeleter(uprv_deleteConditionalCE32);
287 }
288 
~CollationDataBuilder()289 CollationDataBuilder::~CollationDataBuilder() {
290     utrie2_close(trie);
291     delete fastLatinBuilder;
292     delete collIter;
293 }
294 
295 void
initForTailoring(const CollationData * b,UErrorCode & errorCode)296 CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) {
297     if(U_FAILURE(errorCode)) { return; }
298     if(trie != NULL) {
299         errorCode = U_INVALID_STATE_ERROR;
300         return;
301     }
302     if(b == NULL) {
303         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
304         return;
305     }
306     base = b;
307 
308     // For a tailoring, the default is to fall back to the base.
309     trie = utrie2_open(Collation::FALLBACK_CE32, Collation::FFFD_CE32, &errorCode);
310 
311     // Set the Latin-1 letters block so that it is allocated first in the data array,
312     // to try to improve locality of reference when sorting Latin-1 text.
313     // Do not use utrie2_setRange32() since that will not actually allocate blocks
314     // that are filled with the default value.
315     // ASCII (0..7F) is already preallocated anyway.
316     for(UChar32 c = 0xc0; c <= 0xff; ++c) {
317         utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
318     }
319 
320     // Hangul syllables are not tailorable (except via tailoring Jamos).
321     // Always set the Hangul tag to help performance.
322     // Do this here, rather than in buildMappings(),
323     // so that we see the HANGUL_TAG in various assertions.
324     uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
325     utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
326 
327     // Copy the set contents but don't copy/clone the set as a whole because
328     // that would copy the isFrozen state too.
329     unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
330 
331     if(U_FAILURE(errorCode)) { return; }
332 }
333 
334 UBool
maybeSetPrimaryRange(UChar32 start,UChar32 end,uint32_t primary,int32_t step,UErrorCode & errorCode)335 CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end,
336                                            uint32_t primary, int32_t step,
337                                            UErrorCode &errorCode) {
338     if(U_FAILURE(errorCode)) { return FALSE; }
339     U_ASSERT(start <= end);
340     // TODO: Do we need to check what values are currently set for start..end?
341     // An offset range is worth it only if we can achieve an overlap between
342     // adjacent UTrie2 blocks of 32 code points each.
343     // An offset CE is also a little more expensive to look up and compute
344     // than a simple CE.
345     // If the range spans at least three UTrie2 block boundaries (> 64 code points),
346     // then we take it.
347     // If the range spans one or two block boundaries and there are
348     // at least 4 code points on either side, then we take it.
349     // (We could additionally require a minimum range length of, say, 16.)
350     int32_t blockDelta = (end >> 5) - (start >> 5);
351     if(2 <= step && step <= 0x7f &&
352             (blockDelta >= 3 ||
353             (blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) {
354         int64_t dataCE = ((int64_t)primary << 32) | (start << 8) | step;
355         if(isCompressiblePrimary(primary)) { dataCE |= 0x80; }
356         int32_t index = addCE(dataCE, errorCode);
357         if(U_FAILURE(errorCode)) { return 0; }
358         if(index > Collation::MAX_INDEX) {
359             errorCode = U_BUFFER_OVERFLOW_ERROR;
360             return 0;
361         }
362         uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index);
363         utrie2_setRange32(trie, start, end, offsetCE32, TRUE, &errorCode);
364         modified = TRUE;
365         return TRUE;
366     } else {
367         return FALSE;
368     }
369 }
370 
371 uint32_t
setPrimaryRangeAndReturnNext(UChar32 start,UChar32 end,uint32_t primary,int32_t step,UErrorCode & errorCode)372 CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
373                                                    uint32_t primary, int32_t step,
374                                                    UErrorCode &errorCode) {
375     if(U_FAILURE(errorCode)) { return 0; }
376     UBool isCompressible = isCompressiblePrimary(primary);
377     if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) {
378         return Collation::incThreeBytePrimaryByOffset(primary, isCompressible,
379                                                       (end - start + 1) * step);
380     } else {
381         // Short range: Set individual CE32s.
382         for(;;) {
383             utrie2_set32(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode);
384             ++start;
385             primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step);
386             if(start > end) { return primary; }
387         }
388         modified = TRUE;
389     }
390 }
391 
392 uint32_t
getCE32FromOffsetCE32(UBool fromBase,UChar32 c,uint32_t ce32) const393 CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const {
394     int32_t i = Collation::indexFromCE32(ce32);
395     int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i);
396     uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE);
397     return Collation::makeLongPrimaryCE32(p);
398 }
399 
400 UBool
isCompressibleLeadByte(uint32_t b) const401 CollationDataBuilder::isCompressibleLeadByte(uint32_t b) const {
402     return base->isCompressibleLeadByte(b);
403 }
404 
405 UBool
isAssigned(UChar32 c) const406 CollationDataBuilder::isAssigned(UChar32 c) const {
407     return Collation::isAssignedCE32(utrie2_get32(trie, c));
408 }
409 
410 uint32_t
getLongPrimaryIfSingleCE(UChar32 c) const411 CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const {
412     uint32_t ce32 = utrie2_get32(trie, c);
413     if(Collation::isLongPrimaryCE32(ce32)) {
414         return Collation::primaryFromLongPrimaryCE32(ce32);
415     } else {
416         return 0;
417     }
418 }
419 
420 int64_t
getSingleCE(UChar32 c,UErrorCode & errorCode) const421 CollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
422     if(U_FAILURE(errorCode)) { return 0; }
423     UBool fromBase = FALSE;
424     uint32_t ce32 = utrie2_get32(trie, c);
425     if(ce32 == Collation::FALLBACK_CE32) {
426         fromBase = TRUE;
427         ce32 = base->getCE32(c);
428     }
429     while(Collation::isSpecialCE32(ce32)) {
430         switch(Collation::tagFromCE32(ce32)) {
431         case Collation::LATIN_EXPANSION_TAG:
432         case Collation::BUILDER_DATA_TAG:
433         case Collation::PREFIX_TAG:
434         case Collation::CONTRACTION_TAG:
435         case Collation::HANGUL_TAG:
436         case Collation::LEAD_SURROGATE_TAG:
437             errorCode = U_UNSUPPORTED_ERROR;
438             return 0;
439         case Collation::FALLBACK_TAG:
440         case Collation::RESERVED_TAG_3:
441             errorCode = U_INTERNAL_PROGRAM_ERROR;
442             return 0;
443         case Collation::LONG_PRIMARY_TAG:
444             return Collation::ceFromLongPrimaryCE32(ce32);
445         case Collation::LONG_SECONDARY_TAG:
446             return Collation::ceFromLongSecondaryCE32(ce32);
447         case Collation::EXPANSION32_TAG:
448             if(Collation::lengthFromCE32(ce32) == 1) {
449                 int32_t i = Collation::indexFromCE32(ce32);
450                 ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i);
451                 break;
452             } else {
453                 errorCode = U_UNSUPPORTED_ERROR;
454                 return 0;
455             }
456         case Collation::EXPANSION_TAG: {
457             if(Collation::lengthFromCE32(ce32) == 1) {
458                 int32_t i = Collation::indexFromCE32(ce32);
459                 return fromBase ? base->ces[i] : ce64s.elementAti(i);
460             } else {
461                 errorCode = U_UNSUPPORTED_ERROR;
462                 return 0;
463             }
464         }
465         case Collation::DIGIT_TAG:
466             // Fetch the non-numeric-collation CE32 and continue.
467             ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32));
468             break;
469         case Collation::U0000_TAG:
470             U_ASSERT(c == 0);
471             // Fetch the normal ce32 for U+0000 and continue.
472             ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0);
473             break;
474         case Collation::OFFSET_TAG:
475             ce32 = getCE32FromOffsetCE32(fromBase, c, ce32);
476             break;
477         case Collation::IMPLICIT_TAG:
478             return Collation::unassignedCEFromCodePoint(c);
479         }
480     }
481     return Collation::ceFromSimpleCE32(ce32);
482 }
483 
484 int32_t
addCE(int64_t ce,UErrorCode & errorCode)485 CollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) {
486     int32_t length = ce64s.size();
487     for(int32_t i = 0; i < length; ++i) {
488         if(ce == ce64s.elementAti(i)) { return i; }
489     }
490     ce64s.addElement(ce, errorCode);
491     return length;
492 }
493 
494 int32_t
addCE32(uint32_t ce32,UErrorCode & errorCode)495 CollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) {
496     int32_t length = ce32s.size();
497     for(int32_t i = 0; i < length; ++i) {
498         if(ce32 == (uint32_t)ce32s.elementAti(i)) { return i; }
499     }
500     ce32s.addElement((int32_t)ce32, errorCode);
501     return length;
502 }
503 
504 int32_t
addConditionalCE32(const UnicodeString & context,uint32_t ce32,UErrorCode & errorCode)505 CollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32,
506                                          UErrorCode &errorCode) {
507     if(U_FAILURE(errorCode)) { return -1; }
508     U_ASSERT(!context.isEmpty());
509     int32_t index = conditionalCE32s.size();
510     if(index > Collation::MAX_INDEX) {
511         errorCode = U_BUFFER_OVERFLOW_ERROR;
512         return -1;
513     }
514     ConditionalCE32 *cond = new ConditionalCE32(context, ce32);
515     if(cond == NULL) {
516         errorCode = U_MEMORY_ALLOCATION_ERROR;
517         return -1;
518     }
519     conditionalCE32s.addElement(cond, errorCode);
520     return index;
521 }
522 
523 void
add(const UnicodeString & prefix,const UnicodeString & s,const int64_t ces[],int32_t cesLength,UErrorCode & errorCode)524 CollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s,
525                           const int64_t ces[], int32_t cesLength,
526                           UErrorCode &errorCode) {
527     uint32_t ce32 = encodeCEs(ces, cesLength, errorCode);
528     addCE32(prefix, s, ce32, errorCode);
529 }
530 
531 void
addCE32(const UnicodeString & prefix,const UnicodeString & s,uint32_t ce32,UErrorCode & errorCode)532 CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s,
533                               uint32_t ce32, UErrorCode &errorCode) {
534     if(U_FAILURE(errorCode)) { return; }
535     if(s.isEmpty()) {
536         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
537         return;
538     }
539     if(trie == NULL || utrie2_isFrozen(trie)) {
540         errorCode = U_INVALID_STATE_ERROR;
541         return;
542     }
543     UChar32 c = s.char32At(0);
544     int32_t cLength = U16_LENGTH(c);
545     uint32_t oldCE32 = utrie2_get32(trie, c);
546     UBool hasContext = !prefix.isEmpty() || s.length() > cLength;
547     if(oldCE32 == Collation::FALLBACK_CE32) {
548         // First tailoring for c.
549         // If c has contextual base mappings or if we add a contextual mapping,
550         // then copy the base mappings.
551         // Otherwise we just override the base mapping.
552         uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c));
553         if(hasContext || Collation::ce32HasContext(baseCE32)) {
554             oldCE32 = copyFromBaseCE32(c, baseCE32, TRUE, errorCode);
555             utrie2_set32(trie, c, oldCE32, &errorCode);
556             if(U_FAILURE(errorCode)) { return; }
557         }
558     }
559     if(!hasContext) {
560         // No prefix, no contraction.
561         if(!isBuilderContextCE32(oldCE32)) {
562             utrie2_set32(trie, c, ce32, &errorCode);
563         } else {
564             ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32);
565             cond->builtCE32 = Collation::NO_CE32;
566             cond->ce32 = ce32;
567         }
568     } else {
569         ConditionalCE32 *cond;
570         if(!isBuilderContextCE32(oldCE32)) {
571             // Replace the simple oldCE32 with a builder context CE32
572             // pointing to a new ConditionalCE32 list head.
573             int32_t index = addConditionalCE32(UnicodeString((UChar)0), oldCE32, errorCode);
574             if(U_FAILURE(errorCode)) { return; }
575             uint32_t contextCE32 = makeBuilderContextCE32(index);
576             utrie2_set32(trie, c, contextCE32, &errorCode);
577             contextChars.add(c);
578             cond = getConditionalCE32(index);
579         } else {
580             cond = getConditionalCE32ForCE32(oldCE32);
581             cond->builtCE32 = Collation::NO_CE32;
582         }
583         UnicodeString suffix(s, cLength);
584         UnicodeString context((UChar)prefix.length());
585         context.append(prefix).append(suffix);
586         unsafeBackwardSet.addAll(suffix);
587         for(;;) {
588             // invariant: context > cond->context
589             int32_t next = cond->next;
590             if(next < 0) {
591                 // Append a new ConditionalCE32 after cond.
592                 int32_t index = addConditionalCE32(context, ce32, errorCode);
593                 if(U_FAILURE(errorCode)) { return; }
594                 cond->next = index;
595                 break;
596             }
597             ConditionalCE32 *nextCond = getConditionalCE32(next);
598             int8_t cmp = context.compare(nextCond->context);
599             if(cmp < 0) {
600                 // Insert a new ConditionalCE32 between cond and nextCond.
601                 int32_t index = addConditionalCE32(context, ce32, errorCode);
602                 if(U_FAILURE(errorCode)) { return; }
603                 cond->next = index;
604                 getConditionalCE32(index)->next = next;
605                 break;
606             } else if(cmp == 0) {
607                 // Same context as before, overwrite its ce32.
608                 nextCond->ce32 = ce32;
609                 break;
610             }
611             cond = nextCond;
612         }
613     }
614     modified = TRUE;
615 }
616 
617 uint32_t
encodeOneCEAsCE32(int64_t ce)618 CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) {
619     uint32_t p = (uint32_t)(ce >> 32);
620     uint32_t lower32 = (uint32_t)ce;
621     uint32_t t = (uint32_t)(ce & 0xffff);
622     U_ASSERT((t & 0xc000) != 0xc000);  // Impossible case bits 11 mark special CE32s.
623     if((ce & INT64_C(0xffff00ff00ff)) == 0) {
624         // normal form ppppsstt
625         return p | (lower32 >> 16) | (t >> 8);
626     } else if((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) {
627         // long-primary form ppppppC1
628         return Collation::makeLongPrimaryCE32(p);
629     } else if(p == 0 && (t & 0xff) == 0) {
630         // long-secondary form ssssttC2
631         return Collation::makeLongSecondaryCE32(lower32);
632     }
633     return Collation::NO_CE32;
634 }
635 
636 uint32_t
encodeOneCE(int64_t ce,UErrorCode & errorCode)637 CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) {
638     // Try to encode one CE as one CE32.
639     uint32_t ce32 = encodeOneCEAsCE32(ce);
640     if(ce32 != Collation::NO_CE32) { return ce32; }
641     int32_t index = addCE(ce, errorCode);
642     if(U_FAILURE(errorCode)) { return 0; }
643     if(index > Collation::MAX_INDEX) {
644         errorCode = U_BUFFER_OVERFLOW_ERROR;
645         return 0;
646     }
647     return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1);
648 }
649 
650 uint32_t
encodeCEs(const int64_t ces[],int32_t cesLength,UErrorCode & errorCode)651 CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength,
652                                 UErrorCode &errorCode) {
653     if(U_FAILURE(errorCode)) { return 0; }
654     if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) {
655         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
656         return 0;
657     }
658     if(trie == NULL || utrie2_isFrozen(trie)) {
659         errorCode = U_INVALID_STATE_ERROR;
660         return 0;
661     }
662     if(cesLength == 0) {
663         // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
664         // Do this here so that callers need not do it.
665         return encodeOneCEAsCE32(0);
666     } else if(cesLength == 1) {
667         return encodeOneCE(ces[0], errorCode);
668     } else if(cesLength == 2) {
669         // Try to encode two CEs as one CE32.
670         int64_t ce0 = ces[0];
671         int64_t ce1 = ces[1];
672         uint32_t p0 = (uint32_t)(ce0 >> 32);
673         if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE &&
674                 (ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE &&
675                 p0 != 0) {
676             // Latin mini expansion
677             return
678                 p0 |
679                 (((uint32_t)ce0 & 0xff00u) << 8) |
680                 (uint32_t)(ce1 >> 16) |
681                 Collation::SPECIAL_CE32_LOW_BYTE |
682                 Collation::LATIN_EXPANSION_TAG;
683         }
684     }
685     // Try to encode two or more CEs as CE32s.
686     int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH];
687     for(int32_t i = 0;; ++i) {
688         if(i == cesLength) {
689             return encodeExpansion32(newCE32s, cesLength, errorCode);
690         }
691         uint32_t ce32 = encodeOneCEAsCE32(ces[i]);
692         if(ce32 == Collation::NO_CE32) { break; }
693         newCE32s[i] = (int32_t)ce32;
694     }
695     return encodeExpansion(ces, cesLength, errorCode);
696 }
697 
698 uint32_t
encodeExpansion(const int64_t ces[],int32_t length,UErrorCode & errorCode)699 CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) {
700     if(U_FAILURE(errorCode)) { return 0; }
701     // See if this sequence of CEs has already been stored.
702     int64_t first = ces[0];
703     int32_t ce64sMax = ce64s.size() - length;
704     for(int32_t i = 0; i <= ce64sMax; ++i) {
705         if(first == ce64s.elementAti(i)) {
706             if(i > Collation::MAX_INDEX) {
707                 errorCode = U_BUFFER_OVERFLOW_ERROR;
708                 return 0;
709             }
710             for(int32_t j = 1;; ++j) {
711                 if(j == length) {
712                     return Collation::makeCE32FromTagIndexAndLength(
713                             Collation::EXPANSION_TAG, i, length);
714                 }
715                 if(ce64s.elementAti(i + j) != ces[j]) { break; }
716             }
717         }
718     }
719     // Store the new sequence.
720     int32_t i = ce64s.size();
721     if(i > Collation::MAX_INDEX) {
722         errorCode = U_BUFFER_OVERFLOW_ERROR;
723         return 0;
724     }
725     for(int32_t j = 0; j < length; ++j) {
726         ce64s.addElement(ces[j], errorCode);
727     }
728     return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length);
729 }
730 
731 uint32_t
encodeExpansion32(const int32_t newCE32s[],int32_t length,UErrorCode & errorCode)732 CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length,
733                                         UErrorCode &errorCode) {
734     if(U_FAILURE(errorCode)) { return 0; }
735     // See if this sequence of CE32s has already been stored.
736     int32_t first = newCE32s[0];
737     int32_t ce32sMax = ce32s.size() - length;
738     for(int32_t i = 0; i <= ce32sMax; ++i) {
739         if(first == ce32s.elementAti(i)) {
740             if(i > Collation::MAX_INDEX) {
741                 errorCode = U_BUFFER_OVERFLOW_ERROR;
742                 return 0;
743             }
744             for(int32_t j = 1;; ++j) {
745                 if(j == length) {
746                     return Collation::makeCE32FromTagIndexAndLength(
747                             Collation::EXPANSION32_TAG, i, length);
748                 }
749                 if(ce32s.elementAti(i + j) != newCE32s[j]) { break; }
750             }
751         }
752     }
753     // Store the new sequence.
754     int32_t i = ce32s.size();
755     if(i > Collation::MAX_INDEX) {
756         errorCode = U_BUFFER_OVERFLOW_ERROR;
757         return 0;
758     }
759     for(int32_t j = 0; j < length; ++j) {
760         ce32s.addElement(newCE32s[j], errorCode);
761     }
762     return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length);
763 }
764 
765 uint32_t
copyFromBaseCE32(UChar32 c,uint32_t ce32,UBool withContext,UErrorCode & errorCode)766 CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext,
767                                        UErrorCode &errorCode) {
768     if(U_FAILURE(errorCode)) { return 0; }
769     if(!Collation::isSpecialCE32(ce32)) { return ce32; }
770     switch(Collation::tagFromCE32(ce32)) {
771     case Collation::LONG_PRIMARY_TAG:
772     case Collation::LONG_SECONDARY_TAG:
773     case Collation::LATIN_EXPANSION_TAG:
774         // copy as is
775         break;
776     case Collation::EXPANSION32_TAG: {
777         const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32);
778         int32_t length = Collation::lengthFromCE32(ce32);
779         ce32 = encodeExpansion32(
780             reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode);
781         break;
782     }
783     case Collation::EXPANSION_TAG: {
784         const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32);
785         int32_t length = Collation::lengthFromCE32(ce32);
786         ce32 = encodeExpansion(baseCEs, length, errorCode);
787         break;
788     }
789     case Collation::PREFIX_TAG: {
790         // Flatten prefixes and nested suffixes (contractions)
791         // into a linear list of ConditionalCE32.
792         const UChar *p = base->contexts + Collation::indexFromCE32(ce32);
793         ce32 = CollationData::readCE32(p);  // Default if no prefix match.
794         if(!withContext) {
795             return copyFromBaseCE32(c, ce32, FALSE, errorCode);
796         }
797         ConditionalCE32 head(UnicodeString(), 0);
798         UnicodeString context((UChar)0);
799         int32_t index;
800         if(Collation::isContractionCE32(ce32)) {
801             index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
802         } else {
803             ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode);
804             head.next = index = addConditionalCE32(context, ce32, errorCode);
805         }
806         if(U_FAILURE(errorCode)) { return 0; }
807         ConditionalCE32 *cond = getConditionalCE32(index);  // the last ConditionalCE32 so far
808         UCharsTrie::Iterator prefixes(p + 2, 0, errorCode);
809         while(prefixes.next(errorCode)) {
810             context = prefixes.getString();
811             context.reverse();
812             context.insert(0, (UChar)context.length());
813             ce32 = (uint32_t)prefixes.getValue();
814             if(Collation::isContractionCE32(ce32)) {
815                 index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode);
816             } else {
817                 ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode);
818                 cond->next = index = addConditionalCE32(context, ce32, errorCode);
819             }
820             if(U_FAILURE(errorCode)) { return 0; }
821             cond = getConditionalCE32(index);
822         }
823         ce32 = makeBuilderContextCE32(head.next);
824         contextChars.add(c);
825         break;
826     }
827     case Collation::CONTRACTION_TAG: {
828         if(!withContext) {
829             const UChar *p = base->contexts + Collation::indexFromCE32(ce32);
830             ce32 = CollationData::readCE32(p);  // Default if no suffix match.
831             return copyFromBaseCE32(c, ce32, FALSE, errorCode);
832         }
833         ConditionalCE32 head(UnicodeString(), 0);
834         UnicodeString context((UChar)0);
835         copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
836         ce32 = makeBuilderContextCE32(head.next);
837         contextChars.add(c);
838         break;
839     }
840     case Collation::HANGUL_TAG:
841         errorCode = U_UNSUPPORTED_ERROR;  // We forbid tailoring of Hangul syllables.
842         break;
843     case Collation::OFFSET_TAG:
844         ce32 = getCE32FromOffsetCE32(TRUE, c, ce32);
845         break;
846     case Collation::IMPLICIT_TAG:
847         ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode);
848         break;
849     default:
850         U_ASSERT(FALSE);  // require ce32 == base->getFinalCE32(ce32)
851         break;
852     }
853     return ce32;
854 }
855 
856 int32_t
copyContractionsFromBaseCE32(UnicodeString & context,UChar32 c,uint32_t ce32,ConditionalCE32 * cond,UErrorCode & errorCode)857 CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
858                                                    ConditionalCE32 *cond, UErrorCode &errorCode) {
859     if(U_FAILURE(errorCode)) { return 0; }
860     const UChar *p = base->contexts + Collation::indexFromCE32(ce32);
861     int32_t index;
862     if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
863         // No match on the single code point.
864         // We are underneath a prefix, and the default mapping is just
865         // a fallback to the mappings for a shorter prefix.
866         U_ASSERT(context.length() > 1);
867         index = -1;
868     } else {
869         ce32 = CollationData::readCE32(p);  // Default if no suffix match.
870         U_ASSERT(!Collation::isContractionCE32(ce32));
871         ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode);
872         cond->next = index = addConditionalCE32(context, ce32, errorCode);
873         if(U_FAILURE(errorCode)) { return 0; }
874         cond = getConditionalCE32(index);
875     }
876 
877     int32_t suffixStart = context.length();
878     UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
879     while(suffixes.next(errorCode)) {
880         context.append(suffixes.getString());
881         ce32 = copyFromBaseCE32(c, (uint32_t)suffixes.getValue(), TRUE, errorCode);
882         cond->next = index = addConditionalCE32(context, ce32, errorCode);
883         if(U_FAILURE(errorCode)) { return 0; }
884         // No need to update the unsafeBackwardSet because the tailoring set
885         // is already a copy of the base set.
886         cond = getConditionalCE32(index);
887         context.truncate(suffixStart);
888     }
889     U_ASSERT(index >= 0);
890     return index;
891 }
892 
893 class CopyHelper {
894 public:
CopyHelper(const CollationDataBuilder & s,CollationDataBuilder & d,const CollationDataBuilder::CEModifier & m,UErrorCode & initialErrorCode)895     CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d,
896                const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode)
897             : src(s), dest(d), modifier(m),
898               errorCode(initialErrorCode) {}
899 
copyRangeCE32(UChar32 start,UChar32 end,uint32_t ce32)900     UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) {
901         ce32 = copyCE32(ce32);
902         utrie2_setRange32(dest.trie, start, end, ce32, TRUE, &errorCode);
903         if(CollationDataBuilder::isBuilderContextCE32(ce32)) {
904             dest.contextChars.add(start, end);
905         }
906         return U_SUCCESS(errorCode);
907     }
908 
copyCE32(uint32_t ce32)909     uint32_t copyCE32(uint32_t ce32) {
910         if(!Collation::isSpecialCE32(ce32)) {
911             int64_t ce = modifier.modifyCE32(ce32);
912             if(ce != Collation::NO_CE) {
913                 ce32 = dest.encodeOneCE(ce, errorCode);
914             }
915         } else {
916             int32_t tag = Collation::tagFromCE32(ce32);
917             if(tag == Collation::EXPANSION32_TAG) {
918                 const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer());
919                 srcCE32s += Collation::indexFromCE32(ce32);
920                 int32_t length = Collation::lengthFromCE32(ce32);
921                 // Inspect the source CE32s. Just copy them if none are modified.
922                 // Otherwise copy to modifiedCEs, with modifications.
923                 UBool isModified = FALSE;
924                 for(int32_t i = 0; i < length; ++i) {
925                     ce32 = srcCE32s[i];
926                     int64_t ce;
927                     if(Collation::isSpecialCE32(ce32) ||
928                             (ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) {
929                         if(isModified) {
930                             modifiedCEs[i] = Collation::ceFromCE32(ce32);
931                         }
932                     } else {
933                         if(!isModified) {
934                             for(int32_t j = 0; j < i; ++j) {
935                                 modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]);
936                             }
937                             isModified = TRUE;
938                         }
939                         modifiedCEs[i] = ce;
940                     }
941                 }
942                 if(isModified) {
943                     ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
944                 } else {
945                     ce32 = dest.encodeExpansion32(
946                         reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode);
947                 }
948             } else if(tag == Collation::EXPANSION_TAG) {
949                 const int64_t *srcCEs = src.ce64s.getBuffer();
950                 srcCEs += Collation::indexFromCE32(ce32);
951                 int32_t length = Collation::lengthFromCE32(ce32);
952                 // Inspect the source CEs. Just copy them if none are modified.
953                 // Otherwise copy to modifiedCEs, with modifications.
954                 UBool isModified = FALSE;
955                 for(int32_t i = 0; i < length; ++i) {
956                     int64_t srcCE = srcCEs[i];
957                     int64_t ce = modifier.modifyCE(srcCE);
958                     if(ce == Collation::NO_CE) {
959                         if(isModified) {
960                             modifiedCEs[i] = srcCE;
961                         }
962                     } else {
963                         if(!isModified) {
964                             for(int32_t j = 0; j < i; ++j) {
965                                 modifiedCEs[j] = srcCEs[j];
966                             }
967                             isModified = TRUE;
968                         }
969                         modifiedCEs[i] = ce;
970                     }
971                 }
972                 if(isModified) {
973                     ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
974                 } else {
975                     ce32 = dest.encodeExpansion(srcCEs, length, errorCode);
976                 }
977             } else if(tag == Collation::BUILDER_DATA_TAG) {
978                 // Copy the list of ConditionalCE32.
979                 ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32);
980                 U_ASSERT(!cond->hasContext());
981                 int32_t destIndex = dest.addConditionalCE32(
982                         cond->context, copyCE32(cond->ce32), errorCode);
983                 ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex);
984                 while(cond->next >= 0) {
985                     cond = src.getConditionalCE32(cond->next);
986                     ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex);
987                     destIndex = dest.addConditionalCE32(
988                             cond->context, copyCE32(cond->ce32), errorCode);
989                     int32_t suffixStart = cond->prefixLength() + 1;
990                     dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart));
991                     prevDestCond->next = destIndex;
992                 }
993             } else {
994                 // Just copy long CEs and Latin mini expansions (and other expected values) as is,
995                 // assuming that the modifier would not modify them.
996                 U_ASSERT(tag == Collation::LONG_PRIMARY_TAG ||
997                         tag == Collation::LONG_SECONDARY_TAG ||
998                         tag == Collation::LATIN_EXPANSION_TAG ||
999                         tag == Collation::HANGUL_TAG);
1000             }
1001         }
1002         return ce32;
1003     }
1004 
1005     const CollationDataBuilder &src;
1006     CollationDataBuilder &dest;
1007     const CollationDataBuilder::CEModifier &modifier;
1008     int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH];
1009     UErrorCode errorCode;
1010 };
1011 
1012 U_CDECL_BEGIN
1013 
1014 static UBool U_CALLCONV
enumRangeForCopy(const void * context,UChar32 start,UChar32 end,uint32_t value)1015 enumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1016     return
1017         value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 ||
1018         ((CopyHelper *)context)->copyRangeCE32(start, end, value);
1019 }
1020 
1021 U_CDECL_END
1022 
1023 void
copyFrom(const CollationDataBuilder & src,const CEModifier & modifier,UErrorCode & errorCode)1024 CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
1025                                UErrorCode &errorCode) {
1026     if(U_FAILURE(errorCode)) { return; }
1027     if(trie == NULL || utrie2_isFrozen(trie)) {
1028         errorCode = U_INVALID_STATE_ERROR;
1029         return;
1030     }
1031     CopyHelper helper(src, *this, modifier, errorCode);
1032     utrie2_enum(src.trie, NULL, enumRangeForCopy, &helper);
1033     errorCode = helper.errorCode;
1034     // Update the contextChars and the unsafeBackwardSet while copying,
1035     // in case a character had conditional mappings in the source builder
1036     // and they were removed later.
1037     modified |= src.modified;
1038 }
1039 
1040 void
optimize(const UnicodeSet & set,UErrorCode & errorCode)1041 CollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) {
1042     if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
1043     UnicodeSetIterator iter(set);
1044     while(iter.next() && !iter.isString()) {
1045         UChar32 c = iter.getCodepoint();
1046         uint32_t ce32 = utrie2_get32(trie, c);
1047         if(ce32 == Collation::FALLBACK_CE32) {
1048             ce32 = base->getFinalCE32(base->getCE32(c));
1049             ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode);
1050             utrie2_set32(trie, c, ce32, &errorCode);
1051         }
1052     }
1053     modified = TRUE;
1054 }
1055 
1056 void
suppressContractions(const UnicodeSet & set,UErrorCode & errorCode)1057 CollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) {
1058     if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
1059     UnicodeSetIterator iter(set);
1060     while(iter.next() && !iter.isString()) {
1061         UChar32 c = iter.getCodepoint();
1062         uint32_t ce32 = utrie2_get32(trie, c);
1063         if(ce32 == Collation::FALLBACK_CE32) {
1064             ce32 = base->getFinalCE32(base->getCE32(c));
1065             if(Collation::ce32HasContext(ce32)) {
1066                 ce32 = copyFromBaseCE32(c, ce32, FALSE /* without context */, errorCode);
1067                 utrie2_set32(trie, c, ce32, &errorCode);
1068             }
1069         } else if(isBuilderContextCE32(ce32)) {
1070             ce32 = getConditionalCE32ForCE32(ce32)->ce32;
1071             // Simply abandon the list of ConditionalCE32.
1072             // The caller will copy this builder in the end,
1073             // eliminating unreachable data.
1074             utrie2_set32(trie, c, ce32, &errorCode);
1075             contextChars.remove(c);
1076         }
1077     }
1078     modified = TRUE;
1079 }
1080 
1081 UBool
getJamoCE32s(uint32_t jamoCE32s[],UErrorCode & errorCode)1082 CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) {
1083     if(U_FAILURE(errorCode)) { return FALSE; }
1084     UBool anyJamoAssigned = base == NULL;  // always set jamoCE32s in the base data
1085     UBool needToCopyFromBase = FALSE;
1086     for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
1087         UChar32 jamo = jamoCpFromIndex(j);
1088         UBool fromBase = FALSE;
1089         uint32_t ce32 = utrie2_get32(trie, jamo);
1090         anyJamoAssigned |= Collation::isAssignedCE32(ce32);
1091         // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
1092         // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
1093         if(ce32 == Collation::FALLBACK_CE32) {
1094             fromBase = TRUE;
1095             ce32 = base->getCE32(jamo);
1096         }
1097         if(Collation::isSpecialCE32(ce32)) {
1098             switch(Collation::tagFromCE32(ce32)) {
1099             case Collation::LONG_PRIMARY_TAG:
1100             case Collation::LONG_SECONDARY_TAG:
1101             case Collation::LATIN_EXPANSION_TAG:
1102                 // Copy the ce32 as-is.
1103                 break;
1104             case Collation::EXPANSION32_TAG:
1105             case Collation::EXPANSION_TAG:
1106             case Collation::PREFIX_TAG:
1107             case Collation::CONTRACTION_TAG:
1108                 if(fromBase) {
1109                     // Defer copying until we know if anyJamoAssigned.
1110                     ce32 = Collation::FALLBACK_CE32;
1111                     needToCopyFromBase = TRUE;
1112                 }
1113                 break;
1114             case Collation::IMPLICIT_TAG:
1115                 // An unassigned Jamo should only occur in tests with incomplete bases.
1116                 U_ASSERT(fromBase);
1117                 ce32 = Collation::FALLBACK_CE32;
1118                 needToCopyFromBase = TRUE;
1119                 break;
1120             case Collation::OFFSET_TAG:
1121                 ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32);
1122                 break;
1123             case Collation::FALLBACK_TAG:
1124             case Collation::RESERVED_TAG_3:
1125             case Collation::BUILDER_DATA_TAG:
1126             case Collation::DIGIT_TAG:
1127             case Collation::U0000_TAG:
1128             case Collation::HANGUL_TAG:
1129             case Collation::LEAD_SURROGATE_TAG:
1130                 errorCode = U_INTERNAL_PROGRAM_ERROR;
1131                 return FALSE;
1132             }
1133         }
1134         jamoCE32s[j] = ce32;
1135     }
1136     if(anyJamoAssigned && needToCopyFromBase) {
1137         for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {
1138             if(jamoCE32s[j] == Collation::FALLBACK_CE32) {
1139                 UChar32 jamo = jamoCpFromIndex(j);
1140                 jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo),
1141                                                 /*withContext=*/ TRUE, errorCode);
1142             }
1143         }
1144     }
1145     return anyJamoAssigned && U_SUCCESS(errorCode);
1146 }
1147 
1148 void
setDigitTags(UErrorCode & errorCode)1149 CollationDataBuilder::setDigitTags(UErrorCode &errorCode) {
1150     UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode);
1151     if(U_FAILURE(errorCode)) { return; }
1152     UnicodeSetIterator iter(digits);
1153     while(iter.next()) {
1154         U_ASSERT(!iter.isString());
1155         UChar32 c = iter.getCodepoint();
1156         uint32_t ce32 = utrie2_get32(trie, c);
1157         if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) {
1158             int32_t index = addCE32(ce32, errorCode);
1159             if(U_FAILURE(errorCode)) { return; }
1160             if(index > Collation::MAX_INDEX) {
1161                 errorCode = U_BUFFER_OVERFLOW_ERROR;
1162                 return;
1163             }
1164             ce32 = Collation::makeCE32FromTagIndexAndLength(
1165                     Collation::DIGIT_TAG, index, u_charDigitValue(c));
1166             utrie2_set32(trie, c, ce32, &errorCode);
1167         }
1168     }
1169 }
1170 
1171 U_CDECL_BEGIN
1172 
1173 static UBool U_CALLCONV
enumRangeLeadValue(const void * context,UChar32,UChar32,uint32_t value)1174 enumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1175     int32_t *pValue = (int32_t *)context;
1176     if(value == Collation::UNASSIGNED_CE32) {
1177         value = Collation::LEAD_ALL_UNASSIGNED;
1178     } else if(value == Collation::FALLBACK_CE32) {
1179         value = Collation::LEAD_ALL_FALLBACK;
1180     } else {
1181         *pValue = Collation::LEAD_MIXED;
1182         return FALSE;
1183     }
1184     if(*pValue < 0) {
1185         *pValue = (int32_t)value;
1186     } else if(*pValue != (int32_t)value) {
1187         *pValue = Collation::LEAD_MIXED;
1188         return FALSE;
1189     }
1190     return TRUE;
1191 }
1192 
1193 U_CDECL_END
1194 
1195 void
setLeadSurrogates(UErrorCode & errorCode)1196 CollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) {
1197     for(UChar lead = 0xd800; lead < 0xdc00; ++lead) {
1198         int32_t value = -1;
1199         utrie2_enumForLeadSurrogate(trie, lead, NULL, enumRangeLeadValue, &value);
1200         utrie2_set32ForLeadSurrogateCodeUnit(
1201             trie, lead,
1202             Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | (uint32_t)value,
1203             &errorCode);
1204     }
1205 }
1206 
1207 void
build(CollationData & data,UErrorCode & errorCode)1208 CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
1209     buildMappings(data, errorCode);
1210     if(base != NULL) {
1211         data.numericPrimary = base->numericPrimary;
1212         data.compressibleBytes = base->compressibleBytes;
1213         data.scripts = base->scripts;
1214         data.scriptsLength = base->scriptsLength;
1215     }
1216     buildFastLatinTable(data, errorCode);
1217 }
1218 
1219 void
buildMappings(CollationData & data,UErrorCode & errorCode)1220 CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) {
1221     if(U_FAILURE(errorCode)) { return; }
1222     if(trie == NULL || utrie2_isFrozen(trie)) {
1223         errorCode = U_INVALID_STATE_ERROR;
1224         return;
1225     }
1226 
1227     buildContexts(errorCode);
1228 
1229     uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
1230     int32_t jamoIndex = -1;
1231     if(getJamoCE32s(jamoCE32s, errorCode)) {
1232         jamoIndex = ce32s.size();
1233         for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
1234             ce32s.addElement((int32_t)jamoCE32s[i], errorCode);
1235         }
1236         // Small optimization: Use a bit in the Hangul ce32
1237         // to indicate that none of the Jamo CE32s are isSpecialCE32()
1238         // (as it should be in the root collator).
1239         // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
1240         // In order to still have good trie compression and keep this code simple,
1241         // we only set this flag if a whole block of 588 Hangul syllables starting with
1242         // a common leading consonant (Jamo L) has this property.
1243         UBool isAnyJamoVTSpecial = FALSE;
1244         for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
1245             if(Collation::isSpecialCE32(jamoCE32s[i])) {
1246                 isAnyJamoVTSpecial = TRUE;
1247                 break;
1248             }
1249         }
1250         uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
1251         UChar32 c = Hangul::HANGUL_BASE;
1252         for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) {  // iterate over the Jamo L
1253             uint32_t ce32 = hangulCE32;
1254             if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) {
1255                 ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO;
1256             }
1257             UChar32 limit = c + Hangul::JAMO_VT_COUNT;
1258             utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode);
1259             c = limit;
1260         }
1261     } else {
1262         // Copy the Hangul CE32s from the base in blocks per Jamo L,
1263         // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
1264         for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) {
1265             uint32_t ce32 = base->getCE32(c);
1266             U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG));
1267             UChar32 limit = c + Hangul::JAMO_VT_COUNT;
1268             utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode);
1269             c = limit;
1270         }
1271     }
1272 
1273     setDigitTags(errorCode);
1274     setLeadSurrogates(errorCode);
1275 
1276     // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
1277     ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0);
1278     utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
1279 
1280     utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode);
1281     if(U_FAILURE(errorCode)) { return; }
1282 
1283     // Mark each lead surrogate as "unsafe"
1284     // if any of its 1024 associated supplementary code points is "unsafe".
1285     UChar32 c = 0x10000;
1286     for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
1287         if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) {
1288             unsafeBackwardSet.add(lead);
1289         }
1290     }
1291     unsafeBackwardSet.freeze();
1292 
1293     data.trie = trie;
1294     data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer());
1295     data.ces = ce64s.getBuffer();
1296     data.contexts = contexts.getBuffer();
1297 
1298     data.ce32sLength = ce32s.size();
1299     data.cesLength = ce64s.size();
1300     data.contextsLength = contexts.length();
1301 
1302     data.base = base;
1303     if(jamoIndex >= 0) {
1304         data.jamoCE32s = data.ce32s + jamoIndex;
1305     } else {
1306         data.jamoCE32s = base->jamoCE32s;
1307     }
1308     data.unsafeBackwardSet = &unsafeBackwardSet;
1309 }
1310 
1311 void
clearContexts()1312 CollationDataBuilder::clearContexts() {
1313     contexts.remove();
1314     UnicodeSetIterator iter(contextChars);
1315     while(iter.next()) {
1316         U_ASSERT(!iter.isString());
1317         uint32_t ce32 = utrie2_get32(trie, iter.getCodepoint());
1318         U_ASSERT(isBuilderContextCE32(ce32));
1319         getConditionalCE32ForCE32(ce32)->builtCE32 = Collation::NO_CE32;
1320     }
1321 }
1322 
1323 void
buildContexts(UErrorCode & errorCode)1324 CollationDataBuilder::buildContexts(UErrorCode &errorCode) {
1325     if(U_FAILURE(errorCode)) { return; }
1326     // Ignore abandoned lists and the cached builtCE32,
1327     // and build all contexts from scratch.
1328     contexts.remove();
1329     UnicodeSetIterator iter(contextChars);
1330     while(U_SUCCESS(errorCode) && iter.next()) {
1331         U_ASSERT(!iter.isString());
1332         UChar32 c = iter.getCodepoint();
1333         uint32_t ce32 = utrie2_get32(trie, c);
1334         if(!isBuilderContextCE32(ce32)) {
1335             // Impossible: No context data for c in contextChars.
1336             errorCode = U_INTERNAL_PROGRAM_ERROR;
1337             return;
1338         }
1339         ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32);
1340         ce32 = buildContext(cond, errorCode);
1341         utrie2_set32(trie, c, ce32, &errorCode);
1342     }
1343 }
1344 
1345 uint32_t
buildContext(ConditionalCE32 * head,UErrorCode & errorCode)1346 CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) {
1347     if(U_FAILURE(errorCode)) { return 0; }
1348     // The list head must have no context.
1349     U_ASSERT(!head->hasContext());
1350     // The list head must be followed by one or more nodes that all do have context.
1351     U_ASSERT(head->next >= 0);
1352     UCharsTrieBuilder prefixBuilder(errorCode);
1353     UCharsTrieBuilder contractionBuilder(errorCode);
1354     for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) {
1355         // After the list head, the prefix or suffix can be empty, but not both.
1356         U_ASSERT(cond == head || cond->hasContext());
1357         int32_t prefixLength = cond->prefixLength();
1358         UnicodeString prefix(cond->context, 0, prefixLength + 1);
1359         // Collect all contraction suffixes for one prefix.
1360         ConditionalCE32 *firstCond = cond;
1361         ConditionalCE32 *lastCond = cond;
1362         while(cond->next >= 0 &&
1363                 (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)) {
1364             lastCond = cond;
1365         }
1366         uint32_t ce32;
1367         int32_t suffixStart = prefixLength + 1;  // == prefix.length()
1368         if(lastCond->context.length() == suffixStart) {
1369             // One prefix without contraction suffix.
1370             U_ASSERT(firstCond == lastCond);
1371             ce32 = lastCond->ce32;
1372             cond = lastCond;
1373         } else {
1374             // Build the contractions trie.
1375             contractionBuilder.clear();
1376             // Entry for an empty suffix, to be stored before the trie.
1377             uint32_t emptySuffixCE32;
1378             uint32_t flags = 0;
1379             if(firstCond->context.length() == suffixStart) {
1380                 // There is a mapping for the prefix and the single character c. (p|c)
1381                 // If no other suffix matches, then we return this value.
1382                 emptySuffixCE32 = firstCond->ce32;
1383                 cond = getConditionalCE32(firstCond->next);
1384             } else {
1385                 // There is no mapping for the prefix and just the single character.
1386                 // (There is no p|c, only p|cd, p|ce etc.)
1387                 flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH;
1388                 // When the prefix matches but none of the prefix-specific suffixes,
1389                 // then we fall back to the mappings with the next-longest prefix,
1390                 // and ultimately to mappings with no prefix.
1391                 // Each fallback might be another set of contractions.
1392                 // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
1393                 // then in text "pch" we find the ch contraction.
1394                 for(cond = head;; cond = getConditionalCE32(cond->next)) {
1395                     int32_t length = cond->prefixLength();
1396                     if(length == prefixLength) { break; }
1397                     if(cond->defaultCE32 != Collation::NO_CE32 &&
1398                             (length==0 || prefix.endsWith(cond->context, 1, length))) {
1399                         emptySuffixCE32 = cond->defaultCE32;
1400                     }
1401                 }
1402                 cond = firstCond;
1403             }
1404             // Optimization: Set a flag when
1405             // the first character of every contraction suffix has lccc!=0.
1406             // Short-circuits contraction matching when a normal letter follows.
1407             flags |= Collation::CONTRACT_NEXT_CCC;
1408             // Add all of the non-empty suffixes into the contraction trie.
1409             for(;;) {
1410                 UnicodeString suffix(cond->context, suffixStart);
1411                 uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0));
1412                 if(fcd16 <= 0xff) {
1413                     flags &= ~Collation::CONTRACT_NEXT_CCC;
1414                 }
1415                 fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1));
1416                 if(fcd16 > 0xff) {
1417                     // The last suffix character has lccc!=0, allowing for discontiguous contractions.
1418                     flags |= Collation::CONTRACT_TRAILING_CCC;
1419                 }
1420                 contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode);
1421                 if(cond == lastCond) { break; }
1422                 cond = getConditionalCE32(cond->next);
1423             }
1424             int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode);
1425             if(U_FAILURE(errorCode)) { return 0; }
1426             if(index > Collation::MAX_INDEX) {
1427                 errorCode = U_BUFFER_OVERFLOW_ERROR;
1428                 return 0;
1429             }
1430             ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags;
1431         }
1432         U_ASSERT(cond == lastCond);
1433         firstCond->defaultCE32 = ce32;
1434         if(prefixLength == 0) {
1435             if(cond->next < 0) {
1436                 // No non-empty prefixes, only contractions.
1437                 return ce32;
1438             }
1439         } else {
1440             prefix.remove(0, 1);  // Remove the length unit.
1441             prefix.reverse();
1442             prefixBuilder.add(prefix, (int32_t)ce32, errorCode);
1443             if(cond->next < 0) { break; }
1444         }
1445     }
1446     U_ASSERT(head->defaultCE32 != Collation::NO_CE32);
1447     int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode);
1448     if(U_FAILURE(errorCode)) { return 0; }
1449     if(index > Collation::MAX_INDEX) {
1450         errorCode = U_BUFFER_OVERFLOW_ERROR;
1451         return 0;
1452     }
1453     return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index);
1454 }
1455 
1456 int32_t
addContextTrie(uint32_t defaultCE32,UCharsTrieBuilder & trieBuilder,UErrorCode & errorCode)1457 CollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
1458                                      UErrorCode &errorCode) {
1459     UnicodeString context;
1460     context.append((UChar)(defaultCE32 >> 16)).append((UChar)defaultCE32);
1461     UnicodeString trieString;
1462     context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode));
1463     if(U_FAILURE(errorCode)) { return -1; }
1464     int32_t index = contexts.indexOf(context);
1465     if(index < 0) {
1466         index = contexts.length();
1467         contexts.append(context);
1468     }
1469     return index;
1470 }
1471 
1472 void
buildFastLatinTable(CollationData & data,UErrorCode & errorCode)1473 CollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) {
1474     if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; }
1475 
1476     delete fastLatinBuilder;
1477     fastLatinBuilder = new CollationFastLatinBuilder(errorCode);
1478     if(fastLatinBuilder == NULL) {
1479         errorCode = U_MEMORY_ALLOCATION_ERROR;
1480         return;
1481     }
1482     if(fastLatinBuilder->forData(data, errorCode)) {
1483         const uint16_t *table = fastLatinBuilder->getTable();
1484         int32_t length = fastLatinBuilder->lengthOfTable();
1485         if(base != NULL && length == base->fastLatinTableLength &&
1486                 uprv_memcmp(table, base->fastLatinTable, length * 2) == 0) {
1487             // Same fast Latin table as in the base, use that one instead.
1488             delete fastLatinBuilder;
1489             fastLatinBuilder = NULL;
1490             table = base->fastLatinTable;
1491         }
1492         data.fastLatinTable = table;
1493         data.fastLatinTableLength = length;
1494     } else {
1495         delete fastLatinBuilder;
1496         fastLatinBuilder = NULL;
1497     }
1498 }
1499 
1500 int32_t
getCEs(const UnicodeString & s,int64_t ces[],int32_t cesLength)1501 CollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) {
1502     return getCEs(s, 0, ces, cesLength);
1503 }
1504 
1505 int32_t
getCEs(const UnicodeString & prefix,const UnicodeString & s,int64_t ces[],int32_t cesLength)1506 CollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s,
1507                              int64_t ces[], int32_t cesLength) {
1508     int32_t prefixLength = prefix.length();
1509     if(prefixLength == 0) {
1510         return getCEs(s, 0, ces, cesLength);
1511     } else {
1512         return getCEs(prefix + s, prefixLength, ces, cesLength);
1513     }
1514 }
1515 
1516 int32_t
getCEs(const UnicodeString & s,int32_t start,int64_t ces[],int32_t cesLength)1517 CollationDataBuilder::getCEs(const UnicodeString &s, int32_t start,
1518                              int64_t ces[], int32_t cesLength) {
1519     if(collIter == NULL) {
1520         collIter = new DataBuilderCollationIterator(*this);
1521         if(collIter == NULL) { return 0; }
1522     }
1523     return collIter->fetchCEs(s, start, ces, cesLength);
1524 }
1525 
1526 U_NAMESPACE_END
1527 
1528 #endif  // !UCONFIG_NO_COLLATION
1529