1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdatabuilder.cpp
7 *
8 * (replaced the former ucol_elm.cpp)
9 *
10 * created on: 2012apr01
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/localpointer.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ucharstrie.h"
21 #include "unicode/ucharstriebuilder.h"
22 #include "unicode/uniset.h"
23 #include "unicode/unistr.h"
24 #include "unicode/usetiter.h"
25 #include "unicode/utf16.h"
26 #include "cmemory.h"
27 #include "collation.h"
28 #include "collationdata.h"
29 #include "collationdatabuilder.h"
30 #include "collationfastlatinbuilder.h"
31 #include "collationiterator.h"
32 #include "normalizer2impl.h"
33 #include "utrie2.h"
34 #include "uvectr32.h"
35 #include "uvectr64.h"
36 #include "uvector.h"
37
38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
39
40 U_NAMESPACE_BEGIN
41
~CEModifier()42 CollationDataBuilder::CEModifier::~CEModifier() {}
43
44 /**
45 * Build-time context and CE32 for a code point.
46 * If a code point has contextual mappings, then the default (no-context) mapping
47 * and all conditional mappings are stored in a singly-linked list
48 * of ConditionalCE32, sorted by context strings.
49 *
50 * Context strings sort by prefix length, then by prefix, then by contraction suffix.
51 * Context strings must be unique and in ascending order.
52 */
53 struct ConditionalCE32 : public UMemory {
ConditionalCE32ConditionalCE3254 ConditionalCE32(const UnicodeString &ct, uint32_t ce)
55 : context(ct),
56 ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32),
57 next(-1) {}
58
hasContextConditionalCE3259 inline UBool hasContext() const { return context.length() > 1; }
prefixLengthConditionalCE3260 inline int32_t prefixLength() const { return context.charAt(0); }
61
62 /**
63 * "\0" for the first entry for any code point, with its default CE32.
64 *
65 * Otherwise one unit with the length of the prefix string,
66 * then the prefix string, then the contraction suffix.
67 */
68 UnicodeString context;
69 /**
70 * CE32 for the code point and its context.
71 * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
72 */
73 uint32_t ce32;
74 /**
75 * Default CE32 for all contexts with this same prefix.
76 * Initially NO_CE32. Set only while building runtime data structures,
77 * and only on one of the nodes of a sub-list with the same prefix.
78 */
79 uint32_t defaultCE32;
80 /**
81 * CE32 for the built contexts.
82 * When fetching CEs from the builder, the contexts are built into their runtime form
83 * so that the normal collation implementation can process them.
84 * The result is cached in the list head. It is reset when the contexts are modified.
85 */
86 uint32_t builtCE32;
87 /**
88 * Index of the next ConditionalCE32.
89 * Negative for the end of the list.
90 */
91 int32_t next;
92 };
93
94 U_CDECL_BEGIN
95
96 U_CAPI void U_CALLCONV
uprv_deleteConditionalCE32(void * obj)97 uprv_deleteConditionalCE32(void *obj) {
98 delete static_cast<ConditionalCE32 *>(obj);
99 }
100
101 U_CDECL_END
102
103 /**
104 * Build-time collation element and character iterator.
105 * Uses the runtime CollationIterator for fetching CEs for a string
106 * but reads from the builder's unfinished data structures.
107 * In particular, this class reads from the unfinished trie
108 * and has to avoid CollationIterator::nextCE() and redirect other
109 * calls to data->getCE32() and data->getCE32FromSupplementary().
110 *
111 * We do this so that we need not implement the collation algorithm
112 * again for the builder and make it behave exactly like the runtime code.
113 * That would be more difficult to test and maintain than this indirection.
114 *
115 * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
116 * so the data accesses from those code paths need not be modified.
117 *
118 * This class iterates directly over whole code points
119 * so that the CollationIterator does not need the finished trie
120 * for handling the LEAD_SURROGATE_TAG.
121 */
122 class DataBuilderCollationIterator : public CollationIterator {
123 public:
124 DataBuilderCollationIterator(CollationDataBuilder &b);
125
126 virtual ~DataBuilderCollationIterator();
127
128 int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength);
129
130 virtual void resetToOffset(int32_t newOffset);
131 virtual int32_t getOffset() const;
132
133 virtual UChar32 nextCodePoint(UErrorCode &errorCode);
134 virtual UChar32 previousCodePoint(UErrorCode &errorCode);
135
136 protected:
137 virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
138 virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
139
140 virtual uint32_t getDataCE32(UChar32 c) const;
141 virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode);
142
143 CollationDataBuilder &builder;
144 CollationData builderData;
145 uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
146 const UnicodeString *s;
147 int32_t pos;
148 };
149
DataBuilderCollationIterator(CollationDataBuilder & b)150 DataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b)
151 : CollationIterator(&builderData, /*numeric=*/ FALSE),
152 builder(b), builderData(b.nfcImpl),
153 s(NULL), pos(0) {
154 builderData.base = builder.base;
155 // Set all of the jamoCE32s[] to indirection CE32s.
156 for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types.
157 UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j);
158 jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) |
159 CollationDataBuilder::IS_BUILDER_JAMO_CE32;
160 }
161 builderData.jamoCE32s = jamoCE32s;
162 }
163
~DataBuilderCollationIterator()164 DataBuilderCollationIterator::~DataBuilderCollationIterator() {}
165
166 int32_t
fetchCEs(const UnicodeString & str,int32_t start,int64_t ces[],int32_t cesLength)167 DataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start,
168 int64_t ces[], int32_t cesLength) {
169 // Set the pointers each time, in case they changed due to reallocation.
170 builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer());
171 builderData.ces = builder.ce64s.getBuffer();
172 builderData.contexts = builder.contexts.getBuffer();
173 // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32().
174 reset();
175 s = &str;
176 pos = start;
177 UErrorCode errorCode = U_ZERO_ERROR;
178 while(U_SUCCESS(errorCode) && pos < s->length()) {
179 // No need to keep all CEs in the iterator buffer.
180 clearCEs();
181 UChar32 c = s->char32At(pos);
182 pos += U16_LENGTH(c);
183 uint32_t ce32 = utrie2_get32(builder.trie, c);
184 const CollationData *d;
185 if(ce32 == Collation::FALLBACK_CE32) {
186 d = builder.base;
187 ce32 = builder.base->getCE32(c);
188 } else {
189 d = &builderData;
190 }
191 appendCEsFromCE32(d, c, ce32, /*forward=*/ TRUE, errorCode);
192 U_ASSERT(U_SUCCESS(errorCode));
193 for(int32_t i = 0; i < getCEsLength(); ++i) {
194 int64_t ce = getCE(i);
195 if(ce != 0) {
196 if(cesLength < Collation::MAX_EXPANSION_LENGTH) {
197 ces[cesLength] = ce;
198 }
199 ++cesLength;
200 }
201 }
202 }
203 return cesLength;
204 }
205
206 void
resetToOffset(int32_t newOffset)207 DataBuilderCollationIterator::resetToOffset(int32_t newOffset) {
208 reset();
209 pos = newOffset;
210 }
211
212 int32_t
getOffset() const213 DataBuilderCollationIterator::getOffset() const {
214 return pos;
215 }
216
217 UChar32
nextCodePoint(UErrorCode &)218 DataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
219 if(pos == s->length()) {
220 return U_SENTINEL;
221 }
222 UChar32 c = s->char32At(pos);
223 pos += U16_LENGTH(c);
224 return c;
225 }
226
227 UChar32
previousCodePoint(UErrorCode &)228 DataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
229 if(pos == 0) {
230 return U_SENTINEL;
231 }
232 UChar32 c = s->char32At(pos - 1);
233 pos -= U16_LENGTH(c);
234 return c;
235 }
236
237 void
forwardNumCodePoints(int32_t num,UErrorCode &)238 DataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
239 pos = s->moveIndex32(pos, num);
240 }
241
242 void
backwardNumCodePoints(int32_t num,UErrorCode &)243 DataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
244 pos = s->moveIndex32(pos, -num);
245 }
246
247 uint32_t
getDataCE32(UChar32 c) const248 DataBuilderCollationIterator::getDataCE32(UChar32 c) const {
249 return utrie2_get32(builder.trie, c);
250 }
251
252 uint32_t
getCE32FromBuilderData(uint32_t ce32,UErrorCode & errorCode)253 DataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) {
254 U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG));
255 if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) {
256 UChar32 jamo = Collation::indexFromCE32(ce32);
257 return utrie2_get32(builder.trie, jamo);
258 } else {
259 ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32);
260 if(cond->builtCE32 == Collation::NO_CE32) {
261 // Build the context-sensitive mappings into their runtime form and cache the result.
262 cond->builtCE32 = builder.buildContext(cond, errorCode);
263 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
264 errorCode = U_ZERO_ERROR;
265 builder.clearContexts();
266 cond->builtCE32 = builder.buildContext(cond, errorCode);
267 }
268 builderData.contexts = builder.contexts.getBuffer();
269 }
270 return cond->builtCE32;
271 }
272 }
273
274 // ------------------------------------------------------------------------- ***
275
CollationDataBuilder(UErrorCode & errorCode)276 CollationDataBuilder::CollationDataBuilder(UErrorCode &errorCode)
277 : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
278 base(NULL), baseSettings(NULL),
279 trie(NULL),
280 ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode),
281 modified(FALSE),
282 fastLatinEnabled(FALSE), fastLatinBuilder(NULL),
283 collIter(NULL) {
284 // Reserve the first CE32 for U+0000.
285 ce32s.addElement(0, errorCode);
286 conditionalCE32s.setDeleter(uprv_deleteConditionalCE32);
287 }
288
~CollationDataBuilder()289 CollationDataBuilder::~CollationDataBuilder() {
290 utrie2_close(trie);
291 delete fastLatinBuilder;
292 delete collIter;
293 }
294
295 void
initForTailoring(const CollationData * b,UErrorCode & errorCode)296 CollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) {
297 if(U_FAILURE(errorCode)) { return; }
298 if(trie != NULL) {
299 errorCode = U_INVALID_STATE_ERROR;
300 return;
301 }
302 if(b == NULL) {
303 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
304 return;
305 }
306 base = b;
307
308 // For a tailoring, the default is to fall back to the base.
309 trie = utrie2_open(Collation::FALLBACK_CE32, Collation::FFFD_CE32, &errorCode);
310
311 // Set the Latin-1 letters block so that it is allocated first in the data array,
312 // to try to improve locality of reference when sorting Latin-1 text.
313 // Do not use utrie2_setRange32() since that will not actually allocate blocks
314 // that are filled with the default value.
315 // ASCII (0..7F) is already preallocated anyway.
316 for(UChar32 c = 0xc0; c <= 0xff; ++c) {
317 utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode);
318 }
319
320 // Hangul syllables are not tailorable (except via tailoring Jamos).
321 // Always set the Hangul tag to help performance.
322 // Do this here, rather than in buildMappings(),
323 // so that we see the HANGUL_TAG in various assertions.
324 uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
325 utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode);
326
327 // Copy the set contents but don't copy/clone the set as a whole because
328 // that would copy the isFrozen state too.
329 unsafeBackwardSet.addAll(*b->unsafeBackwardSet);
330
331 if(U_FAILURE(errorCode)) { return; }
332 }
333
334 UBool
maybeSetPrimaryRange(UChar32 start,UChar32 end,uint32_t primary,int32_t step,UErrorCode & errorCode)335 CollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end,
336 uint32_t primary, int32_t step,
337 UErrorCode &errorCode) {
338 if(U_FAILURE(errorCode)) { return FALSE; }
339 U_ASSERT(start <= end);
340 // TODO: Do we need to check what values are currently set for start..end?
341 // An offset range is worth it only if we can achieve an overlap between
342 // adjacent UTrie2 blocks of 32 code points each.
343 // An offset CE is also a little more expensive to look up and compute
344 // than a simple CE.
345 // If the range spans at least three UTrie2 block boundaries (> 64 code points),
346 // then we take it.
347 // If the range spans one or two block boundaries and there are
348 // at least 4 code points on either side, then we take it.
349 // (We could additionally require a minimum range length of, say, 16.)
350 int32_t blockDelta = (end >> 5) - (start >> 5);
351 if(2 <= step && step <= 0x7f &&
352 (blockDelta >= 3 ||
353 (blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) {
354 int64_t dataCE = ((int64_t)primary << 32) | (start << 8) | step;
355 if(isCompressiblePrimary(primary)) { dataCE |= 0x80; }
356 int32_t index = addCE(dataCE, errorCode);
357 if(U_FAILURE(errorCode)) { return 0; }
358 if(index > Collation::MAX_INDEX) {
359 errorCode = U_BUFFER_OVERFLOW_ERROR;
360 return 0;
361 }
362 uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index);
363 utrie2_setRange32(trie, start, end, offsetCE32, TRUE, &errorCode);
364 modified = TRUE;
365 return TRUE;
366 } else {
367 return FALSE;
368 }
369 }
370
371 uint32_t
setPrimaryRangeAndReturnNext(UChar32 start,UChar32 end,uint32_t primary,int32_t step,UErrorCode & errorCode)372 CollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end,
373 uint32_t primary, int32_t step,
374 UErrorCode &errorCode) {
375 if(U_FAILURE(errorCode)) { return 0; }
376 UBool isCompressible = isCompressiblePrimary(primary);
377 if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) {
378 return Collation::incThreeBytePrimaryByOffset(primary, isCompressible,
379 (end - start + 1) * step);
380 } else {
381 // Short range: Set individual CE32s.
382 for(;;) {
383 utrie2_set32(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode);
384 ++start;
385 primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step);
386 if(start > end) { return primary; }
387 }
388 modified = TRUE;
389 }
390 }
391
392 uint32_t
getCE32FromOffsetCE32(UBool fromBase,UChar32 c,uint32_t ce32) const393 CollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const {
394 int32_t i = Collation::indexFromCE32(ce32);
395 int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i);
396 uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE);
397 return Collation::makeLongPrimaryCE32(p);
398 }
399
400 UBool
isCompressibleLeadByte(uint32_t b) const401 CollationDataBuilder::isCompressibleLeadByte(uint32_t b) const {
402 return base->isCompressibleLeadByte(b);
403 }
404
405 UBool
isAssigned(UChar32 c) const406 CollationDataBuilder::isAssigned(UChar32 c) const {
407 return Collation::isAssignedCE32(utrie2_get32(trie, c));
408 }
409
410 uint32_t
getLongPrimaryIfSingleCE(UChar32 c) const411 CollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const {
412 uint32_t ce32 = utrie2_get32(trie, c);
413 if(Collation::isLongPrimaryCE32(ce32)) {
414 return Collation::primaryFromLongPrimaryCE32(ce32);
415 } else {
416 return 0;
417 }
418 }
419
420 int64_t
getSingleCE(UChar32 c,UErrorCode & errorCode) const421 CollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
422 if(U_FAILURE(errorCode)) { return 0; }
423 UBool fromBase = FALSE;
424 uint32_t ce32 = utrie2_get32(trie, c);
425 if(ce32 == Collation::FALLBACK_CE32) {
426 fromBase = TRUE;
427 ce32 = base->getCE32(c);
428 }
429 while(Collation::isSpecialCE32(ce32)) {
430 switch(Collation::tagFromCE32(ce32)) {
431 case Collation::LATIN_EXPANSION_TAG:
432 case Collation::BUILDER_DATA_TAG:
433 case Collation::PREFIX_TAG:
434 case Collation::CONTRACTION_TAG:
435 case Collation::HANGUL_TAG:
436 case Collation::LEAD_SURROGATE_TAG:
437 errorCode = U_UNSUPPORTED_ERROR;
438 return 0;
439 case Collation::FALLBACK_TAG:
440 case Collation::RESERVED_TAG_3:
441 errorCode = U_INTERNAL_PROGRAM_ERROR;
442 return 0;
443 case Collation::LONG_PRIMARY_TAG:
444 return Collation::ceFromLongPrimaryCE32(ce32);
445 case Collation::LONG_SECONDARY_TAG:
446 return Collation::ceFromLongSecondaryCE32(ce32);
447 case Collation::EXPANSION32_TAG:
448 if(Collation::lengthFromCE32(ce32) == 1) {
449 int32_t i = Collation::indexFromCE32(ce32);
450 ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i);
451 break;
452 } else {
453 errorCode = U_UNSUPPORTED_ERROR;
454 return 0;
455 }
456 case Collation::EXPANSION_TAG: {
457 if(Collation::lengthFromCE32(ce32) == 1) {
458 int32_t i = Collation::indexFromCE32(ce32);
459 return fromBase ? base->ces[i] : ce64s.elementAti(i);
460 } else {
461 errorCode = U_UNSUPPORTED_ERROR;
462 return 0;
463 }
464 }
465 case Collation::DIGIT_TAG:
466 // Fetch the non-numeric-collation CE32 and continue.
467 ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32));
468 break;
469 case Collation::U0000_TAG:
470 U_ASSERT(c == 0);
471 // Fetch the normal ce32 for U+0000 and continue.
472 ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0);
473 break;
474 case Collation::OFFSET_TAG:
475 ce32 = getCE32FromOffsetCE32(fromBase, c, ce32);
476 break;
477 case Collation::IMPLICIT_TAG:
478 return Collation::unassignedCEFromCodePoint(c);
479 }
480 }
481 return Collation::ceFromSimpleCE32(ce32);
482 }
483
484 int32_t
addCE(int64_t ce,UErrorCode & errorCode)485 CollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) {
486 int32_t length = ce64s.size();
487 for(int32_t i = 0; i < length; ++i) {
488 if(ce == ce64s.elementAti(i)) { return i; }
489 }
490 ce64s.addElement(ce, errorCode);
491 return length;
492 }
493
494 int32_t
addCE32(uint32_t ce32,UErrorCode & errorCode)495 CollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) {
496 int32_t length = ce32s.size();
497 for(int32_t i = 0; i < length; ++i) {
498 if(ce32 == (uint32_t)ce32s.elementAti(i)) { return i; }
499 }
500 ce32s.addElement((int32_t)ce32, errorCode);
501 return length;
502 }
503
504 int32_t
addConditionalCE32(const UnicodeString & context,uint32_t ce32,UErrorCode & errorCode)505 CollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32,
506 UErrorCode &errorCode) {
507 if(U_FAILURE(errorCode)) { return -1; }
508 U_ASSERT(!context.isEmpty());
509 int32_t index = conditionalCE32s.size();
510 if(index > Collation::MAX_INDEX) {
511 errorCode = U_BUFFER_OVERFLOW_ERROR;
512 return -1;
513 }
514 ConditionalCE32 *cond = new ConditionalCE32(context, ce32);
515 if(cond == NULL) {
516 errorCode = U_MEMORY_ALLOCATION_ERROR;
517 return -1;
518 }
519 conditionalCE32s.addElement(cond, errorCode);
520 return index;
521 }
522
523 void
add(const UnicodeString & prefix,const UnicodeString & s,const int64_t ces[],int32_t cesLength,UErrorCode & errorCode)524 CollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s,
525 const int64_t ces[], int32_t cesLength,
526 UErrorCode &errorCode) {
527 uint32_t ce32 = encodeCEs(ces, cesLength, errorCode);
528 addCE32(prefix, s, ce32, errorCode);
529 }
530
531 void
addCE32(const UnicodeString & prefix,const UnicodeString & s,uint32_t ce32,UErrorCode & errorCode)532 CollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s,
533 uint32_t ce32, UErrorCode &errorCode) {
534 if(U_FAILURE(errorCode)) { return; }
535 if(s.isEmpty()) {
536 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
537 return;
538 }
539 if(trie == NULL || utrie2_isFrozen(trie)) {
540 errorCode = U_INVALID_STATE_ERROR;
541 return;
542 }
543 UChar32 c = s.char32At(0);
544 int32_t cLength = U16_LENGTH(c);
545 uint32_t oldCE32 = utrie2_get32(trie, c);
546 UBool hasContext = !prefix.isEmpty() || s.length() > cLength;
547 if(oldCE32 == Collation::FALLBACK_CE32) {
548 // First tailoring for c.
549 // If c has contextual base mappings or if we add a contextual mapping,
550 // then copy the base mappings.
551 // Otherwise we just override the base mapping.
552 uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c));
553 if(hasContext || Collation::ce32HasContext(baseCE32)) {
554 oldCE32 = copyFromBaseCE32(c, baseCE32, TRUE, errorCode);
555 utrie2_set32(trie, c, oldCE32, &errorCode);
556 if(U_FAILURE(errorCode)) { return; }
557 }
558 }
559 if(!hasContext) {
560 // No prefix, no contraction.
561 if(!isBuilderContextCE32(oldCE32)) {
562 utrie2_set32(trie, c, ce32, &errorCode);
563 } else {
564 ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32);
565 cond->builtCE32 = Collation::NO_CE32;
566 cond->ce32 = ce32;
567 }
568 } else {
569 ConditionalCE32 *cond;
570 if(!isBuilderContextCE32(oldCE32)) {
571 // Replace the simple oldCE32 with a builder context CE32
572 // pointing to a new ConditionalCE32 list head.
573 int32_t index = addConditionalCE32(UnicodeString((UChar)0), oldCE32, errorCode);
574 if(U_FAILURE(errorCode)) { return; }
575 uint32_t contextCE32 = makeBuilderContextCE32(index);
576 utrie2_set32(trie, c, contextCE32, &errorCode);
577 contextChars.add(c);
578 cond = getConditionalCE32(index);
579 } else {
580 cond = getConditionalCE32ForCE32(oldCE32);
581 cond->builtCE32 = Collation::NO_CE32;
582 }
583 UnicodeString suffix(s, cLength);
584 UnicodeString context((UChar)prefix.length());
585 context.append(prefix).append(suffix);
586 unsafeBackwardSet.addAll(suffix);
587 for(;;) {
588 // invariant: context > cond->context
589 int32_t next = cond->next;
590 if(next < 0) {
591 // Append a new ConditionalCE32 after cond.
592 int32_t index = addConditionalCE32(context, ce32, errorCode);
593 if(U_FAILURE(errorCode)) { return; }
594 cond->next = index;
595 break;
596 }
597 ConditionalCE32 *nextCond = getConditionalCE32(next);
598 int8_t cmp = context.compare(nextCond->context);
599 if(cmp < 0) {
600 // Insert a new ConditionalCE32 between cond and nextCond.
601 int32_t index = addConditionalCE32(context, ce32, errorCode);
602 if(U_FAILURE(errorCode)) { return; }
603 cond->next = index;
604 getConditionalCE32(index)->next = next;
605 break;
606 } else if(cmp == 0) {
607 // Same context as before, overwrite its ce32.
608 nextCond->ce32 = ce32;
609 break;
610 }
611 cond = nextCond;
612 }
613 }
614 modified = TRUE;
615 }
616
617 uint32_t
encodeOneCEAsCE32(int64_t ce)618 CollationDataBuilder::encodeOneCEAsCE32(int64_t ce) {
619 uint32_t p = (uint32_t)(ce >> 32);
620 uint32_t lower32 = (uint32_t)ce;
621 uint32_t t = (uint32_t)(ce & 0xffff);
622 U_ASSERT((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s.
623 if((ce & INT64_C(0xffff00ff00ff)) == 0) {
624 // normal form ppppsstt
625 return p | (lower32 >> 16) | (t >> 8);
626 } else if((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) {
627 // long-primary form ppppppC1
628 return Collation::makeLongPrimaryCE32(p);
629 } else if(p == 0 && (t & 0xff) == 0) {
630 // long-secondary form ssssttC2
631 return Collation::makeLongSecondaryCE32(lower32);
632 }
633 return Collation::NO_CE32;
634 }
635
636 uint32_t
encodeOneCE(int64_t ce,UErrorCode & errorCode)637 CollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) {
638 // Try to encode one CE as one CE32.
639 uint32_t ce32 = encodeOneCEAsCE32(ce);
640 if(ce32 != Collation::NO_CE32) { return ce32; }
641 int32_t index = addCE(ce, errorCode);
642 if(U_FAILURE(errorCode)) { return 0; }
643 if(index > Collation::MAX_INDEX) {
644 errorCode = U_BUFFER_OVERFLOW_ERROR;
645 return 0;
646 }
647 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1);
648 }
649
650 uint32_t
encodeCEs(const int64_t ces[],int32_t cesLength,UErrorCode & errorCode)651 CollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength,
652 UErrorCode &errorCode) {
653 if(U_FAILURE(errorCode)) { return 0; }
654 if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) {
655 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
656 return 0;
657 }
658 if(trie == NULL || utrie2_isFrozen(trie)) {
659 errorCode = U_INVALID_STATE_ERROR;
660 return 0;
661 }
662 if(cesLength == 0) {
663 // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
664 // Do this here so that callers need not do it.
665 return encodeOneCEAsCE32(0);
666 } else if(cesLength == 1) {
667 return encodeOneCE(ces[0], errorCode);
668 } else if(cesLength == 2) {
669 // Try to encode two CEs as one CE32.
670 int64_t ce0 = ces[0];
671 int64_t ce1 = ces[1];
672 uint32_t p0 = (uint32_t)(ce0 >> 32);
673 if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE &&
674 (ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE &&
675 p0 != 0) {
676 // Latin mini expansion
677 return
678 p0 |
679 (((uint32_t)ce0 & 0xff00u) << 8) |
680 (uint32_t)(ce1 >> 16) |
681 Collation::SPECIAL_CE32_LOW_BYTE |
682 Collation::LATIN_EXPANSION_TAG;
683 }
684 }
685 // Try to encode two or more CEs as CE32s.
686 int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH];
687 for(int32_t i = 0;; ++i) {
688 if(i == cesLength) {
689 return encodeExpansion32(newCE32s, cesLength, errorCode);
690 }
691 uint32_t ce32 = encodeOneCEAsCE32(ces[i]);
692 if(ce32 == Collation::NO_CE32) { break; }
693 newCE32s[i] = (int32_t)ce32;
694 }
695 return encodeExpansion(ces, cesLength, errorCode);
696 }
697
698 uint32_t
encodeExpansion(const int64_t ces[],int32_t length,UErrorCode & errorCode)699 CollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) {
700 if(U_FAILURE(errorCode)) { return 0; }
701 // See if this sequence of CEs has already been stored.
702 int64_t first = ces[0];
703 int32_t ce64sMax = ce64s.size() - length;
704 for(int32_t i = 0; i <= ce64sMax; ++i) {
705 if(first == ce64s.elementAti(i)) {
706 if(i > Collation::MAX_INDEX) {
707 errorCode = U_BUFFER_OVERFLOW_ERROR;
708 return 0;
709 }
710 for(int32_t j = 1;; ++j) {
711 if(j == length) {
712 return Collation::makeCE32FromTagIndexAndLength(
713 Collation::EXPANSION_TAG, i, length);
714 }
715 if(ce64s.elementAti(i + j) != ces[j]) { break; }
716 }
717 }
718 }
719 // Store the new sequence.
720 int32_t i = ce64s.size();
721 if(i > Collation::MAX_INDEX) {
722 errorCode = U_BUFFER_OVERFLOW_ERROR;
723 return 0;
724 }
725 for(int32_t j = 0; j < length; ++j) {
726 ce64s.addElement(ces[j], errorCode);
727 }
728 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length);
729 }
730
731 uint32_t
encodeExpansion32(const int32_t newCE32s[],int32_t length,UErrorCode & errorCode)732 CollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length,
733 UErrorCode &errorCode) {
734 if(U_FAILURE(errorCode)) { return 0; }
735 // See if this sequence of CE32s has already been stored.
736 int32_t first = newCE32s[0];
737 int32_t ce32sMax = ce32s.size() - length;
738 for(int32_t i = 0; i <= ce32sMax; ++i) {
739 if(first == ce32s.elementAti(i)) {
740 if(i > Collation::MAX_INDEX) {
741 errorCode = U_BUFFER_OVERFLOW_ERROR;
742 return 0;
743 }
744 for(int32_t j = 1;; ++j) {
745 if(j == length) {
746 return Collation::makeCE32FromTagIndexAndLength(
747 Collation::EXPANSION32_TAG, i, length);
748 }
749 if(ce32s.elementAti(i + j) != newCE32s[j]) { break; }
750 }
751 }
752 }
753 // Store the new sequence.
754 int32_t i = ce32s.size();
755 if(i > Collation::MAX_INDEX) {
756 errorCode = U_BUFFER_OVERFLOW_ERROR;
757 return 0;
758 }
759 for(int32_t j = 0; j < length; ++j) {
760 ce32s.addElement(newCE32s[j], errorCode);
761 }
762 return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length);
763 }
764
765 uint32_t
copyFromBaseCE32(UChar32 c,uint32_t ce32,UBool withContext,UErrorCode & errorCode)766 CollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext,
767 UErrorCode &errorCode) {
768 if(U_FAILURE(errorCode)) { return 0; }
769 if(!Collation::isSpecialCE32(ce32)) { return ce32; }
770 switch(Collation::tagFromCE32(ce32)) {
771 case Collation::LONG_PRIMARY_TAG:
772 case Collation::LONG_SECONDARY_TAG:
773 case Collation::LATIN_EXPANSION_TAG:
774 // copy as is
775 break;
776 case Collation::EXPANSION32_TAG: {
777 const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32);
778 int32_t length = Collation::lengthFromCE32(ce32);
779 ce32 = encodeExpansion32(
780 reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode);
781 break;
782 }
783 case Collation::EXPANSION_TAG: {
784 const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32);
785 int32_t length = Collation::lengthFromCE32(ce32);
786 ce32 = encodeExpansion(baseCEs, length, errorCode);
787 break;
788 }
789 case Collation::PREFIX_TAG: {
790 // Flatten prefixes and nested suffixes (contractions)
791 // into a linear list of ConditionalCE32.
792 const UChar *p = base->contexts + Collation::indexFromCE32(ce32);
793 ce32 = CollationData::readCE32(p); // Default if no prefix match.
794 if(!withContext) {
795 return copyFromBaseCE32(c, ce32, FALSE, errorCode);
796 }
797 ConditionalCE32 head(UnicodeString(), 0);
798 UnicodeString context((UChar)0);
799 int32_t index;
800 if(Collation::isContractionCE32(ce32)) {
801 index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
802 } else {
803 ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode);
804 head.next = index = addConditionalCE32(context, ce32, errorCode);
805 }
806 if(U_FAILURE(errorCode)) { return 0; }
807 ConditionalCE32 *cond = getConditionalCE32(index); // the last ConditionalCE32 so far
808 UCharsTrie::Iterator prefixes(p + 2, 0, errorCode);
809 while(prefixes.next(errorCode)) {
810 context = prefixes.getString();
811 context.reverse();
812 context.insert(0, (UChar)context.length());
813 ce32 = (uint32_t)prefixes.getValue();
814 if(Collation::isContractionCE32(ce32)) {
815 index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode);
816 } else {
817 ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode);
818 cond->next = index = addConditionalCE32(context, ce32, errorCode);
819 }
820 if(U_FAILURE(errorCode)) { return 0; }
821 cond = getConditionalCE32(index);
822 }
823 ce32 = makeBuilderContextCE32(head.next);
824 contextChars.add(c);
825 break;
826 }
827 case Collation::CONTRACTION_TAG: {
828 if(!withContext) {
829 const UChar *p = base->contexts + Collation::indexFromCE32(ce32);
830 ce32 = CollationData::readCE32(p); // Default if no suffix match.
831 return copyFromBaseCE32(c, ce32, FALSE, errorCode);
832 }
833 ConditionalCE32 head(UnicodeString(), 0);
834 UnicodeString context((UChar)0);
835 copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode);
836 ce32 = makeBuilderContextCE32(head.next);
837 contextChars.add(c);
838 break;
839 }
840 case Collation::HANGUL_TAG:
841 errorCode = U_UNSUPPORTED_ERROR; // We forbid tailoring of Hangul syllables.
842 break;
843 case Collation::OFFSET_TAG:
844 ce32 = getCE32FromOffsetCE32(TRUE, c, ce32);
845 break;
846 case Collation::IMPLICIT_TAG:
847 ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode);
848 break;
849 default:
850 U_ASSERT(FALSE); // require ce32 == base->getFinalCE32(ce32)
851 break;
852 }
853 return ce32;
854 }
855
856 int32_t
copyContractionsFromBaseCE32(UnicodeString & context,UChar32 c,uint32_t ce32,ConditionalCE32 * cond,UErrorCode & errorCode)857 CollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32,
858 ConditionalCE32 *cond, UErrorCode &errorCode) {
859 if(U_FAILURE(errorCode)) { return 0; }
860 const UChar *p = base->contexts + Collation::indexFromCE32(ce32);
861 int32_t index;
862 if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
863 // No match on the single code point.
864 // We are underneath a prefix, and the default mapping is just
865 // a fallback to the mappings for a shorter prefix.
866 U_ASSERT(context.length() > 1);
867 index = -1;
868 } else {
869 ce32 = CollationData::readCE32(p); // Default if no suffix match.
870 U_ASSERT(!Collation::isContractionCE32(ce32));
871 ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode);
872 cond->next = index = addConditionalCE32(context, ce32, errorCode);
873 if(U_FAILURE(errorCode)) { return 0; }
874 cond = getConditionalCE32(index);
875 }
876
877 int32_t suffixStart = context.length();
878 UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
879 while(suffixes.next(errorCode)) {
880 context.append(suffixes.getString());
881 ce32 = copyFromBaseCE32(c, (uint32_t)suffixes.getValue(), TRUE, errorCode);
882 cond->next = index = addConditionalCE32(context, ce32, errorCode);
883 if(U_FAILURE(errorCode)) { return 0; }
884 // No need to update the unsafeBackwardSet because the tailoring set
885 // is already a copy of the base set.
886 cond = getConditionalCE32(index);
887 context.truncate(suffixStart);
888 }
889 U_ASSERT(index >= 0);
890 return index;
891 }
892
893 class CopyHelper {
894 public:
CopyHelper(const CollationDataBuilder & s,CollationDataBuilder & d,const CollationDataBuilder::CEModifier & m,UErrorCode & initialErrorCode)895 CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d,
896 const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode)
897 : src(s), dest(d), modifier(m),
898 errorCode(initialErrorCode) {}
899
copyRangeCE32(UChar32 start,UChar32 end,uint32_t ce32)900 UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) {
901 ce32 = copyCE32(ce32);
902 utrie2_setRange32(dest.trie, start, end, ce32, TRUE, &errorCode);
903 if(CollationDataBuilder::isBuilderContextCE32(ce32)) {
904 dest.contextChars.add(start, end);
905 }
906 return U_SUCCESS(errorCode);
907 }
908
copyCE32(uint32_t ce32)909 uint32_t copyCE32(uint32_t ce32) {
910 if(!Collation::isSpecialCE32(ce32)) {
911 int64_t ce = modifier.modifyCE32(ce32);
912 if(ce != Collation::NO_CE) {
913 ce32 = dest.encodeOneCE(ce, errorCode);
914 }
915 } else {
916 int32_t tag = Collation::tagFromCE32(ce32);
917 if(tag == Collation::EXPANSION32_TAG) {
918 const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer());
919 srcCE32s += Collation::indexFromCE32(ce32);
920 int32_t length = Collation::lengthFromCE32(ce32);
921 // Inspect the source CE32s. Just copy them if none are modified.
922 // Otherwise copy to modifiedCEs, with modifications.
923 UBool isModified = FALSE;
924 for(int32_t i = 0; i < length; ++i) {
925 ce32 = srcCE32s[i];
926 int64_t ce;
927 if(Collation::isSpecialCE32(ce32) ||
928 (ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) {
929 if(isModified) {
930 modifiedCEs[i] = Collation::ceFromCE32(ce32);
931 }
932 } else {
933 if(!isModified) {
934 for(int32_t j = 0; j < i; ++j) {
935 modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]);
936 }
937 isModified = TRUE;
938 }
939 modifiedCEs[i] = ce;
940 }
941 }
942 if(isModified) {
943 ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
944 } else {
945 ce32 = dest.encodeExpansion32(
946 reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode);
947 }
948 } else if(tag == Collation::EXPANSION_TAG) {
949 const int64_t *srcCEs = src.ce64s.getBuffer();
950 srcCEs += Collation::indexFromCE32(ce32);
951 int32_t length = Collation::lengthFromCE32(ce32);
952 // Inspect the source CEs. Just copy them if none are modified.
953 // Otherwise copy to modifiedCEs, with modifications.
954 UBool isModified = FALSE;
955 for(int32_t i = 0; i < length; ++i) {
956 int64_t srcCE = srcCEs[i];
957 int64_t ce = modifier.modifyCE(srcCE);
958 if(ce == Collation::NO_CE) {
959 if(isModified) {
960 modifiedCEs[i] = srcCE;
961 }
962 } else {
963 if(!isModified) {
964 for(int32_t j = 0; j < i; ++j) {
965 modifiedCEs[j] = srcCEs[j];
966 }
967 isModified = TRUE;
968 }
969 modifiedCEs[i] = ce;
970 }
971 }
972 if(isModified) {
973 ce32 = dest.encodeCEs(modifiedCEs, length, errorCode);
974 } else {
975 ce32 = dest.encodeExpansion(srcCEs, length, errorCode);
976 }
977 } else if(tag == Collation::BUILDER_DATA_TAG) {
978 // Copy the list of ConditionalCE32.
979 ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32);
980 U_ASSERT(!cond->hasContext());
981 int32_t destIndex = dest.addConditionalCE32(
982 cond->context, copyCE32(cond->ce32), errorCode);
983 ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex);
984 while(cond->next >= 0) {
985 cond = src.getConditionalCE32(cond->next);
986 ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex);
987 destIndex = dest.addConditionalCE32(
988 cond->context, copyCE32(cond->ce32), errorCode);
989 int32_t suffixStart = cond->prefixLength() + 1;
990 dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart));
991 prevDestCond->next = destIndex;
992 }
993 } else {
994 // Just copy long CEs and Latin mini expansions (and other expected values) as is,
995 // assuming that the modifier would not modify them.
996 U_ASSERT(tag == Collation::LONG_PRIMARY_TAG ||
997 tag == Collation::LONG_SECONDARY_TAG ||
998 tag == Collation::LATIN_EXPANSION_TAG ||
999 tag == Collation::HANGUL_TAG);
1000 }
1001 }
1002 return ce32;
1003 }
1004
1005 const CollationDataBuilder &src;
1006 CollationDataBuilder &dest;
1007 const CollationDataBuilder::CEModifier &modifier;
1008 int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH];
1009 UErrorCode errorCode;
1010 };
1011
1012 U_CDECL_BEGIN
1013
1014 static UBool U_CALLCONV
enumRangeForCopy(const void * context,UChar32 start,UChar32 end,uint32_t value)1015 enumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1016 return
1017 value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 ||
1018 ((CopyHelper *)context)->copyRangeCE32(start, end, value);
1019 }
1020
1021 U_CDECL_END
1022
1023 void
copyFrom(const CollationDataBuilder & src,const CEModifier & modifier,UErrorCode & errorCode)1024 CollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier,
1025 UErrorCode &errorCode) {
1026 if(U_FAILURE(errorCode)) { return; }
1027 if(trie == NULL || utrie2_isFrozen(trie)) {
1028 errorCode = U_INVALID_STATE_ERROR;
1029 return;
1030 }
1031 CopyHelper helper(src, *this, modifier, errorCode);
1032 utrie2_enum(src.trie, NULL, enumRangeForCopy, &helper);
1033 errorCode = helper.errorCode;
1034 // Update the contextChars and the unsafeBackwardSet while copying,
1035 // in case a character had conditional mappings in the source builder
1036 // and they were removed later.
1037 modified |= src.modified;
1038 }
1039
1040 void
optimize(const UnicodeSet & set,UErrorCode & errorCode)1041 CollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) {
1042 if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
1043 UnicodeSetIterator iter(set);
1044 while(iter.next() && !iter.isString()) {
1045 UChar32 c = iter.getCodepoint();
1046 uint32_t ce32 = utrie2_get32(trie, c);
1047 if(ce32 == Collation::FALLBACK_CE32) {
1048 ce32 = base->getFinalCE32(base->getCE32(c));
1049 ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode);
1050 utrie2_set32(trie, c, ce32, &errorCode);
1051 }
1052 }
1053 modified = TRUE;
1054 }
1055
1056 void
suppressContractions(const UnicodeSet & set,UErrorCode & errorCode)1057 CollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) {
1058 if(U_FAILURE(errorCode) || set.isEmpty()) { return; }
1059 UnicodeSetIterator iter(set);
1060 while(iter.next() && !iter.isString()) {
1061 UChar32 c = iter.getCodepoint();
1062 uint32_t ce32 = utrie2_get32(trie, c);
1063 if(ce32 == Collation::FALLBACK_CE32) {
1064 ce32 = base->getFinalCE32(base->getCE32(c));
1065 if(Collation::ce32HasContext(ce32)) {
1066 ce32 = copyFromBaseCE32(c, ce32, FALSE /* without context */, errorCode);
1067 utrie2_set32(trie, c, ce32, &errorCode);
1068 }
1069 } else if(isBuilderContextCE32(ce32)) {
1070 ce32 = getConditionalCE32ForCE32(ce32)->ce32;
1071 // Simply abandon the list of ConditionalCE32.
1072 // The caller will copy this builder in the end,
1073 // eliminating unreachable data.
1074 utrie2_set32(trie, c, ce32, &errorCode);
1075 contextChars.remove(c);
1076 }
1077 }
1078 modified = TRUE;
1079 }
1080
1081 UBool
getJamoCE32s(uint32_t jamoCE32s[],UErrorCode & errorCode)1082 CollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) {
1083 if(U_FAILURE(errorCode)) { return FALSE; }
1084 UBool anyJamoAssigned = base == NULL; // always set jamoCE32s in the base data
1085 UBool needToCopyFromBase = FALSE;
1086 for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types.
1087 UChar32 jamo = jamoCpFromIndex(j);
1088 UBool fromBase = FALSE;
1089 uint32_t ce32 = utrie2_get32(trie, jamo);
1090 anyJamoAssigned |= Collation::isAssignedCE32(ce32);
1091 // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
1092 // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
1093 if(ce32 == Collation::FALLBACK_CE32) {
1094 fromBase = TRUE;
1095 ce32 = base->getCE32(jamo);
1096 }
1097 if(Collation::isSpecialCE32(ce32)) {
1098 switch(Collation::tagFromCE32(ce32)) {
1099 case Collation::LONG_PRIMARY_TAG:
1100 case Collation::LONG_SECONDARY_TAG:
1101 case Collation::LATIN_EXPANSION_TAG:
1102 // Copy the ce32 as-is.
1103 break;
1104 case Collation::EXPANSION32_TAG:
1105 case Collation::EXPANSION_TAG:
1106 case Collation::PREFIX_TAG:
1107 case Collation::CONTRACTION_TAG:
1108 if(fromBase) {
1109 // Defer copying until we know if anyJamoAssigned.
1110 ce32 = Collation::FALLBACK_CE32;
1111 needToCopyFromBase = TRUE;
1112 }
1113 break;
1114 case Collation::IMPLICIT_TAG:
1115 // An unassigned Jamo should only occur in tests with incomplete bases.
1116 U_ASSERT(fromBase);
1117 ce32 = Collation::FALLBACK_CE32;
1118 needToCopyFromBase = TRUE;
1119 break;
1120 case Collation::OFFSET_TAG:
1121 ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32);
1122 break;
1123 case Collation::FALLBACK_TAG:
1124 case Collation::RESERVED_TAG_3:
1125 case Collation::BUILDER_DATA_TAG:
1126 case Collation::DIGIT_TAG:
1127 case Collation::U0000_TAG:
1128 case Collation::HANGUL_TAG:
1129 case Collation::LEAD_SURROGATE_TAG:
1130 errorCode = U_INTERNAL_PROGRAM_ERROR;
1131 return FALSE;
1132 }
1133 }
1134 jamoCE32s[j] = ce32;
1135 }
1136 if(anyJamoAssigned && needToCopyFromBase) {
1137 for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) {
1138 if(jamoCE32s[j] == Collation::FALLBACK_CE32) {
1139 UChar32 jamo = jamoCpFromIndex(j);
1140 jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo),
1141 /*withContext=*/ TRUE, errorCode);
1142 }
1143 }
1144 }
1145 return anyJamoAssigned && U_SUCCESS(errorCode);
1146 }
1147
1148 void
setDigitTags(UErrorCode & errorCode)1149 CollationDataBuilder::setDigitTags(UErrorCode &errorCode) {
1150 UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode);
1151 if(U_FAILURE(errorCode)) { return; }
1152 UnicodeSetIterator iter(digits);
1153 while(iter.next()) {
1154 U_ASSERT(!iter.isString());
1155 UChar32 c = iter.getCodepoint();
1156 uint32_t ce32 = utrie2_get32(trie, c);
1157 if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) {
1158 int32_t index = addCE32(ce32, errorCode);
1159 if(U_FAILURE(errorCode)) { return; }
1160 if(index > Collation::MAX_INDEX) {
1161 errorCode = U_BUFFER_OVERFLOW_ERROR;
1162 return;
1163 }
1164 ce32 = Collation::makeCE32FromTagIndexAndLength(
1165 Collation::DIGIT_TAG, index, u_charDigitValue(c));
1166 utrie2_set32(trie, c, ce32, &errorCode);
1167 }
1168 }
1169 }
1170
1171 U_CDECL_BEGIN
1172
1173 static UBool U_CALLCONV
enumRangeLeadValue(const void * context,UChar32,UChar32,uint32_t value)1174 enumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1175 int32_t *pValue = (int32_t *)context;
1176 if(value == Collation::UNASSIGNED_CE32) {
1177 value = Collation::LEAD_ALL_UNASSIGNED;
1178 } else if(value == Collation::FALLBACK_CE32) {
1179 value = Collation::LEAD_ALL_FALLBACK;
1180 } else {
1181 *pValue = Collation::LEAD_MIXED;
1182 return FALSE;
1183 }
1184 if(*pValue < 0) {
1185 *pValue = (int32_t)value;
1186 } else if(*pValue != (int32_t)value) {
1187 *pValue = Collation::LEAD_MIXED;
1188 return FALSE;
1189 }
1190 return TRUE;
1191 }
1192
1193 U_CDECL_END
1194
1195 void
setLeadSurrogates(UErrorCode & errorCode)1196 CollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) {
1197 for(UChar lead = 0xd800; lead < 0xdc00; ++lead) {
1198 int32_t value = -1;
1199 utrie2_enumForLeadSurrogate(trie, lead, NULL, enumRangeLeadValue, &value);
1200 utrie2_set32ForLeadSurrogateCodeUnit(
1201 trie, lead,
1202 Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | (uint32_t)value,
1203 &errorCode);
1204 }
1205 }
1206
1207 void
build(CollationData & data,UErrorCode & errorCode)1208 CollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) {
1209 buildMappings(data, errorCode);
1210 if(base != NULL) {
1211 data.numericPrimary = base->numericPrimary;
1212 data.compressibleBytes = base->compressibleBytes;
1213 data.scripts = base->scripts;
1214 data.scriptsLength = base->scriptsLength;
1215 }
1216 buildFastLatinTable(data, errorCode);
1217 }
1218
1219 void
buildMappings(CollationData & data,UErrorCode & errorCode)1220 CollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) {
1221 if(U_FAILURE(errorCode)) { return; }
1222 if(trie == NULL || utrie2_isFrozen(trie)) {
1223 errorCode = U_INVALID_STATE_ERROR;
1224 return;
1225 }
1226
1227 buildContexts(errorCode);
1228
1229 uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH];
1230 int32_t jamoIndex = -1;
1231 if(getJamoCE32s(jamoCE32s, errorCode)) {
1232 jamoIndex = ce32s.size();
1233 for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
1234 ce32s.addElement((int32_t)jamoCE32s[i], errorCode);
1235 }
1236 // Small optimization: Use a bit in the Hangul ce32
1237 // to indicate that none of the Jamo CE32s are isSpecialCE32()
1238 // (as it should be in the root collator).
1239 // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
1240 // In order to still have good trie compression and keep this code simple,
1241 // we only set this flag if a whole block of 588 Hangul syllables starting with
1242 // a common leading consonant (Jamo L) has this property.
1243 UBool isAnyJamoVTSpecial = FALSE;
1244 for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) {
1245 if(Collation::isSpecialCE32(jamoCE32s[i])) {
1246 isAnyJamoVTSpecial = TRUE;
1247 break;
1248 }
1249 }
1250 uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0);
1251 UChar32 c = Hangul::HANGUL_BASE;
1252 for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) { // iterate over the Jamo L
1253 uint32_t ce32 = hangulCE32;
1254 if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) {
1255 ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO;
1256 }
1257 UChar32 limit = c + Hangul::JAMO_VT_COUNT;
1258 utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode);
1259 c = limit;
1260 }
1261 } else {
1262 // Copy the Hangul CE32s from the base in blocks per Jamo L,
1263 // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
1264 for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) {
1265 uint32_t ce32 = base->getCE32(c);
1266 U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG));
1267 UChar32 limit = c + Hangul::JAMO_VT_COUNT;
1268 utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode);
1269 c = limit;
1270 }
1271 }
1272
1273 setDigitTags(errorCode);
1274 setLeadSurrogates(errorCode);
1275
1276 // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
1277 ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0);
1278 utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode);
1279
1280 utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode);
1281 if(U_FAILURE(errorCode)) { return; }
1282
1283 // Mark each lead surrogate as "unsafe"
1284 // if any of its 1024 associated supplementary code points is "unsafe".
1285 UChar32 c = 0x10000;
1286 for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
1287 if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) {
1288 unsafeBackwardSet.add(lead);
1289 }
1290 }
1291 unsafeBackwardSet.freeze();
1292
1293 data.trie = trie;
1294 data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer());
1295 data.ces = ce64s.getBuffer();
1296 data.contexts = contexts.getBuffer();
1297
1298 data.ce32sLength = ce32s.size();
1299 data.cesLength = ce64s.size();
1300 data.contextsLength = contexts.length();
1301
1302 data.base = base;
1303 if(jamoIndex >= 0) {
1304 data.jamoCE32s = data.ce32s + jamoIndex;
1305 } else {
1306 data.jamoCE32s = base->jamoCE32s;
1307 }
1308 data.unsafeBackwardSet = &unsafeBackwardSet;
1309 }
1310
1311 void
clearContexts()1312 CollationDataBuilder::clearContexts() {
1313 contexts.remove();
1314 UnicodeSetIterator iter(contextChars);
1315 while(iter.next()) {
1316 U_ASSERT(!iter.isString());
1317 uint32_t ce32 = utrie2_get32(trie, iter.getCodepoint());
1318 U_ASSERT(isBuilderContextCE32(ce32));
1319 getConditionalCE32ForCE32(ce32)->builtCE32 = Collation::NO_CE32;
1320 }
1321 }
1322
1323 void
buildContexts(UErrorCode & errorCode)1324 CollationDataBuilder::buildContexts(UErrorCode &errorCode) {
1325 if(U_FAILURE(errorCode)) { return; }
1326 // Ignore abandoned lists and the cached builtCE32,
1327 // and build all contexts from scratch.
1328 contexts.remove();
1329 UnicodeSetIterator iter(contextChars);
1330 while(U_SUCCESS(errorCode) && iter.next()) {
1331 U_ASSERT(!iter.isString());
1332 UChar32 c = iter.getCodepoint();
1333 uint32_t ce32 = utrie2_get32(trie, c);
1334 if(!isBuilderContextCE32(ce32)) {
1335 // Impossible: No context data for c in contextChars.
1336 errorCode = U_INTERNAL_PROGRAM_ERROR;
1337 return;
1338 }
1339 ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32);
1340 ce32 = buildContext(cond, errorCode);
1341 utrie2_set32(trie, c, ce32, &errorCode);
1342 }
1343 }
1344
1345 uint32_t
buildContext(ConditionalCE32 * head,UErrorCode & errorCode)1346 CollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) {
1347 if(U_FAILURE(errorCode)) { return 0; }
1348 // The list head must have no context.
1349 U_ASSERT(!head->hasContext());
1350 // The list head must be followed by one or more nodes that all do have context.
1351 U_ASSERT(head->next >= 0);
1352 UCharsTrieBuilder prefixBuilder(errorCode);
1353 UCharsTrieBuilder contractionBuilder(errorCode);
1354 for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) {
1355 // After the list head, the prefix or suffix can be empty, but not both.
1356 U_ASSERT(cond == head || cond->hasContext());
1357 int32_t prefixLength = cond->prefixLength();
1358 UnicodeString prefix(cond->context, 0, prefixLength + 1);
1359 // Collect all contraction suffixes for one prefix.
1360 ConditionalCE32 *firstCond = cond;
1361 ConditionalCE32 *lastCond = cond;
1362 while(cond->next >= 0 &&
1363 (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)) {
1364 lastCond = cond;
1365 }
1366 uint32_t ce32;
1367 int32_t suffixStart = prefixLength + 1; // == prefix.length()
1368 if(lastCond->context.length() == suffixStart) {
1369 // One prefix without contraction suffix.
1370 U_ASSERT(firstCond == lastCond);
1371 ce32 = lastCond->ce32;
1372 cond = lastCond;
1373 } else {
1374 // Build the contractions trie.
1375 contractionBuilder.clear();
1376 // Entry for an empty suffix, to be stored before the trie.
1377 uint32_t emptySuffixCE32;
1378 uint32_t flags = 0;
1379 if(firstCond->context.length() == suffixStart) {
1380 // There is a mapping for the prefix and the single character c. (p|c)
1381 // If no other suffix matches, then we return this value.
1382 emptySuffixCE32 = firstCond->ce32;
1383 cond = getConditionalCE32(firstCond->next);
1384 } else {
1385 // There is no mapping for the prefix and just the single character.
1386 // (There is no p|c, only p|cd, p|ce etc.)
1387 flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH;
1388 // When the prefix matches but none of the prefix-specific suffixes,
1389 // then we fall back to the mappings with the next-longest prefix,
1390 // and ultimately to mappings with no prefix.
1391 // Each fallback might be another set of contractions.
1392 // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
1393 // then in text "pch" we find the ch contraction.
1394 for(cond = head;; cond = getConditionalCE32(cond->next)) {
1395 int32_t length = cond->prefixLength();
1396 if(length == prefixLength) { break; }
1397 if(cond->defaultCE32 != Collation::NO_CE32 &&
1398 (length==0 || prefix.endsWith(cond->context, 1, length))) {
1399 emptySuffixCE32 = cond->defaultCE32;
1400 }
1401 }
1402 cond = firstCond;
1403 }
1404 // Optimization: Set a flag when
1405 // the first character of every contraction suffix has lccc!=0.
1406 // Short-circuits contraction matching when a normal letter follows.
1407 flags |= Collation::CONTRACT_NEXT_CCC;
1408 // Add all of the non-empty suffixes into the contraction trie.
1409 for(;;) {
1410 UnicodeString suffix(cond->context, suffixStart);
1411 uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0));
1412 if(fcd16 <= 0xff) {
1413 flags &= ~Collation::CONTRACT_NEXT_CCC;
1414 }
1415 fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1));
1416 if(fcd16 > 0xff) {
1417 // The last suffix character has lccc!=0, allowing for discontiguous contractions.
1418 flags |= Collation::CONTRACT_TRAILING_CCC;
1419 }
1420 contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode);
1421 if(cond == lastCond) { break; }
1422 cond = getConditionalCE32(cond->next);
1423 }
1424 int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode);
1425 if(U_FAILURE(errorCode)) { return 0; }
1426 if(index > Collation::MAX_INDEX) {
1427 errorCode = U_BUFFER_OVERFLOW_ERROR;
1428 return 0;
1429 }
1430 ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags;
1431 }
1432 U_ASSERT(cond == lastCond);
1433 firstCond->defaultCE32 = ce32;
1434 if(prefixLength == 0) {
1435 if(cond->next < 0) {
1436 // No non-empty prefixes, only contractions.
1437 return ce32;
1438 }
1439 } else {
1440 prefix.remove(0, 1); // Remove the length unit.
1441 prefix.reverse();
1442 prefixBuilder.add(prefix, (int32_t)ce32, errorCode);
1443 if(cond->next < 0) { break; }
1444 }
1445 }
1446 U_ASSERT(head->defaultCE32 != Collation::NO_CE32);
1447 int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode);
1448 if(U_FAILURE(errorCode)) { return 0; }
1449 if(index > Collation::MAX_INDEX) {
1450 errorCode = U_BUFFER_OVERFLOW_ERROR;
1451 return 0;
1452 }
1453 return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index);
1454 }
1455
1456 int32_t
addContextTrie(uint32_t defaultCE32,UCharsTrieBuilder & trieBuilder,UErrorCode & errorCode)1457 CollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder,
1458 UErrorCode &errorCode) {
1459 UnicodeString context;
1460 context.append((UChar)(defaultCE32 >> 16)).append((UChar)defaultCE32);
1461 UnicodeString trieString;
1462 context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode));
1463 if(U_FAILURE(errorCode)) { return -1; }
1464 int32_t index = contexts.indexOf(context);
1465 if(index < 0) {
1466 index = contexts.length();
1467 contexts.append(context);
1468 }
1469 return index;
1470 }
1471
1472 void
buildFastLatinTable(CollationData & data,UErrorCode & errorCode)1473 CollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) {
1474 if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; }
1475
1476 delete fastLatinBuilder;
1477 fastLatinBuilder = new CollationFastLatinBuilder(errorCode);
1478 if(fastLatinBuilder == NULL) {
1479 errorCode = U_MEMORY_ALLOCATION_ERROR;
1480 return;
1481 }
1482 if(fastLatinBuilder->forData(data, errorCode)) {
1483 const uint16_t *table = fastLatinBuilder->getTable();
1484 int32_t length = fastLatinBuilder->lengthOfTable();
1485 if(base != NULL && length == base->fastLatinTableLength &&
1486 uprv_memcmp(table, base->fastLatinTable, length * 2) == 0) {
1487 // Same fast Latin table as in the base, use that one instead.
1488 delete fastLatinBuilder;
1489 fastLatinBuilder = NULL;
1490 table = base->fastLatinTable;
1491 }
1492 data.fastLatinTable = table;
1493 data.fastLatinTableLength = length;
1494 } else {
1495 delete fastLatinBuilder;
1496 fastLatinBuilder = NULL;
1497 }
1498 }
1499
1500 int32_t
getCEs(const UnicodeString & s,int64_t ces[],int32_t cesLength)1501 CollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) {
1502 return getCEs(s, 0, ces, cesLength);
1503 }
1504
1505 int32_t
getCEs(const UnicodeString & prefix,const UnicodeString & s,int64_t ces[],int32_t cesLength)1506 CollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s,
1507 int64_t ces[], int32_t cesLength) {
1508 int32_t prefixLength = prefix.length();
1509 if(prefixLength == 0) {
1510 return getCEs(s, 0, ces, cesLength);
1511 } else {
1512 return getCEs(prefix + s, prefixLength, ces, cesLength);
1513 }
1514 }
1515
1516 int32_t
getCEs(const UnicodeString & s,int32_t start,int64_t ces[],int32_t cesLength)1517 CollationDataBuilder::getCEs(const UnicodeString &s, int32_t start,
1518 int64_t ces[], int32_t cesLength) {
1519 if(collIter == NULL) {
1520 collIter = new DataBuilderCollationIterator(*this);
1521 if(collIter == NULL) { return 0; }
1522 }
1523 return collIter->fetchCEs(s, start, ces, cesLength);
1524 }
1525
1526 U_NAMESPACE_END
1527
1528 #endif // !UCONFIG_NO_COLLATION
1529