1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2001-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: utrie2_builder.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2008sep26 (split off from utrie2.c)
16 * created by: Markus W. Scherer
17 *
18 * This is a common implementation of a Unicode trie.
19 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
20 * Unicode code points (0..0x10ffff).
21 * This is the second common version of a Unicode trie (hence the name UTrie2).
22 * See utrie2.h for a comparison.
23 *
24 * This file contains only the builder code.
25 * See utrie2.c for the runtime and enumeration code.
26 */
27 // #define UTRIE2_DEBUG
28 #ifdef UTRIE2_DEBUG
29 # include <stdio.h>
30 #endif
31 // #define UCPTRIE_DEBUG
32
33 #include "unicode/utypes.h"
34 #ifdef UCPTRIE_DEBUG
35 #include "unicode/ucptrie.h"
36 #include "unicode/umutablecptrie.h"
37 #include "ucptrie_impl.h"
38 #endif
39 #include "cmemory.h"
40 #include "utrie2.h"
41 #include "utrie2_impl.h"
42
43 #include "utrie.h" // for utrie2_fromUTrie()
44
45 /* Implementation notes ----------------------------------------------------- */
46
47 /*
48 * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
49 * have been chosen to minimize trie sizes overall.
50 * Most of the code is flexible enough to work with a range of values,
51 * within certain limits.
52 *
53 * Exception: Support for separate values for lead surrogate code _units_
54 * vs. code _points_ was added after the constants were fixed,
55 * and has not been tested nor particularly designed for different constant values.
56 * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
57 * part and back.)
58 *
59 * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
60 * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
61 * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
62 * remapping) stops working.
63 *
64 * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
65 * assumes that a single index-2 block is used for 0x400 code points
66 * corresponding to one lead surrogate.
67 *
68 * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
69 * more than one Unicode plane, and the split of the index-2 table into a BMP
70 * part and a supplementary part, with a gap in between, would not work.
71 *
72 * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
73 * there is data with more than 64k distinct values,
74 * for example for Unihan collation with a separate collation weight per
75 * Han character.
76 */
77
78 /* Building a trie ----------------------------------------------------------*/
79
80 enum {
81 /** The null index-2 block, following the gap in the index-2 table. */
82 UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
83
84 /** The start of allocated index-2 blocks. */
85 UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
86
87 /**
88 * The null data block.
89 * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
90 * to work with 6-bit trail bytes from 2-byte UTF-8.
91 */
92 UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
93
94 /** The start of allocated data blocks. */
95 UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
96
97 /**
98 * The start of data blocks for U+0800 and above.
99 * Below, compaction uses a block length of 64 for 2-byte UTF-8.
100 * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
101 * Data values for 0x780 code points beyond ASCII.
102 */
103 UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
104 };
105
106 /* Start with allocation of 16k data entries. */
107 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
108
109 /* Grow about 8x each time. */
110 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
111
112 static int32_t
113 allocIndex2Block(UNewTrie2 *trie);
114
115 U_CAPI UTrie2 * U_EXPORT2
utrie2_open(uint32_t initialValue,uint32_t errorValue,UErrorCode * pErrorCode)116 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
117 UTrie2 *trie;
118 UNewTrie2 *newTrie;
119 uint32_t *data;
120 int32_t i, j;
121
122 if(U_FAILURE(*pErrorCode)) {
123 return nullptr;
124 }
125
126 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
127 newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
128 data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
129 if(trie==nullptr || newTrie==nullptr || data==nullptr) {
130 uprv_free(trie);
131 uprv_free(newTrie);
132 uprv_free(data);
133 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
134 return 0;
135 }
136
137 uprv_memset(trie, 0, sizeof(UTrie2));
138 trie->initialValue=initialValue;
139 trie->errorValue=errorValue;
140 trie->highStart=0x110000;
141 trie->newTrie=newTrie;
142 #ifdef UTRIE2_DEBUG
143 trie->name="open";
144 #endif
145
146 newTrie->data=data;
147 #ifdef UCPTRIE_DEBUG
148 newTrie->t3=umutablecptrie_open(initialValue, errorValue, pErrorCode);
149 #endif
150 newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
151 newTrie->initialValue=initialValue;
152 newTrie->errorValue=errorValue;
153 newTrie->highStart=0x110000;
154 newTrie->firstFreeBlock=0; /* no free block in the list */
155 newTrie->isCompacted=false;
156
157 /*
158 * preallocate and reset
159 * - ASCII
160 * - the bad-UTF-8-data block
161 * - the null data block
162 */
163 for(i=0; i<0x80; ++i) {
164 newTrie->data[i]=initialValue;
165 }
166 for(; i<0xc0; ++i) {
167 newTrie->data[i]=errorValue;
168 }
169 for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
170 newTrie->data[i]=initialValue;
171 }
172 newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
173 newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
174
175 /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
176 for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
177 newTrie->index2[i]=j;
178 newTrie->map[i]=1;
179 }
180 /* reference counts for the bad-UTF-8-data block */
181 for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
182 newTrie->map[i]=0;
183 }
184 /*
185 * Reference counts for the null data block: all blocks except for the ASCII blocks.
186 * Plus 1 so that we don't drop this block during compaction.
187 * Plus as many as needed for lead surrogate code points.
188 */
189 /* i==newTrie->dataNullOffset */
190 newTrie->map[i++]=
191 (0x110000>>UTRIE2_SHIFT_2)-
192 (0x80>>UTRIE2_SHIFT_2)+
193 1+
194 UTRIE2_LSCP_INDEX_2_LENGTH;
195 j+=UTRIE2_DATA_BLOCK_LENGTH;
196 for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
197 newTrie->map[i]=0;
198 }
199
200 /*
201 * set the remaining indexes in the BMP index-2 block
202 * to the null data block
203 */
204 for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
205 newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
206 }
207
208 /*
209 * Fill the index gap with impossible values so that compaction
210 * does not overlap other index-2 blocks with the gap.
211 */
212 for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
213 newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
214 }
215
216 /* set the indexes in the null index-2 block */
217 for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
218 newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
219 }
220 newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
221 newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
222
223 /* set the index-1 indexes for the linear index-2 block */
224 for(i=0, j=0;
225 i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
226 ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
227 ) {
228 newTrie->index1[i]=j;
229 }
230
231 /* set the remaining index-1 indexes to the null index-2 block */
232 for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
233 newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
234 }
235
236 /*
237 * Preallocate and reset data for U+0080..U+07ff,
238 * for 2-byte UTF-8 which will be compacted in 64-blocks
239 * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
240 */
241 for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
242 utrie2_set32(trie, i, initialValue, pErrorCode);
243 }
244
245 return trie;
246 }
247
248 static UNewTrie2 *
cloneBuilder(const UNewTrie2 * other)249 cloneBuilder(const UNewTrie2 *other) {
250 UNewTrie2 *trie;
251
252 trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
253 if(trie==nullptr) {
254 return nullptr;
255 }
256
257 trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4);
258 if(trie->data==nullptr) {
259 uprv_free(trie);
260 return nullptr;
261 }
262 #ifdef UCPTRIE_DEBUG
263 if(other->t3==nullptr) {
264 trie->t3=nullptr;
265 } else {
266 UErrorCode errorCode=U_ZERO_ERROR;
267 trie->t3=umutablecptrie_clone(other->t3, &errorCode);
268 }
269 #endif
270 trie->dataCapacity=other->dataCapacity;
271
272 /* clone data */
273 uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
274 uprv_memcpy(trie->index2, other->index2, (size_t)other->index2Length*4);
275 trie->index2NullOffset=other->index2NullOffset;
276 trie->index2Length=other->index2Length;
277
278 uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4);
279 trie->dataNullOffset=other->dataNullOffset;
280 trie->dataLength=other->dataLength;
281
282 /* reference counters */
283 if(other->isCompacted) {
284 trie->firstFreeBlock=0;
285 } else {
286 uprv_memcpy(trie->map, other->map, ((size_t)other->dataLength>>UTRIE2_SHIFT_2)*4);
287 trie->firstFreeBlock=other->firstFreeBlock;
288 }
289
290 trie->initialValue=other->initialValue;
291 trie->errorValue=other->errorValue;
292 trie->highStart=other->highStart;
293 trie->isCompacted=other->isCompacted;
294
295 return trie;
296 }
297
298 U_CAPI UTrie2 * U_EXPORT2
utrie2_clone(const UTrie2 * other,UErrorCode * pErrorCode)299 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
300 UTrie2 *trie;
301
302 if(U_FAILURE(*pErrorCode)) {
303 return nullptr;
304 }
305 if(other==nullptr || (other->memory==nullptr && other->newTrie==nullptr)) {
306 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
307 return nullptr;
308 }
309
310 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
311 if(trie==nullptr) {
312 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
313 return nullptr;
314 }
315 uprv_memcpy(trie, other, sizeof(UTrie2));
316
317 if(other->memory!=nullptr) {
318 trie->memory=uprv_malloc(other->length);
319 if(trie->memory!=nullptr) {
320 trie->isMemoryOwned=true;
321 uprv_memcpy(trie->memory, other->memory, other->length);
322
323 /* make the clone's pointers point to its own memory */
324 trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
325 if(other->data16!=nullptr) {
326 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
327 }
328 if(other->data32!=nullptr) {
329 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
330 }
331 }
332 } else /* other->newTrie!=nullptr */ {
333 trie->newTrie=cloneBuilder(other->newTrie);
334 }
335
336 if(trie->memory==nullptr && trie->newTrie==nullptr) {
337 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
338 uprv_free(trie);
339 trie=nullptr;
340 }
341 return trie;
342 }
343
344 typedef struct NewTrieAndStatus {
345 UTrie2 *trie;
346 UErrorCode errorCode;
347 UBool exclusiveLimit; /* rather than inclusive range end */
348 } NewTrieAndStatus;
349
350 static UBool U_CALLCONV
copyEnumRange(const void * context,UChar32 start,UChar32 end,uint32_t value)351 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
352 NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
353 if(value!=nt->trie->initialValue) {
354 if(nt->exclusiveLimit) {
355 --end;
356 }
357 if(start==end) {
358 utrie2_set32(nt->trie, start, value, &nt->errorCode);
359 } else {
360 utrie2_setRange32(nt->trie, start, end, value, true, &nt->errorCode);
361 }
362 return U_SUCCESS(nt->errorCode);
363 } else {
364 return true;
365 }
366 }
367
368 #ifdef UTRIE2_DEBUG
countInitial(const UTrie2 * trie)369 static long countInitial(const UTrie2 *trie) {
370 uint32_t initialValue=trie->initialValue;
371 int32_t length=trie->dataLength;
372 long count=0;
373 if(trie->data16!=nullptr) {
374 for(int32_t i=0; i<length; ++i) {
375 if(trie->data16[i]==initialValue) { ++count; }
376 }
377 } else {
378 for(int32_t i=0; i<length; ++i) {
379 if(trie->data32[i]==initialValue) { ++count; }
380 }
381 }
382 return count;
383 }
384
385 static void
utrie_printLengths(const UTrie * trie)386 utrie_printLengths(const UTrie *trie) {
387 long indexLength=trie->indexLength;
388 long dataLength=(long)trie->dataLength;
389 long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=nullptr ? 4 : 2);
390 printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n",
391 indexLength, dataLength, totalLength);
392 }
393
394 static void
utrie2_printLengths(const UTrie2 * trie,const char * which)395 utrie2_printLengths(const UTrie2 *trie, const char *which) {
396 long indexLength=trie->indexLength;
397 long dataLength=(long)trie->dataLength;
398 long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=nullptr ? 4 : 2);
399 printf("**UTrie2Lengths(%s %s)** index:%6ld data:%6ld countInitial:%6ld serialized:%6ld\n",
400 which, trie->name, indexLength, dataLength, countInitial(trie), totalLength);
401 }
402 #endif
403
404 U_CAPI UTrie2 * U_EXPORT2
utrie2_cloneAsThawed(const UTrie2 * other,UErrorCode * pErrorCode)405 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
406 NewTrieAndStatus context;
407 char16_t lead;
408
409 if(U_FAILURE(*pErrorCode)) {
410 return nullptr;
411 }
412 if(other==nullptr || (other->memory==nullptr && other->newTrie==nullptr)) {
413 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
414 return nullptr;
415 }
416 if(other->newTrie!=nullptr && !other->newTrie->isCompacted) {
417 return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */
418 }
419
420 /* Clone the frozen trie by enumerating it and building a new one. */
421 context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
422 if(U_FAILURE(*pErrorCode)) {
423 return nullptr;
424 }
425 context.exclusiveLimit=false;
426 context.errorCode=*pErrorCode;
427 utrie2_enum(other, nullptr, copyEnumRange, &context);
428 *pErrorCode=context.errorCode;
429 for(lead=0xd800; lead<0xdc00; ++lead) {
430 uint32_t value;
431 if(other->data32==nullptr) {
432 value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
433 } else {
434 value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
435 }
436 if(value!=other->initialValue) {
437 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
438 }
439 }
440 if(U_FAILURE(*pErrorCode)) {
441 utrie2_close(context.trie);
442 context.trie=nullptr;
443 }
444 return context.trie;
445 }
446
447 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
448 U_CAPI UTrie2 * U_EXPORT2
utrie2_fromUTrie(const UTrie * trie1,uint32_t errorValue,UErrorCode * pErrorCode)449 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
450 NewTrieAndStatus context;
451 char16_t lead;
452
453 if(U_FAILURE(*pErrorCode)) {
454 return nullptr;
455 }
456 if(trie1==nullptr) {
457 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
458 return nullptr;
459 }
460 context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
461 if(U_FAILURE(*pErrorCode)) {
462 return nullptr;
463 }
464 context.exclusiveLimit=true;
465 context.errorCode=*pErrorCode;
466 utrie_enum(trie1, nullptr, copyEnumRange, &context);
467 *pErrorCode=context.errorCode;
468 for(lead=0xd800; lead<0xdc00; ++lead) {
469 uint32_t value;
470 if(trie1->data32==nullptr) {
471 value=UTRIE_GET16_FROM_LEAD(trie1, lead);
472 } else {
473 value=UTRIE_GET32_FROM_LEAD(trie1, lead);
474 }
475 if(value!=trie1->initialValue) {
476 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
477 }
478 }
479 if(U_SUCCESS(*pErrorCode)) {
480 utrie2_freeze(context.trie,
481 trie1->data32!=nullptr ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
482 pErrorCode);
483 }
484 #ifdef UTRIE2_DEBUG
485 if(U_SUCCESS(*pErrorCode)) {
486 utrie_printLengths(trie1);
487 utrie2_printLengths(context.trie, "fromUTrie");
488 }
489 #endif
490 if(U_FAILURE(*pErrorCode)) {
491 utrie2_close(context.trie);
492 context.trie=nullptr;
493 }
494 return context.trie;
495 }
496
497 static inline UBool
isInNullBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)498 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
499 int32_t i2, block;
500
501 if(U_IS_LEAD(c) && forLSCP) {
502 i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
503 (c>>UTRIE2_SHIFT_2);
504 } else {
505 i2=trie->index1[c>>UTRIE2_SHIFT_1]+
506 ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
507 }
508 block=trie->index2[i2];
509 return (UBool)(block==trie->dataNullOffset);
510 }
511
512 static int32_t
allocIndex2Block(UNewTrie2 * trie)513 allocIndex2Block(UNewTrie2 *trie) {
514 int32_t newBlock, newTop;
515
516 newBlock=trie->index2Length;
517 newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
518 if(newTop>UPRV_LENGTHOF(trie->index2)) {
519 /*
520 * Should never occur.
521 * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
522 * or the code writes more values than should be possible.
523 */
524 return -1;
525 }
526 trie->index2Length=newTop;
527 uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
528 return newBlock;
529 }
530
531 static int32_t
getIndex2Block(UNewTrie2 * trie,UChar32 c,UBool forLSCP)532 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
533 int32_t i1, i2;
534
535 if(U_IS_LEAD(c) && forLSCP) {
536 return UTRIE2_LSCP_INDEX_2_OFFSET;
537 }
538
539 i1=c>>UTRIE2_SHIFT_1;
540 i2=trie->index1[i1];
541 if(i2==trie->index2NullOffset) {
542 i2=allocIndex2Block(trie);
543 if(i2<0) {
544 return -1; /* program error */
545 }
546 trie->index1[i1]=i2;
547 }
548 return i2;
549 }
550
551 static int32_t
allocDataBlock(UNewTrie2 * trie,int32_t copyBlock)552 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
553 int32_t newBlock, newTop;
554
555 if(trie->firstFreeBlock!=0) {
556 /* get the first free block */
557 newBlock=trie->firstFreeBlock;
558 trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
559 } else {
560 /* get a new block from the high end */
561 newBlock=trie->dataLength;
562 newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
563 if(newTop>trie->dataCapacity) {
564 /* out of memory in the data array */
565 int32_t capacity;
566 uint32_t *data;
567
568 if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
569 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
570 } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
571 capacity=UNEWTRIE2_MAX_DATA_LENGTH;
572 } else {
573 /*
574 * Should never occur.
575 * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
576 * or the code writes more values than should be possible.
577 */
578 return -1;
579 }
580 data=(uint32_t *)uprv_malloc(capacity*4);
581 if(data==nullptr) {
582 return -1;
583 }
584 uprv_memcpy(data, trie->data, (size_t)trie->dataLength*4);
585 uprv_free(trie->data);
586 trie->data=data;
587 trie->dataCapacity=capacity;
588 }
589 trie->dataLength=newTop;
590 }
591 uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
592 trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
593 return newBlock;
594 }
595
596 /* call when the block's reference counter reaches 0 */
597 static void
releaseDataBlock(UNewTrie2 * trie,int32_t block)598 releaseDataBlock(UNewTrie2 *trie, int32_t block) {
599 /* put this block at the front of the free-block chain */
600 trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
601 trie->firstFreeBlock=block;
602 }
603
604 static inline UBool
isWritableBlock(UNewTrie2 * trie,int32_t block)605 isWritableBlock(UNewTrie2 *trie, int32_t block) {
606 return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]);
607 }
608
609 static inline void
setIndex2Entry(UNewTrie2 * trie,int32_t i2,int32_t block)610 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
611 int32_t oldBlock;
612 ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */
613 oldBlock=trie->index2[i2];
614 if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
615 releaseDataBlock(trie, oldBlock);
616 }
617 trie->index2[i2]=block;
618 }
619
620 /**
621 * No error checking for illegal arguments.
622 *
623 * @return -1 if no new data block available (out of memory in data array)
624 * @internal
625 */
626 static int32_t
getDataBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)627 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
628 int32_t i2, oldBlock, newBlock;
629
630 i2=getIndex2Block(trie, c, forLSCP);
631 if(i2<0) {
632 return -1; /* program error */
633 }
634
635 i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
636 oldBlock=trie->index2[i2];
637 if(isWritableBlock(trie, oldBlock)) {
638 return oldBlock;
639 }
640
641 /* allocate a new data block */
642 newBlock=allocDataBlock(trie, oldBlock);
643 if(newBlock<0) {
644 /* out of memory in the data array */
645 return -1;
646 }
647 setIndex2Entry(trie, i2, newBlock);
648 return newBlock;
649 }
650
651 /**
652 * @return true if the value was successfully set
653 */
654 static void
set32(UNewTrie2 * trie,UChar32 c,UBool forLSCP,uint32_t value,UErrorCode * pErrorCode)655 set32(UNewTrie2 *trie,
656 UChar32 c, UBool forLSCP, uint32_t value,
657 UErrorCode *pErrorCode) {
658 int32_t block;
659
660 if(trie==nullptr || trie->isCompacted) {
661 *pErrorCode=U_NO_WRITE_PERMISSION;
662 return;
663 }
664 #ifdef UCPTRIE_DEBUG
665 umutablecptrie_set(trie->t3, c, value, pErrorCode);
666 #endif
667
668 block=getDataBlock(trie, c, forLSCP);
669 if(block<0) {
670 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
671 return;
672 }
673
674 trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
675 }
676
677 U_CAPI void U_EXPORT2
utrie2_set32(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)678 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
679 if(U_FAILURE(*pErrorCode)) {
680 return;
681 }
682 if((uint32_t)c>0x10ffff) {
683 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
684 return;
685 }
686 set32(trie->newTrie, c, true, value, pErrorCode);
687 }
688
689 U_CAPI void U_EXPORT2
utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)690 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
691 UChar32 c, uint32_t value,
692 UErrorCode *pErrorCode) {
693 if(U_FAILURE(*pErrorCode)) {
694 return;
695 }
696 if(!U_IS_LEAD(c)) {
697 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
698 return;
699 }
700 set32(trie->newTrie, c, false, value, pErrorCode);
701 }
702
703 static void
writeBlock(uint32_t * block,uint32_t value)704 writeBlock(uint32_t *block, uint32_t value) {
705 uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
706 while(block<limit) {
707 *block++=value;
708 }
709 }
710
711 /**
712 * initialValue is ignored if overwrite=true
713 * @internal
714 */
715 static void
fillBlock(uint32_t * block,UChar32 start,UChar32 limit,uint32_t value,uint32_t initialValue,UBool overwrite)716 fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
717 uint32_t value, uint32_t initialValue, UBool overwrite) {
718 uint32_t *pLimit;
719
720 pLimit=block+limit;
721 block+=start;
722 if(overwrite) {
723 while(block<pLimit) {
724 *block++=value;
725 }
726 } else {
727 while(block<pLimit) {
728 if(*block==initialValue) {
729 *block=value;
730 }
731 ++block;
732 }
733 }
734 }
735
736 U_CAPI void U_EXPORT2
utrie2_setRange32(UTrie2 * trie,UChar32 start,UChar32 end,uint32_t value,UBool overwrite,UErrorCode * pErrorCode)737 utrie2_setRange32(UTrie2 *trie,
738 UChar32 start, UChar32 end,
739 uint32_t value, UBool overwrite,
740 UErrorCode *pErrorCode) {
741 /*
742 * repeat value in [start..end]
743 * mark index values for repeat-data blocks by setting bit 31 of the index values
744 * fill around existing values if any, if(overwrite)
745 */
746 UNewTrie2 *newTrie;
747 int32_t block, rest, repeatBlock;
748 UChar32 limit;
749
750 if(U_FAILURE(*pErrorCode)) {
751 return;
752 }
753 if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
754 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
755 return;
756 }
757 newTrie=trie->newTrie;
758 if(newTrie==nullptr || newTrie->isCompacted) {
759 *pErrorCode=U_NO_WRITE_PERMISSION;
760 return;
761 }
762 #ifdef UCPTRIE_DEBUG
763 umutablecptrie_setRange(newTrie->t3, start, end, value, pErrorCode);
764 #endif
765 if(!overwrite && value==newTrie->initialValue) {
766 return; /* nothing to do */
767 }
768
769 limit=end+1;
770 if(start&UTRIE2_DATA_MASK) {
771 UChar32 nextStart;
772
773 /* set partial block at [start..following block boundary[ */
774 block=getDataBlock(newTrie, start, true);
775 if(block<0) {
776 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
777 return;
778 }
779
780 nextStart=(start+UTRIE2_DATA_MASK)&~UTRIE2_DATA_MASK;
781 if(nextStart<=limit) {
782 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
783 value, newTrie->initialValue, overwrite);
784 start=nextStart;
785 } else {
786 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
787 value, newTrie->initialValue, overwrite);
788 return;
789 }
790 }
791
792 /* number of positions in the last, partial block */
793 rest=limit&UTRIE2_DATA_MASK;
794
795 /* round down limit to a block boundary */
796 limit&=~UTRIE2_DATA_MASK;
797
798 /* iterate over all-value blocks */
799 if(value==newTrie->initialValue) {
800 repeatBlock=newTrie->dataNullOffset;
801 } else {
802 repeatBlock=-1;
803 }
804
805 while(start<limit) {
806 int32_t i2;
807 UBool setRepeatBlock=false;
808
809 if(value==newTrie->initialValue && isInNullBlock(newTrie, start, true)) {
810 start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
811 continue;
812 }
813
814 /* get index value */
815 i2=getIndex2Block(newTrie, start, true);
816 if(i2<0) {
817 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
818 return;
819 }
820 i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
821 block=newTrie->index2[i2];
822 if(isWritableBlock(newTrie, block)) {
823 /* already allocated */
824 if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
825 /*
826 * We overwrite all values, and it's not a
827 * protected (ASCII-linear or 2-byte UTF-8) block:
828 * replace with the repeatBlock.
829 */
830 setRepeatBlock=true;
831 } else {
832 /* !overwrite, or protected block: just write the values into this block */
833 fillBlock(newTrie->data+block,
834 0, UTRIE2_DATA_BLOCK_LENGTH,
835 value, newTrie->initialValue, overwrite);
836 }
837 } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
838 /*
839 * Set the repeatBlock instead of the null block or previous repeat block:
840 *
841 * If !isWritableBlock() then all entries in the block have the same value
842 * because it's the null block or a range block (the repeatBlock from a previous
843 * call to utrie2_setRange32()).
844 * No other blocks are used multiple times before compacting.
845 *
846 * The null block is the only non-writable block with the initialValue because
847 * of the repeatBlock initialization above. (If value==initialValue, then
848 * the repeatBlock will be the null data block.)
849 *
850 * We set our repeatBlock if the desired value differs from the block's value,
851 * and if we overwrite any data or if the data is all initial values
852 * (which is the same as the block being the null block, see above).
853 */
854 setRepeatBlock=true;
855 }
856 if(setRepeatBlock) {
857 if(repeatBlock>=0) {
858 setIndex2Entry(newTrie, i2, repeatBlock);
859 } else {
860 /* create and set and fill the repeatBlock */
861 repeatBlock=getDataBlock(newTrie, start, true);
862 if(repeatBlock<0) {
863 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
864 return;
865 }
866 writeBlock(newTrie->data+repeatBlock, value);
867 }
868 }
869
870 start+=UTRIE2_DATA_BLOCK_LENGTH;
871 }
872
873 if(rest>0) {
874 /* set partial block at [last block boundary..limit[ */
875 block=getDataBlock(newTrie, start, true);
876 if(block<0) {
877 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
878 return;
879 }
880
881 fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
882 }
883
884 return;
885 }
886
887 /* compaction --------------------------------------------------------------- */
888
889 static inline UBool
equal_int32(const int32_t * s,const int32_t * t,int32_t length)890 equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
891 while(length>0 && *s==*t) {
892 ++s;
893 ++t;
894 --length;
895 }
896 return (UBool)(length==0);
897 }
898
899 static inline UBool
equal_uint32(const uint32_t * s,const uint32_t * t,int32_t length)900 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
901 while(length>0 && *s==*t) {
902 ++s;
903 ++t;
904 --length;
905 }
906 return (UBool)(length==0);
907 }
908
909 static int32_t
findSameIndex2Block(const int32_t * idx,int32_t index2Length,int32_t otherBlock)910 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
911 int32_t block;
912
913 /* ensure that we do not even partially get past index2Length */
914 index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
915
916 for(block=0; block<=index2Length; ++block) {
917 if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
918 return block;
919 }
920 }
921 return -1;
922 }
923
924 static int32_t
findSameDataBlock(const uint32_t * data,int32_t dataLength,int32_t otherBlock,int32_t blockLength)925 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
926 int32_t block;
927
928 /* ensure that we do not even partially get past dataLength */
929 dataLength-=blockLength;
930
931 for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
932 if(equal_uint32(data+block, data+otherBlock, blockLength)) {
933 return block;
934 }
935 }
936 return -1;
937 }
938
939 /*
940 * Find the start of the last range in the trie by enumerating backward.
941 * Indexes for supplementary code points higher than this will be omitted.
942 */
943 static UChar32
findHighStart(UNewTrie2 * trie,uint32_t highValue)944 findHighStart(UNewTrie2 *trie, uint32_t highValue) {
945 const uint32_t *data32;
946
947 uint32_t value, initialValue;
948 UChar32 c, prev;
949 int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
950
951 data32=trie->data;
952 initialValue=trie->initialValue;
953
954 index2NullOffset=trie->index2NullOffset;
955 nullBlock=trie->dataNullOffset;
956
957 /* set variables for previous range */
958 if(highValue==initialValue) {
959 prevI2Block=index2NullOffset;
960 prevBlock=nullBlock;
961 } else {
962 prevI2Block=-1;
963 prevBlock=-1;
964 }
965 prev=0x110000;
966
967 /* enumerate index-2 blocks */
968 i1=UNEWTRIE2_INDEX_1_LENGTH;
969 c=prev;
970 while(c>0) {
971 i2Block=trie->index1[--i1];
972 if(i2Block==prevI2Block) {
973 /* the index-2 block is the same as the previous one, and filled with highValue */
974 c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
975 continue;
976 }
977 prevI2Block=i2Block;
978 if(i2Block==index2NullOffset) {
979 /* this is the null index-2 block */
980 if(highValue!=initialValue) {
981 return c;
982 }
983 c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
984 } else {
985 /* enumerate data blocks for one index-2 block */
986 for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
987 block=trie->index2[i2Block+ --i2];
988 if(block==prevBlock) {
989 /* the block is the same as the previous one, and filled with highValue */
990 c-=UTRIE2_DATA_BLOCK_LENGTH;
991 continue;
992 }
993 prevBlock=block;
994 if(block==nullBlock) {
995 /* this is the null data block */
996 if(highValue!=initialValue) {
997 return c;
998 }
999 c-=UTRIE2_DATA_BLOCK_LENGTH;
1000 } else {
1001 for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
1002 value=data32[block+ --j];
1003 if(value!=highValue) {
1004 return c;
1005 }
1006 --c;
1007 }
1008 }
1009 }
1010 }
1011 }
1012
1013 /* deliver last range */
1014 return 0;
1015 }
1016
1017 /*
1018 * Compact a build-time trie.
1019 *
1020 * The compaction
1021 * - removes blocks that are identical with earlier ones
1022 * - overlaps adjacent blocks as much as possible (if overlap==true)
1023 * - moves blocks in steps of the data granularity
1024 * - moves and overlaps blocks that overlap with multiple values in the overlap region
1025 *
1026 * It does not
1027 * - try to move and overlap blocks that are not already adjacent
1028 */
1029 static void
compactData(UNewTrie2 * trie)1030 compactData(UNewTrie2 *trie) {
1031 #ifdef UTRIE2_DEBUG
1032 int32_t countSame=0, sumOverlaps=0;
1033 #endif
1034
1035 int32_t start, newStart, movedStart;
1036 int32_t blockLength, overlap;
1037 int32_t i, mapIndex, blockCount;
1038
1039 /* do not compact linear-ASCII data */
1040 newStart=UTRIE2_DATA_START_OFFSET;
1041 for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
1042 trie->map[i]=start;
1043 }
1044
1045 /*
1046 * Start with a block length of 64 for 2-byte UTF-8,
1047 * then switch to UTRIE2_DATA_BLOCK_LENGTH.
1048 */
1049 blockLength=64;
1050 blockCount=blockLength>>UTRIE2_SHIFT_2;
1051 for(start=newStart; start<trie->dataLength;) {
1052 /*
1053 * start: index of first entry of current block
1054 * newStart: index where the current block is to be moved
1055 * (right after current end of already-compacted data)
1056 */
1057 if(start==UNEWTRIE2_DATA_0800_OFFSET) {
1058 blockLength=UTRIE2_DATA_BLOCK_LENGTH;
1059 blockCount=1;
1060 }
1061
1062 /* skip blocks that are not used */
1063 if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
1064 /* advance start to the next block */
1065 start+=blockLength;
1066
1067 /* leave newStart with the previous block! */
1068 continue;
1069 }
1070
1071 /* search for an identical block */
1072 if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
1073 >=0
1074 ) {
1075 #ifdef UTRIE2_DEBUG
1076 ++countSame;
1077 #endif
1078 /* found an identical block, set the other block's index value for the current block */
1079 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1080 trie->map[mapIndex++]=movedStart;
1081 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1082 }
1083
1084 /* advance start to the next block */
1085 start+=blockLength;
1086
1087 /* leave newStart with the previous block! */
1088 continue;
1089 }
1090
1091 /* see if the beginning of this block can be overlapped with the end of the previous block */
1092 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
1093 for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
1094 overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
1095 overlap-=UTRIE2_DATA_GRANULARITY) {}
1096
1097 #ifdef UTRIE2_DEBUG
1098 sumOverlaps+=overlap;
1099 #endif
1100 if(overlap>0 || newStart<start) {
1101 /* some overlap, or just move the whole block */
1102 movedStart=newStart-overlap;
1103 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1104 trie->map[mapIndex++]=movedStart;
1105 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1106 }
1107
1108 /* move the non-overlapping indexes to their new positions */
1109 start+=overlap;
1110 for(i=blockLength-overlap; i>0; --i) {
1111 trie->data[newStart++]=trie->data[start++];
1112 }
1113 } else /* no overlap && newStart==start */ {
1114 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1115 trie->map[mapIndex++]=start;
1116 start+=UTRIE2_DATA_BLOCK_LENGTH;
1117 }
1118 newStart=start;
1119 }
1120 }
1121
1122 /* now adjust the index-2 table */
1123 for(i=0; i<trie->index2Length; ++i) {
1124 if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
1125 /* Gap indexes are invalid (-1). Skip over the gap. */
1126 i+=UNEWTRIE2_INDEX_GAP_LENGTH;
1127 }
1128 trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
1129 }
1130 trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
1131
1132 /* ensure dataLength alignment */
1133 while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1134 trie->data[newStart++]=trie->initialValue;
1135 }
1136
1137 #ifdef UTRIE2_DEBUG
1138 /* we saved some space */
1139 printf("compacting UTrie2: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n",
1140 (long)trie->dataLength, (long)newStart, (long)countSame, (long)sumOverlaps);
1141 #endif
1142
1143 trie->dataLength=newStart;
1144 }
1145
1146 static void
compactIndex2(UNewTrie2 * trie)1147 compactIndex2(UNewTrie2 *trie) {
1148 int32_t i, start, newStart, movedStart, overlap;
1149
1150 /* do not compact linear-BMP index-2 blocks */
1151 newStart=UTRIE2_INDEX_2_BMP_LENGTH;
1152 for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
1153 trie->map[i]=start;
1154 }
1155
1156 /* Reduce the index table gap to what will be needed at runtime. */
1157 newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
1158
1159 for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
1160 /*
1161 * start: index of first entry of current block
1162 * newStart: index where the current block is to be moved
1163 * (right after current end of already-compacted data)
1164 */
1165
1166 /* search for an identical block */
1167 if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
1168 >=0
1169 ) {
1170 /* found an identical block, set the other block's index value for the current block */
1171 trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
1172
1173 /* advance start to the next block */
1174 start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1175
1176 /* leave newStart with the previous block! */
1177 continue;
1178 }
1179
1180 /* see if the beginning of this block can be overlapped with the end of the previous block */
1181 /* look for maximum overlap with the previous, adjacent block */
1182 for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
1183 overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
1184 --overlap) {}
1185
1186 if(overlap>0 || newStart<start) {
1187 /* some overlap, or just move the whole block */
1188 trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
1189
1190 /* move the non-overlapping indexes to their new positions */
1191 start+=overlap;
1192 for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
1193 trie->index2[newStart++]=trie->index2[start++];
1194 }
1195 } else /* no overlap && newStart==start */ {
1196 trie->map[start>>UTRIE2_SHIFT_1_2]=start;
1197 start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1198 newStart=start;
1199 }
1200 }
1201
1202 /* now adjust the index-1 table */
1203 for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
1204 trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
1205 }
1206 trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
1207
1208 /*
1209 * Ensure data table alignment:
1210 * Needs to be granularity-aligned for 16-bit trie
1211 * (so that dataMove will be down-shiftable),
1212 * and 2-aligned for uint32_t data.
1213 */
1214 while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
1215 /* Arbitrary value: 0x3fffc not possible for real data. */
1216 trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT;
1217 }
1218
1219 #ifdef UTRIE2_DEBUG
1220 /* we saved some space */
1221 printf("compacting UTrie2: count of 16-bit index words %lu->%lu\n",
1222 (long)trie->index2Length, (long)newStart);
1223 #endif
1224
1225 trie->index2Length=newStart;
1226 }
1227
1228 static void
compactTrie(UTrie2 * trie,UErrorCode * pErrorCode)1229 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
1230 UNewTrie2 *newTrie;
1231 UChar32 highStart, suppHighStart;
1232 uint32_t highValue;
1233
1234 newTrie=trie->newTrie;
1235
1236 /* find highStart and round it up */
1237 highValue=utrie2_get32(trie, 0x10ffff);
1238 highStart=findHighStart(newTrie, highValue);
1239 highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
1240 if(highStart==0x110000) {
1241 highValue=trie->errorValue;
1242 }
1243
1244 /*
1245 * Set trie->highStart only after utrie2_get32(trie, highStart).
1246 * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
1247 */
1248 trie->highStart=newTrie->highStart=highStart;
1249
1250 #ifdef UTRIE2_DEBUG
1251 printf("UTrie2: highStart U+%06lx highValue 0x%lx initialValue 0x%lx\n",
1252 (long)highStart, (long)highValue, (long)trie->initialValue);
1253 #endif
1254
1255 if(highStart<0x110000) {
1256 /* Blank out [highStart..10ffff] to release associated data blocks. */
1257 suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
1258 utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, true, pErrorCode);
1259 if(U_FAILURE(*pErrorCode)) {
1260 return;
1261 }
1262 }
1263
1264 compactData(newTrie);
1265 if(highStart>0x10000) {
1266 compactIndex2(newTrie);
1267 #ifdef UTRIE2_DEBUG
1268 } else {
1269 printf("UTrie2: highStart U+%04lx count of 16-bit index words %lu->%lu\n",
1270 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
1271 #endif
1272 }
1273
1274 /*
1275 * Store the highValue in the data array and round up the dataLength.
1276 * Must be done after compactData() because that assumes that dataLength
1277 * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
1278 */
1279 newTrie->data[newTrie->dataLength++]=highValue;
1280 while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1281 newTrie->data[newTrie->dataLength++]=trie->initialValue;
1282 }
1283
1284 newTrie->isCompacted=true;
1285 }
1286
1287 /* serialization ------------------------------------------------------------ */
1288
1289 /**
1290 * Maximum length of the runtime index array.
1291 * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
1292 * (The actual maximum length is lower,
1293 * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
1294 */
1295 #define UTRIE2_MAX_INDEX_LENGTH 0xffff
1296
1297 /**
1298 * Maximum length of the runtime data array.
1299 * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
1300 * and by uint16_t UTrie2Header.shiftedDataLength.
1301 */
1302 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
1303
1304 /* Compact and internally serialize the trie. */
1305 U_CAPI void U_EXPORT2
utrie2_freeze(UTrie2 * trie,UTrie2ValueBits valueBits,UErrorCode * pErrorCode)1306 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
1307 UNewTrie2 *newTrie;
1308 UTrie2Header *header;
1309 uint32_t *p;
1310 uint16_t *dest16;
1311 int32_t i, length;
1312 int32_t allIndexesLength;
1313 int32_t dataMove; /* >0 if the data is moved to the end of the index array */
1314 UChar32 highStart;
1315
1316 /* argument check */
1317 if(U_FAILURE(*pErrorCode)) {
1318 return;
1319 }
1320 if( trie==nullptr ||
1321 valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
1322 ) {
1323 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1324 return;
1325 }
1326 newTrie=trie->newTrie;
1327 if(newTrie==nullptr) {
1328 /* already frozen */
1329 UTrie2ValueBits frozenValueBits=
1330 trie->data16!=nullptr ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
1331 if(valueBits!=frozenValueBits) {
1332 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1333 }
1334 return;
1335 }
1336
1337 /* compact if necessary */
1338 if(!newTrie->isCompacted) {
1339 compactTrie(trie, pErrorCode);
1340 if(U_FAILURE(*pErrorCode)) {
1341 return;
1342 }
1343 }
1344 highStart=trie->highStart;
1345
1346 if(highStart<=0x10000) {
1347 allIndexesLength=UTRIE2_INDEX_1_OFFSET;
1348 } else {
1349 allIndexesLength=newTrie->index2Length;
1350 }
1351 if(valueBits==UTRIE2_16_VALUE_BITS) {
1352 dataMove=allIndexesLength;
1353 } else {
1354 dataMove=0;
1355 }
1356
1357 /* are indexLength and dataLength within limits? */
1358 if( /* for unshifted indexLength */
1359 allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
1360 /* for unshifted dataNullOffset */
1361 (dataMove+newTrie->dataNullOffset)>0xffff ||
1362 /* for unshifted 2-byte UTF-8 index-2 values */
1363 (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
1364 /* for shiftedDataLength */
1365 (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
1366 ) {
1367 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1368 return;
1369 }
1370
1371 /* calculate the total serialized length */
1372 length=sizeof(UTrie2Header)+allIndexesLength*2;
1373 if(valueBits==UTRIE2_16_VALUE_BITS) {
1374 length+=newTrie->dataLength*2;
1375 } else {
1376 length+=newTrie->dataLength*4;
1377 }
1378
1379 trie->memory=uprv_malloc(length);
1380 if(trie->memory==nullptr) {
1381 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1382 return;
1383 }
1384 trie->length=length;
1385 trie->isMemoryOwned=true;
1386
1387 trie->indexLength=allIndexesLength;
1388 trie->dataLength=newTrie->dataLength;
1389 if(highStart<=0x10000) {
1390 trie->index2NullOffset=0xffff;
1391 } else {
1392 trie->index2NullOffset=static_cast<uint16_t>(UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset);
1393 }
1394 trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
1395 trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
1396
1397 /* set the header fields */
1398 header=(UTrie2Header *)trie->memory;
1399
1400 header->signature=UTRIE2_SIG; /* "Tri2" */
1401 header->options=(uint16_t)valueBits;
1402
1403 header->indexLength=(uint16_t)trie->indexLength;
1404 header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
1405 header->index2NullOffset=trie->index2NullOffset;
1406 header->dataNullOffset=trie->dataNullOffset;
1407 header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
1408
1409 /* fill the index and data arrays */
1410 dest16=(uint16_t *)(header+1);
1411 trie->index=dest16;
1412
1413 /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
1414 p=(uint32_t *)newTrie->index2;
1415 for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
1416 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1417 }
1418
1419 /* write UTF-8 2-byte index-2 values, not right-shifted */
1420 for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */
1421 *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
1422 }
1423 for(; i<(0xe0-0xc0); ++i) { /* C2..DF */
1424 *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
1425 }
1426
1427 if(highStart>0x10000) {
1428 int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
1429 int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
1430
1431 /* write 16-bit index-1 values for supplementary code points */
1432 p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
1433 for(i=index1Length; i>0; --i) {
1434 *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
1435 }
1436
1437 /*
1438 * write the index-2 array values for supplementary code points,
1439 * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
1440 */
1441 p=(uint32_t *)newTrie->index2+index2Offset;
1442 for(i=newTrie->index2Length-index2Offset; i>0; --i) {
1443 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1444 }
1445 }
1446
1447 /* write the 16/32-bit data array */
1448 switch(valueBits) {
1449 case UTRIE2_16_VALUE_BITS:
1450 /* write 16-bit data values */
1451 trie->data16=dest16;
1452 trie->data32=nullptr;
1453 p=newTrie->data;
1454 for(i=newTrie->dataLength; i>0; --i) {
1455 *dest16++=(uint16_t)*p++;
1456 }
1457 break;
1458 case UTRIE2_32_VALUE_BITS:
1459 /* write 32-bit data values */
1460 trie->data16=nullptr;
1461 trie->data32=(uint32_t *)dest16;
1462 uprv_memcpy(dest16, newTrie->data, (size_t)newTrie->dataLength*4);
1463 break;
1464 default:
1465 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1466 return;
1467 }
1468
1469 #ifdef UTRIE2_DEBUG
1470 utrie2_printLengths(trie, "");
1471 #endif
1472
1473 #ifdef UCPTRIE_DEBUG
1474 umutablecptrie_setName(newTrie->t3, trie->name);
1475 ucptrie_close(
1476 umutablecptrie_buildImmutable(
1477 newTrie->t3, UCPTRIE_TYPE_FAST, (UCPTrieValueWidth)valueBits, pErrorCode));
1478 #endif
1479 /* Delete the UNewTrie2. */
1480 uprv_free(newTrie->data);
1481 uprv_free(newTrie);
1482 trie->newTrie=nullptr;
1483 }
1484