1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2001-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: utrie2_builder.cpp
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2008sep26 (split off from utrie2.c)
16 * created by: Markus W. Scherer
17 *
18 * This is a common implementation of a Unicode trie.
19 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
20 * Unicode code points (0..0x10ffff).
21 * This is the second common version of a Unicode trie (hence the name UTrie2).
22 * See utrie2.h for a comparison.
23 *
24 * This file contains only the builder code.
25 * See utrie2.c for the runtime and enumeration code.
26 */
27 #ifdef UTRIE2_DEBUG
28 # include <stdio.h>
29 #endif
30
31 #include "unicode/utypes.h"
32 #include "cmemory.h"
33 #include "utrie2.h"
34 #include "utrie2_impl.h"
35
36 #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */
37
38 /* Implementation notes ----------------------------------------------------- */
39
40 /*
41 * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
42 * have been chosen to minimize trie sizes overall.
43 * Most of the code is flexible enough to work with a range of values,
44 * within certain limits.
45 *
46 * Exception: Support for separate values for lead surrogate code _units_
47 * vs. code _points_ was added after the constants were fixed,
48 * and has not been tested nor particularly designed for different constant values.
49 * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
50 * part and back.)
51 *
52 * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
53 * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
54 * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
55 * remapping) stops working.
56 *
57 * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
58 * assumes that a single index-2 block is used for 0x400 code points
59 * corresponding to one lead surrogate.
60 *
61 * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
62 * more than one Unicode plane, and the split of the index-2 table into a BMP
63 * part and a supplementary part, with a gap in between, would not work.
64 *
65 * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
66 * there is data with more than 64k distinct values,
67 * for example for Unihan collation with a separate collation weight per
68 * Han character.
69 */
70
71 /* Building a trie ----------------------------------------------------------*/
72
73 enum {
74 /** The null index-2 block, following the gap in the index-2 table. */
75 UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
76
77 /** The start of allocated index-2 blocks. */
78 UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
79
80 /**
81 * The null data block.
82 * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
83 * to work with 6-bit trail bytes from 2-byte UTF-8.
84 */
85 UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
86
87 /** The start of allocated data blocks. */
88 UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
89
90 /**
91 * The start of data blocks for U+0800 and above.
92 * Below, compaction uses a block length of 64 for 2-byte UTF-8.
93 * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
94 * Data values for 0x780 code points beyond ASCII.
95 */
96 UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
97 };
98
99 /* Start with allocation of 16k data entries. */
100 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
101
102 /* Grow about 8x each time. */
103 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
104
105 static int32_t
106 allocIndex2Block(UNewTrie2 *trie);
107
108 U_CAPI UTrie2 * U_EXPORT2
utrie2_open(uint32_t initialValue,uint32_t errorValue,UErrorCode * pErrorCode)109 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
110 UTrie2 *trie;
111 UNewTrie2 *newTrie;
112 uint32_t *data;
113 int32_t i, j;
114
115 if(U_FAILURE(*pErrorCode)) {
116 return NULL;
117 }
118
119 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
120 newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
121 data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
122 if(trie==NULL || newTrie==NULL || data==NULL) {
123 uprv_free(trie);
124 uprv_free(newTrie);
125 uprv_free(data);
126 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
127 return 0;
128 }
129
130 uprv_memset(trie, 0, sizeof(UTrie2));
131 trie->initialValue=initialValue;
132 trie->errorValue=errorValue;
133 trie->highStart=0x110000;
134 trie->newTrie=newTrie;
135
136 newTrie->data=data;
137 newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
138 newTrie->initialValue=initialValue;
139 newTrie->errorValue=errorValue;
140 newTrie->highStart=0x110000;
141 newTrie->firstFreeBlock=0; /* no free block in the list */
142 newTrie->isCompacted=FALSE;
143
144 /*
145 * preallocate and reset
146 * - ASCII
147 * - the bad-UTF-8-data block
148 * - the null data block
149 */
150 for(i=0; i<0x80; ++i) {
151 newTrie->data[i]=initialValue;
152 }
153 for(; i<0xc0; ++i) {
154 newTrie->data[i]=errorValue;
155 }
156 for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
157 newTrie->data[i]=initialValue;
158 }
159 newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
160 newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
161
162 /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
163 for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
164 newTrie->index2[i]=j;
165 newTrie->map[i]=1;
166 }
167 /* reference counts for the bad-UTF-8-data block */
168 for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
169 newTrie->map[i]=0;
170 }
171 /*
172 * Reference counts for the null data block: all blocks except for the ASCII blocks.
173 * Plus 1 so that we don't drop this block during compaction.
174 * Plus as many as needed for lead surrogate code points.
175 */
176 /* i==newTrie->dataNullOffset */
177 newTrie->map[i++]=
178 (0x110000>>UTRIE2_SHIFT_2)-
179 (0x80>>UTRIE2_SHIFT_2)+
180 1+
181 UTRIE2_LSCP_INDEX_2_LENGTH;
182 j+=UTRIE2_DATA_BLOCK_LENGTH;
183 for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
184 newTrie->map[i]=0;
185 }
186
187 /*
188 * set the remaining indexes in the BMP index-2 block
189 * to the null data block
190 */
191 for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
192 newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
193 }
194
195 /*
196 * Fill the index gap with impossible values so that compaction
197 * does not overlap other index-2 blocks with the gap.
198 */
199 for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
200 newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
201 }
202
203 /* set the indexes in the null index-2 block */
204 for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
205 newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
206 }
207 newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
208 newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
209
210 /* set the index-1 indexes for the linear index-2 block */
211 for(i=0, j=0;
212 i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
213 ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
214 ) {
215 newTrie->index1[i]=j;
216 }
217
218 /* set the remaining index-1 indexes to the null index-2 block */
219 for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
220 newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
221 }
222
223 /*
224 * Preallocate and reset data for U+0080..U+07ff,
225 * for 2-byte UTF-8 which will be compacted in 64-blocks
226 * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
227 */
228 for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
229 utrie2_set32(trie, i, initialValue, pErrorCode);
230 }
231
232 return trie;
233 }
234
235 static UNewTrie2 *
cloneBuilder(const UNewTrie2 * other)236 cloneBuilder(const UNewTrie2 *other) {
237 UNewTrie2 *trie;
238
239 trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
240 if(trie==NULL) {
241 return NULL;
242 }
243
244 trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4);
245 if(trie->data==NULL) {
246 uprv_free(trie);
247 return NULL;
248 }
249 trie->dataCapacity=other->dataCapacity;
250
251 /* clone data */
252 uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
253 uprv_memcpy(trie->index2, other->index2, (size_t)other->index2Length*4);
254 trie->index2NullOffset=other->index2NullOffset;
255 trie->index2Length=other->index2Length;
256
257 uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4);
258 trie->dataNullOffset=other->dataNullOffset;
259 trie->dataLength=other->dataLength;
260
261 /* reference counters */
262 if(other->isCompacted) {
263 trie->firstFreeBlock=0;
264 } else {
265 uprv_memcpy(trie->map, other->map, ((size_t)other->dataLength>>UTRIE2_SHIFT_2)*4);
266 trie->firstFreeBlock=other->firstFreeBlock;
267 }
268
269 trie->initialValue=other->initialValue;
270 trie->errorValue=other->errorValue;
271 trie->highStart=other->highStart;
272 trie->isCompacted=other->isCompacted;
273
274 return trie;
275 }
276
277 U_CAPI UTrie2 * U_EXPORT2
utrie2_clone(const UTrie2 * other,UErrorCode * pErrorCode)278 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
279 UTrie2 *trie;
280
281 if(U_FAILURE(*pErrorCode)) {
282 return NULL;
283 }
284 if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
285 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
286 return NULL;
287 }
288
289 trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
290 if(trie==NULL) {
291 return NULL;
292 }
293 uprv_memcpy(trie, other, sizeof(UTrie2));
294
295 if(other->memory!=NULL) {
296 trie->memory=uprv_malloc(other->length);
297 if(trie->memory!=NULL) {
298 trie->isMemoryOwned=TRUE;
299 uprv_memcpy(trie->memory, other->memory, other->length);
300
301 /* make the clone's pointers point to its own memory */
302 trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
303 if(other->data16!=NULL) {
304 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
305 }
306 if(other->data32!=NULL) {
307 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
308 }
309 }
310 } else /* other->newTrie!=NULL */ {
311 trie->newTrie=cloneBuilder(other->newTrie);
312 }
313
314 if(trie->memory==NULL && trie->newTrie==NULL) {
315 uprv_free(trie);
316 trie=NULL;
317 }
318 return trie;
319 }
320
321 typedef struct NewTrieAndStatus {
322 UTrie2 *trie;
323 UErrorCode errorCode;
324 UBool exclusiveLimit; /* rather than inclusive range end */
325 } NewTrieAndStatus;
326
327 static UBool U_CALLCONV
copyEnumRange(const void * context,UChar32 start,UChar32 end,uint32_t value)328 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
329 NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
330 if(value!=nt->trie->initialValue) {
331 if(nt->exclusiveLimit) {
332 --end;
333 }
334 if(start==end) {
335 utrie2_set32(nt->trie, start, value, &nt->errorCode);
336 } else {
337 utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode);
338 }
339 return U_SUCCESS(nt->errorCode);
340 } else {
341 return TRUE;
342 }
343 }
344
345 #ifdef UTRIE2_DEBUG
346 static void
utrie_printLengths(const UTrie * trie)347 utrie_printLengths(const UTrie *trie) {
348 long indexLength=trie->indexLength;
349 long dataLength=(long)trie->dataLength;
350 long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
351 printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n",
352 indexLength, dataLength, totalLength);
353 }
354
355 static void
utrie2_printLengths(const UTrie2 * trie,const char * which)356 utrie2_printLengths(const UTrie2 *trie, const char *which) {
357 long indexLength=trie->indexLength;
358 long dataLength=(long)trie->dataLength;
359 long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
360 printf("**UTrie2Lengths(%s)** index:%6ld data:%6ld serialized:%6ld\n",
361 which, indexLength, dataLength, totalLength);
362 }
363 #endif
364
365 U_CAPI UTrie2 * U_EXPORT2
utrie2_cloneAsThawed(const UTrie2 * other,UErrorCode * pErrorCode)366 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
367 NewTrieAndStatus context;
368 UChar lead;
369
370 if(U_FAILURE(*pErrorCode)) {
371 return NULL;
372 }
373 if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
374 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
375 return NULL;
376 }
377 if(other->newTrie!=NULL && !other->newTrie->isCompacted) {
378 return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */
379 }
380
381 /* Clone the frozen trie by enumerating it and building a new one. */
382 context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
383 if(U_FAILURE(*pErrorCode)) {
384 return NULL;
385 }
386 context.exclusiveLimit=FALSE;
387 context.errorCode=*pErrorCode;
388 utrie2_enum(other, NULL, copyEnumRange, &context);
389 *pErrorCode=context.errorCode;
390 for(lead=0xd800; lead<0xdc00; ++lead) {
391 uint32_t value;
392 if(other->data32==NULL) {
393 value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
394 } else {
395 value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
396 }
397 if(value!=other->initialValue) {
398 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
399 }
400 }
401 if(U_FAILURE(*pErrorCode)) {
402 utrie2_close(context.trie);
403 context.trie=NULL;
404 }
405 return context.trie;
406 }
407
408 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
409 U_CAPI UTrie2 * U_EXPORT2
utrie2_fromUTrie(const UTrie * trie1,uint32_t errorValue,UErrorCode * pErrorCode)410 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
411 NewTrieAndStatus context;
412 UChar lead;
413
414 if(U_FAILURE(*pErrorCode)) {
415 return NULL;
416 }
417 if(trie1==NULL) {
418 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
419 return NULL;
420 }
421 context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
422 if(U_FAILURE(*pErrorCode)) {
423 return NULL;
424 }
425 context.exclusiveLimit=TRUE;
426 context.errorCode=*pErrorCode;
427 utrie_enum(trie1, NULL, copyEnumRange, &context);
428 *pErrorCode=context.errorCode;
429 for(lead=0xd800; lead<0xdc00; ++lead) {
430 uint32_t value;
431 if(trie1->data32==NULL) {
432 value=UTRIE_GET16_FROM_LEAD(trie1, lead);
433 } else {
434 value=UTRIE_GET32_FROM_LEAD(trie1, lead);
435 }
436 if(value!=trie1->initialValue) {
437 utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
438 }
439 }
440 if(U_SUCCESS(*pErrorCode)) {
441 utrie2_freeze(context.trie,
442 trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
443 pErrorCode);
444 }
445 #ifdef UTRIE2_DEBUG
446 if(U_SUCCESS(*pErrorCode)) {
447 utrie_printLengths(trie1);
448 utrie2_printLengths(context.trie, "fromUTrie");
449 }
450 #endif
451 if(U_FAILURE(*pErrorCode)) {
452 utrie2_close(context.trie);
453 context.trie=NULL;
454 }
455 return context.trie;
456 }
457
458 static inline UBool
isInNullBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)459 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
460 int32_t i2, block;
461
462 if(U_IS_LEAD(c) && forLSCP) {
463 i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
464 (c>>UTRIE2_SHIFT_2);
465 } else {
466 i2=trie->index1[c>>UTRIE2_SHIFT_1]+
467 ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
468 }
469 block=trie->index2[i2];
470 return (UBool)(block==trie->dataNullOffset);
471 }
472
473 static int32_t
allocIndex2Block(UNewTrie2 * trie)474 allocIndex2Block(UNewTrie2 *trie) {
475 int32_t newBlock, newTop;
476
477 newBlock=trie->index2Length;
478 newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
479 if(newTop>UPRV_LENGTHOF(trie->index2)) {
480 /*
481 * Should never occur.
482 * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
483 * or the code writes more values than should be possible.
484 */
485 return -1;
486 }
487 trie->index2Length=newTop;
488 uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
489 return newBlock;
490 }
491
492 static int32_t
getIndex2Block(UNewTrie2 * trie,UChar32 c,UBool forLSCP)493 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
494 int32_t i1, i2;
495
496 if(U_IS_LEAD(c) && forLSCP) {
497 return UTRIE2_LSCP_INDEX_2_OFFSET;
498 }
499
500 i1=c>>UTRIE2_SHIFT_1;
501 i2=trie->index1[i1];
502 if(i2==trie->index2NullOffset) {
503 i2=allocIndex2Block(trie);
504 if(i2<0) {
505 return -1; /* program error */
506 }
507 trie->index1[i1]=i2;
508 }
509 return i2;
510 }
511
512 static int32_t
allocDataBlock(UNewTrie2 * trie,int32_t copyBlock)513 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
514 int32_t newBlock, newTop;
515
516 if(trie->firstFreeBlock!=0) {
517 /* get the first free block */
518 newBlock=trie->firstFreeBlock;
519 trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
520 } else {
521 /* get a new block from the high end */
522 newBlock=trie->dataLength;
523 newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
524 if(newTop>trie->dataCapacity) {
525 /* out of memory in the data array */
526 int32_t capacity;
527 uint32_t *data;
528
529 if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
530 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
531 } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
532 capacity=UNEWTRIE2_MAX_DATA_LENGTH;
533 } else {
534 /*
535 * Should never occur.
536 * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
537 * or the code writes more values than should be possible.
538 */
539 return -1;
540 }
541 data=(uint32_t *)uprv_malloc(capacity*4);
542 if(data==NULL) {
543 return -1;
544 }
545 uprv_memcpy(data, trie->data, (size_t)trie->dataLength*4);
546 uprv_free(trie->data);
547 trie->data=data;
548 trie->dataCapacity=capacity;
549 }
550 trie->dataLength=newTop;
551 }
552 uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
553 trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
554 return newBlock;
555 }
556
557 /* call when the block's reference counter reaches 0 */
558 static void
releaseDataBlock(UNewTrie2 * trie,int32_t block)559 releaseDataBlock(UNewTrie2 *trie, int32_t block) {
560 /* put this block at the front of the free-block chain */
561 trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
562 trie->firstFreeBlock=block;
563 }
564
565 static inline UBool
isWritableBlock(UNewTrie2 * trie,int32_t block)566 isWritableBlock(UNewTrie2 *trie, int32_t block) {
567 return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]);
568 }
569
570 static inline void
setIndex2Entry(UNewTrie2 * trie,int32_t i2,int32_t block)571 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
572 int32_t oldBlock;
573 ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */
574 oldBlock=trie->index2[i2];
575 if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
576 releaseDataBlock(trie, oldBlock);
577 }
578 trie->index2[i2]=block;
579 }
580
581 /**
582 * No error checking for illegal arguments.
583 *
584 * @return -1 if no new data block available (out of memory in data array)
585 * @internal
586 */
587 static int32_t
getDataBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)588 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
589 int32_t i2, oldBlock, newBlock;
590
591 i2=getIndex2Block(trie, c, forLSCP);
592 if(i2<0) {
593 return -1; /* program error */
594 }
595
596 i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
597 oldBlock=trie->index2[i2];
598 if(isWritableBlock(trie, oldBlock)) {
599 return oldBlock;
600 }
601
602 /* allocate a new data block */
603 newBlock=allocDataBlock(trie, oldBlock);
604 if(newBlock<0) {
605 /* out of memory in the data array */
606 return -1;
607 }
608 setIndex2Entry(trie, i2, newBlock);
609 return newBlock;
610 }
611
612 /**
613 * @return TRUE if the value was successfully set
614 */
615 static void
set32(UNewTrie2 * trie,UChar32 c,UBool forLSCP,uint32_t value,UErrorCode * pErrorCode)616 set32(UNewTrie2 *trie,
617 UChar32 c, UBool forLSCP, uint32_t value,
618 UErrorCode *pErrorCode) {
619 int32_t block;
620
621 if(trie==NULL || trie->isCompacted) {
622 *pErrorCode=U_NO_WRITE_PERMISSION;
623 return;
624 }
625
626 block=getDataBlock(trie, c, forLSCP);
627 if(block<0) {
628 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
629 return;
630 }
631
632 trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
633 }
634
635 U_CAPI void U_EXPORT2
utrie2_set32(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)636 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
637 if(U_FAILURE(*pErrorCode)) {
638 return;
639 }
640 if((uint32_t)c>0x10ffff) {
641 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
642 return;
643 }
644 set32(trie->newTrie, c, TRUE, value, pErrorCode);
645 }
646
647 U_CAPI void U_EXPORT2
utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)648 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
649 UChar32 c, uint32_t value,
650 UErrorCode *pErrorCode) {
651 if(U_FAILURE(*pErrorCode)) {
652 return;
653 }
654 if(!U_IS_LEAD(c)) {
655 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
656 return;
657 }
658 set32(trie->newTrie, c, FALSE, value, pErrorCode);
659 }
660
661 static void
writeBlock(uint32_t * block,uint32_t value)662 writeBlock(uint32_t *block, uint32_t value) {
663 uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
664 while(block<limit) {
665 *block++=value;
666 }
667 }
668
669 /**
670 * initialValue is ignored if overwrite=TRUE
671 * @internal
672 */
673 static void
fillBlock(uint32_t * block,UChar32 start,UChar32 limit,uint32_t value,uint32_t initialValue,UBool overwrite)674 fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
675 uint32_t value, uint32_t initialValue, UBool overwrite) {
676 uint32_t *pLimit;
677
678 pLimit=block+limit;
679 block+=start;
680 if(overwrite) {
681 while(block<pLimit) {
682 *block++=value;
683 }
684 } else {
685 while(block<pLimit) {
686 if(*block==initialValue) {
687 *block=value;
688 }
689 ++block;
690 }
691 }
692 }
693
694 U_CAPI void U_EXPORT2
utrie2_setRange32(UTrie2 * trie,UChar32 start,UChar32 end,uint32_t value,UBool overwrite,UErrorCode * pErrorCode)695 utrie2_setRange32(UTrie2 *trie,
696 UChar32 start, UChar32 end,
697 uint32_t value, UBool overwrite,
698 UErrorCode *pErrorCode) {
699 /*
700 * repeat value in [start..end]
701 * mark index values for repeat-data blocks by setting bit 31 of the index values
702 * fill around existing values if any, if(overwrite)
703 */
704 UNewTrie2 *newTrie;
705 int32_t block, rest, repeatBlock;
706 UChar32 limit;
707
708 if(U_FAILURE(*pErrorCode)) {
709 return;
710 }
711 if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
712 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
713 return;
714 }
715 newTrie=trie->newTrie;
716 if(newTrie==NULL || newTrie->isCompacted) {
717 *pErrorCode=U_NO_WRITE_PERMISSION;
718 return;
719 }
720 if(!overwrite && value==newTrie->initialValue) {
721 return; /* nothing to do */
722 }
723
724 limit=end+1;
725 if(start&UTRIE2_DATA_MASK) {
726 UChar32 nextStart;
727
728 /* set partial block at [start..following block boundary[ */
729 block=getDataBlock(newTrie, start, TRUE);
730 if(block<0) {
731 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
732 return;
733 }
734
735 nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK;
736 if(nextStart<=limit) {
737 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
738 value, newTrie->initialValue, overwrite);
739 start=nextStart;
740 } else {
741 fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
742 value, newTrie->initialValue, overwrite);
743 return;
744 }
745 }
746
747 /* number of positions in the last, partial block */
748 rest=limit&UTRIE2_DATA_MASK;
749
750 /* round down limit to a block boundary */
751 limit&=~UTRIE2_DATA_MASK;
752
753 /* iterate over all-value blocks */
754 if(value==newTrie->initialValue) {
755 repeatBlock=newTrie->dataNullOffset;
756 } else {
757 repeatBlock=-1;
758 }
759
760 while(start<limit) {
761 int32_t i2;
762 UBool setRepeatBlock=FALSE;
763
764 if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) {
765 start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
766 continue;
767 }
768
769 /* get index value */
770 i2=getIndex2Block(newTrie, start, TRUE);
771 if(i2<0) {
772 *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
773 return;
774 }
775 i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
776 block=newTrie->index2[i2];
777 if(isWritableBlock(newTrie, block)) {
778 /* already allocated */
779 if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
780 /*
781 * We overwrite all values, and it's not a
782 * protected (ASCII-linear or 2-byte UTF-8) block:
783 * replace with the repeatBlock.
784 */
785 setRepeatBlock=TRUE;
786 } else {
787 /* !overwrite, or protected block: just write the values into this block */
788 fillBlock(newTrie->data+block,
789 0, UTRIE2_DATA_BLOCK_LENGTH,
790 value, newTrie->initialValue, overwrite);
791 }
792 } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
793 /*
794 * Set the repeatBlock instead of the null block or previous repeat block:
795 *
796 * If !isWritableBlock() then all entries in the block have the same value
797 * because it's the null block or a range block (the repeatBlock from a previous
798 * call to utrie2_setRange32()).
799 * No other blocks are used multiple times before compacting.
800 *
801 * The null block is the only non-writable block with the initialValue because
802 * of the repeatBlock initialization above. (If value==initialValue, then
803 * the repeatBlock will be the null data block.)
804 *
805 * We set our repeatBlock if the desired value differs from the block's value,
806 * and if we overwrite any data or if the data is all initial values
807 * (which is the same as the block being the null block, see above).
808 */
809 setRepeatBlock=TRUE;
810 }
811 if(setRepeatBlock) {
812 if(repeatBlock>=0) {
813 setIndex2Entry(newTrie, i2, repeatBlock);
814 } else {
815 /* create and set and fill the repeatBlock */
816 repeatBlock=getDataBlock(newTrie, start, TRUE);
817 if(repeatBlock<0) {
818 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
819 return;
820 }
821 writeBlock(newTrie->data+repeatBlock, value);
822 }
823 }
824
825 start+=UTRIE2_DATA_BLOCK_LENGTH;
826 }
827
828 if(rest>0) {
829 /* set partial block at [last block boundary..limit[ */
830 block=getDataBlock(newTrie, start, TRUE);
831 if(block<0) {
832 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
833 return;
834 }
835
836 fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
837 }
838
839 return;
840 }
841
842 /* compaction --------------------------------------------------------------- */
843
844 static inline UBool
equal_int32(const int32_t * s,const int32_t * t,int32_t length)845 equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
846 while(length>0 && *s==*t) {
847 ++s;
848 ++t;
849 --length;
850 }
851 return (UBool)(length==0);
852 }
853
854 static inline UBool
equal_uint32(const uint32_t * s,const uint32_t * t,int32_t length)855 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
856 while(length>0 && *s==*t) {
857 ++s;
858 ++t;
859 --length;
860 }
861 return (UBool)(length==0);
862 }
863
864 static int32_t
findSameIndex2Block(const int32_t * idx,int32_t index2Length,int32_t otherBlock)865 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
866 int32_t block;
867
868 /* ensure that we do not even partially get past index2Length */
869 index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
870
871 for(block=0; block<=index2Length; ++block) {
872 if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
873 return block;
874 }
875 }
876 return -1;
877 }
878
879 static int32_t
findSameDataBlock(const uint32_t * data,int32_t dataLength,int32_t otherBlock,int32_t blockLength)880 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
881 int32_t block;
882
883 /* ensure that we do not even partially get past dataLength */
884 dataLength-=blockLength;
885
886 for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
887 if(equal_uint32(data+block, data+otherBlock, blockLength)) {
888 return block;
889 }
890 }
891 return -1;
892 }
893
894 /*
895 * Find the start of the last range in the trie by enumerating backward.
896 * Indexes for supplementary code points higher than this will be omitted.
897 */
898 static UChar32
findHighStart(UNewTrie2 * trie,uint32_t highValue)899 findHighStart(UNewTrie2 *trie, uint32_t highValue) {
900 const uint32_t *data32;
901
902 uint32_t value, initialValue;
903 UChar32 c, prev;
904 int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
905
906 data32=trie->data;
907 initialValue=trie->initialValue;
908
909 index2NullOffset=trie->index2NullOffset;
910 nullBlock=trie->dataNullOffset;
911
912 /* set variables for previous range */
913 if(highValue==initialValue) {
914 prevI2Block=index2NullOffset;
915 prevBlock=nullBlock;
916 } else {
917 prevI2Block=-1;
918 prevBlock=-1;
919 }
920 prev=0x110000;
921
922 /* enumerate index-2 blocks */
923 i1=UNEWTRIE2_INDEX_1_LENGTH;
924 c=prev;
925 while(c>0) {
926 i2Block=trie->index1[--i1];
927 if(i2Block==prevI2Block) {
928 /* the index-2 block is the same as the previous one, and filled with highValue */
929 c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
930 continue;
931 }
932 prevI2Block=i2Block;
933 if(i2Block==index2NullOffset) {
934 /* this is the null index-2 block */
935 if(highValue!=initialValue) {
936 return c;
937 }
938 c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
939 } else {
940 /* enumerate data blocks for one index-2 block */
941 for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
942 block=trie->index2[i2Block+ --i2];
943 if(block==prevBlock) {
944 /* the block is the same as the previous one, and filled with highValue */
945 c-=UTRIE2_DATA_BLOCK_LENGTH;
946 continue;
947 }
948 prevBlock=block;
949 if(block==nullBlock) {
950 /* this is the null data block */
951 if(highValue!=initialValue) {
952 return c;
953 }
954 c-=UTRIE2_DATA_BLOCK_LENGTH;
955 } else {
956 for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
957 value=data32[block+ --j];
958 if(value!=highValue) {
959 return c;
960 }
961 --c;
962 }
963 }
964 }
965 }
966 }
967
968 /* deliver last range */
969 return 0;
970 }
971
972 /*
973 * Compact a build-time trie.
974 *
975 * The compaction
976 * - removes blocks that are identical with earlier ones
977 * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
978 * - moves blocks in steps of the data granularity
979 * - moves and overlaps blocks that overlap with multiple values in the overlap region
980 *
981 * It does not
982 * - try to move and overlap blocks that are not already adjacent
983 */
984 static void
compactData(UNewTrie2 * trie)985 compactData(UNewTrie2 *trie) {
986 int32_t start, newStart, movedStart;
987 int32_t blockLength, overlap;
988 int32_t i, mapIndex, blockCount;
989
990 /* do not compact linear-ASCII data */
991 newStart=UTRIE2_DATA_START_OFFSET;
992 for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
993 trie->map[i]=start;
994 }
995
996 /*
997 * Start with a block length of 64 for 2-byte UTF-8,
998 * then switch to UTRIE2_DATA_BLOCK_LENGTH.
999 */
1000 blockLength=64;
1001 blockCount=blockLength>>UTRIE2_SHIFT_2;
1002 for(start=newStart; start<trie->dataLength;) {
1003 /*
1004 * start: index of first entry of current block
1005 * newStart: index where the current block is to be moved
1006 * (right after current end of already-compacted data)
1007 */
1008 if(start==UNEWTRIE2_DATA_0800_OFFSET) {
1009 blockLength=UTRIE2_DATA_BLOCK_LENGTH;
1010 blockCount=1;
1011 }
1012
1013 /* skip blocks that are not used */
1014 if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
1015 /* advance start to the next block */
1016 start+=blockLength;
1017
1018 /* leave newStart with the previous block! */
1019 continue;
1020 }
1021
1022 /* search for an identical block */
1023 if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
1024 >=0
1025 ) {
1026 /* found an identical block, set the other block's index value for the current block */
1027 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1028 trie->map[mapIndex++]=movedStart;
1029 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1030 }
1031
1032 /* advance start to the next block */
1033 start+=blockLength;
1034
1035 /* leave newStart with the previous block! */
1036 continue;
1037 }
1038
1039 /* see if the beginning of this block can be overlapped with the end of the previous block */
1040 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
1041 for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
1042 overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
1043 overlap-=UTRIE2_DATA_GRANULARITY) {}
1044
1045 if(overlap>0 || newStart<start) {
1046 /* some overlap, or just move the whole block */
1047 movedStart=newStart-overlap;
1048 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1049 trie->map[mapIndex++]=movedStart;
1050 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1051 }
1052
1053 /* move the non-overlapping indexes to their new positions */
1054 start+=overlap;
1055 for(i=blockLength-overlap; i>0; --i) {
1056 trie->data[newStart++]=trie->data[start++];
1057 }
1058 } else /* no overlap && newStart==start */ {
1059 for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1060 trie->map[mapIndex++]=start;
1061 start+=UTRIE2_DATA_BLOCK_LENGTH;
1062 }
1063 newStart=start;
1064 }
1065 }
1066
1067 /* now adjust the index-2 table */
1068 for(i=0; i<trie->index2Length; ++i) {
1069 if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
1070 /* Gap indexes are invalid (-1). Skip over the gap. */
1071 i+=UNEWTRIE2_INDEX_GAP_LENGTH;
1072 }
1073 trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
1074 }
1075 trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
1076
1077 /* ensure dataLength alignment */
1078 while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1079 trie->data[newStart++]=trie->initialValue;
1080 }
1081
1082 #ifdef UTRIE2_DEBUG
1083 /* we saved some space */
1084 printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n",
1085 (long)trie->dataLength, (long)newStart);
1086 #endif
1087
1088 trie->dataLength=newStart;
1089 }
1090
1091 static void
compactIndex2(UNewTrie2 * trie)1092 compactIndex2(UNewTrie2 *trie) {
1093 int32_t i, start, newStart, movedStart, overlap;
1094
1095 /* do not compact linear-BMP index-2 blocks */
1096 newStart=UTRIE2_INDEX_2_BMP_LENGTH;
1097 for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
1098 trie->map[i]=start;
1099 }
1100
1101 /* Reduce the index table gap to what will be needed at runtime. */
1102 newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
1103
1104 for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
1105 /*
1106 * start: index of first entry of current block
1107 * newStart: index where the current block is to be moved
1108 * (right after current end of already-compacted data)
1109 */
1110
1111 /* search for an identical block */
1112 if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
1113 >=0
1114 ) {
1115 /* found an identical block, set the other block's index value for the current block */
1116 trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
1117
1118 /* advance start to the next block */
1119 start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1120
1121 /* leave newStart with the previous block! */
1122 continue;
1123 }
1124
1125 /* see if the beginning of this block can be overlapped with the end of the previous block */
1126 /* look for maximum overlap with the previous, adjacent block */
1127 for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
1128 overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
1129 --overlap) {}
1130
1131 if(overlap>0 || newStart<start) {
1132 /* some overlap, or just move the whole block */
1133 trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
1134
1135 /* move the non-overlapping indexes to their new positions */
1136 start+=overlap;
1137 for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
1138 trie->index2[newStart++]=trie->index2[start++];
1139 }
1140 } else /* no overlap && newStart==start */ {
1141 trie->map[start>>UTRIE2_SHIFT_1_2]=start;
1142 start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1143 newStart=start;
1144 }
1145 }
1146
1147 /* now adjust the index-1 table */
1148 for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
1149 trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
1150 }
1151 trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
1152
1153 /*
1154 * Ensure data table alignment:
1155 * Needs to be granularity-aligned for 16-bit trie
1156 * (so that dataMove will be down-shiftable),
1157 * and 2-aligned for uint32_t data.
1158 */
1159 while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
1160 /* Arbitrary value: 0x3fffc not possible for real data. */
1161 trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT;
1162 }
1163
1164 #ifdef UTRIE2_DEBUG
1165 /* we saved some space */
1166 printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n",
1167 (long)trie->index2Length, (long)newStart);
1168 #endif
1169
1170 trie->index2Length=newStart;
1171 }
1172
1173 static void
compactTrie(UTrie2 * trie,UErrorCode * pErrorCode)1174 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
1175 UNewTrie2 *newTrie;
1176 UChar32 highStart, suppHighStart;
1177 uint32_t highValue;
1178
1179 newTrie=trie->newTrie;
1180
1181 /* find highStart and round it up */
1182 highValue=utrie2_get32(trie, 0x10ffff);
1183 highStart=findHighStart(newTrie, highValue);
1184 highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
1185 if(highStart==0x110000) {
1186 highValue=trie->errorValue;
1187 }
1188
1189 /*
1190 * Set trie->highStart only after utrie2_get32(trie, highStart).
1191 * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
1192 */
1193 trie->highStart=newTrie->highStart=highStart;
1194
1195 #ifdef UTRIE2_DEBUG
1196 printf("UTrie2: highStart U+%04lx highValue 0x%lx initialValue 0x%lx\n",
1197 (long)highStart, (long)highValue, (long)trie->initialValue);
1198 #endif
1199
1200 if(highStart<0x110000) {
1201 /* Blank out [highStart..10ffff] to release associated data blocks. */
1202 suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
1203 utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode);
1204 if(U_FAILURE(*pErrorCode)) {
1205 return;
1206 }
1207 }
1208
1209 compactData(newTrie);
1210 if(highStart>0x10000) {
1211 compactIndex2(newTrie);
1212 #ifdef UTRIE2_DEBUG
1213 } else {
1214 printf("UTrie2: highStart U+%04lx count of 16-bit index-2 words %lu->%lu\n",
1215 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
1216 #endif
1217 }
1218
1219 /*
1220 * Store the highValue in the data array and round up the dataLength.
1221 * Must be done after compactData() because that assumes that dataLength
1222 * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
1223 */
1224 newTrie->data[newTrie->dataLength++]=highValue;
1225 while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1226 newTrie->data[newTrie->dataLength++]=trie->initialValue;
1227 }
1228
1229 newTrie->isCompacted=TRUE;
1230 }
1231
1232 /* serialization ------------------------------------------------------------ */
1233
1234 /**
1235 * Maximum length of the runtime index array.
1236 * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
1237 * (The actual maximum length is lower,
1238 * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
1239 */
1240 #define UTRIE2_MAX_INDEX_LENGTH 0xffff
1241
1242 /**
1243 * Maximum length of the runtime data array.
1244 * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
1245 * and by uint16_t UTrie2Header.shiftedDataLength.
1246 */
1247 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
1248
1249 /* Compact and internally serialize the trie. */
1250 U_CAPI void U_EXPORT2
utrie2_freeze(UTrie2 * trie,UTrie2ValueBits valueBits,UErrorCode * pErrorCode)1251 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
1252 UNewTrie2 *newTrie;
1253 UTrie2Header *header;
1254 uint32_t *p;
1255 uint16_t *dest16;
1256 int32_t i, length;
1257 int32_t allIndexesLength;
1258 int32_t dataMove; /* >0 if the data is moved to the end of the index array */
1259 UChar32 highStart;
1260
1261 /* argument check */
1262 if(U_FAILURE(*pErrorCode)) {
1263 return;
1264 }
1265 if( trie==NULL ||
1266 valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
1267 ) {
1268 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1269 return;
1270 }
1271 newTrie=trie->newTrie;
1272 if(newTrie==NULL) {
1273 /* already frozen */
1274 UTrie2ValueBits frozenValueBits=
1275 trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
1276 if(valueBits!=frozenValueBits) {
1277 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1278 }
1279 return;
1280 }
1281
1282 /* compact if necessary */
1283 if(!newTrie->isCompacted) {
1284 compactTrie(trie, pErrorCode);
1285 if(U_FAILURE(*pErrorCode)) {
1286 return;
1287 }
1288 }
1289 highStart=trie->highStart;
1290
1291 if(highStart<=0x10000) {
1292 allIndexesLength=UTRIE2_INDEX_1_OFFSET;
1293 } else {
1294 allIndexesLength=newTrie->index2Length;
1295 }
1296 if(valueBits==UTRIE2_16_VALUE_BITS) {
1297 dataMove=allIndexesLength;
1298 } else {
1299 dataMove=0;
1300 }
1301
1302 /* are indexLength and dataLength within limits? */
1303 if( /* for unshifted indexLength */
1304 allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
1305 /* for unshifted dataNullOffset */
1306 (dataMove+newTrie->dataNullOffset)>0xffff ||
1307 /* for unshifted 2-byte UTF-8 index-2 values */
1308 (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
1309 /* for shiftedDataLength */
1310 (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
1311 ) {
1312 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1313 return;
1314 }
1315
1316 /* calculate the total serialized length */
1317 length=sizeof(UTrie2Header)+allIndexesLength*2;
1318 if(valueBits==UTRIE2_16_VALUE_BITS) {
1319 length+=newTrie->dataLength*2;
1320 } else {
1321 length+=newTrie->dataLength*4;
1322 }
1323
1324 trie->memory=uprv_malloc(length);
1325 if(trie->memory==NULL) {
1326 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1327 return;
1328 }
1329 trie->length=length;
1330 trie->isMemoryOwned=TRUE;
1331
1332 trie->indexLength=allIndexesLength;
1333 trie->dataLength=newTrie->dataLength;
1334 if(highStart<=0x10000) {
1335 trie->index2NullOffset=0xffff;
1336 } else {
1337 trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset;
1338 }
1339 trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
1340 trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
1341
1342 /* set the header fields */
1343 header=(UTrie2Header *)trie->memory;
1344
1345 header->signature=UTRIE2_SIG; /* "Tri2" */
1346 header->options=(uint16_t)valueBits;
1347
1348 header->indexLength=(uint16_t)trie->indexLength;
1349 header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
1350 header->index2NullOffset=trie->index2NullOffset;
1351 header->dataNullOffset=trie->dataNullOffset;
1352 header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
1353
1354 /* fill the index and data arrays */
1355 dest16=(uint16_t *)(header+1);
1356 trie->index=dest16;
1357
1358 /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
1359 p=(uint32_t *)newTrie->index2;
1360 for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
1361 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1362 }
1363
1364 /* write UTF-8 2-byte index-2 values, not right-shifted */
1365 for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */
1366 *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
1367 }
1368 for(; i<(0xe0-0xc0); ++i) { /* C2..DF */
1369 *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
1370 }
1371
1372 if(highStart>0x10000) {
1373 int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
1374 int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
1375
1376 /* write 16-bit index-1 values for supplementary code points */
1377 p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
1378 for(i=index1Length; i>0; --i) {
1379 *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
1380 }
1381
1382 /*
1383 * write the index-2 array values for supplementary code points,
1384 * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
1385 */
1386 p=(uint32_t *)newTrie->index2+index2Offset;
1387 for(i=newTrie->index2Length-index2Offset; i>0; --i) {
1388 *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1389 }
1390 }
1391
1392 /* write the 16/32-bit data array */
1393 switch(valueBits) {
1394 case UTRIE2_16_VALUE_BITS:
1395 /* write 16-bit data values */
1396 trie->data16=dest16;
1397 trie->data32=NULL;
1398 p=newTrie->data;
1399 for(i=newTrie->dataLength; i>0; --i) {
1400 *dest16++=(uint16_t)*p++;
1401 }
1402 break;
1403 case UTRIE2_32_VALUE_BITS:
1404 /* write 32-bit data values */
1405 trie->data16=NULL;
1406 trie->data32=(uint32_t *)dest16;
1407 uprv_memcpy(dest16, newTrie->data, (size_t)newTrie->dataLength*4);
1408 break;
1409 default:
1410 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1411 return;
1412 }
1413
1414 /* Delete the UNewTrie2. */
1415 uprv_free(newTrie->data);
1416 uprv_free(newTrie);
1417 trie->newTrie=NULL;
1418 }
1419
1420 /*
1421 * This is here to avoid a dependency from utrie2.cpp on utrie.c.
1422 * This file already depends on utrie.c.
1423 * Otherwise, this should be in utrie2.cpp right after utrie2_swap().
1424 */
1425 U_CAPI int32_t U_EXPORT2
utrie2_swapAnyVersion(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)1426 utrie2_swapAnyVersion(const UDataSwapper *ds,
1427 const void *inData, int32_t length, void *outData,
1428 UErrorCode *pErrorCode) {
1429 if(U_SUCCESS(*pErrorCode)) {
1430 switch(utrie2_getVersion(inData, length, TRUE)) {
1431 case 1:
1432 return utrie_swap(ds, inData, length, outData, pErrorCode);
1433 case 2:
1434 return utrie2_swap(ds, inData, length, outData, pErrorCode);
1435 default:
1436 *pErrorCode=U_INVALID_FORMAT_ERROR;
1437 return 0;
1438 }
1439 }
1440 return 0;
1441 }
1442