• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2001-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  utrie2_builder.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2008sep26 (split off from utrie2.c)
14 *   created by: Markus W. Scherer
15 *
16 *   This is a common implementation of a Unicode trie.
17 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
18 *   Unicode code points (0..0x10ffff).
19 *   This is the second common version of a Unicode trie (hence the name UTrie2).
20 *   See utrie2.h for a comparison.
21 *
22 *   This file contains only the builder code.
23 *   See utrie2.c for the runtime and enumeration code.
24 */
25 #ifdef UTRIE2_DEBUG
26 #   include <stdio.h>
27 #endif
28 
29 #include "unicode/utypes.h"
30 #include "cmemory.h"
31 #include "utrie2.h"
32 #include "utrie2_impl.h"
33 
34 #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */
35 
36 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
37 
38 /* Implementation notes ----------------------------------------------------- */
39 
40 /*
41  * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
42  * have been chosen to minimize trie sizes overall.
43  * Most of the code is flexible enough to work with a range of values,
44  * within certain limits.
45  *
46  * Exception: Support for separate values for lead surrogate code _units_
47  * vs. code _points_ was added after the constants were fixed,
48  * and has not been tested nor particularly designed for different constant values.
49  * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
50  * part and back.)
51  *
52  * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
53  * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
54  * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
55  * remapping) stops working.
56  *
57  * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
58  * assumes that a single index-2 block is used for 0x400 code points
59  * corresponding to one lead surrogate.
60  *
61  * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
62  * more than one Unicode plane, and the split of the index-2 table into a BMP
63  * part and a supplementary part, with a gap in between, would not work.
64  *
65  * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
66  * there is data with more than 64k distinct values,
67  * for example for Unihan collation with a separate collation weight per
68  * Han character.
69  */
70 
71 /* Building a trie ----------------------------------------------------------*/
72 
73 enum {
74     /** The null index-2 block, following the gap in the index-2 table. */
75     UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
76 
77     /** The start of allocated index-2 blocks. */
78     UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
79 
80     /**
81      * The null data block.
82      * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
83      * to work with 6-bit trail bytes from 2-byte UTF-8.
84      */
85     UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
86 
87     /** The start of allocated data blocks. */
88     UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
89 
90     /**
91      * The start of data blocks for U+0800 and above.
92      * Below, compaction uses a block length of 64 for 2-byte UTF-8.
93      * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
94      * Data values for 0x780 code points beyond ASCII.
95      */
96     UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
97 };
98 
99 /* Start with allocation of 16k data entries. */
100 #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
101 
102 /* Grow about 8x each time. */
103 #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
104 
105 static int32_t
106 allocIndex2Block(UNewTrie2 *trie);
107 
108 U_CAPI UTrie2 * U_EXPORT2
utrie2_open(uint32_t initialValue,uint32_t errorValue,UErrorCode * pErrorCode)109 utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
110     UTrie2 *trie;
111     UNewTrie2 *newTrie;
112     uint32_t *data;
113     int32_t i, j;
114 
115     if(U_FAILURE(*pErrorCode)) {
116         return NULL;
117     }
118 
119     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
120     newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
121     data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
122     if(trie==NULL || newTrie==NULL || data==NULL) {
123         uprv_free(trie);
124         uprv_free(newTrie);
125         uprv_free(data);
126         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
127         return 0;
128     }
129 
130     uprv_memset(trie, 0, sizeof(UTrie2));
131     trie->initialValue=initialValue;
132     trie->errorValue=errorValue;
133     trie->highStart=0x110000;
134     trie->newTrie=newTrie;
135 
136     newTrie->data=data;
137     newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
138     newTrie->initialValue=initialValue;
139     newTrie->errorValue=errorValue;
140     newTrie->highStart=0x110000;
141     newTrie->firstFreeBlock=0;  /* no free block in the list */
142     newTrie->isCompacted=FALSE;
143 
144     /*
145      * preallocate and reset
146      * - ASCII
147      * - the bad-UTF-8-data block
148      * - the null data block
149      */
150     for(i=0; i<0x80; ++i) {
151         newTrie->data[i]=initialValue;
152     }
153     for(; i<0xc0; ++i) {
154         newTrie->data[i]=errorValue;
155     }
156     for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
157         newTrie->data[i]=initialValue;
158     }
159     newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
160     newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
161 
162     /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
163     for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
164         newTrie->index2[i]=j;
165         newTrie->map[i]=1;
166     }
167     /* reference counts for the bad-UTF-8-data block */
168     for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
169         newTrie->map[i]=0;
170     }
171     /*
172      * Reference counts for the null data block: all blocks except for the ASCII blocks.
173      * Plus 1 so that we don't drop this block during compaction.
174      * Plus as many as needed for lead surrogate code points.
175      */
176     /* i==newTrie->dataNullOffset */
177     newTrie->map[i++]=
178         (0x110000>>UTRIE2_SHIFT_2)-
179         (0x80>>UTRIE2_SHIFT_2)+
180         1+
181         UTRIE2_LSCP_INDEX_2_LENGTH;
182     j+=UTRIE2_DATA_BLOCK_LENGTH;
183     for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
184         newTrie->map[i]=0;
185     }
186 
187     /*
188      * set the remaining indexes in the BMP index-2 block
189      * to the null data block
190      */
191     for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
192         newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
193     }
194 
195     /*
196      * Fill the index gap with impossible values so that compaction
197      * does not overlap other index-2 blocks with the gap.
198      */
199     for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
200         newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
201     }
202 
203     /* set the indexes in the null index-2 block */
204     for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
205         newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
206     }
207     newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
208     newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
209 
210     /* set the index-1 indexes for the linear index-2 block */
211     for(i=0, j=0;
212         i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
213         ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
214     ) {
215         newTrie->index1[i]=j;
216     }
217 
218     /* set the remaining index-1 indexes to the null index-2 block */
219     for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
220         newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
221     }
222 
223     /*
224      * Preallocate and reset data for U+0080..U+07ff,
225      * for 2-byte UTF-8 which will be compacted in 64-blocks
226      * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
227      */
228     for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
229         utrie2_set32(trie, i, initialValue, pErrorCode);
230     }
231 
232     return trie;
233 }
234 
235 static UNewTrie2 *
cloneBuilder(const UNewTrie2 * other)236 cloneBuilder(const UNewTrie2 *other) {
237     UNewTrie2 *trie;
238 
239     trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
240     if(trie==NULL) {
241         return NULL;
242     }
243 
244     trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4);
245     if(trie->data==NULL) {
246         uprv_free(trie);
247         return NULL;
248     }
249     trie->dataCapacity=other->dataCapacity;
250 
251     /* clone data */
252     uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
253     uprv_memcpy(trie->index2, other->index2, other->index2Length*4);
254     trie->index2NullOffset=other->index2NullOffset;
255     trie->index2Length=other->index2Length;
256 
257     uprv_memcpy(trie->data, other->data, other->dataLength*4);
258     trie->dataNullOffset=other->dataNullOffset;
259     trie->dataLength=other->dataLength;
260 
261     /* reference counters */
262     if(other->isCompacted) {
263         trie->firstFreeBlock=0;
264     } else {
265         uprv_memcpy(trie->map, other->map, (other->dataLength>>UTRIE2_SHIFT_2)*4);
266         trie->firstFreeBlock=other->firstFreeBlock;
267     }
268 
269     trie->initialValue=other->initialValue;
270     trie->errorValue=other->errorValue;
271     trie->highStart=other->highStart;
272     trie->isCompacted=other->isCompacted;
273 
274     return trie;
275 }
276 
277 U_CAPI UTrie2 * U_EXPORT2
utrie2_clone(const UTrie2 * other,UErrorCode * pErrorCode)278 utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
279     UTrie2 *trie;
280 
281     if(U_FAILURE(*pErrorCode)) {
282         return NULL;
283     }
284     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
285         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
286         return NULL;
287     }
288 
289     trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
290     if(trie==NULL) {
291         return NULL;
292     }
293     uprv_memcpy(trie, other, sizeof(UTrie2));
294 
295     if(other->memory!=NULL) {
296         trie->memory=uprv_malloc(other->length);
297         if(trie->memory!=NULL) {
298             trie->isMemoryOwned=TRUE;
299             uprv_memcpy(trie->memory, other->memory, other->length);
300 
301             /* make the clone's pointers point to its own memory */
302             trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
303             if(other->data16!=NULL) {
304                 trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
305             }
306             if(other->data32!=NULL) {
307                 trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
308             }
309         }
310     } else /* other->newTrie!=NULL */ {
311         trie->newTrie=cloneBuilder(other->newTrie);
312     }
313 
314     if(trie->memory==NULL && trie->newTrie==NULL) {
315         uprv_free(trie);
316         trie=NULL;
317     }
318     return trie;
319 }
320 
321 typedef struct NewTrieAndStatus {
322     UTrie2 *trie;
323     UErrorCode errorCode;
324     UBool exclusiveLimit;  /* rather than inclusive range end */
325 } NewTrieAndStatus;
326 
327 static UBool U_CALLCONV
copyEnumRange(const void * context,UChar32 start,UChar32 end,uint32_t value)328 copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
329     NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
330     if(value!=nt->trie->initialValue) {
331         if(nt->exclusiveLimit) {
332             --end;
333         }
334         if(start==end) {
335             utrie2_set32(nt->trie, start, value, &nt->errorCode);
336         } else {
337             utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode);
338         }
339         return U_SUCCESS(nt->errorCode);
340     } else {
341         return TRUE;
342     }
343 }
344 
345 #ifdef UTRIE2_DEBUG
346 static void
utrie_printLengths(const UTrie * trie)347 utrie_printLengths(const UTrie *trie) {
348     long indexLength=trie->indexLength;
349     long dataLength=(long)trie->dataLength;
350     long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
351     printf("**UTrieLengths** index:%6ld  data:%6ld  serialized:%6ld\n",
352            indexLength, dataLength, totalLength);
353 }
354 
355 static void
utrie2_printLengths(const UTrie2 * trie,const char * which)356 utrie2_printLengths(const UTrie2 *trie, const char *which) {
357     long indexLength=trie->indexLength;
358     long dataLength=(long)trie->dataLength;
359     long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
360     printf("**UTrie2Lengths(%s)** index:%6ld  data:%6ld  serialized:%6ld\n",
361            which, indexLength, dataLength, totalLength);
362 }
363 #endif
364 
365 U_CAPI UTrie2 * U_EXPORT2
utrie2_cloneAsThawed(const UTrie2 * other,UErrorCode * pErrorCode)366 utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
367     NewTrieAndStatus context;
368     UChar lead;
369 
370     if(U_FAILURE(*pErrorCode)) {
371         return NULL;
372     }
373     if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
374         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
375         return NULL;
376     }
377     if(other->newTrie!=NULL && !other->newTrie->isCompacted) {
378         return utrie2_clone(other, pErrorCode);  /* clone an unfrozen trie */
379     }
380 
381     /* Clone the frozen trie by enumerating it and building a new one. */
382     context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
383     if(U_FAILURE(*pErrorCode)) {
384         return NULL;
385     }
386     context.exclusiveLimit=FALSE;
387     context.errorCode=*pErrorCode;
388     utrie2_enum(other, NULL, copyEnumRange, &context);
389     *pErrorCode=context.errorCode;
390     for(lead=0xd800; lead<0xdc00; ++lead) {
391         uint32_t value;
392         if(other->data32==NULL) {
393             value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
394         } else {
395             value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
396         }
397         if(value!=other->initialValue) {
398             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
399         }
400     }
401     if(U_FAILURE(*pErrorCode)) {
402         utrie2_close(context.trie);
403         context.trie=NULL;
404     }
405     return context.trie;
406 }
407 
408 /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
409 U_CAPI UTrie2 * U_EXPORT2
utrie2_fromUTrie(const UTrie * trie1,uint32_t errorValue,UErrorCode * pErrorCode)410 utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
411     NewTrieAndStatus context;
412     UChar lead;
413 
414     if(U_FAILURE(*pErrorCode)) {
415         return NULL;
416     }
417     if(trie1==NULL) {
418         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
419         return NULL;
420     }
421     context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
422     if(U_FAILURE(*pErrorCode)) {
423         return NULL;
424     }
425     context.exclusiveLimit=TRUE;
426     context.errorCode=*pErrorCode;
427     utrie_enum(trie1, NULL, copyEnumRange, &context);
428     *pErrorCode=context.errorCode;
429     for(lead=0xd800; lead<0xdc00; ++lead) {
430         uint32_t value;
431         if(trie1->data32==NULL) {
432             value=UTRIE_GET16_FROM_LEAD(trie1, lead);
433         } else {
434             value=UTRIE_GET32_FROM_LEAD(trie1, lead);
435         }
436         if(value!=trie1->initialValue) {
437             utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
438         }
439     }
440     if(U_SUCCESS(*pErrorCode)) {
441         utrie2_freeze(context.trie,
442                       trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
443                       pErrorCode);
444     }
445 #ifdef UTRIE2_DEBUG
446     if(U_SUCCESS(*pErrorCode)) {
447         utrie_printLengths(trie1);
448         utrie2_printLengths(context.trie, "fromUTrie");
449     }
450 #endif
451     if(U_FAILURE(*pErrorCode)) {
452         utrie2_close(context.trie);
453         context.trie=NULL;
454     }
455     return context.trie;
456 }
457 
458 static U_INLINE UBool
isInNullBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)459 isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
460     int32_t i2, block;
461 
462     if(U_IS_LEAD(c) && forLSCP) {
463         i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
464             (c>>UTRIE2_SHIFT_2);
465     } else {
466         i2=trie->index1[c>>UTRIE2_SHIFT_1]+
467             ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
468     }
469     block=trie->index2[i2];
470     return (UBool)(block==trie->dataNullOffset);
471 }
472 
473 static int32_t
allocIndex2Block(UNewTrie2 * trie)474 allocIndex2Block(UNewTrie2 *trie) {
475     int32_t newBlock, newTop;
476 
477     newBlock=trie->index2Length;
478     newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
479     if(newTop>LENGTHOF(trie->index2)) {
480         /*
481          * Should never occur.
482          * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
483          * or the code writes more values than should be possible.
484          */
485         return -1;
486     }
487     trie->index2Length=newTop;
488     uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
489     return newBlock;
490 }
491 
492 static int32_t
getIndex2Block(UNewTrie2 * trie,UChar32 c,UBool forLSCP)493 getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
494     int32_t i1, i2;
495 
496     if(U_IS_LEAD(c) && forLSCP) {
497         return UTRIE2_LSCP_INDEX_2_OFFSET;
498     }
499 
500     i1=c>>UTRIE2_SHIFT_1;
501     i2=trie->index1[i1];
502     if(i2==trie->index2NullOffset) {
503         i2=allocIndex2Block(trie);
504         if(i2<0) {
505             return -1;  /* program error */
506         }
507         trie->index1[i1]=i2;
508     }
509     return i2;
510 }
511 
512 static int32_t
allocDataBlock(UNewTrie2 * trie,int32_t copyBlock)513 allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
514     int32_t newBlock, newTop;
515 
516     if(trie->firstFreeBlock!=0) {
517         /* get the first free block */
518         newBlock=trie->firstFreeBlock;
519         trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
520     } else {
521         /* get a new block from the high end */
522         newBlock=trie->dataLength;
523         newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
524         if(newTop>trie->dataCapacity) {
525             /* out of memory in the data array */
526             int32_t capacity;
527             uint32_t *data;
528 
529             if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
530                 capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
531             } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
532                 capacity=UNEWTRIE2_MAX_DATA_LENGTH;
533             } else {
534                 /*
535                  * Should never occur.
536                  * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
537                  * or the code writes more values than should be possible.
538                  */
539                 return -1;
540             }
541             data=(uint32_t *)uprv_malloc(capacity*4);
542             if(data==NULL) {
543                 return -1;
544             }
545             uprv_memcpy(data, trie->data, trie->dataLength*4);
546             uprv_free(trie->data);
547             trie->data=data;
548             trie->dataCapacity=capacity;
549         }
550         trie->dataLength=newTop;
551     }
552     uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
553     trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
554     return newBlock;
555 }
556 
557 /* call when the block's reference counter reaches 0 */
558 static void
releaseDataBlock(UNewTrie2 * trie,int32_t block)559 releaseDataBlock(UNewTrie2 *trie, int32_t block) {
560     /* put this block at the front of the free-block chain */
561     trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
562     trie->firstFreeBlock=block;
563 }
564 
565 static U_INLINE UBool
isWritableBlock(UNewTrie2 * trie,int32_t block)566 isWritableBlock(UNewTrie2 *trie, int32_t block) {
567     return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]);
568 }
569 
570 static U_INLINE void
setIndex2Entry(UNewTrie2 * trie,int32_t i2,int32_t block)571 setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
572     int32_t oldBlock;
573     ++trie->map[block>>UTRIE2_SHIFT_2];  /* increment first, in case block==oldBlock! */
574     oldBlock=trie->index2[i2];
575     if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
576         releaseDataBlock(trie, oldBlock);
577     }
578     trie->index2[i2]=block;
579 }
580 
581 /**
582  * No error checking for illegal arguments.
583  *
584  * @return -1 if no new data block available (out of memory in data array)
585  * @internal
586  */
587 static int32_t
getDataBlock(UNewTrie2 * trie,UChar32 c,UBool forLSCP)588 getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
589     int32_t i2, oldBlock, newBlock;
590 
591     i2=getIndex2Block(trie, c, forLSCP);
592     if(i2<0) {
593         return -1;  /* program error */
594     }
595 
596     i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
597     oldBlock=trie->index2[i2];
598     if(isWritableBlock(trie, oldBlock)) {
599         return oldBlock;
600     }
601 
602     /* allocate a new data block */
603     newBlock=allocDataBlock(trie, oldBlock);
604     if(newBlock<0) {
605         /* out of memory in the data array */
606         return -1;
607     }
608     setIndex2Entry(trie, i2, newBlock);
609     return newBlock;
610 }
611 
612 /**
613  * @return TRUE if the value was successfully set
614  */
615 static void
set32(UNewTrie2 * trie,UChar32 c,UBool forLSCP,uint32_t value,UErrorCode * pErrorCode)616 set32(UNewTrie2 *trie,
617       UChar32 c, UBool forLSCP, uint32_t value,
618       UErrorCode *pErrorCode) {
619     int32_t block;
620 
621     if(trie==NULL || trie->isCompacted) {
622         *pErrorCode=U_NO_WRITE_PERMISSION;
623         return;
624     }
625 
626     block=getDataBlock(trie, c, forLSCP);
627     if(block<0) {
628         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
629         return;
630     }
631 
632     trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
633 }
634 
635 U_CAPI void U_EXPORT2
utrie2_set32(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)636 utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
637     if(U_FAILURE(*pErrorCode)) {
638         return;
639     }
640     if((uint32_t)c>0x10ffff) {
641         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
642         return;
643     }
644     set32(trie->newTrie, c, TRUE, value, pErrorCode);
645 }
646 
647 U_CAPI void U_EXPORT2
utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 * trie,UChar32 c,uint32_t value,UErrorCode * pErrorCode)648 utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
649                                      UChar32 c, uint32_t value,
650                                      UErrorCode *pErrorCode) {
651     if(U_FAILURE(*pErrorCode)) {
652         return;
653     }
654     if(!U_IS_LEAD(c)) {
655         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
656         return;
657     }
658     set32(trie->newTrie, c, FALSE, value, pErrorCode);
659 }
660 
661 static void
writeBlock(uint32_t * block,uint32_t value)662 writeBlock(uint32_t *block, uint32_t value) {
663     uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
664     while(block<limit) {
665         *block++=value;
666     }
667 }
668 
669 /**
670  * initialValue is ignored if overwrite=TRUE
671  * @internal
672  */
673 static void
fillBlock(uint32_t * block,UChar32 start,UChar32 limit,uint32_t value,uint32_t initialValue,UBool overwrite)674 fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
675           uint32_t value, uint32_t initialValue, UBool overwrite) {
676     uint32_t *pLimit;
677 
678     pLimit=block+limit;
679     block+=start;
680     if(overwrite) {
681         while(block<pLimit) {
682             *block++=value;
683         }
684     } else {
685         while(block<pLimit) {
686             if(*block==initialValue) {
687                 *block=value;
688             }
689             ++block;
690         }
691     }
692 }
693 
694 U_CAPI void U_EXPORT2
utrie2_setRange32(UTrie2 * trie,UChar32 start,UChar32 end,uint32_t value,UBool overwrite,UErrorCode * pErrorCode)695 utrie2_setRange32(UTrie2 *trie,
696                   UChar32 start, UChar32 end,
697                   uint32_t value, UBool overwrite,
698                   UErrorCode *pErrorCode) {
699     /*
700      * repeat value in [start..end]
701      * mark index values for repeat-data blocks by setting bit 31 of the index values
702      * fill around existing values if any, if(overwrite)
703      */
704     UNewTrie2 *newTrie;
705     int32_t block, rest, repeatBlock;
706     UChar32 limit;
707 
708     if(U_FAILURE(*pErrorCode)) {
709         return;
710     }
711     if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
712         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
713         return;
714     }
715     newTrie=trie->newTrie;
716     if(newTrie==NULL || newTrie->isCompacted) {
717         *pErrorCode=U_NO_WRITE_PERMISSION;
718         return;
719     }
720     if(!overwrite && value==newTrie->initialValue) {
721         return; /* nothing to do */
722     }
723 
724     limit=end+1;
725     if(start&UTRIE2_DATA_MASK) {
726         UChar32 nextStart;
727 
728         /* set partial block at [start..following block boundary[ */
729         block=getDataBlock(newTrie, start, TRUE);
730         if(block<0) {
731             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
732             return;
733         }
734 
735         nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK;
736         if(nextStart<=limit) {
737             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
738                       value, newTrie->initialValue, overwrite);
739             start=nextStart;
740         } else {
741             fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
742                       value, newTrie->initialValue, overwrite);
743             return;
744         }
745     }
746 
747     /* number of positions in the last, partial block */
748     rest=limit&UTRIE2_DATA_MASK;
749 
750     /* round down limit to a block boundary */
751     limit&=~UTRIE2_DATA_MASK;
752 
753     /* iterate over all-value blocks */
754     if(value==newTrie->initialValue) {
755         repeatBlock=newTrie->dataNullOffset;
756     } else {
757         repeatBlock=-1;
758     }
759 
760     while(start<limit) {
761         int32_t i2;
762         UBool setRepeatBlock=FALSE;
763 
764         if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) {
765             start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
766             continue;
767         }
768 
769         /* get index value */
770         i2=getIndex2Block(newTrie, start, TRUE);
771         if(i2<0) {
772             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
773             return;
774         }
775         i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
776         block=newTrie->index2[i2];
777         if(isWritableBlock(newTrie, block)) {
778             /* already allocated */
779             if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
780                 /*
781                  * We overwrite all values, and it's not a
782                  * protected (ASCII-linear or 2-byte UTF-8) block:
783                  * replace with the repeatBlock.
784                  */
785                 setRepeatBlock=TRUE;
786             } else {
787                 /* !overwrite, or protected block: just write the values into this block */
788                 fillBlock(newTrie->data+block,
789                           0, UTRIE2_DATA_BLOCK_LENGTH,
790                           value, newTrie->initialValue, overwrite);
791             }
792         } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
793             /*
794              * Set the repeatBlock instead of the null block or previous repeat block:
795              *
796              * If !isWritableBlock() then all entries in the block have the same value
797              * because it's the null block or a range block (the repeatBlock from a previous
798              * call to utrie2_setRange32()).
799              * No other blocks are used multiple times before compacting.
800              *
801              * The null block is the only non-writable block with the initialValue because
802              * of the repeatBlock initialization above. (If value==initialValue, then
803              * the repeatBlock will be the null data block.)
804              *
805              * We set our repeatBlock if the desired value differs from the block's value,
806              * and if we overwrite any data or if the data is all initial values
807              * (which is the same as the block being the null block, see above).
808              */
809             setRepeatBlock=TRUE;
810         }
811         if(setRepeatBlock) {
812             if(repeatBlock>=0) {
813                 setIndex2Entry(newTrie, i2, repeatBlock);
814             } else {
815                 /* create and set and fill the repeatBlock */
816                 repeatBlock=getDataBlock(newTrie, start, TRUE);
817                 if(repeatBlock<0) {
818                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
819                     return;
820                 }
821                 writeBlock(newTrie->data+repeatBlock, value);
822             }
823         }
824 
825         start+=UTRIE2_DATA_BLOCK_LENGTH;
826     }
827 
828     if(rest>0) {
829         /* set partial block at [last block boundary..limit[ */
830         block=getDataBlock(newTrie, start, TRUE);
831         if(block<0) {
832             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
833             return;
834         }
835 
836         fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
837     }
838 
839     return;
840 }
841 
842 /* compaction --------------------------------------------------------------- */
843 
844 static U_INLINE UBool
equal_int32(const int32_t * s,const int32_t * t,int32_t length)845 equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
846     while(length>0 && *s==*t) {
847         ++s;
848         ++t;
849         --length;
850     }
851     return (UBool)(length==0);
852 }
853 
854 static U_INLINE UBool
equal_uint32(const uint32_t * s,const uint32_t * t,int32_t length)855 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
856     while(length>0 && *s==*t) {
857         ++s;
858         ++t;
859         --length;
860     }
861     return (UBool)(length==0);
862 }
863 
864 static int32_t
findSameIndex2Block(const int32_t * idx,int32_t index2Length,int32_t otherBlock)865 findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
866     int32_t block;
867 
868     /* ensure that we do not even partially get past index2Length */
869     index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
870 
871     for(block=0; block<=index2Length; ++block) {
872         if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
873             return block;
874         }
875     }
876     return -1;
877 }
878 
879 static int32_t
findSameDataBlock(const uint32_t * data,int32_t dataLength,int32_t otherBlock,int32_t blockLength)880 findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
881     int32_t block;
882 
883     /* ensure that we do not even partially get past dataLength */
884     dataLength-=blockLength;
885 
886     for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
887         if(equal_uint32(data+block, data+otherBlock, blockLength)) {
888             return block;
889         }
890     }
891     return -1;
892 }
893 
894 /*
895  * Find the start of the last range in the trie by enumerating backward.
896  * Indexes for supplementary code points higher than this will be omitted.
897  */
898 static UChar32
findHighStart(UNewTrie2 * trie,uint32_t highValue)899 findHighStart(UNewTrie2 *trie, uint32_t highValue) {
900     const uint32_t *data32;
901 
902     uint32_t value, initialValue;
903     UChar32 c, prev;
904     int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
905 
906     data32=trie->data;
907     initialValue=trie->initialValue;
908 
909     index2NullOffset=trie->index2NullOffset;
910     nullBlock=trie->dataNullOffset;
911 
912     /* set variables for previous range */
913     if(highValue==initialValue) {
914         prevI2Block=index2NullOffset;
915         prevBlock=nullBlock;
916     } else {
917         prevI2Block=-1;
918         prevBlock=-1;
919     }
920     prev=0x110000;
921 
922     /* enumerate index-2 blocks */
923     i1=UNEWTRIE2_INDEX_1_LENGTH;
924     c=prev;
925     while(c>0) {
926         i2Block=trie->index1[--i1];
927         if(i2Block==prevI2Block) {
928             /* the index-2 block is the same as the previous one, and filled with highValue */
929             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
930             continue;
931         }
932         prevI2Block=i2Block;
933         if(i2Block==index2NullOffset) {
934             /* this is the null index-2 block */
935             if(highValue!=initialValue) {
936                 return c;
937             }
938             c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
939         } else {
940             /* enumerate data blocks for one index-2 block */
941             for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
942                 block=trie->index2[i2Block+ --i2];
943                 if(block==prevBlock) {
944                     /* the block is the same as the previous one, and filled with highValue */
945                     c-=UTRIE2_DATA_BLOCK_LENGTH;
946                     continue;
947                 }
948                 prevBlock=block;
949                 if(block==nullBlock) {
950                     /* this is the null data block */
951                     if(highValue!=initialValue) {
952                         return c;
953                     }
954                     c-=UTRIE2_DATA_BLOCK_LENGTH;
955                 } else {
956                     for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
957                         value=data32[block+ --j];
958                         if(value!=highValue) {
959                             return c;
960                         }
961                         --c;
962                     }
963                 }
964             }
965         }
966     }
967 
968     /* deliver last range */
969     return 0;
970 }
971 
972 /*
973  * Compact a build-time trie.
974  *
975  * The compaction
976  * - removes blocks that are identical with earlier ones
977  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
978  * - moves blocks in steps of the data granularity
979  * - moves and overlaps blocks that overlap with multiple values in the overlap region
980  *
981  * It does not
982  * - try to move and overlap blocks that are not already adjacent
983  */
984 static void
compactData(UNewTrie2 * trie)985 compactData(UNewTrie2 *trie) {
986     int32_t start, newStart, movedStart;
987     int32_t blockLength, overlap;
988     int32_t i, mapIndex, blockCount;
989 
990     /* do not compact linear-ASCII data */
991     newStart=UTRIE2_DATA_START_OFFSET;
992     for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
993         trie->map[i]=start;
994     }
995 
996     /*
997      * Start with a block length of 64 for 2-byte UTF-8,
998      * then switch to UTRIE2_DATA_BLOCK_LENGTH.
999      */
1000     blockLength=64;
1001     blockCount=blockLength>>UTRIE2_SHIFT_2;
1002     for(start=newStart; start<trie->dataLength;) {
1003         /*
1004          * start: index of first entry of current block
1005          * newStart: index where the current block is to be moved
1006          *           (right after current end of already-compacted data)
1007          */
1008         if(start==UNEWTRIE2_DATA_0800_OFFSET) {
1009             blockLength=UTRIE2_DATA_BLOCK_LENGTH;
1010             blockCount=1;
1011         }
1012 
1013         /* skip blocks that are not used */
1014         if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
1015             /* advance start to the next block */
1016             start+=blockLength;
1017 
1018             /* leave newStart with the previous block! */
1019             continue;
1020         }
1021 
1022         /* search for an identical block */
1023         if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
1024              >=0
1025         ) {
1026             /* found an identical block, set the other block's index value for the current block */
1027             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1028                 trie->map[mapIndex++]=movedStart;
1029                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1030             }
1031 
1032             /* advance start to the next block */
1033             start+=blockLength;
1034 
1035             /* leave newStart with the previous block! */
1036             continue;
1037         }
1038 
1039         /* see if the beginning of this block can be overlapped with the end of the previous block */
1040         /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
1041         for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
1042             overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
1043             overlap-=UTRIE2_DATA_GRANULARITY) {}
1044 
1045         if(overlap>0 || newStart<start) {
1046             /* some overlap, or just move the whole block */
1047             movedStart=newStart-overlap;
1048             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1049                 trie->map[mapIndex++]=movedStart;
1050                 movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1051             }
1052 
1053             /* move the non-overlapping indexes to their new positions */
1054             start+=overlap;
1055             for(i=blockLength-overlap; i>0; --i) {
1056                 trie->data[newStart++]=trie->data[start++];
1057             }
1058         } else /* no overlap && newStart==start */ {
1059             for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1060                 trie->map[mapIndex++]=start;
1061                 start+=UTRIE2_DATA_BLOCK_LENGTH;
1062             }
1063             newStart=start;
1064         }
1065     }
1066 
1067     /* now adjust the index-2 table */
1068     for(i=0; i<trie->index2Length; ++i) {
1069         if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
1070             /* Gap indexes are invalid (-1). Skip over the gap. */
1071             i+=UNEWTRIE2_INDEX_GAP_LENGTH;
1072         }
1073         trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
1074     }
1075     trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
1076 
1077     /* ensure dataLength alignment */
1078     while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1079         trie->data[newStart++]=trie->initialValue;
1080     }
1081 
1082 #ifdef UTRIE2_DEBUG
1083     /* we saved some space */
1084     printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n",
1085             (long)trie->dataLength, (long)newStart);
1086 #endif
1087 
1088     trie->dataLength=newStart;
1089 }
1090 
1091 static void
compactIndex2(UNewTrie2 * trie)1092 compactIndex2(UNewTrie2 *trie) {
1093     int32_t i, start, newStart, movedStart, overlap;
1094 
1095     /* do not compact linear-BMP index-2 blocks */
1096     newStart=UTRIE2_INDEX_2_BMP_LENGTH;
1097     for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
1098         trie->map[i]=start;
1099     }
1100 
1101     /* Reduce the index table gap to what will be needed at runtime. */
1102     newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
1103 
1104     for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
1105         /*
1106          * start: index of first entry of current block
1107          * newStart: index where the current block is to be moved
1108          *           (right after current end of already-compacted data)
1109          */
1110 
1111         /* search for an identical block */
1112         if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
1113              >=0
1114         ) {
1115             /* found an identical block, set the other block's index value for the current block */
1116             trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
1117 
1118             /* advance start to the next block */
1119             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1120 
1121             /* leave newStart with the previous block! */
1122             continue;
1123         }
1124 
1125         /* see if the beginning of this block can be overlapped with the end of the previous block */
1126         /* look for maximum overlap with the previous, adjacent block */
1127         for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
1128             overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
1129             --overlap) {}
1130 
1131         if(overlap>0 || newStart<start) {
1132             /* some overlap, or just move the whole block */
1133             trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
1134 
1135             /* move the non-overlapping indexes to their new positions */
1136             start+=overlap;
1137             for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
1138                 trie->index2[newStart++]=trie->index2[start++];
1139             }
1140         } else /* no overlap && newStart==start */ {
1141             trie->map[start>>UTRIE2_SHIFT_1_2]=start;
1142             start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1143             newStart=start;
1144         }
1145     }
1146 
1147     /* now adjust the index-1 table */
1148     for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
1149         trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
1150     }
1151     trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
1152 
1153     /*
1154      * Ensure data table alignment:
1155      * Needs to be granularity-aligned for 16-bit trie
1156      * (so that dataMove will be down-shiftable),
1157      * and 2-aligned for uint32_t data.
1158      */
1159     while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
1160         /* Arbitrary value: 0x3fffc not possible for real data. */
1161         trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT;
1162     }
1163 
1164 #ifdef UTRIE2_DEBUG
1165     /* we saved some space */
1166     printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n",
1167             (long)trie->index2Length, (long)newStart);
1168 #endif
1169 
1170     trie->index2Length=newStart;
1171 }
1172 
1173 static void
compactTrie(UTrie2 * trie,UErrorCode * pErrorCode)1174 compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
1175     UNewTrie2 *newTrie;
1176     UChar32 highStart, suppHighStart;
1177     uint32_t highValue;
1178 
1179     newTrie=trie->newTrie;
1180 
1181     /* find highStart and round it up */
1182     highValue=utrie2_get32(trie, 0x10ffff);
1183     highStart=findHighStart(newTrie, highValue);
1184     highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
1185     if(highStart==0x110000) {
1186         highValue=trie->errorValue;
1187     }
1188 
1189     /*
1190      * Set trie->highStart only after utrie2_get32(trie, highStart).
1191      * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
1192      */
1193     trie->highStart=newTrie->highStart=highStart;
1194 
1195 #ifdef UTRIE2_DEBUG
1196     printf("UTrie2: highStart U+%04lx  highValue 0x%lx  initialValue 0x%lx\n",
1197             (long)highStart, (long)highValue, (long)trie->initialValue);
1198 #endif
1199 
1200     if(highStart<0x110000) {
1201         /* Blank out [highStart..10ffff] to release associated data blocks. */
1202         suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
1203         utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode);
1204         if(U_FAILURE(*pErrorCode)) {
1205             return;
1206         }
1207     }
1208 
1209     compactData(newTrie);
1210     if(highStart>0x10000) {
1211         compactIndex2(newTrie);
1212 #ifdef UTRIE2_DEBUG
1213     } else {
1214         printf("UTrie2: highStart U+%04lx  count of 16-bit index-2 words %lu->%lu\n",
1215                 (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
1216 #endif
1217     }
1218 
1219     /*
1220      * Store the highValue in the data array and round up the dataLength.
1221      * Must be done after compactData() because that assumes that dataLength
1222      * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
1223      */
1224     newTrie->data[newTrie->dataLength++]=highValue;
1225     while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1226         newTrie->data[newTrie->dataLength++]=trie->initialValue;
1227     }
1228 
1229     newTrie->isCompacted=TRUE;
1230 }
1231 
1232 /* serialization ------------------------------------------------------------ */
1233 
1234 /**
1235  * Maximum length of the runtime index array.
1236  * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
1237  * (The actual maximum length is lower,
1238  * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
1239  */
1240 #define UTRIE2_MAX_INDEX_LENGTH 0xffff
1241 
1242 /**
1243  * Maximum length of the runtime data array.
1244  * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
1245  * and by uint16_t UTrie2Header.shiftedDataLength.
1246  */
1247 #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
1248 
1249 /* Compact and internally serialize the trie. */
1250 U_CAPI void U_EXPORT2
utrie2_freeze(UTrie2 * trie,UTrie2ValueBits valueBits,UErrorCode * pErrorCode)1251 utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
1252     UNewTrie2 *newTrie;
1253     UTrie2Header *header;
1254     uint32_t *p;
1255     uint16_t *dest16;
1256     int32_t i, length;
1257     int32_t allIndexesLength;
1258     int32_t dataMove;  /* >0 if the data is moved to the end of the index array */
1259     UChar32 highStart;
1260 
1261     /* argument check */
1262     if(U_FAILURE(*pErrorCode)) {
1263         return;
1264     }
1265     if( trie==NULL ||
1266         valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
1267     ) {
1268         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1269         return;
1270     }
1271     newTrie=trie->newTrie;
1272     if(newTrie==NULL) {
1273         /* already frozen */
1274         UTrie2ValueBits frozenValueBits=
1275             trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
1276         if(valueBits!=frozenValueBits) {
1277             *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1278         }
1279         return;
1280     }
1281 
1282     /* compact if necessary */
1283     if(!newTrie->isCompacted) {
1284         compactTrie(trie, pErrorCode);
1285         if(U_FAILURE(*pErrorCode)) {
1286             return;
1287         }
1288     }
1289     highStart=trie->highStart;
1290 
1291     if(highStart<=0x10000) {
1292         allIndexesLength=UTRIE2_INDEX_1_OFFSET;
1293     } else {
1294         allIndexesLength=newTrie->index2Length;
1295     }
1296     if(valueBits==UTRIE2_16_VALUE_BITS) {
1297         dataMove=allIndexesLength;
1298     } else {
1299         dataMove=0;
1300     }
1301 
1302     /* are indexLength and dataLength within limits? */
1303     if( /* for unshifted indexLength */
1304         allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
1305         /* for unshifted dataNullOffset */
1306         (dataMove+newTrie->dataNullOffset)>0xffff ||
1307         /* for unshifted 2-byte UTF-8 index-2 values */
1308         (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
1309         /* for shiftedDataLength */
1310         (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
1311     ) {
1312         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1313         return;
1314     }
1315 
1316     /* calculate the total serialized length */
1317     length=sizeof(UTrie2Header)+allIndexesLength*2;
1318     if(valueBits==UTRIE2_16_VALUE_BITS) {
1319         length+=newTrie->dataLength*2;
1320     } else {
1321         length+=newTrie->dataLength*4;
1322     }
1323 
1324     trie->memory=uprv_malloc(length);
1325     if(trie->memory==NULL) {
1326         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1327         return;
1328     }
1329     trie->length=length;
1330     trie->isMemoryOwned=TRUE;
1331 
1332     trie->indexLength=allIndexesLength;
1333     trie->dataLength=newTrie->dataLength;
1334     if(highStart<=0x10000) {
1335         trie->index2NullOffset=0xffff;
1336     } else {
1337         trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset;
1338     }
1339     trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
1340     trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
1341 
1342     /* set the header fields */
1343     header=(UTrie2Header *)trie->memory;
1344 
1345     header->signature=UTRIE2_SIG; /* "Tri2" */
1346     header->options=(uint16_t)valueBits;
1347 
1348     header->indexLength=(uint16_t)trie->indexLength;
1349     header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
1350     header->index2NullOffset=trie->index2NullOffset;
1351     header->dataNullOffset=trie->dataNullOffset;
1352     header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
1353 
1354     /* fill the index and data arrays */
1355     dest16=(uint16_t *)(header+1);
1356     trie->index=dest16;
1357 
1358     /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
1359     p=(uint32_t *)newTrie->index2;
1360     for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
1361         *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1362     }
1363 
1364     /* write UTF-8 2-byte index-2 values, not right-shifted */
1365     for(i=0; i<(0xc2-0xc0); ++i) {                                  /* C0..C1 */
1366         *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
1367     }
1368     for(; i<(0xe0-0xc0); ++i) {                                     /* C2..DF */
1369         *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
1370     }
1371 
1372     if(highStart>0x10000) {
1373         int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
1374         int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
1375 
1376         /* write 16-bit index-1 values for supplementary code points */
1377         p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
1378         for(i=index1Length; i>0; --i) {
1379             *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
1380         }
1381 
1382         /*
1383          * write the index-2 array values for supplementary code points,
1384          * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
1385          */
1386         p=(uint32_t *)newTrie->index2+index2Offset;
1387         for(i=newTrie->index2Length-index2Offset; i>0; --i) {
1388             *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1389         }
1390     }
1391 
1392     /* write the 16/32-bit data array */
1393     switch(valueBits) {
1394     case UTRIE2_16_VALUE_BITS:
1395         /* write 16-bit data values */
1396         trie->data16=dest16;
1397         trie->data32=NULL;
1398         p=newTrie->data;
1399         for(i=newTrie->dataLength; i>0; --i) {
1400             *dest16++=(uint16_t)*p++;
1401         }
1402         break;
1403     case UTRIE2_32_VALUE_BITS:
1404         /* write 32-bit data values */
1405         trie->data16=NULL;
1406         trie->data32=(uint32_t *)dest16;
1407         uprv_memcpy(dest16, newTrie->data, newTrie->dataLength*4);
1408         break;
1409     default:
1410         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1411         return;
1412     }
1413 
1414     /* Delete the UNewTrie2. */
1415     uprv_free(newTrie->data);
1416     uprv_free(newTrie);
1417     trie->newTrie=NULL;
1418 }
1419 
1420 U_CAPI UBool U_EXPORT2
utrie2_isFrozen(const UTrie2 * trie)1421 utrie2_isFrozen(const UTrie2 *trie) {
1422     return (UBool)(trie->newTrie==NULL);
1423 }
1424 
1425 U_CAPI int32_t U_EXPORT2
utrie2_serialize(UTrie2 * trie,void * data,int32_t capacity,UErrorCode * pErrorCode)1426 utrie2_serialize(UTrie2 *trie,
1427                  void *data, int32_t capacity,
1428                  UErrorCode *pErrorCode) {
1429     /* argument check */
1430     if(U_FAILURE(*pErrorCode)) {
1431         return 0;
1432     }
1433 
1434     if( trie==NULL || trie->memory==NULL || trie->newTrie!=NULL ||
1435         capacity<0 || (capacity>0 && (data==NULL || (U_POINTER_MASK_LSB(data, 3)!=0)))
1436     ) {
1437         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1438         return 0;
1439     }
1440 
1441     if(capacity>=trie->length) {
1442         uprv_memcpy(data, trie->memory, trie->length);
1443     } else {
1444         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1445     }
1446     return trie->length;
1447 }
1448 
1449 /*
1450  * This is here to avoid a dependency from utrie2.cpp on utrie.c.
1451  * This file already depends on utrie.c.
1452  * Otherwise, this should be in utrie2.cpp right after utrie2_swap().
1453  */
1454 U_CAPI int32_t U_EXPORT2
utrie2_swapAnyVersion(const UDataSwapper * ds,const void * inData,int32_t length,void * outData,UErrorCode * pErrorCode)1455 utrie2_swapAnyVersion(const UDataSwapper *ds,
1456                       const void *inData, int32_t length, void *outData,
1457                       UErrorCode *pErrorCode) {
1458     if(U_SUCCESS(*pErrorCode)) {
1459         switch(utrie2_getVersion(inData, length, TRUE)) {
1460         case 1:
1461             return utrie_swap(ds, inData, length, outData, pErrorCode);
1462         case 2:
1463             return utrie2_swap(ds, inData, length, outData, pErrorCode);
1464         default:
1465             *pErrorCode=U_INVALID_FORMAT_ERROR;
1466             return 0;
1467         }
1468     }
1469     return 0;
1470 }
1471