• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2001-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  utrie.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2001oct20
14 *   created by: Markus W. Scherer
15 *
16 *   This is a common implementation of a "folded" trie.
17 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
18 *   Unicode code points (0..0x10ffff).
19 */
20 
21 #ifdef UTRIE_DEBUG
22 #   include <stdio.h>
23 #endif
24 
25 #include "unicode/utypes.h"
26 #include "cmemory.h"
27 #include "utrie.h"
28 
29 /* miscellaneous ------------------------------------------------------------ */
30 
31 #undef ABS
32 #define ABS(x) ((x)>=0 ? (x) : -(x))
33 
34 static inline UBool
equal_uint32(const uint32_t * s,const uint32_t * t,int32_t length)35 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
36     while(length>0 && *s==*t) {
37         ++s;
38         ++t;
39         --length;
40     }
41     return (UBool)(length==0);
42 }
43 
44 /* Building a trie ----------------------------------------------------------*/
45 
46 U_CAPI UNewTrie * U_EXPORT2
utrie_open(UNewTrie * fillIn,uint32_t * aliasData,int32_t maxDataLength,uint32_t initialValue,uint32_t leadUnitValue,UBool latin1Linear)47 utrie_open(UNewTrie *fillIn,
48            uint32_t *aliasData, int32_t maxDataLength,
49            uint32_t initialValue, uint32_t leadUnitValue,
50            UBool latin1Linear) {
51     UNewTrie *trie;
52     int32_t i, j;
53 
54     if( maxDataLength<UTRIE_DATA_BLOCK_LENGTH ||
55         (latin1Linear && maxDataLength<1024)
56     ) {
57         return NULL;
58     }
59 
60     if(fillIn!=NULL) {
61         trie=fillIn;
62     } else {
63         trie=(UNewTrie *)uprv_malloc(sizeof(UNewTrie));
64         if(trie==NULL) {
65             return NULL;
66         }
67     }
68     uprv_memset(trie, 0, sizeof(UNewTrie));
69     trie->isAllocated= (UBool)(fillIn==NULL);
70 
71     if(aliasData!=NULL) {
72         trie->data=aliasData;
73         trie->isDataAllocated=FALSE;
74     } else {
75         trie->data=(uint32_t *)uprv_malloc(maxDataLength*4);
76         if(trie->data==NULL) {
77             uprv_free(trie);
78             return NULL;
79         }
80         trie->isDataAllocated=TRUE;
81     }
82 
83     /* preallocate and reset the first data block (block index 0) */
84     j=UTRIE_DATA_BLOCK_LENGTH;
85 
86     if(latin1Linear) {
87         /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */
88         /* made sure above that maxDataLength>=1024 */
89 
90         /* set indexes to point to consecutive data blocks */
91         i=0;
92         do {
93             /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */
94             trie->index[i++]=j;
95             j+=UTRIE_DATA_BLOCK_LENGTH;
96         } while(i<(256>>UTRIE_SHIFT));
97     }
98 
99     /* reset the initially allocated blocks to the initial value */
100     trie->dataLength=j;
101     while(j>0) {
102         trie->data[--j]=initialValue;
103     }
104 
105     trie->leadUnitValue=leadUnitValue;
106     trie->indexLength=UTRIE_MAX_INDEX_LENGTH;
107     trie->dataCapacity=maxDataLength;
108     trie->isLatin1Linear=latin1Linear;
109     trie->isCompacted=FALSE;
110     return trie;
111 }
112 
113 U_CAPI UNewTrie * U_EXPORT2
utrie_clone(UNewTrie * fillIn,const UNewTrie * other,uint32_t * aliasData,int32_t aliasDataCapacity)114 utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataCapacity) {
115     UNewTrie *trie;
116     UBool isDataAllocated;
117 
118     /* do not clone if other is not valid or already compacted */
119     if(other==NULL || other->data==NULL || other->isCompacted) {
120         return NULL;
121     }
122 
123     /* clone data */
124     if(aliasData!=NULL && aliasDataCapacity>=other->dataCapacity) {
125         isDataAllocated=FALSE;
126     } else {
127         aliasDataCapacity=other->dataCapacity;
128         aliasData=(uint32_t *)uprv_malloc(other->dataCapacity*4);
129         if(aliasData==NULL) {
130             return NULL;
131         }
132         isDataAllocated=TRUE;
133     }
134 
135     trie=utrie_open(fillIn, aliasData, aliasDataCapacity,
136                     other->data[0], other->leadUnitValue,
137                     other->isLatin1Linear);
138     if(trie==NULL) {
139         uprv_free(aliasData);
140     } else {
141         uprv_memcpy(trie->index, other->index, sizeof(trie->index));
142         uprv_memcpy(trie->data, other->data, other->dataLength*4);
143         trie->dataLength=other->dataLength;
144         trie->isDataAllocated=isDataAllocated;
145     }
146 
147     return trie;
148 }
149 
150 U_CAPI void U_EXPORT2
utrie_close(UNewTrie * trie)151 utrie_close(UNewTrie *trie) {
152     if(trie!=NULL) {
153         if(trie->isDataAllocated) {
154             uprv_free(trie->data);
155             trie->data=NULL;
156         }
157         if(trie->isAllocated) {
158             uprv_free(trie);
159         }
160     }
161 }
162 
163 U_CAPI uint32_t * U_EXPORT2
utrie_getData(UNewTrie * trie,int32_t * pLength)164 utrie_getData(UNewTrie *trie, int32_t *pLength) {
165     if(trie==NULL || pLength==NULL) {
166         return NULL;
167     }
168 
169     *pLength=trie->dataLength;
170     return trie->data;
171 }
172 
173 static int32_t
utrie_allocDataBlock(UNewTrie * trie)174 utrie_allocDataBlock(UNewTrie *trie) {
175     int32_t newBlock, newTop;
176 
177     newBlock=trie->dataLength;
178     newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH;
179     if(newTop>trie->dataCapacity) {
180         /* out of memory in the data array */
181         return -1;
182     }
183     trie->dataLength=newTop;
184     return newBlock;
185 }
186 
187 /**
188  * No error checking for illegal arguments.
189  *
190  * @return -1 if no new data block available (out of memory in data array)
191  * @internal
192  */
193 static int32_t
utrie_getDataBlock(UNewTrie * trie,UChar32 c)194 utrie_getDataBlock(UNewTrie *trie, UChar32 c) {
195     int32_t indexValue, newBlock;
196 
197     c>>=UTRIE_SHIFT;
198     indexValue=trie->index[c];
199     if(indexValue>0) {
200         return indexValue;
201     }
202 
203     /* allocate a new data block */
204     newBlock=utrie_allocDataBlock(trie);
205     if(newBlock<0) {
206         /* out of memory in the data array */
207         return -1;
208     }
209     trie->index[c]=newBlock;
210 
211     /* copy-on-write for a block from a setRange() */
212     uprv_memcpy(trie->data+newBlock, trie->data-indexValue, 4*UTRIE_DATA_BLOCK_LENGTH);
213     return newBlock;
214 }
215 
216 /**
217  * @return TRUE if the value was successfully set
218  */
219 U_CAPI UBool U_EXPORT2
utrie_set32(UNewTrie * trie,UChar32 c,uint32_t value)220 utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value) {
221     int32_t block;
222 
223     /* valid, uncompacted trie and valid c? */
224     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
225         return FALSE;
226     }
227 
228     block=utrie_getDataBlock(trie, c);
229     if(block<0) {
230         return FALSE;
231     }
232 
233     trie->data[block+(c&UTRIE_MASK)]=value;
234     return TRUE;
235 }
236 
237 U_CAPI uint32_t U_EXPORT2
utrie_get32(UNewTrie * trie,UChar32 c,UBool * pInBlockZero)238 utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero) {
239     int32_t block;
240 
241     /* valid, uncompacted trie and valid c? */
242     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
243         if(pInBlockZero!=NULL) {
244             *pInBlockZero=TRUE;
245         }
246         return 0;
247     }
248 
249     block=trie->index[c>>UTRIE_SHIFT];
250     if(pInBlockZero!=NULL) {
251         *pInBlockZero= (UBool)(block==0);
252     }
253 
254     return trie->data[ABS(block)+(c&UTRIE_MASK)];
255 }
256 
257 /**
258  * @internal
259  */
260 static void
utrie_fillBlock(uint32_t * block,UChar32 start,UChar32 limit,uint32_t value,uint32_t initialValue,UBool overwrite)261 utrie_fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
262                 uint32_t value, uint32_t initialValue, UBool overwrite) {
263     uint32_t *pLimit;
264 
265     pLimit=block+limit;
266     block+=start;
267     if(overwrite) {
268         while(block<pLimit) {
269             *block++=value;
270         }
271     } else {
272         while(block<pLimit) {
273             if(*block==initialValue) {
274                 *block=value;
275             }
276             ++block;
277         }
278     }
279 }
280 
281 U_CAPI UBool U_EXPORT2
utrie_setRange32(UNewTrie * trie,UChar32 start,UChar32 limit,uint32_t value,UBool overwrite)282 utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite) {
283     /*
284      * repeat value in [start..limit[
285      * mark index values for repeat-data blocks by setting bit 31 of the index values
286      * fill around existing values if any, if(overwrite)
287      */
288     uint32_t initialValue;
289     int32_t block, rest, repeatBlock;
290 
291     /* valid, uncompacted trie and valid indexes? */
292     if( trie==NULL || trie->isCompacted ||
293         (uint32_t)start>0x10ffff || (uint32_t)limit>0x110000 || start>limit
294     ) {
295         return FALSE;
296     }
297     if(start==limit) {
298         return TRUE; /* nothing to do */
299     }
300 
301     initialValue=trie->data[0];
302     if(start&UTRIE_MASK) {
303         UChar32 nextStart;
304 
305         /* set partial block at [start..following block boundary[ */
306         block=utrie_getDataBlock(trie, start);
307         if(block<0) {
308             return FALSE;
309         }
310 
311         nextStart=(start+UTRIE_DATA_BLOCK_LENGTH)&~UTRIE_MASK;
312         if(nextStart<=limit) {
313             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, UTRIE_DATA_BLOCK_LENGTH,
314                             value, initialValue, overwrite);
315             start=nextStart;
316         } else {
317             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, limit&UTRIE_MASK,
318                             value, initialValue, overwrite);
319             return TRUE;
320         }
321     }
322 
323     /* number of positions in the last, partial block */
324     rest=limit&UTRIE_MASK;
325 
326     /* round down limit to a block boundary */
327     limit&=~UTRIE_MASK;
328 
329     /* iterate over all-value blocks */
330     if(value==initialValue) {
331         repeatBlock=0;
332     } else {
333         repeatBlock=-1;
334     }
335     while(start<limit) {
336         /* get index value */
337         block=trie->index[start>>UTRIE_SHIFT];
338         if(block>0) {
339             /* already allocated, fill in value */
340             utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, overwrite);
341         } else if(trie->data[-block]!=value && (block==0 || overwrite)) {
342             /* set the repeatBlock instead of the current block 0 or range block */
343             if(repeatBlock>=0) {
344                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
345             } else {
346                 /* create and set and fill the repeatBlock */
347                 repeatBlock=utrie_getDataBlock(trie, start);
348                 if(repeatBlock<0) {
349                     return FALSE;
350                 }
351 
352                 /* set the negative block number to indicate that it is a repeat block */
353                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
354                 utrie_fillBlock(trie->data+repeatBlock, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, TRUE);
355             }
356         }
357 
358         start+=UTRIE_DATA_BLOCK_LENGTH;
359     }
360 
361     if(rest>0) {
362         /* set partial block at [last block boundary..limit[ */
363         block=utrie_getDataBlock(trie, start);
364         if(block<0) {
365             return FALSE;
366         }
367 
368         utrie_fillBlock(trie->data+block, 0, rest, value, initialValue, overwrite);
369     }
370 
371     return TRUE;
372 }
373 
374 static int32_t
_findSameIndexBlock(const int32_t * idx,int32_t indexLength,int32_t otherBlock)375 _findSameIndexBlock(const int32_t *idx, int32_t indexLength,
376                     int32_t otherBlock) {
377     int32_t block, i;
378 
379     for(block=UTRIE_BMP_INDEX_LENGTH; block<indexLength; block+=UTRIE_SURROGATE_BLOCK_COUNT) {
380         for(i=0; i<UTRIE_SURROGATE_BLOCK_COUNT; ++i) {
381             if(idx[block+i]!=idx[otherBlock+i]) {
382                 break;
383             }
384         }
385         if(i==UTRIE_SURROGATE_BLOCK_COUNT) {
386             return block;
387         }
388     }
389     return indexLength;
390 }
391 
392 /*
393  * Fold the normalization data for supplementary code points into
394  * a compact area on top of the BMP-part of the trie index,
395  * with the lead surrogates indexing this compact area.
396  *
397  * Duplicate the index values for lead surrogates:
398  * From inside the BMP area, where some may be overridden with folded values,
399  * to just after the BMP area, where they can be retrieved for
400  * code point lookups.
401  */
402 static void
utrie_fold(UNewTrie * trie,UNewTrieGetFoldedValue * getFoldedValue,UErrorCode * pErrorCode)403 utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *pErrorCode) {
404     int32_t leadIndexes[UTRIE_SURROGATE_BLOCK_COUNT];
405     int32_t *idx;
406     uint32_t value;
407     UChar32 c;
408     int32_t indexLength, block;
409 #ifdef UTRIE_DEBUG
410     int countLeadCUWithData=0;
411 #endif
412 
413     idx=trie->index;
414 
415     /* copy the lead surrogate indexes into a temporary array */
416     uprv_memcpy(leadIndexes, idx+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);
417 
418     /*
419      * set all values for lead surrogate code *units* to leadUnitValue
420      * so that, by default, runtime lookups will find no data for associated
421      * supplementary code points, unless there is data for such code points
422      * which will result in a non-zero folding value below that is set for
423      * the respective lead units
424      *
425      * the above saved the indexes for surrogate code *points*
426      * fill the indexes with simplified code from utrie_setRange32()
427      */
428     if(trie->leadUnitValue==trie->data[0]) {
429         block=0; /* leadUnitValue==initialValue, use all-initial-value block */
430     } else {
431         /* create and fill the repeatBlock */
432         block=utrie_allocDataBlock(trie);
433         if(block<0) {
434             /* data table overflow */
435             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
436             return;
437         }
438         utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], TRUE);
439         block=-block; /* negative block number to indicate that it is a repeat block */
440     }
441     for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) {
442         trie->index[c]=block;
443     }
444 
445     /*
446      * Fold significant index values into the area just after the BMP indexes.
447      * In case the first lead surrogate has significant data,
448      * its index block must be used first (in which case the folding is a no-op).
449      * Later all folded index blocks are moved up one to insert the copied
450      * lead surrogate indexes.
451      */
452     indexLength=UTRIE_BMP_INDEX_LENGTH;
453 
454     /* search for any index (stage 1) entries for supplementary code points */
455     for(c=0x10000; c<0x110000;) {
456         if(idx[c>>UTRIE_SHIFT]!=0) {
457             /* there is data, treat the full block for a lead surrogate */
458             c&=~0x3ff;
459 
460 #ifdef UTRIE_DEBUG
461             ++countLeadCUWithData;
462             /* printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10))); */
463 #endif
464 
465             /* is there an identical index block? */
466             block=_findSameIndexBlock(idx, indexLength, c>>UTRIE_SHIFT);
467 
468             /*
469              * get a folded value for [c..c+0x400[ and,
470              * if different from the value for the lead surrogate code point,
471              * set it for the lead surrogate code unit
472              */
473             value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT);
474             if(value!=utrie_get32(trie, U16_LEAD(c), NULL)) {
475                 if(!utrie_set32(trie, U16_LEAD(c), value)) {
476                     /* data table overflow */
477                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
478                     return;
479                 }
480 
481                 /* if we did not find an identical index block... */
482                 if(block==indexLength) {
483                     /* move the actual index (stage 1) entries from the supplementary position to the new one */
484                     uprv_memmove(idx+indexLength,
485                                  idx+(c>>UTRIE_SHIFT),
486                                  4*UTRIE_SURROGATE_BLOCK_COUNT);
487                     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
488                 }
489             }
490             c+=0x400;
491         } else {
492             c+=UTRIE_DATA_BLOCK_LENGTH;
493         }
494     }
495 #ifdef UTRIE_DEBUG
496     if(countLeadCUWithData>0) {
497         printf("supplementary data for %d lead surrogates\n", countLeadCUWithData);
498     }
499 #endif
500 
501     /*
502      * index array overflow?
503      * This is to guarantee that a folding offset is of the form
504      * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
505      * If the index is too large, then n>=1024 and more than 10 bits are necessary.
506      *
507      * In fact, it can only ever become n==1024 with completely unfoldable data and
508      * the additional block of duplicated values for lead surrogates.
509      */
510     if(indexLength>=UTRIE_MAX_INDEX_LENGTH) {
511         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
512         return;
513     }
514 
515     /*
516      * make space for the lead surrogate index block and
517      * insert it between the BMP indexes and the folded ones
518      */
519     uprv_memmove(idx+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT,
520                  idx+UTRIE_BMP_INDEX_LENGTH,
521                  4*(indexLength-UTRIE_BMP_INDEX_LENGTH));
522     uprv_memcpy(idx+UTRIE_BMP_INDEX_LENGTH,
523                 leadIndexes,
524                 4*UTRIE_SURROGATE_BLOCK_COUNT);
525     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
526 
527 #ifdef UTRIE_DEBUG
528     printf("trie index count: BMP %ld  all Unicode %ld  folded %ld\n",
529            UTRIE_BMP_INDEX_LENGTH, (long)UTRIE_MAX_INDEX_LENGTH, indexLength);
530 #endif
531 
532     trie->indexLength=indexLength;
533 }
534 
535 /*
536  * Set a value in the trie index map to indicate which data block
537  * is referenced and which one is not.
538  * utrie_compact() will remove data blocks that are not used at all.
539  * Set
540  * - 0 if it is used
541  * - -1 if it is not used
542  */
543 static void
_findUnusedBlocks(UNewTrie * trie)544 _findUnusedBlocks(UNewTrie *trie) {
545     int32_t i;
546 
547     /* fill the entire map with "not used" */
548     uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4);
549 
550     /* mark each block that _is_ used with 0 */
551     for(i=0; i<trie->indexLength; ++i) {
552         trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0;
553     }
554 
555     /* never move the all-initial-value block 0 */
556     trie->map[0]=0;
557 }
558 
559 static int32_t
_findSameDataBlock(const uint32_t * data,int32_t dataLength,int32_t otherBlock,int32_t step)560 _findSameDataBlock(const uint32_t *data, int32_t dataLength,
561                    int32_t otherBlock, int32_t step) {
562     int32_t block;
563 
564     /* ensure that we do not even partially get past dataLength */
565     dataLength-=UTRIE_DATA_BLOCK_LENGTH;
566 
567     for(block=0; block<=dataLength; block+=step) {
568         if(equal_uint32(data+block, data+otherBlock, UTRIE_DATA_BLOCK_LENGTH)) {
569             return block;
570         }
571     }
572     return -1;
573 }
574 
575 /*
576  * Compact a folded build-time trie.
577  *
578  * The compaction
579  * - removes blocks that are identical with earlier ones
580  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
581  * - moves blocks in steps of the data granularity
582  * - moves and overlaps blocks that overlap with multiple values in the overlap region
583  *
584  * It does not
585  * - try to move and overlap blocks that are not already adjacent
586  */
587 static void
utrie_compact(UNewTrie * trie,UBool overlap,UErrorCode * pErrorCode)588 utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) {
589     int32_t i, start, newStart, overlapStart;
590 
591     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
592         return;
593     }
594 
595     /* valid, uncompacted trie? */
596     if(trie==NULL) {
597         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
598         return;
599     }
600     if(trie->isCompacted) {
601         return; /* nothing left to do */
602     }
603 
604     /* compaction */
605 
606     /* initialize the index map with "block is used/unused" flags */
607     _findUnusedBlocks(trie);
608 
609     /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */
610     if(trie->isLatin1Linear && UTRIE_SHIFT<=8) {
611         overlapStart=UTRIE_DATA_BLOCK_LENGTH+256;
612     } else {
613         overlapStart=UTRIE_DATA_BLOCK_LENGTH;
614     }
615 
616     newStart=UTRIE_DATA_BLOCK_LENGTH;
617     for(start=newStart; start<trie->dataLength;) {
618         /*
619          * start: index of first entry of current block
620          * newStart: index where the current block is to be moved
621          *           (right after current end of already-compacted data)
622          */
623 
624         /* skip blocks that are not used */
625         if(trie->map[start>>UTRIE_SHIFT]<0) {
626             /* advance start to the next block */
627             start+=UTRIE_DATA_BLOCK_LENGTH;
628 
629             /* leave newStart with the previous block! */
630             continue;
631         }
632 
633         /* search for an identical block */
634         if( start>=overlapStart &&
635             (i=_findSameDataBlock(trie->data, newStart, start,
636                             overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH))
637              >=0
638         ) {
639             /* found an identical block, set the other block's index value for the current block */
640             trie->map[start>>UTRIE_SHIFT]=i;
641 
642             /* advance start to the next block */
643             start+=UTRIE_DATA_BLOCK_LENGTH;
644 
645             /* leave newStart with the previous block! */
646             continue;
647         }
648 
649         /* see if the beginning of this block can be overlapped with the end of the previous block */
650         if(overlap && start>=overlapStart) {
651             /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
652             for(i=UTRIE_DATA_BLOCK_LENGTH-UTRIE_DATA_GRANULARITY;
653                 i>0 && !equal_uint32(trie->data+(newStart-i), trie->data+start, i);
654                 i-=UTRIE_DATA_GRANULARITY) {}
655         } else {
656             i=0;
657         }
658 
659         if(i>0) {
660             /* some overlap */
661             trie->map[start>>UTRIE_SHIFT]=newStart-i;
662 
663             /* move the non-overlapping indexes to their new positions */
664             start+=i;
665             for(i=UTRIE_DATA_BLOCK_LENGTH-i; i>0; --i) {
666                 trie->data[newStart++]=trie->data[start++];
667             }
668         } else if(newStart<start) {
669             /* no overlap, just move the indexes to their new positions */
670             trie->map[start>>UTRIE_SHIFT]=newStart;
671             for(i=UTRIE_DATA_BLOCK_LENGTH; i>0; --i) {
672                 trie->data[newStart++]=trie->data[start++];
673             }
674         } else /* no overlap && newStart==start */ {
675             trie->map[start>>UTRIE_SHIFT]=start;
676             newStart+=UTRIE_DATA_BLOCK_LENGTH;
677             start=newStart;
678         }
679     }
680 
681     /* now adjust the index (stage 1) table */
682     for(i=0; i<trie->indexLength; ++i) {
683         trie->index[i]=trie->map[ABS(trie->index[i])>>UTRIE_SHIFT];
684     }
685 
686 #ifdef UTRIE_DEBUG
687     /* we saved some space */
688     printf("compacting trie: count of 32-bit words %lu->%lu\n",
689             (long)trie->dataLength, (long)newStart);
690 #endif
691 
692     trie->dataLength=newStart;
693 }
694 
695 /* serialization ------------------------------------------------------------ */
696 
697 /*
698  * Default function for the folding value:
699  * Just store the offset (16 bits) if there is any non-initial-value entry.
700  *
701  * The offset parameter is never 0.
702  * Returning the offset itself is safe for UTRIE_SHIFT>=5 because
703  * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800
704  * which fits into 16-bit trie values;
705  * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases.
706  *
707  * Theoretically, it would be safer for all possible UTRIE_SHIFT including
708  * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS
709  * which would always result in a value of 0x40..0x43f
710  * (start/end 1k blocks of supplementary Unicode code points).
711  * However, this would be uglier, and would not work for some existing
712  * binary data file formats.
713  *
714  * Also, we do not plan to change UTRIE_SHIFT because it would change binary
715  * data file formats, and we would probably not make it smaller because of
716  * the then even larger BMP index length even for empty tries.
717  */
718 static uint32_t U_CALLCONV
defaultGetFoldedValue(UNewTrie * trie,UChar32 start,int32_t offset)719 defaultGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
720     uint32_t value, initialValue;
721     UChar32 limit;
722     UBool inBlockZero;
723 
724     initialValue=trie->data[0];
725     limit=start+0x400;
726     while(start<limit) {
727         value=utrie_get32(trie, start, &inBlockZero);
728         if(inBlockZero) {
729             start+=UTRIE_DATA_BLOCK_LENGTH;
730         } else if(value!=initialValue) {
731             return (uint32_t)offset;
732         } else {
733             ++start;
734         }
735     }
736     return 0;
737 }
738 
739 U_CAPI int32_t U_EXPORT2
utrie_serialize(UNewTrie * trie,void * dt,int32_t capacity,UNewTrieGetFoldedValue * getFoldedValue,UBool reduceTo16Bits,UErrorCode * pErrorCode)740 utrie_serialize(UNewTrie *trie, void *dt, int32_t capacity,
741                 UNewTrieGetFoldedValue *getFoldedValue,
742                 UBool reduceTo16Bits,
743                 UErrorCode *pErrorCode) {
744     UTrieHeader *header;
745     uint32_t *p;
746     uint16_t *dest16;
747     int32_t i, length;
748     uint8_t* data = NULL;
749 
750     /* argument check */
751     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
752         return 0;
753     }
754 
755     if(trie==NULL || capacity<0 || (capacity>0 && dt==NULL)) {
756         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
757         return 0;
758     }
759     if(getFoldedValue==NULL) {
760         getFoldedValue=defaultGetFoldedValue;
761     }
762 
763     data = (uint8_t*)dt;
764     /* fold and compact if necessary, also checks that indexLength is within limits */
765     if(!trie->isCompacted) {
766         /* compact once without overlap to improve folding */
767         utrie_compact(trie, FALSE, pErrorCode);
768 
769         /* fold the supplementary part of the index array */
770         utrie_fold(trie, getFoldedValue, pErrorCode);
771 
772         /* compact again with overlap for minimum data array length */
773         utrie_compact(trie, TRUE, pErrorCode);
774 
775         trie->isCompacted=TRUE;
776         if(U_FAILURE(*pErrorCode)) {
777             return 0;
778         }
779     }
780 
781     /* is dataLength within limits? */
782     if( (reduceTo16Bits ? (trie->dataLength+trie->indexLength) : trie->dataLength) >= UTRIE_MAX_DATA_LENGTH) {
783         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
784     }
785 
786     length=sizeof(UTrieHeader)+2*trie->indexLength;
787     if(reduceTo16Bits) {
788         length+=2*trie->dataLength;
789     } else {
790         length+=4*trie->dataLength;
791     }
792 
793     if(length>capacity) {
794         return length; /* preflighting */
795     }
796 
797 #ifdef UTRIE_DEBUG
798     printf("**UTrieLengths(serialize)** index:%6ld  data:%6ld  serialized:%6ld\n",
799            (long)trie->indexLength, (long)trie->dataLength, (long)length);
800 #endif
801 
802     /* set the header fields */
803     header=(UTrieHeader *)data;
804     data+=sizeof(UTrieHeader);
805 
806     header->signature=0x54726965; /* "Trie" */
807     header->options=UTRIE_SHIFT | (UTRIE_INDEX_SHIFT<<UTRIE_OPTIONS_INDEX_SHIFT);
808 
809     if(!reduceTo16Bits) {
810         header->options|=UTRIE_OPTIONS_DATA_IS_32_BIT;
811     }
812     if(trie->isLatin1Linear) {
813         header->options|=UTRIE_OPTIONS_LATIN1_IS_LINEAR;
814     }
815 
816     header->indexLength=trie->indexLength;
817     header->dataLength=trie->dataLength;
818 
819     /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
820     if(reduceTo16Bits) {
821         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
822         p=(uint32_t *)trie->index;
823         dest16=(uint16_t *)data;
824         for(i=trie->indexLength; i>0; --i) {
825             *dest16++=(uint16_t)((*p++ + trie->indexLength)>>UTRIE_INDEX_SHIFT);
826         }
827 
828         /* write 16-bit data values */
829         p=trie->data;
830         for(i=trie->dataLength; i>0; --i) {
831             *dest16++=(uint16_t)*p++;
832         }
833     } else {
834         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
835         p=(uint32_t *)trie->index;
836         dest16=(uint16_t *)data;
837         for(i=trie->indexLength; i>0; --i) {
838             *dest16++=(uint16_t)(*p++ >> UTRIE_INDEX_SHIFT);
839         }
840 
841         /* write 32-bit data values */
842         uprv_memcpy(dest16, trie->data, 4*trie->dataLength);
843     }
844 
845     return length;
846 }
847 
848 /* inverse to defaultGetFoldedValue() */
849 U_CAPI int32_t U_EXPORT2
utrie_defaultGetFoldingOffset(uint32_t data)850 utrie_defaultGetFoldingOffset(uint32_t data) {
851     return (int32_t)data;
852 }
853 
854 U_CAPI int32_t U_EXPORT2
utrie_unserialize(UTrie * trie,const void * data,int32_t length,UErrorCode * pErrorCode)855 utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) {
856     const UTrieHeader *header;
857     const uint16_t *p16;
858     uint32_t options;
859 
860     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
861         return -1;
862     }
863 
864     /* enough data for a trie header? */
865     if(length<(int32_t)sizeof(UTrieHeader)) {
866         *pErrorCode=U_INVALID_FORMAT_ERROR;
867         return -1;
868     }
869 
870     /* check the signature */
871     header=(const UTrieHeader *)data;
872     if(header->signature!=0x54726965) {
873         *pErrorCode=U_INVALID_FORMAT_ERROR;
874         return -1;
875     }
876 
877     /* get the options and check the shift values */
878     options=header->options;
879     if( (options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
880         ((options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT
881     ) {
882         *pErrorCode=U_INVALID_FORMAT_ERROR;
883         return -1;
884     }
885     trie->isLatin1Linear= (UBool)((options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0);
886 
887     /* get the length values */
888     trie->indexLength=header->indexLength;
889     trie->dataLength=header->dataLength;
890 
891     length-=(int32_t)sizeof(UTrieHeader);
892 
893     /* enough data for the index? */
894     if(length<2*trie->indexLength) {
895         *pErrorCode=U_INVALID_FORMAT_ERROR;
896         return -1;
897     }
898     p16=(const uint16_t *)(header+1);
899     trie->index=p16;
900     p16+=trie->indexLength;
901     length-=2*trie->indexLength;
902 
903     /* get the data */
904     if(options&UTRIE_OPTIONS_DATA_IS_32_BIT) {
905         if(length<4*trie->dataLength) {
906             *pErrorCode=U_INVALID_FORMAT_ERROR;
907             return -1;
908         }
909         trie->data32=(const uint32_t *)p16;
910         trie->initialValue=trie->data32[0];
911         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+4*trie->dataLength;
912     } else {
913         if(length<2*trie->dataLength) {
914             *pErrorCode=U_INVALID_FORMAT_ERROR;
915             return -1;
916         }
917 
918         /* the "data16" data is used via the index pointer */
919         trie->data32=NULL;
920         trie->initialValue=trie->index[trie->indexLength];
921         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength;
922     }
923 
924     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
925 
926     return length;
927 }
928 
929 U_CAPI int32_t U_EXPORT2
utrie_unserializeDummy(UTrie * trie,void * data,int32_t length,uint32_t initialValue,uint32_t leadUnitValue,UBool make16BitTrie,UErrorCode * pErrorCode)930 utrie_unserializeDummy(UTrie *trie,
931                        void *data, int32_t length,
932                        uint32_t initialValue, uint32_t leadUnitValue,
933                        UBool make16BitTrie,
934                        UErrorCode *pErrorCode) {
935     uint16_t *p16;
936     int32_t actualLength, latin1Length, i, limit;
937     uint16_t block;
938 
939     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
940         return -1;
941     }
942 
943     /* calculate the actual size of the dummy trie data */
944 
945     /* max(Latin-1, block 0) */
946     latin1Length= 256; /*UTRIE_SHIFT<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH;*/
947 
948     trie->indexLength=UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT;
949     trie->dataLength=latin1Length;
950     if(leadUnitValue!=initialValue) {
951         trie->dataLength+=UTRIE_DATA_BLOCK_LENGTH;
952     }
953 
954     actualLength=trie->indexLength*2;
955     if(make16BitTrie) {
956         actualLength+=trie->dataLength*2;
957     } else {
958         actualLength+=trie->dataLength*4;
959     }
960 
961     /* enough space for the dummy trie? */
962     if(length<actualLength) {
963         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
964         return actualLength;
965     }
966 
967     trie->isLatin1Linear=TRUE;
968     trie->initialValue=initialValue;
969 
970     /* fill the index and data arrays */
971     p16=(uint16_t *)data;
972     trie->index=p16;
973 
974     if(make16BitTrie) {
975         /* indexes to block 0 */
976         block=(uint16_t)(trie->indexLength>>UTRIE_INDEX_SHIFT);
977         limit=trie->indexLength;
978         for(i=0; i<limit; ++i) {
979             p16[i]=block;
980         }
981 
982         if(leadUnitValue!=initialValue) {
983             /* indexes for lead surrogate code units to the block after Latin-1 */
984             block+=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
985             i=0xd800>>UTRIE_SHIFT;
986             limit=0xdc00>>UTRIE_SHIFT;
987             for(; i<limit; ++i) {
988                 p16[i]=block;
989             }
990         }
991 
992         trie->data32=NULL;
993 
994         /* Latin-1 data */
995         p16+=trie->indexLength;
996         for(i=0; i<latin1Length; ++i) {
997             p16[i]=(uint16_t)initialValue;
998         }
999 
1000         /* data for lead surrogate code units */
1001         if(leadUnitValue!=initialValue) {
1002             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
1003             for(/* i=latin1Length */; i<limit; ++i) {
1004                 p16[i]=(uint16_t)leadUnitValue;
1005             }
1006         }
1007     } else {
1008         uint32_t *p32;
1009 
1010         /* indexes to block 0 */
1011         uprv_memset(p16, 0, trie->indexLength*2);
1012 
1013         if(leadUnitValue!=initialValue) {
1014             /* indexes for lead surrogate code units to the block after Latin-1 */
1015             block=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
1016             i=0xd800>>UTRIE_SHIFT;
1017             limit=0xdc00>>UTRIE_SHIFT;
1018             for(; i<limit; ++i) {
1019                 p16[i]=block;
1020             }
1021         }
1022 
1023         trie->data32=p32=(uint32_t *)(p16+trie->indexLength);
1024 
1025         /* Latin-1 data */
1026         for(i=0; i<latin1Length; ++i) {
1027             p32[i]=initialValue;
1028         }
1029 
1030         /* data for lead surrogate code units */
1031         if(leadUnitValue!=initialValue) {
1032             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
1033             for(/* i=latin1Length */; i<limit; ++i) {
1034                 p32[i]=leadUnitValue;
1035             }
1036         }
1037     }
1038 
1039     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
1040 
1041     return actualLength;
1042 }
1043 
1044 /* enumeration -------------------------------------------------------------- */
1045 
1046 /* default UTrieEnumValue() returns the input value itself */
1047 static uint32_t U_CALLCONV
enumSameValue(const void *,uint32_t value)1048 enumSameValue(const void * /*context*/, uint32_t value) {
1049     return value;
1050 }
1051 
1052 /**
1053  * Enumerate all ranges of code points with the same relevant values.
1054  * The values are transformed from the raw trie entries by the enumValue function.
1055  */
1056 U_CAPI void U_EXPORT2
utrie_enum(const UTrie * trie,UTrieEnumValue * enumValue,UTrieEnumRange * enumRange,const void * context)1057 utrie_enum(const UTrie *trie,
1058            UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) {
1059     const uint32_t *data32;
1060     const uint16_t *idx;
1061 
1062     uint32_t value, prevValue, initialValue;
1063     UChar32 c, prev;
1064     int32_t l, i, j, block, prevBlock, nullBlock, offset;
1065 
1066     /* check arguments */
1067     if(trie==NULL || trie->index==NULL || enumRange==NULL) {
1068         return;
1069     }
1070     if(enumValue==NULL) {
1071         enumValue=enumSameValue;
1072     }
1073 
1074     idx=trie->index;
1075     data32=trie->data32;
1076 
1077     /* get the enumeration value that corresponds to an initial-value trie data entry */
1078     initialValue=enumValue(context, trie->initialValue);
1079 
1080     if(data32==NULL) {
1081         nullBlock=trie->indexLength;
1082     } else {
1083         nullBlock=0;
1084     }
1085 
1086     /* set variables for previous range */
1087     prevBlock=nullBlock;
1088     prev=0;
1089     prevValue=initialValue;
1090 
1091     /* enumerate BMP - the main loop enumerates data blocks */
1092     for(i=0, c=0; c<=0xffff; ++i) {
1093         if(c==0xd800) {
1094             /* skip lead surrogate code _units_, go to lead surr. code _points_ */
1095             i=UTRIE_BMP_INDEX_LENGTH;
1096         } else if(c==0xdc00) {
1097             /* go back to regular BMP code points */
1098             i=c>>UTRIE_SHIFT;
1099         }
1100 
1101         block=idx[i]<<UTRIE_INDEX_SHIFT;
1102         if(block==prevBlock) {
1103             /* the block is the same as the previous one, and filled with value */
1104             c+=UTRIE_DATA_BLOCK_LENGTH;
1105         } else if(block==nullBlock) {
1106             /* this is the all-initial-value block */
1107             if(prevValue!=initialValue) {
1108                 if(prev<c) {
1109                     if(!enumRange(context, prev, c, prevValue)) {
1110                         return;
1111                     }
1112                 }
1113                 prevBlock=nullBlock;
1114                 prev=c;
1115                 prevValue=initialValue;
1116             }
1117             c+=UTRIE_DATA_BLOCK_LENGTH;
1118         } else {
1119             prevBlock=block;
1120             for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
1121                 value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);
1122                 if(value!=prevValue) {
1123                     if(prev<c) {
1124                         if(!enumRange(context, prev, c, prevValue)) {
1125                             return;
1126                         }
1127                     }
1128                     if(j>0) {
1129                         /* the block is not filled with all the same value */
1130                         prevBlock=-1;
1131                     }
1132                     prev=c;
1133                     prevValue=value;
1134                 }
1135                 ++c;
1136             }
1137         }
1138     }
1139 
1140     /* enumerate supplementary code points */
1141     for(l=0xd800; l<0xdc00;) {
1142         /* lead surrogate access */
1143         offset=idx[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;
1144         if(offset==nullBlock) {
1145             /* no entries for a whole block of lead surrogates */
1146             if(prevValue!=initialValue) {
1147                 if(prev<c) {
1148                     if(!enumRange(context, prev, c, prevValue)) {
1149                         return;
1150                     }
1151                 }
1152                 prevBlock=nullBlock;
1153                 prev=c;
1154                 prevValue=initialValue;
1155             }
1156 
1157             l+=UTRIE_DATA_BLOCK_LENGTH;
1158             c+=UTRIE_DATA_BLOCK_LENGTH<<10;
1159             continue;
1160         }
1161 
1162         value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : idx[offset+(l&UTRIE_MASK)];
1163 
1164         /* enumerate trail surrogates for this lead surrogate */
1165         offset=trie->getFoldingOffset(value);
1166         if(offset<=0) {
1167             /* no data for this lead surrogate */
1168             if(prevValue!=initialValue) {
1169                 if(prev<c) {
1170                     if(!enumRange(context, prev, c, prevValue)) {
1171                         return;
1172                     }
1173                 }
1174                 prevBlock=nullBlock;
1175                 prev=c;
1176                 prevValue=initialValue;
1177             }
1178 
1179             /* nothing else to do for the supplementary code points for this lead surrogate */
1180             c+=0x400;
1181         } else {
1182             /* enumerate code points for this lead surrogate */
1183             i=offset;
1184             offset+=UTRIE_SURROGATE_BLOCK_COUNT;
1185             do {
1186                 /* copy of most of the body of the BMP loop */
1187                 block=idx[i]<<UTRIE_INDEX_SHIFT;
1188                 if(block==prevBlock) {
1189                     /* the block is the same as the previous one, and filled with value */
1190                     c+=UTRIE_DATA_BLOCK_LENGTH;
1191                 } else if(block==nullBlock) {
1192                     /* this is the all-initial-value block */
1193                     if(prevValue!=initialValue) {
1194                         if(prev<c) {
1195                             if(!enumRange(context, prev, c, prevValue)) {
1196                                 return;
1197                             }
1198                         }
1199                         prevBlock=nullBlock;
1200                         prev=c;
1201                         prevValue=initialValue;
1202                     }
1203                     c+=UTRIE_DATA_BLOCK_LENGTH;
1204                 } else {
1205                     prevBlock=block;
1206                     for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
1207                         value=enumValue(context, data32!=NULL ? data32[block+j] : idx[block+j]);
1208                         if(value!=prevValue) {
1209                             if(prev<c) {
1210                                 if(!enumRange(context, prev, c, prevValue)) {
1211                                     return;
1212                                 }
1213                             }
1214                             if(j>0) {
1215                                 /* the block is not filled with all the same value */
1216                                 prevBlock=-1;
1217                             }
1218                             prev=c;
1219                             prevValue=value;
1220                         }
1221                         ++c;
1222                     }
1223                 }
1224             } while(++i<offset);
1225         }
1226 
1227         ++l;
1228     }
1229 
1230     /* deliver last range */
1231     enumRange(context, prev, c, prevValue);
1232 }
1233