• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2001-2006, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  utrie.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2001oct20
14 *   created by: Markus W. Scherer
15 *
16 *   This is a common implementation of a "folded" trie.
17 *   It is a kind of compressed, serializable table of 16- or 32-bit values associated with
18 *   Unicode code points (0..0x10ffff).
19 */
20 
21 #ifdef UTRIE_DEBUG
22 #   include <stdio.h>
23 #endif
24 
25 #include "unicode/utypes.h"
26 #include "cmemory.h"
27 #include "utrie.h"
28 
29 /* miscellaneous ------------------------------------------------------------ */
30 
31 #undef ABS
32 #define ABS(x) ((x)>=0 ? (x) : -(x))
33 
34 static U_INLINE UBool
equal_uint32(const uint32_t * s,const uint32_t * t,int32_t length)35 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
36     while(length>0 && *s==*t) {
37         ++s;
38         ++t;
39         --length;
40     }
41     return (UBool)(length==0);
42 }
43 
44 /* Building a trie ----------------------------------------------------------*/
45 
46 U_CAPI UNewTrie * U_EXPORT2
utrie_open(UNewTrie * fillIn,uint32_t * aliasData,int32_t maxDataLength,uint32_t initialValue,uint32_t leadUnitValue,UBool latin1Linear)47 utrie_open(UNewTrie *fillIn,
48            uint32_t *aliasData, int32_t maxDataLength,
49            uint32_t initialValue, uint32_t leadUnitValue,
50            UBool latin1Linear) {
51     UNewTrie *trie;
52     int32_t i, j;
53 
54     if( maxDataLength<UTRIE_DATA_BLOCK_LENGTH ||
55         (latin1Linear && maxDataLength<1024)
56     ) {
57         return NULL;
58     }
59 
60     if(fillIn!=NULL) {
61         trie=fillIn;
62     } else {
63         trie=(UNewTrie *)uprv_malloc(sizeof(UNewTrie));
64         if(trie==NULL) {
65             return NULL;
66         }
67     }
68     uprv_memset(trie, 0, sizeof(UNewTrie));
69     trie->isAllocated= (UBool)(fillIn==NULL);
70 
71     if(aliasData!=NULL) {
72         trie->data=aliasData;
73         trie->isDataAllocated=FALSE;
74     } else {
75         trie->data=(uint32_t *)uprv_malloc(maxDataLength*4);
76         if(trie->data==NULL) {
77             uprv_free(trie);
78             return NULL;
79         }
80         trie->isDataAllocated=TRUE;
81     }
82 
83     /* preallocate and reset the first data block (block index 0) */
84     j=UTRIE_DATA_BLOCK_LENGTH;
85 
86     if(latin1Linear) {
87         /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */
88         /* made sure above that maxDataLength>=1024 */
89 
90         /* set indexes to point to consecutive data blocks */
91         i=0;
92         do {
93             /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */
94             trie->index[i++]=j;
95             j+=UTRIE_DATA_BLOCK_LENGTH;
96         } while(i<(256>>UTRIE_SHIFT));
97     }
98 
99     /* reset the initially allocated blocks to the initial value */
100     trie->dataLength=j;
101     while(j>0) {
102         trie->data[--j]=initialValue;
103     }
104 
105     trie->leadUnitValue=leadUnitValue;
106     trie->indexLength=UTRIE_MAX_INDEX_LENGTH;
107     trie->dataCapacity=maxDataLength;
108     trie->isLatin1Linear=latin1Linear;
109     trie->isCompacted=FALSE;
110     return trie;
111 }
112 
113 U_CAPI UNewTrie * U_EXPORT2
utrie_clone(UNewTrie * fillIn,const UNewTrie * other,uint32_t * aliasData,int32_t aliasDataCapacity)114 utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataCapacity) {
115     UNewTrie *trie;
116     UBool isDataAllocated;
117 
118     /* do not clone if other is not valid or already compacted */
119     if(other==NULL || other->data==NULL || other->isCompacted) {
120         return NULL;
121     }
122 
123     /* clone data */
124     if(aliasData!=NULL && aliasDataCapacity>=other->dataCapacity) {
125         isDataAllocated=FALSE;
126     } else {
127         aliasDataCapacity=other->dataCapacity;
128         aliasData=(uint32_t *)uprv_malloc(other->dataCapacity*4);
129         if(aliasData==NULL) {
130             return NULL;
131         }
132         isDataAllocated=TRUE;
133     }
134 
135     trie=utrie_open(fillIn, aliasData, aliasDataCapacity,
136                     other->data[0], other->leadUnitValue,
137                     other->isLatin1Linear);
138     if(trie==NULL) {
139         uprv_free(aliasData);
140     } else {
141         uprv_memcpy(trie->index, other->index, sizeof(trie->index));
142         uprv_memcpy(trie->data, other->data, other->dataLength*4);
143         trie->dataLength=other->dataLength;
144         trie->isDataAllocated=isDataAllocated;
145     }
146 
147     return trie;
148 }
149 
150 U_CAPI void U_EXPORT2
utrie_close(UNewTrie * trie)151 utrie_close(UNewTrie *trie) {
152     if(trie!=NULL) {
153         if(trie->isDataAllocated) {
154             uprv_free(trie->data);
155             trie->data=NULL;
156         }
157         if(trie->isAllocated) {
158             uprv_free(trie);
159         }
160     }
161 }
162 
163 U_CAPI uint32_t * U_EXPORT2
utrie_getData(UNewTrie * trie,int32_t * pLength)164 utrie_getData(UNewTrie *trie, int32_t *pLength) {
165     if(trie==NULL || pLength==NULL) {
166         return NULL;
167     }
168 
169     *pLength=trie->dataLength;
170     return trie->data;
171 }
172 
173 static int32_t
utrie_allocDataBlock(UNewTrie * trie)174 utrie_allocDataBlock(UNewTrie *trie) {
175     int32_t newBlock, newTop;
176 
177     newBlock=trie->dataLength;
178     newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH;
179     if(newTop>trie->dataCapacity) {
180         /* out of memory in the data array */
181         return -1;
182     }
183     trie->dataLength=newTop;
184     return newBlock;
185 }
186 
187 /**
188  * No error checking for illegal arguments.
189  *
190  * @return -1 if no new data block available (out of memory in data array)
191  * @internal
192  */
193 static int32_t
utrie_getDataBlock(UNewTrie * trie,UChar32 c)194 utrie_getDataBlock(UNewTrie *trie, UChar32 c) {
195     int32_t indexValue, newBlock;
196 
197     c>>=UTRIE_SHIFT;
198     indexValue=trie->index[c];
199     if(indexValue>0) {
200         return indexValue;
201     }
202 
203     /* allocate a new data block */
204     newBlock=utrie_allocDataBlock(trie);
205     if(newBlock<0) {
206         /* out of memory in the data array */
207         return -1;
208     }
209     trie->index[c]=newBlock;
210 
211     /* copy-on-write for a block from a setRange() */
212     uprv_memcpy(trie->data+newBlock, trie->data-indexValue, 4*UTRIE_DATA_BLOCK_LENGTH);
213     return newBlock;
214 }
215 
216 /**
217  * @return TRUE if the value was successfully set
218  */
219 U_CAPI UBool U_EXPORT2
utrie_set32(UNewTrie * trie,UChar32 c,uint32_t value)220 utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value) {
221     int32_t block;
222 
223     /* valid, uncompacted trie and valid c? */
224     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
225         return FALSE;
226     }
227 
228     block=utrie_getDataBlock(trie, c);
229     if(block<0) {
230         return FALSE;
231     }
232 
233     trie->data[block+(c&UTRIE_MASK)]=value;
234     return TRUE;
235 }
236 
237 U_CAPI uint32_t U_EXPORT2
utrie_get32(UNewTrie * trie,UChar32 c,UBool * pInBlockZero)238 utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero) {
239     int32_t block;
240 
241     /* valid, uncompacted trie and valid c? */
242     if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
243         if(pInBlockZero!=NULL) {
244             *pInBlockZero=TRUE;
245         }
246         return 0;
247     }
248 
249     block=trie->index[c>>UTRIE_SHIFT];
250     if(pInBlockZero!=NULL) {
251         *pInBlockZero= (UBool)(block==0);
252     }
253 
254     return trie->data[ABS(block)+(c&UTRIE_MASK)];
255 }
256 
257 /**
258  * @internal
259  */
260 static void
utrie_fillBlock(uint32_t * block,UChar32 start,UChar32 limit,uint32_t value,uint32_t initialValue,UBool overwrite)261 utrie_fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
262                 uint32_t value, uint32_t initialValue, UBool overwrite) {
263     uint32_t *pLimit;
264 
265     pLimit=block+limit;
266     block+=start;
267     if(overwrite) {
268         while(block<pLimit) {
269             *block++=value;
270         }
271     } else {
272         while(block<pLimit) {
273             if(*block==initialValue) {
274                 *block=value;
275             }
276             ++block;
277         }
278     }
279 }
280 
281 U_CAPI UBool U_EXPORT2
utrie_setRange32(UNewTrie * trie,UChar32 start,UChar32 limit,uint32_t value,UBool overwrite)282 utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite) {
283     /*
284      * repeat value in [start..limit[
285      * mark index values for repeat-data blocks by setting bit 31 of the index values
286      * fill around existing values if any, if(overwrite)
287      */
288     uint32_t initialValue;
289     int32_t block, rest, repeatBlock;
290 
291     /* valid, uncompacted trie and valid indexes? */
292     if( trie==NULL || trie->isCompacted ||
293         (uint32_t)start>0x10ffff || (uint32_t)limit>0x110000 || start>limit
294     ) {
295         return FALSE;
296     }
297     if(start==limit) {
298         return TRUE; /* nothing to do */
299     }
300 
301     initialValue=trie->data[0];
302     if(start&UTRIE_MASK) {
303         UChar32 nextStart;
304 
305         /* set partial block at [start..following block boundary[ */
306         block=utrie_getDataBlock(trie, start);
307         if(block<0) {
308             return FALSE;
309         }
310 
311         nextStart=(start+UTRIE_DATA_BLOCK_LENGTH)&~UTRIE_MASK;
312         if(nextStart<=limit) {
313             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, UTRIE_DATA_BLOCK_LENGTH,
314                             value, initialValue, overwrite);
315             start=nextStart;
316         } else {
317             utrie_fillBlock(trie->data+block, start&UTRIE_MASK, limit&UTRIE_MASK,
318                             value, initialValue, overwrite);
319             return TRUE;
320         }
321     }
322 
323     /* number of positions in the last, partial block */
324     rest=limit&UTRIE_MASK;
325 
326     /* round down limit to a block boundary */
327     limit&=~UTRIE_MASK;
328 
329     /* iterate over all-value blocks */
330     if(value==initialValue) {
331         repeatBlock=0;
332     } else {
333         repeatBlock=-1;
334     }
335     while(start<limit) {
336         /* get index value */
337         block=trie->index[start>>UTRIE_SHIFT];
338         if(block>0) {
339             /* already allocated, fill in value */
340             utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, overwrite);
341         } else if(trie->data[-block]!=value && (block==0 || overwrite)) {
342             /* set the repeatBlock instead of the current block 0 or range block */
343             if(repeatBlock>=0) {
344                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
345             } else {
346                 /* create and set and fill the repeatBlock */
347                 repeatBlock=utrie_getDataBlock(trie, start);
348                 if(repeatBlock<0) {
349                     return FALSE;
350                 }
351 
352                 /* set the negative block number to indicate that it is a repeat block */
353                 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
354                 utrie_fillBlock(trie->data+repeatBlock, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, TRUE);
355             }
356         }
357 
358         start+=UTRIE_DATA_BLOCK_LENGTH;
359     }
360 
361     if(rest>0) {
362         /* set partial block at [last block boundary..limit[ */
363         block=utrie_getDataBlock(trie, start);
364         if(block<0) {
365             return FALSE;
366         }
367 
368         utrie_fillBlock(trie->data+block, 0, rest, value, initialValue, overwrite);
369     }
370 
371     return TRUE;
372 }
373 
374 static int32_t
_findSameIndexBlock(const int32_t * index,int32_t indexLength,int32_t otherBlock)375 _findSameIndexBlock(const int32_t *index, int32_t indexLength,
376                     int32_t otherBlock) {
377     int32_t block, i;
378 
379     for(block=UTRIE_BMP_INDEX_LENGTH; block<indexLength; block+=UTRIE_SURROGATE_BLOCK_COUNT) {
380         for(i=0; i<UTRIE_SURROGATE_BLOCK_COUNT; ++i) {
381             if(index[block+i]!=index[otherBlock+i]) {
382                 break;
383             }
384         }
385         if(i==UTRIE_SURROGATE_BLOCK_COUNT) {
386             return block;
387         }
388     }
389     return indexLength;
390 }
391 
392 /*
393  * Fold the normalization data for supplementary code points into
394  * a compact area on top of the BMP-part of the trie index,
395  * with the lead surrogates indexing this compact area.
396  *
397  * Duplicate the index values for lead surrogates:
398  * From inside the BMP area, where some may be overridden with folded values,
399  * to just after the BMP area, where they can be retrieved for
400  * code point lookups.
401  */
402 static void
utrie_fold(UNewTrie * trie,UNewTrieGetFoldedValue * getFoldedValue,UErrorCode * pErrorCode)403 utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *pErrorCode) {
404     int32_t leadIndexes[UTRIE_SURROGATE_BLOCK_COUNT];
405     int32_t *index;
406     uint32_t value;
407     UChar32 c;
408     int32_t indexLength, block;
409 
410     index=trie->index;
411 
412     /* copy the lead surrogate indexes into a temporary array */
413     uprv_memcpy(leadIndexes, index+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);
414 
415     /*
416      * set all values for lead surrogate code *units* to leadUnitValue
417      * so that, by default, runtime lookups will find no data for associated
418      * supplementary code points, unless there is data for such code points
419      * which will result in a non-zero folding value below that is set for
420      * the respective lead units
421      *
422      * the above saved the indexes for surrogate code *points*
423      * fill the indexes with simplified code from utrie_setRange32()
424      */
425     if(trie->leadUnitValue==trie->data[0]) {
426         block=0; /* leadUnitValue==initialValue, use all-initial-value block */
427     } else {
428         /* create and fill the repeatBlock */
429         block=utrie_allocDataBlock(trie);
430         if(block<0) {
431             /* data table overflow */
432             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
433             return;
434         }
435         utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], TRUE);
436         block=-block; /* negative block number to indicate that it is a repeat block */
437     }
438     for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) {
439         trie->index[c]=block;
440     }
441 
442     /*
443      * Fold significant index values into the area just after the BMP indexes.
444      * In case the first lead surrogate has significant data,
445      * its index block must be used first (in which case the folding is a no-op).
446      * Later all folded index blocks are moved up one to insert the copied
447      * lead surrogate indexes.
448      */
449     indexLength=UTRIE_BMP_INDEX_LENGTH;
450 
451     /* search for any index (stage 1) entries for supplementary code points */
452     for(c=0x10000; c<0x110000;) {
453         if(index[c>>UTRIE_SHIFT]!=0) {
454             /* there is data, treat the full block for a lead surrogate */
455             c&=~0x3ff;
456 
457 #ifdef UTRIE_DEBUG
458             printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10)));
459 #endif
460 
461             /* is there an identical index block? */
462             block=_findSameIndexBlock(index, indexLength, c>>UTRIE_SHIFT);
463 
464             /*
465              * get a folded value for [c..c+0x400[ and,
466              * if different from the value for the lead surrogate code point,
467              * set it for the lead surrogate code unit
468              */
469             value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT);
470             if(value!=utrie_get32(trie, U16_LEAD(c), NULL)) {
471                 if(!utrie_set32(trie, U16_LEAD(c), value)) {
472                     /* data table overflow */
473                     *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
474                     return;
475                 }
476 
477                 /* if we did not find an identical index block... */
478                 if(block==indexLength) {
479                     /* move the actual index (stage 1) entries from the supplementary position to the new one */
480                     uprv_memmove(index+indexLength,
481                                  index+(c>>UTRIE_SHIFT),
482                                  4*UTRIE_SURROGATE_BLOCK_COUNT);
483                     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
484                 }
485             }
486             c+=0x400;
487         } else {
488             c+=UTRIE_DATA_BLOCK_LENGTH;
489         }
490     }
491 
492     /*
493      * index array overflow?
494      * This is to guarantee that a folding offset is of the form
495      * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
496      * If the index is too large, then n>=1024 and more than 10 bits are necessary.
497      *
498      * In fact, it can only ever become n==1024 with completely unfoldable data and
499      * the additional block of duplicated values for lead surrogates.
500      */
501     if(indexLength>=UTRIE_MAX_INDEX_LENGTH) {
502         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
503         return;
504     }
505 
506     /*
507      * make space for the lead surrogate index block and
508      * insert it between the BMP indexes and the folded ones
509      */
510     uprv_memmove(index+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT,
511                  index+UTRIE_BMP_INDEX_LENGTH,
512                  4*(indexLength-UTRIE_BMP_INDEX_LENGTH));
513     uprv_memcpy(index+UTRIE_BMP_INDEX_LENGTH,
514                 leadIndexes,
515                 4*UTRIE_SURROGATE_BLOCK_COUNT);
516     indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
517 
518 #ifdef UTRIE_DEBUG
519     printf("trie index count: BMP %ld  all Unicode %ld  folded %ld\n",
520            UTRIE_BMP_INDEX_LENGTH, (long)UTRIE_MAX_INDEX_LENGTH, indexLength);
521 #endif
522 
523     trie->indexLength=indexLength;
524 }
525 
526 /*
527  * Set a value in the trie index map to indicate which data block
528  * is referenced and which one is not.
529  * utrie_compact() will remove data blocks that are not used at all.
530  * Set
531  * - 0 if it is used
532  * - -1 if it is not used
533  */
534 static void
_findUnusedBlocks(UNewTrie * trie)535 _findUnusedBlocks(UNewTrie *trie) {
536     int32_t i;
537 
538     /* fill the entire map with "not used" */
539     uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4);
540 
541     /* mark each block that _is_ used with 0 */
542     for(i=0; i<trie->indexLength; ++i) {
543         trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0;
544     }
545 
546     /* never move the all-initial-value block 0 */
547     trie->map[0]=0;
548 }
549 
550 static int32_t
_findSameDataBlock(const uint32_t * data,int32_t dataLength,int32_t otherBlock,int32_t step)551 _findSameDataBlock(const uint32_t *data, int32_t dataLength,
552                    int32_t otherBlock, int32_t step) {
553     int32_t block;
554 
555     /* ensure that we do not even partially get past dataLength */
556     dataLength-=UTRIE_DATA_BLOCK_LENGTH;
557 
558     for(block=0; block<=dataLength; block+=step) {
559         if(equal_uint32(data+block, data+otherBlock, UTRIE_DATA_BLOCK_LENGTH)) {
560             return block;
561         }
562     }
563     return -1;
564 }
565 
566 /*
567  * Compact a folded build-time trie.
568  *
569  * The compaction
570  * - removes blocks that are identical with earlier ones
571  * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
572  * - moves blocks in steps of the data granularity
573  * - moves and overlaps blocks that overlap with multiple values in the overlap region
574  *
575  * It does not
576  * - try to move and overlap blocks that are not already adjacent
577  */
578 static void
utrie_compact(UNewTrie * trie,UBool overlap,UErrorCode * pErrorCode)579 utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) {
580     int32_t i, start, newStart, overlapStart;
581 
582     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
583         return;
584     }
585 
586     /* valid, uncompacted trie? */
587     if(trie==NULL) {
588         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
589         return;
590     }
591     if(trie->isCompacted) {
592         return; /* nothing left to do */
593     }
594 
595     /* compaction */
596 
597     /* initialize the index map with "block is used/unused" flags */
598     _findUnusedBlocks(trie);
599 
600     /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */
601     if(trie->isLatin1Linear && UTRIE_SHIFT<=8) {
602         overlapStart=UTRIE_DATA_BLOCK_LENGTH+256;
603     } else {
604         overlapStart=UTRIE_DATA_BLOCK_LENGTH;
605     }
606 
607     newStart=UTRIE_DATA_BLOCK_LENGTH;
608     for(start=newStart; start<trie->dataLength;) {
609         /*
610          * start: index of first entry of current block
611          * newStart: index where the current block is to be moved
612          *           (right after current end of already-compacted data)
613          */
614 
615         /* skip blocks that are not used */
616         if(trie->map[start>>UTRIE_SHIFT]<0) {
617             /* advance start to the next block */
618             start+=UTRIE_DATA_BLOCK_LENGTH;
619 
620             /* leave newStart with the previous block! */
621             continue;
622         }
623 
624         /* search for an identical block */
625         if( start>=overlapStart &&
626             (i=_findSameDataBlock(trie->data, newStart, start,
627                             overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH))
628              >=0
629         ) {
630             /* found an identical block, set the other block's index value for the current block */
631             trie->map[start>>UTRIE_SHIFT]=i;
632 
633             /* advance start to the next block */
634             start+=UTRIE_DATA_BLOCK_LENGTH;
635 
636             /* leave newStart with the previous block! */
637             continue;
638         }
639 
640         /* see if the beginning of this block can be overlapped with the end of the previous block */
641         if(overlap && start>=overlapStart) {
642             /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
643             for(i=UTRIE_DATA_BLOCK_LENGTH-UTRIE_DATA_GRANULARITY;
644                 i>0 && !equal_uint32(trie->data+(newStart-i), trie->data+start, i);
645                 i-=UTRIE_DATA_GRANULARITY) {}
646         } else {
647             i=0;
648         }
649 
650         if(i>0) {
651             /* some overlap */
652             trie->map[start>>UTRIE_SHIFT]=newStart-i;
653 
654             /* move the non-overlapping indexes to their new positions */
655             start+=i;
656             for(i=UTRIE_DATA_BLOCK_LENGTH-i; i>0; --i) {
657                 trie->data[newStart++]=trie->data[start++];
658             }
659         } else if(newStart<start) {
660             /* no overlap, just move the indexes to their new positions */
661             trie->map[start>>UTRIE_SHIFT]=newStart;
662             for(i=UTRIE_DATA_BLOCK_LENGTH; i>0; --i) {
663                 trie->data[newStart++]=trie->data[start++];
664             }
665         } else /* no overlap && newStart==start */ {
666             trie->map[start>>UTRIE_SHIFT]=start;
667             newStart+=UTRIE_DATA_BLOCK_LENGTH;
668             start=newStart;
669         }
670     }
671 
672     /* now adjust the index (stage 1) table */
673     for(i=0; i<trie->indexLength; ++i) {
674         trie->index[i]=trie->map[ABS(trie->index[i])>>UTRIE_SHIFT];
675     }
676 
677 #ifdef UTRIE_DEBUG
678     /* we saved some space */
679     printf("compacting trie: count of 32-bit words %lu->%lu\n",
680             (long)trie->dataLength, (long)newStart);
681 #endif
682 
683     trie->dataLength=newStart;
684 }
685 
686 /* serialization ------------------------------------------------------------ */
687 
688 /*
689  * Default function for the folding value:
690  * Just store the offset (16 bits) if there is any non-initial-value entry.
691  *
692  * The offset parameter is never 0.
693  * Returning the offset itself is safe for UTRIE_SHIFT>=5 because
694  * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800
695  * which fits into 16-bit trie values;
696  * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases.
697  *
698  * Theoretically, it would be safer for all possible UTRIE_SHIFT including
699  * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS
700  * which would always result in a value of 0x40..0x43f
701  * (start/end 1k blocks of supplementary Unicode code points).
702  * However, this would be uglier, and would not work for some existing
703  * binary data file formats.
704  *
705  * Also, we do not plan to change UTRIE_SHIFT because it would change binary
706  * data file formats, and we would probably not make it smaller because of
707  * the then even larger BMP index length even for empty tries.
708  */
709 static uint32_t U_CALLCONV
defaultGetFoldedValue(UNewTrie * trie,UChar32 start,int32_t offset)710 defaultGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
711     uint32_t value, initialValue;
712     UChar32 limit;
713     UBool inBlockZero;
714 
715     initialValue=trie->data[0];
716     limit=start+0x400;
717     while(start<limit) {
718         value=utrie_get32(trie, start, &inBlockZero);
719         if(inBlockZero) {
720             start+=UTRIE_DATA_BLOCK_LENGTH;
721         } else if(value!=initialValue) {
722             return (uint32_t)offset;
723         } else {
724             ++start;
725         }
726     }
727     return 0;
728 }
729 
730 U_CAPI int32_t U_EXPORT2
utrie_serialize(UNewTrie * trie,void * dt,int32_t capacity,UNewTrieGetFoldedValue * getFoldedValue,UBool reduceTo16Bits,UErrorCode * pErrorCode)731 utrie_serialize(UNewTrie *trie, void *dt, int32_t capacity,
732                 UNewTrieGetFoldedValue *getFoldedValue,
733                 UBool reduceTo16Bits,
734                 UErrorCode *pErrorCode) {
735     UTrieHeader *header;
736     uint32_t *p;
737     uint16_t *dest16;
738     int32_t i, length;
739     uint8_t* data = NULL;
740 
741     /* argument check */
742     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
743         return 0;
744     }
745 
746     if(trie==NULL || capacity<0 || (capacity>0 && dt==NULL)) {
747         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
748         return 0;
749     }
750     if(getFoldedValue==NULL) {
751         getFoldedValue=defaultGetFoldedValue;
752     }
753 
754     data = (uint8_t*)dt;
755     /* fold and compact if necessary, also checks that indexLength is within limits */
756     if(!trie->isCompacted) {
757         /* compact once without overlap to improve folding */
758         utrie_compact(trie, FALSE, pErrorCode);
759 
760         /* fold the supplementary part of the index array */
761         utrie_fold(trie, getFoldedValue, pErrorCode);
762 
763         /* compact again with overlap for minimum data array length */
764         utrie_compact(trie, TRUE, pErrorCode);
765 
766         trie->isCompacted=TRUE;
767         if(U_FAILURE(*pErrorCode)) {
768             return 0;
769         }
770     }
771 
772     /* is dataLength within limits? */
773     if( (reduceTo16Bits ? (trie->dataLength+trie->indexLength) : trie->dataLength) >= UTRIE_MAX_DATA_LENGTH) {
774         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
775     }
776 
777     length=sizeof(UTrieHeader)+2*trie->indexLength;
778     if(reduceTo16Bits) {
779         length+=2*trie->dataLength;
780     } else {
781         length+=4*trie->dataLength;
782     }
783 
784     if(length>capacity) {
785         return length; /* preflighting */
786     }
787 
788     /* set the header fields */
789     header=(UTrieHeader *)data;
790     data+=sizeof(UTrieHeader);
791 
792     header->signature=0x54726965; /* "Trie" */
793     header->options=UTRIE_SHIFT | (UTRIE_INDEX_SHIFT<<UTRIE_OPTIONS_INDEX_SHIFT);
794 
795     if(!reduceTo16Bits) {
796         header->options|=UTRIE_OPTIONS_DATA_IS_32_BIT;
797     }
798     if(trie->isLatin1Linear) {
799         header->options|=UTRIE_OPTIONS_LATIN1_IS_LINEAR;
800     }
801 
802     header->indexLength=trie->indexLength;
803     header->dataLength=trie->dataLength;
804 
805     /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
806     if(reduceTo16Bits) {
807         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
808         p=(uint32_t *)trie->index;
809         dest16=(uint16_t *)data;
810         for(i=trie->indexLength; i>0; --i) {
811             *dest16++=(uint16_t)((*p++ + trie->indexLength)>>UTRIE_INDEX_SHIFT);
812         }
813 
814         /* write 16-bit data values */
815         p=trie->data;
816         for(i=trie->dataLength; i>0; --i) {
817             *dest16++=(uint16_t)*p++;
818         }
819     } else {
820         /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
821         p=(uint32_t *)trie->index;
822         dest16=(uint16_t *)data;
823         for(i=trie->indexLength; i>0; --i) {
824             *dest16++=(uint16_t)(*p++ >> UTRIE_INDEX_SHIFT);
825         }
826 
827         /* write 32-bit data values */
828         uprv_memcpy(dest16, trie->data, 4*trie->dataLength);
829     }
830 
831     return length;
832 }
833 
834 /* inverse to defaultGetFoldedValue() */
835 U_CAPI int32_t U_EXPORT2
utrie_defaultGetFoldingOffset(uint32_t data)836 utrie_defaultGetFoldingOffset(uint32_t data) {
837     return (int32_t)data;
838 }
839 
840 U_CAPI int32_t U_EXPORT2
utrie_unserialize(UTrie * trie,const void * data,int32_t length,UErrorCode * pErrorCode)841 utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) {
842     const UTrieHeader *header;
843     const uint16_t *p16;
844     uint32_t options;
845 
846     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
847         return -1;
848     }
849 
850     /* enough data for a trie header? */
851     if(length<sizeof(UTrieHeader)) {
852         *pErrorCode=U_INVALID_FORMAT_ERROR;
853         return -1;
854     }
855 
856     /* check the signature */
857     header=(const UTrieHeader *)data;
858     if(header->signature!=0x54726965) {
859         *pErrorCode=U_INVALID_FORMAT_ERROR;
860         return -1;
861     }
862 
863     /* get the options and check the shift values */
864     options=header->options;
865     if( (options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
866         ((options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT
867     ) {
868         *pErrorCode=U_INVALID_FORMAT_ERROR;
869         return -1;
870     }
871     trie->isLatin1Linear= (UBool)((options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0);
872 
873     /* get the length values */
874     trie->indexLength=header->indexLength;
875     trie->dataLength=header->dataLength;
876 
877     length-=(int32_t)sizeof(UTrieHeader);
878 
879     /* enough data for the index? */
880     if(length<2*trie->indexLength) {
881         *pErrorCode=U_INVALID_FORMAT_ERROR;
882         return -1;
883     }
884     p16=(const uint16_t *)(header+1);
885     trie->index=p16;
886     p16+=trie->indexLength;
887     length-=2*trie->indexLength;
888 
889     /* get the data */
890     if(options&UTRIE_OPTIONS_DATA_IS_32_BIT) {
891         if(length<4*trie->dataLength) {
892             *pErrorCode=U_INVALID_FORMAT_ERROR;
893             return -1;
894         }
895         trie->data32=(const uint32_t *)p16;
896         trie->initialValue=trie->data32[0];
897         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+4*trie->dataLength;
898     } else {
899         if(length<2*trie->dataLength) {
900             *pErrorCode=U_INVALID_FORMAT_ERROR;
901             return -1;
902         }
903 
904         /* the "data16" data is used via the index pointer */
905         trie->data32=NULL;
906         trie->initialValue=trie->index[trie->indexLength];
907         length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength;
908     }
909 
910     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
911 
912     return length;
913 }
914 
915 U_CAPI int32_t U_EXPORT2
utrie_unserializeDummy(UTrie * trie,void * data,int32_t length,uint32_t initialValue,uint32_t leadUnitValue,UBool make16BitTrie,UErrorCode * pErrorCode)916 utrie_unserializeDummy(UTrie *trie,
917                        void *data, int32_t length,
918                        uint32_t initialValue, uint32_t leadUnitValue,
919                        UBool make16BitTrie,
920                        UErrorCode *pErrorCode) {
921     uint16_t *p16;
922     int32_t actualLength, latin1Length, i, limit;
923     uint16_t block;
924 
925     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
926         return -1;
927     }
928 
929     /* calculate the actual size of the dummy trie data */
930 
931     /* max(Latin-1, block 0) */
932     latin1Length= UTRIE_SHIFT<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH;
933 
934     trie->indexLength=UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT;
935     trie->dataLength=latin1Length;
936     if(leadUnitValue!=initialValue) {
937         trie->dataLength+=UTRIE_DATA_BLOCK_LENGTH;
938     }
939 
940     actualLength=trie->indexLength*2;
941     if(make16BitTrie) {
942         actualLength+=trie->dataLength*2;
943     } else {
944         actualLength+=trie->dataLength*4;
945     }
946 
947     /* enough space for the dummy trie? */
948     if(length<actualLength) {
949         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
950         return actualLength;
951     }
952 
953     trie->isLatin1Linear=TRUE;
954     trie->initialValue=initialValue;
955 
956     /* fill the index and data arrays */
957     p16=(uint16_t *)data;
958     trie->index=p16;
959 
960     if(make16BitTrie) {
961         /* indexes to block 0 */
962         block=(uint16_t)(trie->indexLength>>UTRIE_INDEX_SHIFT);
963         limit=trie->indexLength;
964         for(i=0; i<limit; ++i) {
965             p16[i]=block;
966         }
967 
968         if(leadUnitValue!=initialValue) {
969             /* indexes for lead surrogate code units to the block after Latin-1 */
970             block+=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
971             i=0xd800>>UTRIE_SHIFT;
972             limit=0xdc00>>UTRIE_SHIFT;
973             for(; i<limit; ++i) {
974                 p16[i]=block;
975             }
976         }
977 
978         trie->data32=NULL;
979 
980         /* Latin-1 data */
981         p16+=trie->indexLength;
982         for(i=0; i<latin1Length; ++i) {
983             p16[i]=(uint16_t)initialValue;
984         }
985 
986         /* data for lead surrogate code units */
987         if(leadUnitValue!=initialValue) {
988             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
989             for(/* i=latin1Length */; i<limit; ++i) {
990                 p16[i]=(uint16_t)leadUnitValue;
991             }
992         }
993     } else {
994         uint32_t *p32;
995 
996         /* indexes to block 0 */
997         uprv_memset(p16, 0, trie->indexLength*2);
998 
999         if(leadUnitValue!=initialValue) {
1000             /* indexes for lead surrogate code units to the block after Latin-1 */
1001             block=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
1002             i=0xd800>>UTRIE_SHIFT;
1003             limit=0xdc00>>UTRIE_SHIFT;
1004             for(; i<limit; ++i) {
1005                 p16[i]=block;
1006             }
1007         }
1008 
1009         trie->data32=p32=(uint32_t *)(p16+trie->indexLength);
1010 
1011         /* Latin-1 data */
1012         for(i=0; i<latin1Length; ++i) {
1013             p32[i]=initialValue;
1014         }
1015 
1016         /* data for lead surrogate code units */
1017         if(leadUnitValue!=initialValue) {
1018             limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
1019             for(/* i=latin1Length */; i<limit; ++i) {
1020                 p32[i]=leadUnitValue;
1021             }
1022         }
1023     }
1024 
1025     trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
1026 
1027     return actualLength;
1028 }
1029 
1030 /* enumeration -------------------------------------------------------------- */
1031 
1032 /* default UTrieEnumValue() returns the input value itself */
1033 static uint32_t U_CALLCONV
enumSameValue(const void * context,uint32_t value)1034 enumSameValue(const void *context, uint32_t value) {
1035     return value;
1036 }
1037 
1038 /**
1039  * Enumerate all ranges of code points with the same relevant values.
1040  * The values are transformed from the raw trie entries by the enumValue function.
1041  */
1042 U_CAPI void U_EXPORT2
utrie_enum(const UTrie * trie,UTrieEnumValue * enumValue,UTrieEnumRange * enumRange,const void * context)1043 utrie_enum(const UTrie *trie,
1044            UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) {
1045     const uint32_t *data32;
1046     const uint16_t *index;
1047 
1048     uint32_t value, prevValue, initialValue;
1049     UChar32 c, prev;
1050     int32_t l, i, j, block, prevBlock, offset;
1051 
1052     /* check arguments */
1053     if(trie==NULL || trie->index==NULL || enumRange==NULL) {
1054         return;
1055     }
1056     if(enumValue==NULL) {
1057         enumValue=enumSameValue;
1058     }
1059 
1060     index=trie->index;
1061     data32=trie->data32;
1062 
1063     /* get the enumeration value that corresponds to an initial-value trie data entry */
1064     initialValue=enumValue(context, trie->initialValue);
1065 
1066     /* set variables for previous range */
1067     prevBlock=0;
1068     prev=0;
1069     prevValue=initialValue;
1070 
1071     /* enumerate BMP - the main loop enumerates data blocks */
1072     for(i=0, c=0; c<=0xffff; ++i) {
1073         if(c==0xd800) {
1074             /* skip lead surrogate code _units_, go to lead surr. code _points_ */
1075             i=UTRIE_BMP_INDEX_LENGTH;
1076         } else if(c==0xdc00) {
1077             /* go back to regular BMP code points */
1078             i=c>>UTRIE_SHIFT;
1079         }
1080 
1081         block=index[i]<<UTRIE_INDEX_SHIFT;
1082         if(block==prevBlock) {
1083             /* the block is the same as the previous one, and filled with value */
1084             c+=UTRIE_DATA_BLOCK_LENGTH;
1085         } else if(block==0) {
1086             /* this is the all-initial-value block */
1087             if(prevValue!=initialValue) {
1088                 if(prev<c) {
1089                     if(!enumRange(context, prev, c, prevValue)) {
1090                         return;
1091                     }
1092                 }
1093                 prevBlock=0;
1094                 prev=c;
1095                 prevValue=initialValue;
1096             }
1097             c+=UTRIE_DATA_BLOCK_LENGTH;
1098         } else {
1099             prevBlock=block;
1100             for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
1101                 value=enumValue(context, data32!=NULL ? data32[block+j] : index[block+j]);
1102                 if(value!=prevValue) {
1103                     if(prev<c) {
1104                         if(!enumRange(context, prev, c, prevValue)) {
1105                             return;
1106                         }
1107                     }
1108                     if(j>0) {
1109                         prevBlock=-1;
1110                     }
1111                     prev=c;
1112                     prevValue=value;
1113                 }
1114                 ++c;
1115             }
1116         }
1117     }
1118 
1119     /* enumerate supplementary code points */
1120     for(l=0xd800; l<0xdc00;) {
1121         /* lead surrogate access */
1122         offset=index[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;
1123         if(offset==(data32!=NULL ? 0 : trie->indexLength)) {
1124             /* no entries for a whole block of lead surrogates */
1125             if(prevValue!=initialValue) {
1126                 if(prev<c) {
1127                     if(!enumRange(context, prev, c, prevValue)) {
1128                         return;
1129                     }
1130                 }
1131                 prevBlock=0;
1132                 prev=c;
1133                 prevValue=initialValue;
1134             }
1135 
1136             l+=UTRIE_DATA_BLOCK_LENGTH;
1137             c+=UTRIE_DATA_BLOCK_LENGTH<<10;
1138             continue;
1139         }
1140 
1141         value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : index[offset+(l&UTRIE_MASK)];
1142 
1143         /* enumerate trail surrogates for this lead surrogate */
1144         offset=trie->getFoldingOffset(value);
1145         if(offset<=0) {
1146             /* no data for this lead surrogate */
1147             if(prevValue!=initialValue) {
1148                 if(prev<c) {
1149                     if(!enumRange(context, prev, c, prevValue)) {
1150                         return;
1151                     }
1152                 }
1153                 prevBlock=0;
1154                 prev=c;
1155                 prevValue=initialValue;
1156             }
1157 
1158             /* nothing else to do for the supplementary code points for this lead surrogate */
1159             c+=0x400;
1160         } else {
1161             /* enumerate code points for this lead surrogate */
1162             i=offset;
1163             offset+=UTRIE_SURROGATE_BLOCK_COUNT;
1164             do {
1165                 /* copy of most of the body of the BMP loop */
1166                 block=index[i]<<UTRIE_INDEX_SHIFT;
1167                 if(block==prevBlock) {
1168                     /* the block is the same as the previous one, and filled with value */
1169                     c+=UTRIE_DATA_BLOCK_LENGTH;
1170                 } else if(block==0) {
1171                     /* this is the all-initial-value block */
1172                     if(prevValue!=initialValue) {
1173                         if(prev<c) {
1174                             if(!enumRange(context, prev, c, prevValue)) {
1175                                 return;
1176                             }
1177                         }
1178                         prevBlock=0;
1179                         prev=c;
1180                         prevValue=initialValue;
1181                     }
1182                     c+=UTRIE_DATA_BLOCK_LENGTH;
1183                 } else {
1184                     prevBlock=block;
1185                     for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
1186                         value=enumValue(context, data32!=NULL ? data32[block+j] : index[block+j]);
1187                         if(value!=prevValue) {
1188                             if(prev<c) {
1189                                 if(!enumRange(context, prev, c, prevValue)) {
1190                                     return;
1191                                 }
1192                             }
1193                             if(j>0) {
1194                                 prevBlock=-1;
1195                             }
1196                             prev=c;
1197                             prevValue=value;
1198                         }
1199                         ++c;
1200                     }
1201                 }
1202             } while(++i<offset);
1203         }
1204 
1205         ++l;
1206     }
1207 
1208     /* deliver last range */
1209     enumRange(context, prev, c, prevValue);
1210 }
1211