1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2001-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: utrie.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2001oct20
14 * created by: Markus W. Scherer
15 *
16 * This is a common implementation of a "folded" trie.
17 * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
18 * Unicode code points (0..0x10ffff).
19 */
20
21 #ifdef UTRIE_DEBUG
22 # include <stdio.h>
23 #endif
24
25 #include "unicode/utypes.h"
26 #include "cmemory.h"
27 #include "utrie.h"
28
29 /* miscellaneous ------------------------------------------------------------ */
30
31 #undef ABS
32 #define ABS(x) ((x)>=0 ? (x) : -(x))
33
34 static U_INLINE UBool
equal_uint32(const uint32_t * s,const uint32_t * t,int32_t length)35 equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
36 while(length>0 && *s==*t) {
37 ++s;
38 ++t;
39 --length;
40 }
41 return (UBool)(length==0);
42 }
43
44 /* Building a trie ----------------------------------------------------------*/
45
46 U_CAPI UNewTrie * U_EXPORT2
utrie_open(UNewTrie * fillIn,uint32_t * aliasData,int32_t maxDataLength,uint32_t initialValue,uint32_t leadUnitValue,UBool latin1Linear)47 utrie_open(UNewTrie *fillIn,
48 uint32_t *aliasData, int32_t maxDataLength,
49 uint32_t initialValue, uint32_t leadUnitValue,
50 UBool latin1Linear) {
51 UNewTrie *trie;
52 int32_t i, j;
53
54 if( maxDataLength<UTRIE_DATA_BLOCK_LENGTH ||
55 (latin1Linear && maxDataLength<1024)
56 ) {
57 return NULL;
58 }
59
60 if(fillIn!=NULL) {
61 trie=fillIn;
62 } else {
63 trie=(UNewTrie *)uprv_malloc(sizeof(UNewTrie));
64 if(trie==NULL) {
65 return NULL;
66 }
67 }
68 uprv_memset(trie, 0, sizeof(UNewTrie));
69 trie->isAllocated= (UBool)(fillIn==NULL);
70
71 if(aliasData!=NULL) {
72 trie->data=aliasData;
73 trie->isDataAllocated=FALSE;
74 } else {
75 trie->data=(uint32_t *)uprv_malloc(maxDataLength*4);
76 if(trie->data==NULL) {
77 uprv_free(trie);
78 return NULL;
79 }
80 trie->isDataAllocated=TRUE;
81 }
82
83 /* preallocate and reset the first data block (block index 0) */
84 j=UTRIE_DATA_BLOCK_LENGTH;
85
86 if(latin1Linear) {
87 /* preallocate and reset the first block (number 0) and Latin-1 (U+0000..U+00ff) after that */
88 /* made sure above that maxDataLength>=1024 */
89
90 /* set indexes to point to consecutive data blocks */
91 i=0;
92 do {
93 /* do this at least for trie->index[0] even if that block is only partly used for Latin-1 */
94 trie->index[i++]=j;
95 j+=UTRIE_DATA_BLOCK_LENGTH;
96 } while(i<(256>>UTRIE_SHIFT));
97 }
98
99 /* reset the initially allocated blocks to the initial value */
100 trie->dataLength=j;
101 while(j>0) {
102 trie->data[--j]=initialValue;
103 }
104
105 trie->leadUnitValue=leadUnitValue;
106 trie->indexLength=UTRIE_MAX_INDEX_LENGTH;
107 trie->dataCapacity=maxDataLength;
108 trie->isLatin1Linear=latin1Linear;
109 trie->isCompacted=FALSE;
110 return trie;
111 }
112
113 U_CAPI UNewTrie * U_EXPORT2
utrie_clone(UNewTrie * fillIn,const UNewTrie * other,uint32_t * aliasData,int32_t aliasDataCapacity)114 utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataCapacity) {
115 UNewTrie *trie;
116 UBool isDataAllocated;
117
118 /* do not clone if other is not valid or already compacted */
119 if(other==NULL || other->data==NULL || other->isCompacted) {
120 return NULL;
121 }
122
123 /* clone data */
124 if(aliasData!=NULL && aliasDataCapacity>=other->dataCapacity) {
125 isDataAllocated=FALSE;
126 } else {
127 aliasDataCapacity=other->dataCapacity;
128 aliasData=(uint32_t *)uprv_malloc(other->dataCapacity*4);
129 if(aliasData==NULL) {
130 return NULL;
131 }
132 isDataAllocated=TRUE;
133 }
134
135 trie=utrie_open(fillIn, aliasData, aliasDataCapacity,
136 other->data[0], other->leadUnitValue,
137 other->isLatin1Linear);
138 if(trie==NULL) {
139 uprv_free(aliasData);
140 } else {
141 uprv_memcpy(trie->index, other->index, sizeof(trie->index));
142 uprv_memcpy(trie->data, other->data, other->dataLength*4);
143 trie->dataLength=other->dataLength;
144 trie->isDataAllocated=isDataAllocated;
145 }
146
147 return trie;
148 }
149
150 U_CAPI void U_EXPORT2
utrie_close(UNewTrie * trie)151 utrie_close(UNewTrie *trie) {
152 if(trie!=NULL) {
153 if(trie->isDataAllocated) {
154 uprv_free(trie->data);
155 trie->data=NULL;
156 }
157 if(trie->isAllocated) {
158 uprv_free(trie);
159 }
160 }
161 }
162
163 U_CAPI uint32_t * U_EXPORT2
utrie_getData(UNewTrie * trie,int32_t * pLength)164 utrie_getData(UNewTrie *trie, int32_t *pLength) {
165 if(trie==NULL || pLength==NULL) {
166 return NULL;
167 }
168
169 *pLength=trie->dataLength;
170 return trie->data;
171 }
172
173 static int32_t
utrie_allocDataBlock(UNewTrie * trie)174 utrie_allocDataBlock(UNewTrie *trie) {
175 int32_t newBlock, newTop;
176
177 newBlock=trie->dataLength;
178 newTop=newBlock+UTRIE_DATA_BLOCK_LENGTH;
179 if(newTop>trie->dataCapacity) {
180 /* out of memory in the data array */
181 return -1;
182 }
183 trie->dataLength=newTop;
184 return newBlock;
185 }
186
187 /**
188 * No error checking for illegal arguments.
189 *
190 * @return -1 if no new data block available (out of memory in data array)
191 * @internal
192 */
193 static int32_t
utrie_getDataBlock(UNewTrie * trie,UChar32 c)194 utrie_getDataBlock(UNewTrie *trie, UChar32 c) {
195 int32_t indexValue, newBlock;
196
197 c>>=UTRIE_SHIFT;
198 indexValue=trie->index[c];
199 if(indexValue>0) {
200 return indexValue;
201 }
202
203 /* allocate a new data block */
204 newBlock=utrie_allocDataBlock(trie);
205 if(newBlock<0) {
206 /* out of memory in the data array */
207 return -1;
208 }
209 trie->index[c]=newBlock;
210
211 /* copy-on-write for a block from a setRange() */
212 uprv_memcpy(trie->data+newBlock, trie->data-indexValue, 4*UTRIE_DATA_BLOCK_LENGTH);
213 return newBlock;
214 }
215
216 /**
217 * @return TRUE if the value was successfully set
218 */
219 U_CAPI UBool U_EXPORT2
utrie_set32(UNewTrie * trie,UChar32 c,uint32_t value)220 utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value) {
221 int32_t block;
222
223 /* valid, uncompacted trie and valid c? */
224 if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
225 return FALSE;
226 }
227
228 block=utrie_getDataBlock(trie, c);
229 if(block<0) {
230 return FALSE;
231 }
232
233 trie->data[block+(c&UTRIE_MASK)]=value;
234 return TRUE;
235 }
236
237 U_CAPI uint32_t U_EXPORT2
utrie_get32(UNewTrie * trie,UChar32 c,UBool * pInBlockZero)238 utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero) {
239 int32_t block;
240
241 /* valid, uncompacted trie and valid c? */
242 if(trie==NULL || trie->isCompacted || (uint32_t)c>0x10ffff) {
243 if(pInBlockZero!=NULL) {
244 *pInBlockZero=TRUE;
245 }
246 return 0;
247 }
248
249 block=trie->index[c>>UTRIE_SHIFT];
250 if(pInBlockZero!=NULL) {
251 *pInBlockZero= (UBool)(block==0);
252 }
253
254 return trie->data[ABS(block)+(c&UTRIE_MASK)];
255 }
256
257 /**
258 * @internal
259 */
260 static void
utrie_fillBlock(uint32_t * block,UChar32 start,UChar32 limit,uint32_t value,uint32_t initialValue,UBool overwrite)261 utrie_fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
262 uint32_t value, uint32_t initialValue, UBool overwrite) {
263 uint32_t *pLimit;
264
265 pLimit=block+limit;
266 block+=start;
267 if(overwrite) {
268 while(block<pLimit) {
269 *block++=value;
270 }
271 } else {
272 while(block<pLimit) {
273 if(*block==initialValue) {
274 *block=value;
275 }
276 ++block;
277 }
278 }
279 }
280
281 U_CAPI UBool U_EXPORT2
utrie_setRange32(UNewTrie * trie,UChar32 start,UChar32 limit,uint32_t value,UBool overwrite)282 utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite) {
283 /*
284 * repeat value in [start..limit[
285 * mark index values for repeat-data blocks by setting bit 31 of the index values
286 * fill around existing values if any, if(overwrite)
287 */
288 uint32_t initialValue;
289 int32_t block, rest, repeatBlock;
290
291 /* valid, uncompacted trie and valid indexes? */
292 if( trie==NULL || trie->isCompacted ||
293 (uint32_t)start>0x10ffff || (uint32_t)limit>0x110000 || start>limit
294 ) {
295 return FALSE;
296 }
297 if(start==limit) {
298 return TRUE; /* nothing to do */
299 }
300
301 initialValue=trie->data[0];
302 if(start&UTRIE_MASK) {
303 UChar32 nextStart;
304
305 /* set partial block at [start..following block boundary[ */
306 block=utrie_getDataBlock(trie, start);
307 if(block<0) {
308 return FALSE;
309 }
310
311 nextStart=(start+UTRIE_DATA_BLOCK_LENGTH)&~UTRIE_MASK;
312 if(nextStart<=limit) {
313 utrie_fillBlock(trie->data+block, start&UTRIE_MASK, UTRIE_DATA_BLOCK_LENGTH,
314 value, initialValue, overwrite);
315 start=nextStart;
316 } else {
317 utrie_fillBlock(trie->data+block, start&UTRIE_MASK, limit&UTRIE_MASK,
318 value, initialValue, overwrite);
319 return TRUE;
320 }
321 }
322
323 /* number of positions in the last, partial block */
324 rest=limit&UTRIE_MASK;
325
326 /* round down limit to a block boundary */
327 limit&=~UTRIE_MASK;
328
329 /* iterate over all-value blocks */
330 if(value==initialValue) {
331 repeatBlock=0;
332 } else {
333 repeatBlock=-1;
334 }
335 while(start<limit) {
336 /* get index value */
337 block=trie->index[start>>UTRIE_SHIFT];
338 if(block>0) {
339 /* already allocated, fill in value */
340 utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, overwrite);
341 } else if(trie->data[-block]!=value && (block==0 || overwrite)) {
342 /* set the repeatBlock instead of the current block 0 or range block */
343 if(repeatBlock>=0) {
344 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
345 } else {
346 /* create and set and fill the repeatBlock */
347 repeatBlock=utrie_getDataBlock(trie, start);
348 if(repeatBlock<0) {
349 return FALSE;
350 }
351
352 /* set the negative block number to indicate that it is a repeat block */
353 trie->index[start>>UTRIE_SHIFT]=-repeatBlock;
354 utrie_fillBlock(trie->data+repeatBlock, 0, UTRIE_DATA_BLOCK_LENGTH, value, initialValue, TRUE);
355 }
356 }
357
358 start+=UTRIE_DATA_BLOCK_LENGTH;
359 }
360
361 if(rest>0) {
362 /* set partial block at [last block boundary..limit[ */
363 block=utrie_getDataBlock(trie, start);
364 if(block<0) {
365 return FALSE;
366 }
367
368 utrie_fillBlock(trie->data+block, 0, rest, value, initialValue, overwrite);
369 }
370
371 return TRUE;
372 }
373
374 static int32_t
_findSameIndexBlock(const int32_t * index,int32_t indexLength,int32_t otherBlock)375 _findSameIndexBlock(const int32_t *index, int32_t indexLength,
376 int32_t otherBlock) {
377 int32_t block, i;
378
379 for(block=UTRIE_BMP_INDEX_LENGTH; block<indexLength; block+=UTRIE_SURROGATE_BLOCK_COUNT) {
380 for(i=0; i<UTRIE_SURROGATE_BLOCK_COUNT; ++i) {
381 if(index[block+i]!=index[otherBlock+i]) {
382 break;
383 }
384 }
385 if(i==UTRIE_SURROGATE_BLOCK_COUNT) {
386 return block;
387 }
388 }
389 return indexLength;
390 }
391
392 /*
393 * Fold the normalization data for supplementary code points into
394 * a compact area on top of the BMP-part of the trie index,
395 * with the lead surrogates indexing this compact area.
396 *
397 * Duplicate the index values for lead surrogates:
398 * From inside the BMP area, where some may be overridden with folded values,
399 * to just after the BMP area, where they can be retrieved for
400 * code point lookups.
401 */
402 static void
utrie_fold(UNewTrie * trie,UNewTrieGetFoldedValue * getFoldedValue,UErrorCode * pErrorCode)403 utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *pErrorCode) {
404 int32_t leadIndexes[UTRIE_SURROGATE_BLOCK_COUNT];
405 int32_t *index;
406 uint32_t value;
407 UChar32 c;
408 int32_t indexLength, block;
409
410 index=trie->index;
411
412 /* copy the lead surrogate indexes into a temporary array */
413 uprv_memcpy(leadIndexes, index+(0xd800>>UTRIE_SHIFT), 4*UTRIE_SURROGATE_BLOCK_COUNT);
414
415 /*
416 * set all values for lead surrogate code *units* to leadUnitValue
417 * so that, by default, runtime lookups will find no data for associated
418 * supplementary code points, unless there is data for such code points
419 * which will result in a non-zero folding value below that is set for
420 * the respective lead units
421 *
422 * the above saved the indexes for surrogate code *points*
423 * fill the indexes with simplified code from utrie_setRange32()
424 */
425 if(trie->leadUnitValue==trie->data[0]) {
426 block=0; /* leadUnitValue==initialValue, use all-initial-value block */
427 } else {
428 /* create and fill the repeatBlock */
429 block=utrie_allocDataBlock(trie);
430 if(block<0) {
431 /* data table overflow */
432 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
433 return;
434 }
435 utrie_fillBlock(trie->data+block, 0, UTRIE_DATA_BLOCK_LENGTH, trie->leadUnitValue, trie->data[0], TRUE);
436 block=-block; /* negative block number to indicate that it is a repeat block */
437 }
438 for(c=(0xd800>>UTRIE_SHIFT); c<(0xdc00>>UTRIE_SHIFT); ++c) {
439 trie->index[c]=block;
440 }
441
442 /*
443 * Fold significant index values into the area just after the BMP indexes.
444 * In case the first lead surrogate has significant data,
445 * its index block must be used first (in which case the folding is a no-op).
446 * Later all folded index blocks are moved up one to insert the copied
447 * lead surrogate indexes.
448 */
449 indexLength=UTRIE_BMP_INDEX_LENGTH;
450
451 /* search for any index (stage 1) entries for supplementary code points */
452 for(c=0x10000; c<0x110000;) {
453 if(index[c>>UTRIE_SHIFT]!=0) {
454 /* there is data, treat the full block for a lead surrogate */
455 c&=~0x3ff;
456
457 #ifdef UTRIE_DEBUG
458 printf("supplementary data for lead surrogate U+%04lx\n", (long)(0xd7c0+(c>>10)));
459 #endif
460
461 /* is there an identical index block? */
462 block=_findSameIndexBlock(index, indexLength, c>>UTRIE_SHIFT);
463
464 /*
465 * get a folded value for [c..c+0x400[ and,
466 * if different from the value for the lead surrogate code point,
467 * set it for the lead surrogate code unit
468 */
469 value=getFoldedValue(trie, c, block+UTRIE_SURROGATE_BLOCK_COUNT);
470 if(value!=utrie_get32(trie, U16_LEAD(c), NULL)) {
471 if(!utrie_set32(trie, U16_LEAD(c), value)) {
472 /* data table overflow */
473 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
474 return;
475 }
476
477 /* if we did not find an identical index block... */
478 if(block==indexLength) {
479 /* move the actual index (stage 1) entries from the supplementary position to the new one */
480 uprv_memmove(index+indexLength,
481 index+(c>>UTRIE_SHIFT),
482 4*UTRIE_SURROGATE_BLOCK_COUNT);
483 indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
484 }
485 }
486 c+=0x400;
487 } else {
488 c+=UTRIE_DATA_BLOCK_LENGTH;
489 }
490 }
491
492 /*
493 * index array overflow?
494 * This is to guarantee that a folding offset is of the form
495 * UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
496 * If the index is too large, then n>=1024 and more than 10 bits are necessary.
497 *
498 * In fact, it can only ever become n==1024 with completely unfoldable data and
499 * the additional block of duplicated values for lead surrogates.
500 */
501 if(indexLength>=UTRIE_MAX_INDEX_LENGTH) {
502 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
503 return;
504 }
505
506 /*
507 * make space for the lead surrogate index block and
508 * insert it between the BMP indexes and the folded ones
509 */
510 uprv_memmove(index+UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT,
511 index+UTRIE_BMP_INDEX_LENGTH,
512 4*(indexLength-UTRIE_BMP_INDEX_LENGTH));
513 uprv_memcpy(index+UTRIE_BMP_INDEX_LENGTH,
514 leadIndexes,
515 4*UTRIE_SURROGATE_BLOCK_COUNT);
516 indexLength+=UTRIE_SURROGATE_BLOCK_COUNT;
517
518 #ifdef UTRIE_DEBUG
519 printf("trie index count: BMP %ld all Unicode %ld folded %ld\n",
520 UTRIE_BMP_INDEX_LENGTH, (long)UTRIE_MAX_INDEX_LENGTH, indexLength);
521 #endif
522
523 trie->indexLength=indexLength;
524 }
525
526 /*
527 * Set a value in the trie index map to indicate which data block
528 * is referenced and which one is not.
529 * utrie_compact() will remove data blocks that are not used at all.
530 * Set
531 * - 0 if it is used
532 * - -1 if it is not used
533 */
534 static void
_findUnusedBlocks(UNewTrie * trie)535 _findUnusedBlocks(UNewTrie *trie) {
536 int32_t i;
537
538 /* fill the entire map with "not used" */
539 uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4);
540
541 /* mark each block that _is_ used with 0 */
542 for(i=0; i<trie->indexLength; ++i) {
543 trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0;
544 }
545
546 /* never move the all-initial-value block 0 */
547 trie->map[0]=0;
548 }
549
550 static int32_t
_findSameDataBlock(const uint32_t * data,int32_t dataLength,int32_t otherBlock,int32_t step)551 _findSameDataBlock(const uint32_t *data, int32_t dataLength,
552 int32_t otherBlock, int32_t step) {
553 int32_t block;
554
555 /* ensure that we do not even partially get past dataLength */
556 dataLength-=UTRIE_DATA_BLOCK_LENGTH;
557
558 for(block=0; block<=dataLength; block+=step) {
559 if(equal_uint32(data+block, data+otherBlock, UTRIE_DATA_BLOCK_LENGTH)) {
560 return block;
561 }
562 }
563 return -1;
564 }
565
566 /*
567 * Compact a folded build-time trie.
568 *
569 * The compaction
570 * - removes blocks that are identical with earlier ones
571 * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
572 * - moves blocks in steps of the data granularity
573 * - moves and overlaps blocks that overlap with multiple values in the overlap region
574 *
575 * It does not
576 * - try to move and overlap blocks that are not already adjacent
577 */
578 static void
utrie_compact(UNewTrie * trie,UBool overlap,UErrorCode * pErrorCode)579 utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) {
580 int32_t i, start, newStart, overlapStart;
581
582 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
583 return;
584 }
585
586 /* valid, uncompacted trie? */
587 if(trie==NULL) {
588 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
589 return;
590 }
591 if(trie->isCompacted) {
592 return; /* nothing left to do */
593 }
594
595 /* compaction */
596
597 /* initialize the index map with "block is used/unused" flags */
598 _findUnusedBlocks(trie);
599
600 /* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */
601 if(trie->isLatin1Linear && UTRIE_SHIFT<=8) {
602 overlapStart=UTRIE_DATA_BLOCK_LENGTH+256;
603 } else {
604 overlapStart=UTRIE_DATA_BLOCK_LENGTH;
605 }
606
607 newStart=UTRIE_DATA_BLOCK_LENGTH;
608 for(start=newStart; start<trie->dataLength;) {
609 /*
610 * start: index of first entry of current block
611 * newStart: index where the current block is to be moved
612 * (right after current end of already-compacted data)
613 */
614
615 /* skip blocks that are not used */
616 if(trie->map[start>>UTRIE_SHIFT]<0) {
617 /* advance start to the next block */
618 start+=UTRIE_DATA_BLOCK_LENGTH;
619
620 /* leave newStart with the previous block! */
621 continue;
622 }
623
624 /* search for an identical block */
625 if( start>=overlapStart &&
626 (i=_findSameDataBlock(trie->data, newStart, start,
627 overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH))
628 >=0
629 ) {
630 /* found an identical block, set the other block's index value for the current block */
631 trie->map[start>>UTRIE_SHIFT]=i;
632
633 /* advance start to the next block */
634 start+=UTRIE_DATA_BLOCK_LENGTH;
635
636 /* leave newStart with the previous block! */
637 continue;
638 }
639
640 /* see if the beginning of this block can be overlapped with the end of the previous block */
641 if(overlap && start>=overlapStart) {
642 /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
643 for(i=UTRIE_DATA_BLOCK_LENGTH-UTRIE_DATA_GRANULARITY;
644 i>0 && !equal_uint32(trie->data+(newStart-i), trie->data+start, i);
645 i-=UTRIE_DATA_GRANULARITY) {}
646 } else {
647 i=0;
648 }
649
650 if(i>0) {
651 /* some overlap */
652 trie->map[start>>UTRIE_SHIFT]=newStart-i;
653
654 /* move the non-overlapping indexes to their new positions */
655 start+=i;
656 for(i=UTRIE_DATA_BLOCK_LENGTH-i; i>0; --i) {
657 trie->data[newStart++]=trie->data[start++];
658 }
659 } else if(newStart<start) {
660 /* no overlap, just move the indexes to their new positions */
661 trie->map[start>>UTRIE_SHIFT]=newStart;
662 for(i=UTRIE_DATA_BLOCK_LENGTH; i>0; --i) {
663 trie->data[newStart++]=trie->data[start++];
664 }
665 } else /* no overlap && newStart==start */ {
666 trie->map[start>>UTRIE_SHIFT]=start;
667 newStart+=UTRIE_DATA_BLOCK_LENGTH;
668 start=newStart;
669 }
670 }
671
672 /* now adjust the index (stage 1) table */
673 for(i=0; i<trie->indexLength; ++i) {
674 trie->index[i]=trie->map[ABS(trie->index[i])>>UTRIE_SHIFT];
675 }
676
677 #ifdef UTRIE_DEBUG
678 /* we saved some space */
679 printf("compacting trie: count of 32-bit words %lu->%lu\n",
680 (long)trie->dataLength, (long)newStart);
681 #endif
682
683 trie->dataLength=newStart;
684 }
685
686 /* serialization ------------------------------------------------------------ */
687
688 /*
689 * Default function for the folding value:
690 * Just store the offset (16 bits) if there is any non-initial-value entry.
691 *
692 * The offset parameter is never 0.
693 * Returning the offset itself is safe for UTRIE_SHIFT>=5 because
694 * for UTRIE_SHIFT==5 the maximum index length is UTRIE_MAX_INDEX_LENGTH==0x8800
695 * which fits into 16-bit trie values;
696 * for higher UTRIE_SHIFT, UTRIE_MAX_INDEX_LENGTH decreases.
697 *
698 * Theoretically, it would be safer for all possible UTRIE_SHIFT including
699 * those of 4 and lower to return offset>>UTRIE_SURROGATE_BLOCK_BITS
700 * which would always result in a value of 0x40..0x43f
701 * (start/end 1k blocks of supplementary Unicode code points).
702 * However, this would be uglier, and would not work for some existing
703 * binary data file formats.
704 *
705 * Also, we do not plan to change UTRIE_SHIFT because it would change binary
706 * data file formats, and we would probably not make it smaller because of
707 * the then even larger BMP index length even for empty tries.
708 */
709 static uint32_t U_CALLCONV
defaultGetFoldedValue(UNewTrie * trie,UChar32 start,int32_t offset)710 defaultGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
711 uint32_t value, initialValue;
712 UChar32 limit;
713 UBool inBlockZero;
714
715 initialValue=trie->data[0];
716 limit=start+0x400;
717 while(start<limit) {
718 value=utrie_get32(trie, start, &inBlockZero);
719 if(inBlockZero) {
720 start+=UTRIE_DATA_BLOCK_LENGTH;
721 } else if(value!=initialValue) {
722 return (uint32_t)offset;
723 } else {
724 ++start;
725 }
726 }
727 return 0;
728 }
729
730 U_CAPI int32_t U_EXPORT2
utrie_serialize(UNewTrie * trie,void * dt,int32_t capacity,UNewTrieGetFoldedValue * getFoldedValue,UBool reduceTo16Bits,UErrorCode * pErrorCode)731 utrie_serialize(UNewTrie *trie, void *dt, int32_t capacity,
732 UNewTrieGetFoldedValue *getFoldedValue,
733 UBool reduceTo16Bits,
734 UErrorCode *pErrorCode) {
735 UTrieHeader *header;
736 uint32_t *p;
737 uint16_t *dest16;
738 int32_t i, length;
739 uint8_t* data = NULL;
740
741 /* argument check */
742 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
743 return 0;
744 }
745
746 if(trie==NULL || capacity<0 || (capacity>0 && dt==NULL)) {
747 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
748 return 0;
749 }
750 if(getFoldedValue==NULL) {
751 getFoldedValue=defaultGetFoldedValue;
752 }
753
754 data = (uint8_t*)dt;
755 /* fold and compact if necessary, also checks that indexLength is within limits */
756 if(!trie->isCompacted) {
757 /* compact once without overlap to improve folding */
758 utrie_compact(trie, FALSE, pErrorCode);
759
760 /* fold the supplementary part of the index array */
761 utrie_fold(trie, getFoldedValue, pErrorCode);
762
763 /* compact again with overlap for minimum data array length */
764 utrie_compact(trie, TRUE, pErrorCode);
765
766 trie->isCompacted=TRUE;
767 if(U_FAILURE(*pErrorCode)) {
768 return 0;
769 }
770 }
771
772 /* is dataLength within limits? */
773 if( (reduceTo16Bits ? (trie->dataLength+trie->indexLength) : trie->dataLength) >= UTRIE_MAX_DATA_LENGTH) {
774 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
775 }
776
777 length=sizeof(UTrieHeader)+2*trie->indexLength;
778 if(reduceTo16Bits) {
779 length+=2*trie->dataLength;
780 } else {
781 length+=4*trie->dataLength;
782 }
783
784 if(length>capacity) {
785 return length; /* preflighting */
786 }
787
788 /* set the header fields */
789 header=(UTrieHeader *)data;
790 data+=sizeof(UTrieHeader);
791
792 header->signature=0x54726965; /* "Trie" */
793 header->options=UTRIE_SHIFT | (UTRIE_INDEX_SHIFT<<UTRIE_OPTIONS_INDEX_SHIFT);
794
795 if(!reduceTo16Bits) {
796 header->options|=UTRIE_OPTIONS_DATA_IS_32_BIT;
797 }
798 if(trie->isLatin1Linear) {
799 header->options|=UTRIE_OPTIONS_LATIN1_IS_LINEAR;
800 }
801
802 header->indexLength=trie->indexLength;
803 header->dataLength=trie->dataLength;
804
805 /* write the index (stage 1) array and the 16/32-bit data (stage 2) array */
806 if(reduceTo16Bits) {
807 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT, after adding indexLength */
808 p=(uint32_t *)trie->index;
809 dest16=(uint16_t *)data;
810 for(i=trie->indexLength; i>0; --i) {
811 *dest16++=(uint16_t)((*p++ + trie->indexLength)>>UTRIE_INDEX_SHIFT);
812 }
813
814 /* write 16-bit data values */
815 p=trie->data;
816 for(i=trie->dataLength; i>0; --i) {
817 *dest16++=(uint16_t)*p++;
818 }
819 } else {
820 /* write 16-bit index values shifted right by UTRIE_INDEX_SHIFT */
821 p=(uint32_t *)trie->index;
822 dest16=(uint16_t *)data;
823 for(i=trie->indexLength; i>0; --i) {
824 *dest16++=(uint16_t)(*p++ >> UTRIE_INDEX_SHIFT);
825 }
826
827 /* write 32-bit data values */
828 uprv_memcpy(dest16, trie->data, 4*trie->dataLength);
829 }
830
831 return length;
832 }
833
834 /* inverse to defaultGetFoldedValue() */
835 U_CAPI int32_t U_EXPORT2
utrie_defaultGetFoldingOffset(uint32_t data)836 utrie_defaultGetFoldingOffset(uint32_t data) {
837 return (int32_t)data;
838 }
839
840 U_CAPI int32_t U_EXPORT2
utrie_unserialize(UTrie * trie,const void * data,int32_t length,UErrorCode * pErrorCode)841 utrie_unserialize(UTrie *trie, const void *data, int32_t length, UErrorCode *pErrorCode) {
842 const UTrieHeader *header;
843 const uint16_t *p16;
844 uint32_t options;
845
846 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
847 return -1;
848 }
849
850 /* enough data for a trie header? */
851 if(length<sizeof(UTrieHeader)) {
852 *pErrorCode=U_INVALID_FORMAT_ERROR;
853 return -1;
854 }
855
856 /* check the signature */
857 header=(const UTrieHeader *)data;
858 if(header->signature!=0x54726965) {
859 *pErrorCode=U_INVALID_FORMAT_ERROR;
860 return -1;
861 }
862
863 /* get the options and check the shift values */
864 options=header->options;
865 if( (options&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_SHIFT ||
866 ((options>>UTRIE_OPTIONS_INDEX_SHIFT)&UTRIE_OPTIONS_SHIFT_MASK)!=UTRIE_INDEX_SHIFT
867 ) {
868 *pErrorCode=U_INVALID_FORMAT_ERROR;
869 return -1;
870 }
871 trie->isLatin1Linear= (UBool)((options&UTRIE_OPTIONS_LATIN1_IS_LINEAR)!=0);
872
873 /* get the length values */
874 trie->indexLength=header->indexLength;
875 trie->dataLength=header->dataLength;
876
877 length-=(int32_t)sizeof(UTrieHeader);
878
879 /* enough data for the index? */
880 if(length<2*trie->indexLength) {
881 *pErrorCode=U_INVALID_FORMAT_ERROR;
882 return -1;
883 }
884 p16=(const uint16_t *)(header+1);
885 trie->index=p16;
886 p16+=trie->indexLength;
887 length-=2*trie->indexLength;
888
889 /* get the data */
890 if(options&UTRIE_OPTIONS_DATA_IS_32_BIT) {
891 if(length<4*trie->dataLength) {
892 *pErrorCode=U_INVALID_FORMAT_ERROR;
893 return -1;
894 }
895 trie->data32=(const uint32_t *)p16;
896 trie->initialValue=trie->data32[0];
897 length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+4*trie->dataLength;
898 } else {
899 if(length<2*trie->dataLength) {
900 *pErrorCode=U_INVALID_FORMAT_ERROR;
901 return -1;
902 }
903
904 /* the "data16" data is used via the index pointer */
905 trie->data32=NULL;
906 trie->initialValue=trie->index[trie->indexLength];
907 length=(int32_t)sizeof(UTrieHeader)+2*trie->indexLength+2*trie->dataLength;
908 }
909
910 trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
911
912 return length;
913 }
914
915 U_CAPI int32_t U_EXPORT2
utrie_unserializeDummy(UTrie * trie,void * data,int32_t length,uint32_t initialValue,uint32_t leadUnitValue,UBool make16BitTrie,UErrorCode * pErrorCode)916 utrie_unserializeDummy(UTrie *trie,
917 void *data, int32_t length,
918 uint32_t initialValue, uint32_t leadUnitValue,
919 UBool make16BitTrie,
920 UErrorCode *pErrorCode) {
921 uint16_t *p16;
922 int32_t actualLength, latin1Length, i, limit;
923 uint16_t block;
924
925 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
926 return -1;
927 }
928
929 /* calculate the actual size of the dummy trie data */
930
931 /* max(Latin-1, block 0) */
932 latin1Length= UTRIE_SHIFT<=8 ? 256 : UTRIE_DATA_BLOCK_LENGTH;
933
934 trie->indexLength=UTRIE_BMP_INDEX_LENGTH+UTRIE_SURROGATE_BLOCK_COUNT;
935 trie->dataLength=latin1Length;
936 if(leadUnitValue!=initialValue) {
937 trie->dataLength+=UTRIE_DATA_BLOCK_LENGTH;
938 }
939
940 actualLength=trie->indexLength*2;
941 if(make16BitTrie) {
942 actualLength+=trie->dataLength*2;
943 } else {
944 actualLength+=trie->dataLength*4;
945 }
946
947 /* enough space for the dummy trie? */
948 if(length<actualLength) {
949 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
950 return actualLength;
951 }
952
953 trie->isLatin1Linear=TRUE;
954 trie->initialValue=initialValue;
955
956 /* fill the index and data arrays */
957 p16=(uint16_t *)data;
958 trie->index=p16;
959
960 if(make16BitTrie) {
961 /* indexes to block 0 */
962 block=(uint16_t)(trie->indexLength>>UTRIE_INDEX_SHIFT);
963 limit=trie->indexLength;
964 for(i=0; i<limit; ++i) {
965 p16[i]=block;
966 }
967
968 if(leadUnitValue!=initialValue) {
969 /* indexes for lead surrogate code units to the block after Latin-1 */
970 block+=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
971 i=0xd800>>UTRIE_SHIFT;
972 limit=0xdc00>>UTRIE_SHIFT;
973 for(; i<limit; ++i) {
974 p16[i]=block;
975 }
976 }
977
978 trie->data32=NULL;
979
980 /* Latin-1 data */
981 p16+=trie->indexLength;
982 for(i=0; i<latin1Length; ++i) {
983 p16[i]=(uint16_t)initialValue;
984 }
985
986 /* data for lead surrogate code units */
987 if(leadUnitValue!=initialValue) {
988 limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
989 for(/* i=latin1Length */; i<limit; ++i) {
990 p16[i]=(uint16_t)leadUnitValue;
991 }
992 }
993 } else {
994 uint32_t *p32;
995
996 /* indexes to block 0 */
997 uprv_memset(p16, 0, trie->indexLength*2);
998
999 if(leadUnitValue!=initialValue) {
1000 /* indexes for lead surrogate code units to the block after Latin-1 */
1001 block=(uint16_t)(latin1Length>>UTRIE_INDEX_SHIFT);
1002 i=0xd800>>UTRIE_SHIFT;
1003 limit=0xdc00>>UTRIE_SHIFT;
1004 for(; i<limit; ++i) {
1005 p16[i]=block;
1006 }
1007 }
1008
1009 trie->data32=p32=(uint32_t *)(p16+trie->indexLength);
1010
1011 /* Latin-1 data */
1012 for(i=0; i<latin1Length; ++i) {
1013 p32[i]=initialValue;
1014 }
1015
1016 /* data for lead surrogate code units */
1017 if(leadUnitValue!=initialValue) {
1018 limit=latin1Length+UTRIE_DATA_BLOCK_LENGTH;
1019 for(/* i=latin1Length */; i<limit; ++i) {
1020 p32[i]=leadUnitValue;
1021 }
1022 }
1023 }
1024
1025 trie->getFoldingOffset=utrie_defaultGetFoldingOffset;
1026
1027 return actualLength;
1028 }
1029
1030 /* enumeration -------------------------------------------------------------- */
1031
1032 /* default UTrieEnumValue() returns the input value itself */
1033 static uint32_t U_CALLCONV
enumSameValue(const void * context,uint32_t value)1034 enumSameValue(const void *context, uint32_t value) {
1035 return value;
1036 }
1037
1038 /**
1039 * Enumerate all ranges of code points with the same relevant values.
1040 * The values are transformed from the raw trie entries by the enumValue function.
1041 */
1042 U_CAPI void U_EXPORT2
utrie_enum(const UTrie * trie,UTrieEnumValue * enumValue,UTrieEnumRange * enumRange,const void * context)1043 utrie_enum(const UTrie *trie,
1044 UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context) {
1045 const uint32_t *data32;
1046 const uint16_t *index;
1047
1048 uint32_t value, prevValue, initialValue;
1049 UChar32 c, prev;
1050 int32_t l, i, j, block, prevBlock, offset;
1051
1052 /* check arguments */
1053 if(trie==NULL || trie->index==NULL || enumRange==NULL) {
1054 return;
1055 }
1056 if(enumValue==NULL) {
1057 enumValue=enumSameValue;
1058 }
1059
1060 index=trie->index;
1061 data32=trie->data32;
1062
1063 /* get the enumeration value that corresponds to an initial-value trie data entry */
1064 initialValue=enumValue(context, trie->initialValue);
1065
1066 /* set variables for previous range */
1067 prevBlock=0;
1068 prev=0;
1069 prevValue=initialValue;
1070
1071 /* enumerate BMP - the main loop enumerates data blocks */
1072 for(i=0, c=0; c<=0xffff; ++i) {
1073 if(c==0xd800) {
1074 /* skip lead surrogate code _units_, go to lead surr. code _points_ */
1075 i=UTRIE_BMP_INDEX_LENGTH;
1076 } else if(c==0xdc00) {
1077 /* go back to regular BMP code points */
1078 i=c>>UTRIE_SHIFT;
1079 }
1080
1081 block=index[i]<<UTRIE_INDEX_SHIFT;
1082 if(block==prevBlock) {
1083 /* the block is the same as the previous one, and filled with value */
1084 c+=UTRIE_DATA_BLOCK_LENGTH;
1085 } else if(block==0) {
1086 /* this is the all-initial-value block */
1087 if(prevValue!=initialValue) {
1088 if(prev<c) {
1089 if(!enumRange(context, prev, c, prevValue)) {
1090 return;
1091 }
1092 }
1093 prevBlock=0;
1094 prev=c;
1095 prevValue=initialValue;
1096 }
1097 c+=UTRIE_DATA_BLOCK_LENGTH;
1098 } else {
1099 prevBlock=block;
1100 for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
1101 value=enumValue(context, data32!=NULL ? data32[block+j] : index[block+j]);
1102 if(value!=prevValue) {
1103 if(prev<c) {
1104 if(!enumRange(context, prev, c, prevValue)) {
1105 return;
1106 }
1107 }
1108 if(j>0) {
1109 prevBlock=-1;
1110 }
1111 prev=c;
1112 prevValue=value;
1113 }
1114 ++c;
1115 }
1116 }
1117 }
1118
1119 /* enumerate supplementary code points */
1120 for(l=0xd800; l<0xdc00;) {
1121 /* lead surrogate access */
1122 offset=index[l>>UTRIE_SHIFT]<<UTRIE_INDEX_SHIFT;
1123 if(offset==(data32!=NULL ? 0 : trie->indexLength)) {
1124 /* no entries for a whole block of lead surrogates */
1125 if(prevValue!=initialValue) {
1126 if(prev<c) {
1127 if(!enumRange(context, prev, c, prevValue)) {
1128 return;
1129 }
1130 }
1131 prevBlock=0;
1132 prev=c;
1133 prevValue=initialValue;
1134 }
1135
1136 l+=UTRIE_DATA_BLOCK_LENGTH;
1137 c+=UTRIE_DATA_BLOCK_LENGTH<<10;
1138 continue;
1139 }
1140
1141 value= data32!=NULL ? data32[offset+(l&UTRIE_MASK)] : index[offset+(l&UTRIE_MASK)];
1142
1143 /* enumerate trail surrogates for this lead surrogate */
1144 offset=trie->getFoldingOffset(value);
1145 if(offset<=0) {
1146 /* no data for this lead surrogate */
1147 if(prevValue!=initialValue) {
1148 if(prev<c) {
1149 if(!enumRange(context, prev, c, prevValue)) {
1150 return;
1151 }
1152 }
1153 prevBlock=0;
1154 prev=c;
1155 prevValue=initialValue;
1156 }
1157
1158 /* nothing else to do for the supplementary code points for this lead surrogate */
1159 c+=0x400;
1160 } else {
1161 /* enumerate code points for this lead surrogate */
1162 i=offset;
1163 offset+=UTRIE_SURROGATE_BLOCK_COUNT;
1164 do {
1165 /* copy of most of the body of the BMP loop */
1166 block=index[i]<<UTRIE_INDEX_SHIFT;
1167 if(block==prevBlock) {
1168 /* the block is the same as the previous one, and filled with value */
1169 c+=UTRIE_DATA_BLOCK_LENGTH;
1170 } else if(block==0) {
1171 /* this is the all-initial-value block */
1172 if(prevValue!=initialValue) {
1173 if(prev<c) {
1174 if(!enumRange(context, prev, c, prevValue)) {
1175 return;
1176 }
1177 }
1178 prevBlock=0;
1179 prev=c;
1180 prevValue=initialValue;
1181 }
1182 c+=UTRIE_DATA_BLOCK_LENGTH;
1183 } else {
1184 prevBlock=block;
1185 for(j=0; j<UTRIE_DATA_BLOCK_LENGTH; ++j) {
1186 value=enumValue(context, data32!=NULL ? data32[block+j] : index[block+j]);
1187 if(value!=prevValue) {
1188 if(prev<c) {
1189 if(!enumRange(context, prev, c, prevValue)) {
1190 return;
1191 }
1192 }
1193 if(j>0) {
1194 prevBlock=-1;
1195 }
1196 prev=c;
1197 prevValue=value;
1198 }
1199 ++c;
1200 }
1201 }
1202 } while(++i<offset);
1203 }
1204
1205 ++l;
1206 }
1207
1208 /* deliver last range */
1209 enumRange(context, prev, c, prevValue);
1210 }
1211