• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2001-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  ucol_bld.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created 02/22/2001
14 *   created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_COLLATION
23 
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "normalizer2impl.h"
29 #include "ucol_bld.h"
30 #include "ucol_elm.h"
31 #include "ucol_cnt.h"
32 #include "ucln_in.h"
33 #include "umutex.h"
34 #include "cmemory.h"
35 
36 static const InverseUCATableHeader* _staticInvUCA = NULL;
37 static UDataMemory* invUCA_DATA_MEM = NULL;
38 
39 U_CDECL_BEGIN
40 static UBool U_CALLCONV
isAcceptableInvUCA(void *,const char *,const char *,const UDataInfo * pInfo)41 isAcceptableInvUCA(void * /*context*/,
42                    const char * /*type*/, const char * /*name*/,
43                    const UDataInfo *pInfo)
44 {
45     /* context, type & name are intentionally not used */
46     if( pInfo->size>=20 &&
47         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
48         pInfo->charsetFamily==U_CHARSET_FAMILY &&
49         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
50         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
51         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
52         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
53         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
54         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
55         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
56         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
57         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
58         )
59     {
60         UVersionInfo UCDVersion;
61         u_getUnicodeVersion(UCDVersion);
62         return (pInfo->dataVersion[0]==UCDVersion[0] &&
63             pInfo->dataVersion[1]==UCDVersion[1]);
64             //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
65             //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
66             //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
67     } else {
68         return FALSE;
69     }
70 }
71 U_CDECL_END
72 
73 /*
74 * Takes two CEs (lead and continuation) and
75 * compares them as CEs should be compared:
76 * primary vs. primary, secondary vs. secondary
77 * tertiary vs. tertiary
78 */
compareCEs(uint32_t source0,uint32_t source1,uint32_t target0,uint32_t target1)79 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
80     uint32_t s1 = source0, s2, t1 = target0, t2;
81     if(isContinuation(source1)) {
82         s2 = source1;
83     } else {
84         s2 = 0;
85     }
86     if(isContinuation(target1)) {
87         t2 = target1;
88     } else {
89         t2 = 0;
90     }
91 
92     uint32_t s = 0, t = 0;
93     if(s1 == t1 && s2 == t2) {
94         return 0;
95     }
96     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
97     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
98     if(s < t) {
99         return -1;
100     } else if(s > t) {
101         return 1;
102     } else {
103         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
104         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
105         if(s < t) {
106             return -1;
107         } else if(s > t) {
108             return 1;
109         } else {
110             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
111             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
112             if(s < t) {
113                 return -1;
114             } else {
115                 return 1;
116             }
117         }
118     }
119 }
120 
121 static
ucol_inv_findCE(const UColTokenParser * src,uint32_t CE,uint32_t SecondCE)122 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
123     uint32_t bottom = 0, top = src->invUCA->tableSize;
124     uint32_t i = 0;
125     uint32_t first = 0, second = 0;
126     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
127     int32_t res = 0;
128 
129     while(bottom < top-1) {
130         i = (top+bottom)/2;
131         first = *(CETable+3*i);
132         second = *(CETable+3*i+1);
133         res = compareCEs(first, second, CE, SecondCE);
134         if(res > 0) {
135             top = i;
136         } else if(res < 0) {
137             bottom = i;
138         } else {
139             break;
140         }
141     }
142 
143     /* weiv:                                                  */
144     /* in searching for elements, I have removed the failure  */
145     /* The reason for this is that the builder does not rely  */
146     /* on search mechanism telling it that it didn't find an  */
147     /* element. However, indirect positioning relies on being */
148     /* able to find the elements around any CE, even if it is */
149     /* not defined in the UCA. */
150     return i;
151     /*
152     if((first == CE && second == SecondCE)) {
153     return i;
154     } else {
155     return -1;
156     }
157     */
158 }
159 
160 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
161     0xFFFF0000,
162     0xFFFFFF00,
163     0xFFFFFFFF
164 };
165 
ucol_inv_getNextCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * nextCE,uint32_t * nextContCE,uint32_t strength)166 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
167                                             uint32_t CE, uint32_t contCE,
168                                             uint32_t *nextCE, uint32_t *nextContCE,
169                                             uint32_t strength)
170 {
171     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
172     int32_t iCE;
173 
174     iCE = ucol_inv_findCE(src, CE, contCE);
175 
176     if(iCE<0) {
177         *nextCE = UCOL_NOT_FOUND;
178         return -1;
179     }
180 
181     CE &= strengthMask[strength];
182     contCE &= strengthMask[strength];
183 
184     *nextCE = CE;
185     *nextContCE = contCE;
186 
187     while((*nextCE  & strengthMask[strength]) == CE
188         && (*nextContCE  & strengthMask[strength]) == contCE)
189     {
190         *nextCE = (*(CETable+3*(++iCE)));
191         *nextContCE = (*(CETable+3*(iCE)+1));
192     }
193 
194     return iCE;
195 }
196 
ucol_inv_getPrevCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * prevCE,uint32_t * prevContCE,uint32_t strength)197 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
198                                             uint32_t CE, uint32_t contCE,
199                                             uint32_t *prevCE, uint32_t *prevContCE,
200                                             uint32_t strength)
201 {
202     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
203     int32_t iCE;
204 
205     iCE = ucol_inv_findCE(src, CE, contCE);
206 
207     if(iCE<0) {
208         *prevCE = UCOL_NOT_FOUND;
209         return -1;
210     }
211 
212     CE &= strengthMask[strength];
213     contCE &= strengthMask[strength];
214 
215     *prevCE = CE;
216     *prevContCE = contCE;
217 
218     while((*prevCE  & strengthMask[strength]) == CE
219         && (*prevContCE  & strengthMask[strength])== contCE
220         && iCE > 0) /* this condition should prevent falling off the edge of the world */
221     {
222         /* here, we end up in a singularity - zero */
223         *prevCE = (*(CETable+3*(--iCE)));
224         *prevContCE = (*(CETable+3*(iCE)+1));
225     }
226 
227     return iCE;
228 }
229 
ucol_getCEStrengthDifference(uint32_t CE,uint32_t contCE,uint32_t prevCE,uint32_t prevContCE)230 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
231                                                        uint32_t prevCE, uint32_t prevContCE)
232 {
233     if(prevCE == CE && prevContCE == contCE) {
234         return UCOL_IDENTICAL;
235     }
236     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
237         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
238     {
239         return UCOL_PRIMARY;
240     }
241     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
242         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
243     {
244         return UCOL_SECONDARY;
245     }
246     return UCOL_TERTIARY;
247 }
248 
249 
250 /*static
251 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
252 
253     uint32_t CE = lh->baseCE;
254     uint32_t SecondCE = lh->baseContCE;
255 
256     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
257     uint32_t previousCE, previousContCE;
258     int32_t iCE;
259 
260     iCE = ucol_inv_findCE(src, CE, SecondCE);
261 
262     if(iCE<0) {
263         return -1;
264     }
265 
266     CE &= strengthMask[strength];
267     SecondCE &= strengthMask[strength];
268 
269     previousCE = CE;
270     previousContCE = SecondCE;
271 
272     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
273         previousCE = (*(CETable+3*(--iCE)));
274         previousContCE = (*(CETable+3*(iCE)+1));
275     }
276     lh->previousCE = previousCE;
277     lh->previousContCE = previousContCE;
278 
279     return iCE;
280 }*/
281 
282 static
ucol_inv_getNext(UColTokenParser * src,UColTokListHeader * lh,uint32_t strength)283 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
284     uint32_t CE = lh->baseCE;
285     uint32_t SecondCE = lh->baseContCE;
286 
287     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
288     uint32_t nextCE, nextContCE;
289     int32_t iCE;
290 
291     iCE = ucol_inv_findCE(src, CE, SecondCE);
292 
293     if(iCE<0) {
294         return -1;
295     }
296 
297     CE &= strengthMask[strength];
298     SecondCE &= strengthMask[strength];
299 
300     nextCE = CE;
301     nextContCE = SecondCE;
302 
303     while((nextCE  & strengthMask[strength]) == CE
304         && (nextContCE  & strengthMask[strength]) == SecondCE)
305     {
306         nextCE = (*(CETable+3*(++iCE)));
307         nextContCE = (*(CETable+3*(iCE)+1));
308     }
309 
310     lh->nextCE = nextCE;
311     lh->nextContCE = nextContCE;
312 
313     return iCE;
314 }
315 
ucol_inv_getGapPositions(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)316 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
317     /* reset all the gaps */
318     int32_t i = 0;
319     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
320     uint32_t st = 0;
321     uint32_t t1, t2;
322     int32_t pos;
323 
324     UColToken *tok = lh->first;
325     uint32_t tokStrength = tok->strength;
326 
327     for(i = 0; i<3; i++) {
328         lh->gapsHi[3*i] = 0;
329         lh->gapsHi[3*i+1] = 0;
330         lh->gapsHi[3*i+2] = 0;
331         lh->gapsLo[3*i] = 0;
332         lh->gapsLo[3*i+1] = 0;
333         lh->gapsLo[3*i+2] = 0;
334         lh->numStr[i] = 0;
335         lh->fStrToken[i] = NULL;
336         lh->lStrToken[i] = NULL;
337         lh->pos[i] = -1;
338     }
339 
340     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
341 
342     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
343         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
344         lh->pos[0] = 0;
345         t1 = lh->baseCE;
346         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
347         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
348         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
349         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
350         uint32_t primaryCE = t1 & UCOL_PRIMARYMASK | (t2 & UCOL_PRIMARYMASK) >> 16;
351         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
352 
353         t1 = primaryCE & UCOL_PRIMARYMASK | 0x0505;
354         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
355 
356         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
357         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
358         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
359     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
360         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
361         lh->pos[0] = 0;
362         t1 = lh->baseCE;
363         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
364         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
365         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
366         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
367         t1 = lh->nextCE;
368         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
369         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
370         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
371         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
372     } else {
373         for(;;) {
374             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
375                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
376                     lh->fStrToken[tokStrength] = tok;
377                 } else { /* The CE must be implicit, since it's not in the table */
378                     /* Error */
379                     *status = U_INTERNAL_PROGRAM_ERROR;
380                 }
381             }
382 
383             while(tok != NULL && tok->strength >= tokStrength) {
384                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
385                     lh->lStrToken[tokStrength] = tok;
386                 }
387                 tok = tok->next;
388             }
389             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
390                 /* check if previous interval is the same and merge the intervals if it is so */
391                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
392                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
393                     lh->fStrToken[tokStrength+1] = NULL;
394                     lh->lStrToken[tokStrength+1] = NULL;
395                     lh->pos[tokStrength+1] = -1;
396                 }
397             }
398             if(tok != NULL) {
399                 tokStrength = tok->strength;
400             } else {
401                 break;
402             }
403         }
404         for(st = 0; st < 3; st++) {
405             if((pos = lh->pos[st]) >= 0) {
406                 t1 = *(CETable+3*(pos));
407                 t2 = *(CETable+3*(pos)+1);
408                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
409                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
410                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
411                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
412                 //pos--;
413                 //t1 = *(CETable+3*(pos));
414                 //t2 = *(CETable+3*(pos)+1);
415                 t1 = lh->baseCE;
416                 t2 = lh->baseContCE;
417                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
418                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
419                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
420             }
421         }
422     }
423 }
424 
425 
426 #define ucol_countBytes(value, noOfBytes)   \
427 {                               \
428     uint32_t mask = 0xFFFFFFFF;   \
429     (noOfBytes) = 0;              \
430     while(mask != 0) {            \
431     if(((value) & mask) != 0) { \
432     (noOfBytes)++;            \
433     }                           \
434     mask >>= 8;                 \
435     }                             \
436 }
437 
ucol_getNextGenerated(ucolCEGenerator * g,UErrorCode * status)438 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
439     if(U_SUCCESS(*status)) {
440         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
441     }
442     return g->current;
443 }
444 
ucol_getSimpleCEGenerator(ucolCEGenerator * g,UColToken * tok,uint32_t strength,UErrorCode * status)445 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
446     /* TODO: rename to enum names */
447     uint32_t high, low, count=1;
448     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
449 
450     if(strength == UCOL_SECONDARY) {
451         low = UCOL_COMMON_TOP2<<24;
452         high = 0xFFFFFFFF;
453         count = 0xFF - UCOL_COMMON_TOP2;
454     } else {
455         low = UCOL_BYTE_COMMON << 24; //0x05000000;
456         high = 0x40000000;
457         count = 0x40 - UCOL_BYTE_COMMON;
458     }
459 
460     if(tok->next != NULL && tok->next->strength == strength) {
461         count = tok->next->toInsert;
462     }
463 
464     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
465     g->current = UCOL_BYTE_COMMON<<24;
466 
467     if(g->noOfRanges == 0) {
468         *status = U_INTERNAL_PROGRAM_ERROR;
469     }
470     return g->current;
471 }
472 
ucol_getCEGenerator(ucolCEGenerator * g,uint32_t * lows,uint32_t * highs,UColToken * tok,uint32_t fStrength,UErrorCode * status)473 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
474     uint32_t strength = tok->strength;
475     uint32_t low = lows[fStrength*3+strength];
476     uint32_t high = highs[fStrength*3+strength];
477     uint32_t maxByte = 0;
478     if(strength == UCOL_TERTIARY) {
479         maxByte = 0x3F;
480     } else if(strength == UCOL_PRIMARY) {
481         maxByte = 0xFE;
482     } else {
483         maxByte = 0xFF;
484     }
485 
486     uint32_t count = tok->toInsert;
487 
488     if(low >= high && strength > UCOL_PRIMARY) {
489         int32_t s = strength;
490         for(;;) {
491             s--;
492             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
493                 if(strength == UCOL_SECONDARY) {
494                     if (low < UCOL_COMMON_TOP2<<24 ) {
495                        // Override if low range is less than UCOL_COMMON_TOP2.
496 		        low = UCOL_COMMON_TOP2<<24;
497                     }
498                     high = 0xFFFFFFFF;
499                 } else {
500                     // Override if low range is less than UCOL_COMMON_BOT3.
501 		    if ( low < UCOL_COMMON_BOT3<<24 ) {
502                         low = UCOL_COMMON_BOT3<<24;
503 		    }
504                     high = 0x40000000;
505                 }
506                 break;
507             }
508             if(s<0) {
509                 *status = U_INTERNAL_PROGRAM_ERROR;
510                 return 0;
511             }
512         }
513     }
514 
515     if(low == 0) {
516         low = 0x01000000;
517     }
518 
519     if(strength == UCOL_SECONDARY) { /* similar as simple */
520         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
521             low = UCOL_COMMON_TOP2<<24;
522         }
523         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
524             high = UCOL_COMMON_TOP2<<24;
525         }
526         if(low < (UCOL_COMMON_BOT2<<24)) {
527             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
528             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
529             //g->current = UCOL_COMMON_BOT2<<24;
530             return g->current;
531         }
532     }
533 
534     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
535     if(g->noOfRanges == 0) {
536         *status = U_INTERNAL_PROGRAM_ERROR;
537     }
538     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
539     return g->current;
540 }
541 
542 static
u_toLargeKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)543 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
544     uint32_t i = 0;
545     UChar c;
546 
547     if(U_FAILURE(*status)) {
548         return 0;
549     }
550 
551     if(sourceLen > resLen) {
552         *status = U_MEMORY_ALLOCATION_ERROR;
553         return 0;
554     }
555 
556     for(i = 0; i < sourceLen; i++) {
557         c = source[i];
558         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
559             switch(c - 0x3000) {
560             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
561             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
562                 c++;
563                 break;
564             case 0xF5:
565                 c = 0x30AB;
566                 break;
567             case 0xF6:
568                 c = 0x30B1;
569                 break;
570             }
571         }
572         resBuf[i] = c;
573     }
574     return sourceLen;
575 }
576 
577 static
u_toSmallKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)578 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
579     uint32_t i = 0;
580     UChar c;
581 
582     if(U_FAILURE(*status)) {
583         return 0;
584     }
585 
586     if(sourceLen > resLen) {
587         *status = U_MEMORY_ALLOCATION_ERROR;
588         return 0;
589     }
590 
591     for(i = 0; i < sourceLen; i++) {
592         c = source[i];
593         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
594             switch(c - 0x3000) {
595             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
596             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
597                 c--;
598                 break;
599             case 0xAB:
600                 c = 0x30F5;
601                 break;
602             case 0xB1:
603                 c = 0x30F6;
604                 break;
605             }
606         }
607         resBuf[i] = c;
608     }
609     return sourceLen;
610 }
611 
612 static
ucol_uprv_getCaseBits(const UCollator * UCA,const UChar * src,uint32_t len,UErrorCode * status)613 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
614     uint32_t i = 0;
615     UChar n[128];
616     uint32_t nLen = 0;
617     uint32_t uCount = 0, lCount = 0;
618 
619     collIterate s;
620     uint32_t order = 0;
621 
622     if(U_FAILURE(*status)) {
623         return UCOL_LOWER_CASE;
624     }
625 
626     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
627     if(U_SUCCESS(*status)) {
628         for(i = 0; i < nLen; i++) {
629             uprv_init_collIterate(UCA, &n[i], 1, &s, status);
630             order = ucol_getNextCE(UCA, &s, status);
631             if(isContinuation(order)) {
632                 *status = U_INTERNAL_PROGRAM_ERROR;
633                 return UCOL_LOWER_CASE;
634             }
635             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
636                 uCount++;
637             } else {
638                 if(u_islower(n[i])) {
639                     lCount++;
640                 } else if(U_SUCCESS(*status)) {
641                     UChar sk[1], lk[1];
642                     u_toSmallKana(&n[i], 1, sk, 1, status);
643                     u_toLargeKana(&n[i], 1, lk, 1, status);
644                     if(sk[0] == n[i] && lk[0] != n[i]) {
645                         lCount++;
646                     }
647                 }
648             }
649         }
650     }
651 
652     if(uCount != 0 && lCount != 0) {
653         return UCOL_MIXED_CASE;
654     } else if(uCount != 0) {
655         return UCOL_UPPER_CASE;
656     } else {
657         return UCOL_LOWER_CASE;
658     }
659 }
660 
661 
ucol_doCE(UColTokenParser * src,uint32_t * CEparts,UColToken * tok,UErrorCode * status)662 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
663     /* this one makes the table and stuff */
664     uint32_t noOfBytes[3];
665     uint32_t i;
666 
667     for(i = 0; i<3; i++) {
668         ucol_countBytes(CEparts[i], noOfBytes[i]);
669     }
670 
671     /* Here we have to pack CEs from parts */
672 
673     uint32_t CEi = 0;
674     uint32_t value = 0;
675 
676     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
677         if(CEi > 0) {
678             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
679         } else {
680             value = 0;
681         }
682 
683         if(2*CEi<noOfBytes[0]) {
684             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
685         }
686         if(CEi<noOfBytes[1]) {
687             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
688         }
689         if(CEi<noOfBytes[2]) {
690             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
691         }
692         tok->CEs[CEi] = value;
693         CEi++;
694     }
695     if(CEi == 0) { /* totally ignorable */
696         tok->noOfCEs = 1;
697         tok->CEs[0] = 0;
698     } else { /* there is at least something */
699         tok->noOfCEs = CEi;
700     }
701 
702 
703     // we want to set case bits here and now, not later.
704     // Case bits handling
705     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
706         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
707         int32_t cSize = (tok->source & 0xFF000000) >> 24;
708         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
709 
710         if(cSize > 1) {
711             // Do it manually
712             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
713         } else {
714             // Copy it from the UCA
715             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
716             tok->CEs[0] |= (caseCE & 0xC0);
717         }
718     }
719 
720 #if UCOL_DEBUG==2
721     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
722     for(i = 0; i<tok->noOfCEs; i++) {
723         fprintf(stderr, "%08X ", tok->CEs[i]);
724     }
725     fprintf(stderr, "\n");
726 #endif
727 }
728 
ucol_initBuffers(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)729 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
730     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
731     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
732 
733     UColToken *tok = lh->last;
734     uint32_t t[UCOL_STRENGTH_LIMIT];
735 
736     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
737 
738     tok->toInsert = 1;
739     t[tok->strength] = 1;
740 
741     while(tok->previous != NULL) {
742         if(tok->previous->strength < tok->strength) { /* going up */
743             t[tok->strength] = 0;
744             t[tok->previous->strength]++;
745         } else if(tok->previous->strength > tok->strength) { /* going down */
746             t[tok->previous->strength] = 1;
747         } else {
748             t[tok->strength]++;
749         }
750         tok=tok->previous;
751         tok->toInsert = t[tok->strength];
752     }
753 
754     tok->toInsert = t[tok->strength];
755     ucol_inv_getGapPositions(src, lh, status);
756 
757 #if UCOL_DEBUG
758     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
759     int32_t j = 2;
760     for(j = 2; j >= 0; j--) {
761         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
762         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
763     }
764     tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
765 
766     do {
767         fprintf(stderr,"%i", tok->strength);
768         tok = tok->next;
769     } while(tok != NULL);
770     fprintf(stderr, "\n");
771 
772     tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
773 
774     do {
775         fprintf(stderr,"%i", tok->toInsert);
776         tok = tok->next;
777     } while(tok != NULL);
778 #endif
779 
780     tok = lh->first;
781     uint32_t fStrength = UCOL_IDENTICAL;
782     uint32_t initStrength = UCOL_IDENTICAL;
783 
784 
785     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
786     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
787     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
788 
789     while (tok != NULL && U_SUCCESS(*status)) {
790         fStrength = tok->strength;
791         if(fStrength < initStrength) {
792             initStrength = fStrength;
793             if(lh->pos[fStrength] == -1) {
794                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
795                     fStrength--;
796                 }
797                 if(lh->pos[fStrength] == -1) {
798                     *status = U_INTERNAL_PROGRAM_ERROR;
799                     return;
800                 }
801             }
802             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
803                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
804                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
805                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
806                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
807             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
808                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
809                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
810                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
811                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
812             } else { /* primaries */
813                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
814                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
815                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
816                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
817             }
818         } else {
819             if(tok->strength == UCOL_TERTIARY) {
820                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
821             } else if(tok->strength == UCOL_SECONDARY) {
822                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
823                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
824             } else if(tok->strength == UCOL_PRIMARY) {
825                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
826                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
827                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
828             }
829         }
830         ucol_doCE(src, CEparts, tok, status);
831         tok = tok->next;
832     }
833 }
834 
ucol_createElements(UColTokenParser * src,tempUCATable * t,UColTokListHeader * lh,UErrorCode * status)835 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
836     UCAElements el;
837     UColToken *tok = lh->first;
838     UColToken *expt = NULL;
839     uint32_t i = 0, j = 0;
840     UChar32 fcdHighStart;
841     const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
842 
843     while(tok != NULL && U_SUCCESS(*status)) {
844         /* first, check if there are any expansions */
845         /* if there are expansions, we need to do a little bit more processing */
846         /* since parts of expansion can be tailored, while others are not */
847         if(tok->expansion != 0) {
848             uint32_t len = tok->expansion >> 24;
849             uint32_t currentSequenceLen = len;
850             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
851             //uint32_t exp = currentSequenceLen | expOffset;
852             UColToken exp;
853             exp.source = currentSequenceLen | expOffset;
854             exp.rulesToParse = src->source;
855 
856             while(len > 0) {
857                 currentSequenceLen = len;
858                 while(currentSequenceLen > 0) {
859                     exp.source = (currentSequenceLen << 24) | expOffset;
860                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
861                         uint32_t noOfCEsToCopy = expt->noOfCEs;
862                         for(j = 0; j<noOfCEsToCopy; j++) {
863                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
864                         }
865                         tok->noOfExpCEs += noOfCEsToCopy;
866                         // Smart people never try to add codepoints and CEs.
867                         // For some odd reason, it won't work.
868                         expOffset += currentSequenceLen; //noOfCEsToCopy;
869                         len -= currentSequenceLen; //noOfCEsToCopy;
870                         break;
871                     } else {
872                         currentSequenceLen--;
873                     }
874                 }
875                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
876                     /* will have to get one from UCA */
877                     /* first, get the UChars from the rules */
878                     /* then pick CEs out until there is no more and stuff them into expansion */
879                     collIterate s;
880                     uint32_t order = 0;
881                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
882 
883                     for(;;) {
884                         order = ucol_getNextCE(src->UCA, &s, status);
885                         if(order == UCOL_NO_MORE_CES) {
886                             break;
887                         }
888                         tok->expCEs[tok->noOfExpCEs++] = order;
889                     }
890                     expOffset++;
891                     len--;
892                 }
893             }
894         } else {
895             tok->noOfExpCEs = 0;
896         }
897 
898         /* set the ucaelement with obtained values */
899         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
900         /* copy CEs */
901         for(i = 0; i<tok->noOfCEs; i++) {
902             el.CEs[i] = tok->CEs[i];
903         }
904         for(i = 0; i<tok->noOfExpCEs; i++) {
905             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
906         }
907 
908         /* copy UChars */
909         // We kept prefix and source kind of together, as it is a kind of a contraction.
910         // However, now we have to slice the prefix off the main thing -
911         el.prefix = el.prefixChars;
912         el.cPoints = el.uchars;
913         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
914             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
915             // decomposed elements to the unsaf table.
916             el.prefixSize = tok->prefix>>24;
917             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
918 
919             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
920             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
921         } else {
922             el.prefixSize = 0;
923             *el.prefix = 0;
924 
925             el.cSize = (tok->source >> 24);
926             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
927         }
928         if(src->UCA != NULL) {
929             for(i = 0; i<el.cSize; i++) {
930                 if(UCOL_ISJAMO(el.cPoints[i])) {
931                     t->image->jamoSpecial = TRUE;
932                 }
933             }
934             if (!src->buildCCTabFlag && el.cSize > 0) {
935                 // Check the trailing canonical combining class (tccc) of the last character.
936                 const UChar *s = el.cPoints + el.cSize;
937                 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s);
938                 if ((fcd & 0xff) != 0) {
939                     src->buildCCTabFlag = TRUE;
940                 }
941             }
942         }
943 
944         /* and then, add it */
945 #if UCOL_DEBUG==2
946         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
947 #endif
948         uprv_uca_addAnElement(t, &el, status);
949 
950 #if UCOL_DEBUG_DUPLICATES
951         if(*status != U_ZERO_ERROR) {
952             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
953             *status = U_ZERO_ERROR;
954         }
955 #endif
956 
957         tok = tok->next;
958     }
959 }
960 
961 U_CDECL_BEGIN
962 static UBool U_CALLCONV
_processUCACompleteIgnorables(const void * context,UChar32 start,UChar32 limit,uint32_t value)963 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
964     UErrorCode status = U_ZERO_ERROR;
965     tempUCATable *t = (tempUCATable *)context;
966     if(value == 0) {
967         while(start < limit) {
968             uint32_t CE = utrie_get32(t->mapping, start, NULL);
969             if(CE == UCOL_NOT_FOUND) {
970                 UCAElements el;
971                 el.isThai = FALSE;
972                 el.prefixSize = 0;
973                 el.prefixChars[0] = 0;
974                 el.prefix = el.prefixChars;
975                 el.cPoints = el.uchars;
976 
977                 el.cSize = 0;
978                 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
979 
980                 el.noOfCEs = 1;
981                 el.CEs[0] = 0;
982                 uprv_uca_addAnElement(t, &el, &status);
983 
984             }
985             start++;
986         }
987     }
988     if(U_FAILURE(status)) {
989         return FALSE;
990     } else {
991         return TRUE;
992     }
993 }
994 U_CDECL_END
995 
996 static void
ucol_uprv_bld_copyRangeFromUCA(UColTokenParser * src,tempUCATable * t,UChar32 start,UChar32 end,UErrorCode * status)997 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
998                                UChar32 start, UChar32 end,
999                                UErrorCode *status)
1000 {
1001     //UChar decomp[256];
1002     uint32_t CE = UCOL_NOT_FOUND;
1003     UChar32 u = 0;
1004     UCAElements el;
1005     el.isThai = FALSE;
1006     el.prefixSize = 0;
1007     el.prefixChars[0] = 0;
1008     collIterate colIt;
1009 
1010     if(U_SUCCESS(*status)) {
1011         for(u = start; u<=end; u++) {
1012             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1013                 /* this test is for contractions that are missing the starting element. */
1014                 || ((isCntTableElement(CE)) &&
1015                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1016                 )
1017             {
1018                 el.cSize = 0;
1019                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1020                 //decomp[0] = (UChar)u;
1021                 //el.uchars[0] = (UChar)u;
1022                 el.cPoints = el.uchars;
1023                 //el.cSize = 1;
1024                 el.noOfCEs = 0;
1025                 el.prefix = el.prefixChars;
1026                 el.prefixSize = 0;
1027                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1028                 // We actually want to check whether this element is a special
1029                 // If it is an implicit element (hangul, CJK - we want to copy the
1030                 // special, not the resolved CEs) - for hangul, copying resolved
1031                 // would just make things the same (there is an expansion and it
1032                 // takes approximately the same amount of time to resolve as
1033                 // falling back to the UCA).
1034                 /*
1035                 UTRIE_GET32(src->UCA->mapping, u, CE);
1036                 tag = getCETag(CE);
1037                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1038                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1039                 || tag == LEAD_SURROGATE_TAG) {
1040                 el.CEs[el.noOfCEs++] = CE;
1041                 } else {
1042                 */
1043                 // It turns out that it does not make sense to keep implicits
1044                 // unresolved. The cost of resolving them is big enough so that
1045                 // it doesn't make any difference whether we have to go to the UCA
1046                 // or not.
1047                 {
1048                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1049                     while(CE != UCOL_NO_MORE_CES) {
1050                         CE = ucol_getNextCE(src->UCA, &colIt, status);
1051                         if(CE != UCOL_NO_MORE_CES) {
1052                             el.CEs[el.noOfCEs++] = CE;
1053                         }
1054                     }
1055                 }
1056                 uprv_uca_addAnElement(t, &el, status);
1057             }
1058         }
1059     }
1060 }
1061 
ucol_assembleTailoringTable(UColTokenParser * src,UErrorCode * status)1062 UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1063     U_NAMESPACE_USE
1064 
1065     uint32_t i = 0;
1066     if(U_FAILURE(*status)) {
1067         return NULL;
1068     }
1069     /*
1070     2.  Eliminate the negative lists by doing the following for each non-null negative list:
1071     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1072     create new ListHeader X
1073     o   reverse the list, add to the end of X's positive list. Reset the strength of the
1074     first item you add, based on the stronger strength levels of the two lists.
1075     */
1076     /*
1077     3.  For each ListHeader with a non-null positive list:
1078     */
1079     /*
1080     o   Find all character strings with CEs between the baseCE and the
1081     next/previous CE, at the strength of the first token. Add these to the
1082     tailoring.
1083     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1084     tailoring has & x < z...
1085     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1086     */
1087     /* It is possible that this part should be done even while constructing list */
1088     /* The problem is that it is unknown what is going to be the strongest weight */
1089     /* So we might as well do it here */
1090 
1091     /*
1092     o   Allocate CEs for each token in the list, based on the total number N of the
1093     largest level difference, and the gap G between baseCE and nextCE at that
1094     level. The relation * between the last item and nextCE is the same as the
1095     strongest strength.
1096     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1097     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1098     Then fit b and c into the secondary gap between a and d, then fit q
1099     into the tertiary gap between b and c.
1100 
1101     o   Example: baseCE << b <<< q << c * nextCE(X,2)
1102     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1103     Then fit q into the tertiary gap between b and c.
1104     o   When incrementing primary values, we will not cross high byte
1105     boundaries except where there is only a single-byte primary. That is to
1106     ensure that the script reordering will continue to work.
1107     */
1108     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1109     /* test for NULL */
1110     if (image == NULL) {
1111         *status = U_MEMORY_ALLOCATION_ERROR;
1112         return NULL;
1113     }
1114     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1115 
1116     for(i = 0; i<src->resultLen; i++) {
1117         /* now we need to generate the CEs */
1118         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1119         /* According to strength                                                          */
1120         if(U_SUCCESS(*status)) {
1121             if(src->lh[i].first) { // if there are any elements
1122                 // due to the way parser works, subsequent tailorings
1123                 // may remove all the elements from a sequence, therefore
1124                 // leaving an empty tailoring sequence.
1125                 ucol_initBuffers(src, &src->lh[i], status);
1126             }
1127         }
1128         if(U_FAILURE(*status)) {
1129             uprv_free(image);
1130             return NULL;
1131         }
1132     }
1133 
1134     if(src->varTop != NULL) { /* stuff the variable top value */
1135         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1136         /* remove it from the list */
1137         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1138             src->varTop->listHeader->first = src->varTop->next;
1139         }
1140         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1141             src->varTop->listHeader->last = src->varTop->previous;
1142         }
1143         if(src->varTop->next != NULL) {
1144             src->varTop->next->previous = src->varTop->previous;
1145         }
1146         if(src->varTop->previous != NULL) {
1147             src->varTop->previous->next = src->varTop->next;
1148         }
1149     }
1150 
1151 
1152     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1153     if(U_FAILURE(*status)) {
1154         uprv_free(image);
1155         return NULL;
1156     }
1157 
1158 
1159     /* After this, we have assigned CE values to all regular CEs      */
1160     /* now we will go through list once more and resolve expansions,  */
1161     /* make UCAElements structs and add them to table                 */
1162     for(i = 0; i<src->resultLen; i++) {
1163         /* now we need to generate the CEs */
1164         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1165         /* According to strength                                                          */
1166         if(U_SUCCESS(*status)) {
1167             ucol_createElements(src, t, &src->lh[i], status);
1168         }
1169     }
1170 
1171     UCAElements el;
1172     el.isThai = FALSE;
1173     el.prefixSize = 0;
1174     el.prefixChars[0] = 0;
1175 
1176     /* add latin-1 stuff */
1177     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1178 
1179     /* add stuff for copying */
1180     if(src->copySet != NULL) {
1181         int32_t i = 0;
1182         UnicodeSet *set = (UnicodeSet *)src->copySet;
1183         for(i = 0; i < set->getRangeCount(); i++) {
1184             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1185         }
1186     }
1187 
1188     if(U_SUCCESS(*status)) {
1189         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1190 
1191         uint32_t tailoredCE = UCOL_NOT_FOUND;
1192         //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1193         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1194         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1195         // Check for null pointer
1196         if (ucaEl == NULL) {
1197         	*status = U_MEMORY_ALLOCATION_ERROR;
1198         	return NULL;
1199         }
1200         while(*conts != 0) {
1201             /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1202             tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1203             if(tailoredCE != UCOL_NOT_FOUND) {
1204                 UBool needToAdd = TRUE;
1205                 if(isCntTableElement(tailoredCE)) {
1206                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1207                         needToAdd = FALSE;
1208                     }
1209                 }
1210                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1211                     UCAElements elm;
1212                     elm.cPoints = el.uchars;
1213                     elm.noOfCEs = 0;
1214                     elm.uchars[0] = *conts;
1215                     elm.uchars[1] = 0;
1216                     elm.cSize = 1;
1217                     elm.prefixChars[0] = *(conts+2);
1218                     elm.isThai = FALSE;
1219                     elm.prefix = elm.prefixChars;
1220                     elm.prefixSize = 1;
1221                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1222                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1223                         needToAdd = TRUE;
1224                     }
1225                 }
1226                 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1227                     needToAdd = FALSE;
1228                 }
1229 
1230                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1231                     if (*(conts+1) != 0) {  // contractions
1232                         el.prefix = el.prefixChars;
1233                         el.prefixSize = 0;
1234                         el.cPoints = el.uchars;
1235                         el.noOfCEs = 0;
1236                         el.uchars[0] = *conts;
1237                         el.uchars[1] = *(conts+1);
1238                         if(*(conts+2)!=0) {
1239                             el.uchars[2] = *(conts+2);
1240                             el.cSize = 3;
1241                         } else {
1242                             el.cSize = 2;
1243                         }
1244                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
1245                     }
1246                     else { // pre-context character
1247                         UChar str[4] = { 0 };
1248                         int32_t len=0;
1249                         int32_t preKeyLen=0;
1250 
1251                         el.cPoints = el.uchars;
1252                         el.noOfCEs = 0;
1253                         el.uchars[0] = *conts;
1254                         el.uchars[1] = 0;
1255                         el.cSize = 1;
1256                         el.prefixChars[0] = *(conts+2);
1257                         el.prefix = el.prefixChars;
1258                         el.prefixSize = 1;
1259                         if (el.prefixChars[0]!=0) {
1260                             // get CE of prefix character first
1261                             str[0]=el.prefixChars[0];
1262                             str[1]=0;
1263                             ucol_setText(ucaEl, str, 1, status);
1264                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1265                                     != UCOL_NULLORDER) {
1266                                 preKeyLen++;  // count number of keys for prefix character
1267                             }
1268                             str[len++] = el.prefixChars[0];
1269                         }
1270 
1271                         str[len++] = el.uchars[0];
1272                         str[len]=0;
1273                         ucol_setText(ucaEl, str, len, status);
1274                         // Skip the keys for prefix character, then copy the rest to el.
1275                         while ((preKeyLen-->0) &&
1276                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1277                             continue;
1278                         }
1279 
1280                     }
1281                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1282                         el.noOfCEs++;
1283                     }
1284                     uprv_uca_addAnElement(t, &el, status);
1285                 }
1286 
1287             } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1288                 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1289             }
1290             conts+=3;
1291         }
1292         ucol_closeElements(ucaEl);
1293     }
1294 
1295     // Add completely ignorable elements
1296     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1297 
1298     // add tailoring characters related canonical closures
1299     uprv_uca_canonicalClosure(t, src, status);
1300 
1301     /* still need to produce compatibility closure */
1302 
1303     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1304 
1305     uprv_uca_closeTempTable(t);
1306     uprv_free(image);
1307 
1308     return myData;
1309 }
1310 
1311 U_CDECL_BEGIN
1312 static UBool U_CALLCONV
ucol_bld_cleanup(void)1313 ucol_bld_cleanup(void)
1314 {
1315     udata_close(invUCA_DATA_MEM);
1316     invUCA_DATA_MEM = NULL;
1317     _staticInvUCA = NULL;
1318     return TRUE;
1319 }
1320 U_CDECL_END
1321 
1322 U_CAPI const InverseUCATableHeader * U_EXPORT2
ucol_initInverseUCA(UErrorCode * status)1323 ucol_initInverseUCA(UErrorCode *status)
1324 {
1325     if(U_FAILURE(*status)) return NULL;
1326 
1327     UBool needsInit;
1328     UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1329 
1330     if(needsInit) {
1331         InverseUCATableHeader *newInvUCA = NULL;
1332         UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1333 
1334         if(U_FAILURE(*status)) {
1335             if (result) {
1336                 udata_close(result);
1337             }
1338             // This is not needed, as we are talking about
1339             // memory we got from UData
1340             //uprv_free(newInvUCA);
1341         }
1342 
1343         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1344             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1345             UCollator *UCA = ucol_initUCA(status);
1346             // UCA versions of UCA and inverse UCA should match
1347             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1348                 *status = U_INVALID_FORMAT_ERROR;
1349                 udata_close(result);
1350                 return NULL;
1351             }
1352 
1353             umtx_lock(NULL);
1354             if(_staticInvUCA == NULL) {
1355                 invUCA_DATA_MEM = result;
1356                 _staticInvUCA = newInvUCA;
1357                 result = NULL;
1358                 newInvUCA = NULL;
1359             }
1360             umtx_unlock(NULL);
1361 
1362             if(newInvUCA != NULL) {
1363                 udata_close(result);
1364                 // This is not needed, as we are talking about
1365                 // memory we got from UData
1366                 //uprv_free(newInvUCA);
1367             }
1368             else {
1369                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1370             }
1371         }
1372     }
1373     return _staticInvUCA;
1374 }
1375 
1376 #endif /* #if !UCONFIG_NO_COLLATION */
1377