• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2001-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  ucol_bld.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created 02/22/2001
14 *   created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_COLLATION
23 
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utf16.h"
31 #include "normalizer2impl.h"
32 #include "ucol_bld.h"
33 #include "ucol_elm.h"
34 #include "ucol_cnt.h"
35 #include "ucln_in.h"
36 #include "umutex.h"
37 #include "cmemory.h"
38 #include "cstring.h"
39 
40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
41 
42 static const InverseUCATableHeader* _staticInvUCA = NULL;
43 static UDataMemory* invUCA_DATA_MEM = NULL;
44 
45 U_CDECL_BEGIN
46 static UBool U_CALLCONV
isAcceptableInvUCA(void *,const char *,const char *,const UDataInfo * pInfo)47 isAcceptableInvUCA(void * /*context*/,
48                    const char * /*type*/, const char * /*name*/,
49                    const UDataInfo *pInfo)
50 {
51     /* context, type & name are intentionally not used */
52     if( pInfo->size>=20 &&
53         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
54         pInfo->charsetFamily==U_CHARSET_FAMILY &&
55         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
56         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
57         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
58         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
59         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
60         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
61         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
62         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
63         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
64         )
65     {
66         UVersionInfo UCDVersion;
67         u_getUnicodeVersion(UCDVersion);
68         return (pInfo->dataVersion[0]==UCDVersion[0] &&
69             pInfo->dataVersion[1]==UCDVersion[1]);
70             //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
71             //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
72             //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
73     } else {
74         return FALSE;
75     }
76 }
77 U_CDECL_END
78 
79 /*
80 * Takes two CEs (lead and continuation) and
81 * compares them as CEs should be compared:
82 * primary vs. primary, secondary vs. secondary
83 * tertiary vs. tertiary
84 */
compareCEs(uint32_t source0,uint32_t source1,uint32_t target0,uint32_t target1)85 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
86     uint32_t s1 = source0, s2, t1 = target0, t2;
87     if(isContinuation(source1)) {
88         s2 = source1;
89     } else {
90         s2 = 0;
91     }
92     if(isContinuation(target1)) {
93         t2 = target1;
94     } else {
95         t2 = 0;
96     }
97 
98     uint32_t s = 0, t = 0;
99     if(s1 == t1 && s2 == t2) {
100         return 0;
101     }
102     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
103     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
104     if(s < t) {
105         return -1;
106     } else if(s > t) {
107         return 1;
108     } else {
109         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
110         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
111         if(s < t) {
112             return -1;
113         } else if(s > t) {
114             return 1;
115         } else {
116             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
117             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
118             if(s < t) {
119                 return -1;
120             } else {
121                 return 1;
122             }
123         }
124     }
125 }
126 
127 static
ucol_inv_findCE(const UColTokenParser * src,uint32_t CE,uint32_t SecondCE)128 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
129     uint32_t bottom = 0, top = src->invUCA->tableSize;
130     uint32_t i = 0;
131     uint32_t first = 0, second = 0;
132     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
133     int32_t res = 0;
134 
135     while(bottom < top-1) {
136         i = (top+bottom)/2;
137         first = *(CETable+3*i);
138         second = *(CETable+3*i+1);
139         res = compareCEs(first, second, CE, SecondCE);
140         if(res > 0) {
141             top = i;
142         } else if(res < 0) {
143             bottom = i;
144         } else {
145             break;
146         }
147     }
148 
149     /* weiv:                                                  */
150     /* in searching for elements, I have removed the failure  */
151     /* The reason for this is that the builder does not rely  */
152     /* on search mechanism telling it that it didn't find an  */
153     /* element. However, indirect positioning relies on being */
154     /* able to find the elements around any CE, even if it is */
155     /* not defined in the UCA. */
156     return i;
157     /*
158     if((first == CE && second == SecondCE)) {
159     return i;
160     } else {
161     return -1;
162     }
163     */
164 }
165 
166 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
167     0xFFFF0000,
168     0xFFFFFF00,
169     0xFFFFFFFF
170 };
171 
ucol_inv_getNextCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * nextCE,uint32_t * nextContCE,uint32_t strength)172 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
173                                             uint32_t CE, uint32_t contCE,
174                                             uint32_t *nextCE, uint32_t *nextContCE,
175                                             uint32_t strength)
176 {
177     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
178     int32_t iCE;
179 
180     iCE = ucol_inv_findCE(src, CE, contCE);
181 
182     if(iCE<0) {
183         *nextCE = UCOL_NOT_FOUND;
184         return -1;
185     }
186 
187     CE &= strengthMask[strength];
188     contCE &= strengthMask[strength];
189 
190     *nextCE = CE;
191     *nextContCE = contCE;
192 
193     while((*nextCE  & strengthMask[strength]) == CE
194         && (*nextContCE  & strengthMask[strength]) == contCE)
195     {
196         *nextCE = (*(CETable+3*(++iCE)));
197         *nextContCE = (*(CETable+3*(iCE)+1));
198     }
199 
200     return iCE;
201 }
202 
ucol_inv_getPrevCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * prevCE,uint32_t * prevContCE,uint32_t strength)203 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
204                                             uint32_t CE, uint32_t contCE,
205                                             uint32_t *prevCE, uint32_t *prevContCE,
206                                             uint32_t strength)
207 {
208     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
209     int32_t iCE;
210 
211     iCE = ucol_inv_findCE(src, CE, contCE);
212 
213     if(iCE<0) {
214         *prevCE = UCOL_NOT_FOUND;
215         return -1;
216     }
217 
218     CE &= strengthMask[strength];
219     contCE &= strengthMask[strength];
220 
221     *prevCE = CE;
222     *prevContCE = contCE;
223 
224     while((*prevCE  & strengthMask[strength]) == CE
225         && (*prevContCE  & strengthMask[strength])== contCE
226         && iCE > 0) /* this condition should prevent falling off the edge of the world */
227     {
228         /* here, we end up in a singularity - zero */
229         *prevCE = (*(CETable+3*(--iCE)));
230         *prevContCE = (*(CETable+3*(iCE)+1));
231     }
232 
233     return iCE;
234 }
235 
ucol_getCEStrengthDifference(uint32_t CE,uint32_t contCE,uint32_t prevCE,uint32_t prevContCE)236 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
237                                                        uint32_t prevCE, uint32_t prevContCE)
238 {
239     if(prevCE == CE && prevContCE == contCE) {
240         return UCOL_IDENTICAL;
241     }
242     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
243         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
244     {
245         return UCOL_PRIMARY;
246     }
247     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
248         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
249     {
250         return UCOL_SECONDARY;
251     }
252     return UCOL_TERTIARY;
253 }
254 
255 
256 /*static
257 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
258 
259     uint32_t CE = lh->baseCE;
260     uint32_t SecondCE = lh->baseContCE;
261 
262     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
263     uint32_t previousCE, previousContCE;
264     int32_t iCE;
265 
266     iCE = ucol_inv_findCE(src, CE, SecondCE);
267 
268     if(iCE<0) {
269         return -1;
270     }
271 
272     CE &= strengthMask[strength];
273     SecondCE &= strengthMask[strength];
274 
275     previousCE = CE;
276     previousContCE = SecondCE;
277 
278     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
279         previousCE = (*(CETable+3*(--iCE)));
280         previousContCE = (*(CETable+3*(iCE)+1));
281     }
282     lh->previousCE = previousCE;
283     lh->previousContCE = previousContCE;
284 
285     return iCE;
286 }*/
287 
288 static
ucol_inv_getNext(UColTokenParser * src,UColTokListHeader * lh,uint32_t strength)289 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
290     uint32_t CE = lh->baseCE;
291     uint32_t SecondCE = lh->baseContCE;
292 
293     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
294     uint32_t nextCE, nextContCE;
295     int32_t iCE;
296 
297     iCE = ucol_inv_findCE(src, CE, SecondCE);
298 
299     if(iCE<0) {
300         return -1;
301     }
302 
303     CE &= strengthMask[strength];
304     SecondCE &= strengthMask[strength];
305 
306     nextCE = CE;
307     nextContCE = SecondCE;
308 
309     while((nextCE  & strengthMask[strength]) == CE
310         && (nextContCE  & strengthMask[strength]) == SecondCE)
311     {
312         nextCE = (*(CETable+3*(++iCE)));
313         nextContCE = (*(CETable+3*(iCE)+1));
314     }
315 
316     lh->nextCE = nextCE;
317     lh->nextContCE = nextContCE;
318 
319     return iCE;
320 }
321 
ucol_inv_getGapPositions(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)322 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
323     /* reset all the gaps */
324     int32_t i = 0;
325     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
326     uint32_t st = 0;
327     uint32_t t1, t2;
328     int32_t pos;
329 
330     UColToken *tok = lh->first;
331     uint32_t tokStrength = tok->strength;
332 
333     for(i = 0; i<3; i++) {
334         lh->gapsHi[3*i] = 0;
335         lh->gapsHi[3*i+1] = 0;
336         lh->gapsHi[3*i+2] = 0;
337         lh->gapsLo[3*i] = 0;
338         lh->gapsLo[3*i+1] = 0;
339         lh->gapsLo[3*i+2] = 0;
340         lh->numStr[i] = 0;
341         lh->fStrToken[i] = NULL;
342         lh->lStrToken[i] = NULL;
343         lh->pos[i] = -1;
344     }
345 
346     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
347 
348     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
349         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
350         lh->pos[0] = 0;
351         t1 = lh->baseCE;
352         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
353         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
354         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
355         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
356         uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
357         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
358 
359         t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
360         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
361 
362         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
363         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
364         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
365     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
366         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
367         lh->pos[0] = 0;
368         t1 = lh->baseCE;
369         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
370         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
371         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
372         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
373         t1 = lh->nextCE;
374         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
375         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
376         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
377         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
378     } else {
379         for(;;) {
380             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
381                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
382                     lh->fStrToken[tokStrength] = tok;
383                 } else { /* The CE must be implicit, since it's not in the table */
384                     /* Error */
385                     *status = U_INTERNAL_PROGRAM_ERROR;
386                 }
387             }
388 
389             while(tok != NULL && tok->strength >= tokStrength) {
390                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
391                     lh->lStrToken[tokStrength] = tok;
392                 }
393                 tok = tok->next;
394             }
395             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
396                 /* check if previous interval is the same and merge the intervals if it is so */
397                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
398                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
399                     lh->fStrToken[tokStrength+1] = NULL;
400                     lh->lStrToken[tokStrength+1] = NULL;
401                     lh->pos[tokStrength+1] = -1;
402                 }
403             }
404             if(tok != NULL) {
405                 tokStrength = tok->strength;
406             } else {
407                 break;
408             }
409         }
410         for(st = 0; st < 3; st++) {
411             if((pos = lh->pos[st]) >= 0) {
412                 t1 = *(CETable+3*(pos));
413                 t2 = *(CETable+3*(pos)+1);
414                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
415                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
416                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
417                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
418                 //pos--;
419                 //t1 = *(CETable+3*(pos));
420                 //t2 = *(CETable+3*(pos)+1);
421                 t1 = lh->baseCE;
422                 t2 = lh->baseContCE;
423                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
424                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
425                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
426             }
427         }
428     }
429 }
430 
431 
432 #define ucol_countBytes(value, noOfBytes)   \
433 {                               \
434     uint32_t mask = 0xFFFFFFFF;   \
435     (noOfBytes) = 0;              \
436     while(mask != 0) {            \
437     if(((value) & mask) != 0) { \
438     (noOfBytes)++;            \
439     }                           \
440     mask >>= 8;                 \
441     }                             \
442 }
443 
ucol_getNextGenerated(ucolCEGenerator * g,UErrorCode * status)444 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
445     if(U_SUCCESS(*status)) {
446         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
447     }
448     return g->current;
449 }
450 
ucol_getSimpleCEGenerator(ucolCEGenerator * g,UColToken * tok,uint32_t strength,UErrorCode * status)451 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
452     /* TODO: rename to enum names */
453     uint32_t high, low, count=1;
454     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
455 
456     if(strength == UCOL_SECONDARY) {
457         low = UCOL_COMMON_TOP2<<24;
458         high = 0xFFFFFFFF;
459         count = 0xFF - UCOL_COMMON_TOP2;
460     } else {
461         low = UCOL_BYTE_COMMON << 24; //0x05000000;
462         high = 0x40000000;
463         count = 0x40 - UCOL_BYTE_COMMON;
464     }
465 
466     if(tok->next != NULL && tok->next->strength == strength) {
467         count = tok->next->toInsert;
468     }
469 
470     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
471     g->current = UCOL_BYTE_COMMON<<24;
472 
473     if(g->noOfRanges == 0) {
474         *status = U_INTERNAL_PROGRAM_ERROR;
475     }
476     return g->current;
477 }
478 
ucol_getCEGenerator(ucolCEGenerator * g,uint32_t * lows,uint32_t * highs,UColToken * tok,uint32_t fStrength,UErrorCode * status)479 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
480     uint32_t strength = tok->strength;
481     uint32_t low = lows[fStrength*3+strength];
482     uint32_t high = highs[fStrength*3+strength];
483     uint32_t maxByte = 0;
484     if(strength == UCOL_TERTIARY) {
485         maxByte = 0x3F;
486     } else if(strength == UCOL_PRIMARY) {
487         maxByte = 0xFE;
488     } else {
489         maxByte = 0xFF;
490     }
491 
492     uint32_t count = tok->toInsert;
493 
494     if(low >= high && strength > UCOL_PRIMARY) {
495         int32_t s = strength;
496         for(;;) {
497             s--;
498             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
499                 if(strength == UCOL_SECONDARY) {
500                     if (low < UCOL_COMMON_TOP2<<24 ) {
501                        // Override if low range is less than UCOL_COMMON_TOP2.
502 		        low = UCOL_COMMON_TOP2<<24;
503                     }
504                     high = 0xFFFFFFFF;
505                 } else {
506                     // Override if low range is less than UCOL_COMMON_BOT3.
507 		    if ( low < UCOL_COMMON_BOT3<<24 ) {
508                         low = UCOL_COMMON_BOT3<<24;
509 		    }
510                     high = 0x40000000;
511                 }
512                 break;
513             }
514             if(s<0) {
515                 *status = U_INTERNAL_PROGRAM_ERROR;
516                 return 0;
517             }
518         }
519     }
520 
521     if(low < 0x02000000) {
522         // We must not use CE weight byte 02, so we set it as the minimum lower bound.
523         // See http://site.icu-project.org/design/collation/bytes
524         low = 0x02000000;
525     }
526 
527     if(strength == UCOL_SECONDARY) { /* similar as simple */
528         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
529             low = UCOL_COMMON_TOP2<<24;
530         }
531         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
532             high = UCOL_COMMON_TOP2<<24;
533         }
534         if(low < (UCOL_COMMON_BOT2<<24)) {
535             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
536             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
537             //g->current = UCOL_COMMON_BOT2<<24;
538             return g->current;
539         }
540     }
541 
542     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
543     if(g->noOfRanges == 0) {
544         *status = U_INTERNAL_PROGRAM_ERROR;
545     }
546     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
547     return g->current;
548 }
549 
550 static
u_toLargeKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)551 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
552     uint32_t i = 0;
553     UChar c;
554 
555     if(U_FAILURE(*status)) {
556         return 0;
557     }
558 
559     if(sourceLen > resLen) {
560         *status = U_MEMORY_ALLOCATION_ERROR;
561         return 0;
562     }
563 
564     for(i = 0; i < sourceLen; i++) {
565         c = source[i];
566         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
567             switch(c - 0x3000) {
568             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
569             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
570                 c++;
571                 break;
572             case 0xF5:
573                 c = 0x30AB;
574                 break;
575             case 0xF6:
576                 c = 0x30B1;
577                 break;
578             }
579         }
580         resBuf[i] = c;
581     }
582     return sourceLen;
583 }
584 
585 static
u_toSmallKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)586 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
587     uint32_t i = 0;
588     UChar c;
589 
590     if(U_FAILURE(*status)) {
591         return 0;
592     }
593 
594     if(sourceLen > resLen) {
595         *status = U_MEMORY_ALLOCATION_ERROR;
596         return 0;
597     }
598 
599     for(i = 0; i < sourceLen; i++) {
600         c = source[i];
601         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
602             switch(c - 0x3000) {
603             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
604             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
605                 c--;
606                 break;
607             case 0xAB:
608                 c = 0x30F5;
609                 break;
610             case 0xB1:
611                 c = 0x30F6;
612                 break;
613             }
614         }
615         resBuf[i] = c;
616     }
617     return sourceLen;
618 }
619 
620 U_NAMESPACE_BEGIN
621 
622 static
ucol_uprv_getCaseBits(const UCollator * UCA,const UChar * src,uint32_t len,UErrorCode * status)623 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
624     uint32_t i = 0;
625     UChar n[128];
626     uint32_t nLen = 0;
627     uint32_t uCount = 0, lCount = 0;
628 
629     collIterate s;
630     uint32_t order = 0;
631 
632     if(U_FAILURE(*status)) {
633         return UCOL_LOWER_CASE;
634     }
635 
636     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
637     if(U_SUCCESS(*status)) {
638         for(i = 0; i < nLen; i++) {
639             uprv_init_collIterate(UCA, &n[i], 1, &s, status);
640             order = ucol_getNextCE(UCA, &s, status);
641             if(isContinuation(order)) {
642                 *status = U_INTERNAL_PROGRAM_ERROR;
643                 return UCOL_LOWER_CASE;
644             }
645             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
646                 uCount++;
647             } else {
648                 if(u_islower(n[i])) {
649                     lCount++;
650                 } else if(U_SUCCESS(*status)) {
651                     UChar sk[1], lk[1];
652                     u_toSmallKana(&n[i], 1, sk, 1, status);
653                     u_toLargeKana(&n[i], 1, lk, 1, status);
654                     if(sk[0] == n[i] && lk[0] != n[i]) {
655                         lCount++;
656                     }
657                 }
658             }
659         }
660     }
661 
662     if(uCount != 0 && lCount != 0) {
663         return UCOL_MIXED_CASE;
664     } else if(uCount != 0) {
665         return UCOL_UPPER_CASE;
666     } else {
667         return UCOL_LOWER_CASE;
668     }
669 }
670 
671 
ucol_doCE(UColTokenParser * src,uint32_t * CEparts,UColToken * tok,UErrorCode * status)672 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
673     /* this one makes the table and stuff */
674     uint32_t noOfBytes[3];
675     uint32_t i;
676 
677     for(i = 0; i<3; i++) {
678         ucol_countBytes(CEparts[i], noOfBytes[i]);
679     }
680 
681     /* Here we have to pack CEs from parts */
682 
683     uint32_t CEi = 0;
684     uint32_t value = 0;
685 
686     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
687         if(CEi > 0) {
688             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
689         } else {
690             value = 0;
691         }
692 
693         if(2*CEi<noOfBytes[0]) {
694             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
695         }
696         if(CEi<noOfBytes[1]) {
697             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
698         }
699         if(CEi<noOfBytes[2]) {
700             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
701         }
702         tok->CEs[CEi] = value;
703         CEi++;
704     }
705     if(CEi == 0) { /* totally ignorable */
706         tok->noOfCEs = 1;
707         tok->CEs[0] = 0;
708     } else { /* there is at least something */
709         tok->noOfCEs = CEi;
710     }
711 
712 
713     // we want to set case bits here and now, not later.
714     // Case bits handling
715     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
716         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
717         int32_t cSize = (tok->source & 0xFF000000) >> 24;
718         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
719 
720         if(cSize > 1) {
721             // Do it manually
722             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
723         } else {
724             // Copy it from the UCA
725             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
726             tok->CEs[0] |= (caseCE & 0xC0);
727         }
728     }
729 
730 #if UCOL_DEBUG==2
731     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
732     for(i = 0; i<tok->noOfCEs; i++) {
733         fprintf(stderr, "%08X ", tok->CEs[i]);
734     }
735     fprintf(stderr, "\n");
736 #endif
737 }
738 
ucol_initBuffers(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)739 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
740     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
741     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
742 
743     UColToken *tok = lh->last;
744     uint32_t t[UCOL_STRENGTH_LIMIT];
745 
746     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
747 
748     /* must initialize ranges to avoid memory check warnings */
749     for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
750         uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
751     }
752 
753     tok->toInsert = 1;
754     t[tok->strength] = 1;
755 
756     while(tok->previous != NULL) {
757         if(tok->previous->strength < tok->strength) { /* going up */
758             t[tok->strength] = 0;
759             t[tok->previous->strength]++;
760         } else if(tok->previous->strength > tok->strength) { /* going down */
761             t[tok->previous->strength] = 1;
762         } else {
763             t[tok->strength]++;
764         }
765         tok=tok->previous;
766         tok->toInsert = t[tok->strength];
767     }
768 
769     tok->toInsert = t[tok->strength];
770     ucol_inv_getGapPositions(src, lh, status);
771 
772 #if UCOL_DEBUG
773     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
774     int32_t j = 2;
775     for(j = 2; j >= 0; j--) {
776         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
777         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
778     }
779     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
780 
781     do {
782         fprintf(stderr,"%i", tok->strength);
783         tok = tok->next;
784     } while(tok != NULL);
785     fprintf(stderr, "\n");
786 
787     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
788 
789     do {
790         fprintf(stderr,"%i", tok->toInsert);
791         tok = tok->next;
792     } while(tok != NULL);
793 #endif
794 
795     tok = lh->first;
796     uint32_t fStrength = UCOL_IDENTICAL;
797     uint32_t initStrength = UCOL_IDENTICAL;
798 
799 
800     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
801     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
802     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
803 
804     while (tok != NULL && U_SUCCESS(*status)) {
805         fStrength = tok->strength;
806         if(fStrength < initStrength) {
807             initStrength = fStrength;
808             if(lh->pos[fStrength] == -1) {
809                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
810                     fStrength--;
811                 }
812                 if(lh->pos[fStrength] == -1) {
813                     *status = U_INTERNAL_PROGRAM_ERROR;
814                     return;
815                 }
816             }
817             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
818                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
819                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
820                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
821                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
822             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
823                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
824                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
825                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
826                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
827             } else { /* primaries */
828                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
829                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
830                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
831                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
832             }
833         } else {
834             if(tok->strength == UCOL_TERTIARY) {
835                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
836             } else if(tok->strength == UCOL_SECONDARY) {
837                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
838                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
839             } else if(tok->strength == UCOL_PRIMARY) {
840                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
841                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
842                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
843             }
844         }
845         ucol_doCE(src, CEparts, tok, status);
846         tok = tok->next;
847     }
848 }
849 
ucol_createElements(UColTokenParser * src,tempUCATable * t,UColTokListHeader * lh,UErrorCode * status)850 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
851     UCAElements el;
852     UColToken *tok = lh->first;
853     UColToken *expt = NULL;
854     uint32_t i = 0, j = 0;
855     const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
856 
857     while(tok != NULL && U_SUCCESS(*status)) {
858         /* first, check if there are any expansions */
859         /* if there are expansions, we need to do a little bit more processing */
860         /* since parts of expansion can be tailored, while others are not */
861         if(tok->expansion != 0) {
862             uint32_t len = tok->expansion >> 24;
863             uint32_t currentSequenceLen = len;
864             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
865             //uint32_t exp = currentSequenceLen | expOffset;
866             UColToken exp;
867             exp.source = currentSequenceLen | expOffset;
868             exp.rulesToParseHdl = &(src->source);
869 
870             while(len > 0) {
871                 currentSequenceLen = len;
872                 while(currentSequenceLen > 0) {
873                     exp.source = (currentSequenceLen << 24) | expOffset;
874                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
875                         uint32_t noOfCEsToCopy = expt->noOfCEs;
876                         for(j = 0; j<noOfCEsToCopy; j++) {
877                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
878                         }
879                         tok->noOfExpCEs += noOfCEsToCopy;
880                         // Smart people never try to add codepoints and CEs.
881                         // For some odd reason, it won't work.
882                         expOffset += currentSequenceLen; //noOfCEsToCopy;
883                         len -= currentSequenceLen; //noOfCEsToCopy;
884                         break;
885                     } else {
886                         currentSequenceLen--;
887                     }
888                 }
889                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
890                     /* will have to get one from UCA */
891                     /* first, get the UChars from the rules */
892                     /* then pick CEs out until there is no more and stuff them into expansion */
893                     collIterate s;
894                     uint32_t order = 0;
895                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
896 
897                     for(;;) {
898                         order = ucol_getNextCE(src->UCA, &s, status);
899                         if(order == UCOL_NO_MORE_CES) {
900                             break;
901                         }
902                         tok->expCEs[tok->noOfExpCEs++] = order;
903                     }
904                     expOffset++;
905                     len--;
906                 }
907             }
908         } else {
909             tok->noOfExpCEs = 0;
910         }
911 
912         /* set the ucaelement with obtained values */
913         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
914         /* copy CEs */
915         for(i = 0; i<tok->noOfCEs; i++) {
916             el.CEs[i] = tok->CEs[i];
917         }
918         for(i = 0; i<tok->noOfExpCEs; i++) {
919             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
920         }
921 
922         /* copy UChars */
923         // We kept prefix and source kind of together, as it is a kind of a contraction.
924         // However, now we have to slice the prefix off the main thing -
925         el.prefix = el.prefixChars;
926         el.cPoints = el.uchars;
927         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
928             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
929             // decomposed elements to the unsaf table.
930             el.prefixSize = tok->prefix>>24;
931             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
932 
933             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
934             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
935         } else {
936             el.prefixSize = 0;
937             *el.prefix = 0;
938 
939             el.cSize = (tok->source >> 24);
940             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
941         }
942         if(src->UCA != NULL) {
943             for(i = 0; i<el.cSize; i++) {
944                 if(UCOL_ISJAMO(el.cPoints[i])) {
945                     t->image->jamoSpecial = TRUE;
946                 }
947             }
948             if (!src->buildCCTabFlag && el.cSize > 0) {
949                 // Check the trailing canonical combining class (tccc) of the last character.
950                 const UChar *s = el.cPoints + el.cSize;
951                 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
952                 if ((fcd & 0xff) != 0) {
953                     src->buildCCTabFlag = TRUE;
954                 }
955             }
956         }
957 
958         /* and then, add it */
959 #if UCOL_DEBUG==2
960         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
961 #endif
962         uprv_uca_addAnElement(t, &el, status);
963 
964 #if UCOL_DEBUG_DUPLICATES
965         if(*status != U_ZERO_ERROR) {
966             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
967             *status = U_ZERO_ERROR;
968         }
969 #endif
970 
971         tok = tok->next;
972     }
973 }
974 
975 U_CDECL_BEGIN
976 static UBool U_CALLCONV
_processUCACompleteIgnorables(const void * context,UChar32 start,UChar32 limit,uint32_t value)977 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
978     UErrorCode status = U_ZERO_ERROR;
979     tempUCATable *t = (tempUCATable *)context;
980     if(value == 0) {
981         while(start < limit) {
982             uint32_t CE = utrie_get32(t->mapping, start, NULL);
983             if(CE == UCOL_NOT_FOUND) {
984                 UCAElements el;
985                 el.isThai = FALSE;
986                 el.prefixSize = 0;
987                 el.prefixChars[0] = 0;
988                 el.prefix = el.prefixChars;
989                 el.cPoints = el.uchars;
990 
991                 el.cSize = 0;
992                 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
993 
994                 el.noOfCEs = 1;
995                 el.CEs[0] = 0;
996                 uprv_uca_addAnElement(t, &el, &status);
997 
998             }
999             start++;
1000         }
1001     }
1002     if(U_FAILURE(status)) {
1003         return FALSE;
1004     } else {
1005         return TRUE;
1006     }
1007 }
1008 U_CDECL_END
1009 
1010 static void
ucol_uprv_bld_copyRangeFromUCA(UColTokenParser * src,tempUCATable * t,UChar32 start,UChar32 end,UErrorCode * status)1011 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1012                                UChar32 start, UChar32 end,
1013                                UErrorCode *status)
1014 {
1015     //UChar decomp[256];
1016     uint32_t CE = UCOL_NOT_FOUND;
1017     UChar32 u = 0;
1018     UCAElements el;
1019     el.isThai = FALSE;
1020     el.prefixSize = 0;
1021     el.prefixChars[0] = 0;
1022     collIterate colIt;
1023 
1024     if(U_SUCCESS(*status)) {
1025         for(u = start; u<=end; u++) {
1026             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1027                 /* this test is for contractions that are missing the starting element. */
1028                 || ((isCntTableElement(CE)) &&
1029                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1030                 )
1031             {
1032                 el.cSize = 0;
1033                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1034                 //decomp[0] = (UChar)u;
1035                 //el.uchars[0] = (UChar)u;
1036                 el.cPoints = el.uchars;
1037                 //el.cSize = 1;
1038                 el.noOfCEs = 0;
1039                 el.prefix = el.prefixChars;
1040                 el.prefixSize = 0;
1041                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1042                 // We actually want to check whether this element is a special
1043                 // If it is an implicit element (hangul, CJK - we want to copy the
1044                 // special, not the resolved CEs) - for hangul, copying resolved
1045                 // would just make things the same (there is an expansion and it
1046                 // takes approximately the same amount of time to resolve as
1047                 // falling back to the UCA).
1048                 /*
1049                 UTRIE_GET32(src->UCA->mapping, u, CE);
1050                 tag = getCETag(CE);
1051                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1052                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1053                 || tag == LEAD_SURROGATE_TAG) {
1054                 el.CEs[el.noOfCEs++] = CE;
1055                 } else {
1056                 */
1057                 // It turns out that it does not make sense to keep implicits
1058                 // unresolved. The cost of resolving them is big enough so that
1059                 // it doesn't make any difference whether we have to go to the UCA
1060                 // or not.
1061                 {
1062                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1063                     while(CE != UCOL_NO_MORE_CES) {
1064                         CE = ucol_getNextCE(src->UCA, &colIt, status);
1065                         if(CE != UCOL_NO_MORE_CES) {
1066                             el.CEs[el.noOfCEs++] = CE;
1067                         }
1068                     }
1069                 }
1070                 uprv_uca_addAnElement(t, &el, status);
1071             }
1072         }
1073     }
1074 }
1075 
1076 U_NAMESPACE_END
1077 
1078 U_CFUNC UCATableHeader *
ucol_assembleTailoringTable(UColTokenParser * src,UErrorCode * status)1079 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1080     U_NAMESPACE_USE
1081 
1082     uint32_t i = 0;
1083     if(U_FAILURE(*status)) {
1084         return NULL;
1085     }
1086     /*
1087     2.  Eliminate the negative lists by doing the following for each non-null negative list:
1088     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1089     create new ListHeader X
1090     o   reverse the list, add to the end of X's positive list. Reset the strength of the
1091     first item you add, based on the stronger strength levels of the two lists.
1092     */
1093     /*
1094     3.  For each ListHeader with a non-null positive list:
1095     */
1096     /*
1097     o   Find all character strings with CEs between the baseCE and the
1098     next/previous CE, at the strength of the first token. Add these to the
1099     tailoring.
1100     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1101     tailoring has & x < z...
1102     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1103     */
1104     /* It is possible that this part should be done even while constructing list */
1105     /* The problem is that it is unknown what is going to be the strongest weight */
1106     /* So we might as well do it here */
1107 
1108     /*
1109     o   Allocate CEs for each token in the list, based on the total number N of the
1110     largest level difference, and the gap G between baseCE and nextCE at that
1111     level. The relation * between the last item and nextCE is the same as the
1112     strongest strength.
1113     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1114     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1115     Then fit b and c into the secondary gap between a and d, then fit q
1116     into the tertiary gap between b and c.
1117 
1118     o   Example: baseCE << b <<< q << c * nextCE(X,2)
1119     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1120     Then fit q into the tertiary gap between b and c.
1121     o   When incrementing primary values, we will not cross high byte
1122     boundaries except where there is only a single-byte primary. That is to
1123     ensure that the script reordering will continue to work.
1124     */
1125     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1126     /* test for NULL */
1127     if (image == NULL) {
1128         *status = U_MEMORY_ALLOCATION_ERROR;
1129         return NULL;
1130     }
1131     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1132 
1133     for(i = 0; i<src->resultLen; i++) {
1134         /* now we need to generate the CEs */
1135         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1136         /* According to strength                                                          */
1137         if(U_SUCCESS(*status)) {
1138             if(src->lh[i].first) { // if there are any elements
1139                 // due to the way parser works, subsequent tailorings
1140                 // may remove all the elements from a sequence, therefore
1141                 // leaving an empty tailoring sequence.
1142                 ucol_initBuffers(src, &src->lh[i], status);
1143             }
1144         }
1145         if(U_FAILURE(*status)) {
1146             uprv_free(image);
1147             return NULL;
1148         }
1149     }
1150 
1151     if(src->varTop != NULL) { /* stuff the variable top value */
1152         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1153         /* remove it from the list */
1154         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1155             src->varTop->listHeader->first = src->varTop->next;
1156         }
1157         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1158             src->varTop->listHeader->last = src->varTop->previous;
1159         }
1160         if(src->varTop->next != NULL) {
1161             src->varTop->next->previous = src->varTop->previous;
1162         }
1163         if(src->varTop->previous != NULL) {
1164             src->varTop->previous->next = src->varTop->next;
1165         }
1166     }
1167 
1168 
1169     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1170     if(U_FAILURE(*status)) {
1171         uprv_free(image);
1172         return NULL;
1173     }
1174 
1175 
1176     /* After this, we have assigned CE values to all regular CEs      */
1177     /* now we will go through list once more and resolve expansions,  */
1178     /* make UCAElements structs and add them to table                 */
1179     for(i = 0; i<src->resultLen; i++) {
1180         /* now we need to generate the CEs */
1181         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1182         /* According to strength                                                          */
1183         if(U_SUCCESS(*status)) {
1184             ucol_createElements(src, t, &src->lh[i], status);
1185         }
1186     }
1187 
1188     UCAElements el;
1189     el.isThai = FALSE;
1190     el.prefixSize = 0;
1191     el.prefixChars[0] = 0;
1192 
1193     /* add latin-1 stuff */
1194     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1195 
1196     /* add stuff for copying */
1197     if(src->copySet != NULL) {
1198         int32_t i = 0;
1199         UnicodeSet *set = (UnicodeSet *)src->copySet;
1200         for(i = 0; i < set->getRangeCount(); i++) {
1201             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1202         }
1203     }
1204 
1205     if(U_SUCCESS(*status)) {
1206         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1207 
1208         uint32_t tailoredCE = UCOL_NOT_FOUND;
1209         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1210         int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
1211         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1212         // Check for null pointer
1213         if (ucaEl == NULL) {
1214             *status = U_MEMORY_ALLOCATION_ERROR;
1215             return NULL;
1216         }
1217         while(*conts != 0) {
1218             // A continuation is NUL-terminated and NUL-padded
1219             // except if it has the maximum length.
1220             int32_t contractionLength = maxUCAContractionLength;
1221             while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
1222                 --contractionLength;
1223             }
1224             UChar32 first;
1225             int32_t firstLength = 0;
1226             U16_NEXT(conts, firstLength, contractionLength, first);
1227             tailoredCE = utrie_get32(t->mapping, first, NULL);
1228             if(tailoredCE != UCOL_NOT_FOUND) {
1229                 UBool needToAdd = TRUE;
1230                 if(isCntTableElement(tailoredCE)) {
1231                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
1232                         needToAdd = FALSE;
1233                     }
1234                 }
1235                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1236                     UCAElements elm;
1237                     elm.cPoints = el.uchars;
1238                     elm.noOfCEs = 0;
1239                     elm.uchars[0] = *conts;
1240                     elm.uchars[1] = 0;
1241                     elm.cSize = 1;
1242                     elm.prefixChars[0] = *(conts+2);
1243                     elm.isThai = FALSE;
1244                     elm.prefix = elm.prefixChars;
1245                     elm.prefixSize = 1;
1246                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1247                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1248                         needToAdd = TRUE;
1249                     }
1250                 }
1251                 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1252                     needToAdd = FALSE;
1253                 }
1254 
1255                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1256                     if (*(conts+1) != 0) {  // contractions
1257                         el.prefix = el.prefixChars;
1258                         el.prefixSize = 0;
1259                         el.cPoints = el.uchars;
1260                         el.noOfCEs = 0;
1261                         u_memcpy(el.uchars, conts, contractionLength);
1262                         el.cSize = contractionLength;
1263                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
1264                     }
1265                     else { // pre-context character
1266                         UChar str[4] = { 0 };
1267                         int32_t len=0;
1268                         int32_t preKeyLen=0;
1269 
1270                         el.cPoints = el.uchars;
1271                         el.noOfCEs = 0;
1272                         el.uchars[0] = *conts;
1273                         el.uchars[1] = 0;
1274                         el.cSize = 1;
1275                         el.prefixChars[0] = *(conts+2);
1276                         el.prefix = el.prefixChars;
1277                         el.prefixSize = 1;
1278                         if (el.prefixChars[0]!=0) {
1279                             // get CE of prefix character first
1280                             str[0]=el.prefixChars[0];
1281                             str[1]=0;
1282                             ucol_setText(ucaEl, str, 1, status);
1283                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1284                                     != UCOL_NULLORDER) {
1285                                 preKeyLen++;  // count number of keys for prefix character
1286                             }
1287                             str[len++] = el.prefixChars[0];
1288                         }
1289 
1290                         str[len++] = el.uchars[0];
1291                         str[len]=0;
1292                         ucol_setText(ucaEl, str, len, status);
1293                         // Skip the keys for prefix character, then copy the rest to el.
1294                         while ((preKeyLen-->0) &&
1295                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1296                             continue;
1297                         }
1298 
1299                     }
1300                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1301                         el.noOfCEs++;
1302                     }
1303                     uprv_uca_addAnElement(t, &el, status);
1304                 }
1305 
1306             } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1307                 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
1308             }
1309             conts+=maxUCAContractionLength;
1310         }
1311         ucol_closeElements(ucaEl);
1312     }
1313 
1314     // Add completely ignorable elements
1315     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1316 
1317     // add tailoring characters related canonical closures
1318     uprv_uca_canonicalClosure(t, src, NULL, status);
1319 
1320     /* still need to produce compatibility closure */
1321 
1322     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1323 
1324     uprv_uca_closeTempTable(t);
1325     uprv_free(image);
1326 
1327     return myData;
1328 }
1329 
1330 U_CDECL_BEGIN
1331 static UBool U_CALLCONV
ucol_bld_cleanup(void)1332 ucol_bld_cleanup(void)
1333 {
1334     udata_close(invUCA_DATA_MEM);
1335     invUCA_DATA_MEM = NULL;
1336     _staticInvUCA = NULL;
1337     return TRUE;
1338 }
1339 U_CDECL_END
1340 
1341 U_CAPI const InverseUCATableHeader * U_EXPORT2
ucol_initInverseUCA(UErrorCode * status)1342 ucol_initInverseUCA(UErrorCode *status)
1343 {
1344     if(U_FAILURE(*status)) return NULL;
1345 
1346     UBool needsInit;
1347     UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1348 
1349     if(needsInit) {
1350         InverseUCATableHeader *newInvUCA = NULL;
1351         UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1352 
1353         if(U_FAILURE(*status)) {
1354             if (result) {
1355                 udata_close(result);
1356             }
1357             // This is not needed, as we are talking about
1358             // memory we got from UData
1359             //uprv_free(newInvUCA);
1360         }
1361 
1362         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1363             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1364             UCollator *UCA = ucol_initUCA(status);
1365             // UCA versions of UCA and inverse UCA should match
1366             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1367                 *status = U_INVALID_FORMAT_ERROR;
1368                 udata_close(result);
1369                 return NULL;
1370             }
1371 
1372             umtx_lock(NULL);
1373             if(_staticInvUCA == NULL) {
1374                 invUCA_DATA_MEM = result;
1375                 _staticInvUCA = newInvUCA;
1376                 result = NULL;
1377                 newInvUCA = NULL;
1378             }
1379             umtx_unlock(NULL);
1380 
1381             if(newInvUCA != NULL) {
1382                 udata_close(result);
1383                 // This is not needed, as we are talking about
1384                 // memory we got from UData
1385                 //uprv_free(newInvUCA);
1386             }
1387             else {
1388                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1389             }
1390         }
1391     }
1392     return _staticInvUCA;
1393 }
1394 
1395 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1396  * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1397  */
1398 static const char * const ReorderingTokenNames[] = {
1399     "SPACE",
1400     "PUNCT",
1401     "SYMBOL",
1402     "CURRENCY",
1403     "DIGIT"
1404 };
1405 
toUpper(const char * src,char * dst,uint32_t length)1406 static void toUpper(const char* src, char* dst, uint32_t length) {
1407    for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1408        *dst = uprv_toupper(*src);
1409    }
1410    *dst = '\0';
1411 }
1412 
1413 U_INTERNAL int32_t U_EXPORT2
ucol_findReorderingEntry(const char * name)1414 ucol_findReorderingEntry(const char* name) {
1415     char buffer[32];
1416     toUpper(name, buffer, 32);
1417     for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
1418         if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1419             return entry + UCOL_REORDER_CODE_FIRST;
1420         }
1421     }
1422     return USCRIPT_INVALID_CODE;
1423 }
1424 
1425 #endif /* #if !UCONFIG_NO_COLLATION */
1426