• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2001-2013, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  ucol_bld.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created 02/22/2001
14 *   created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19 
20 #include "unicode/utypes.h"
21 
22 #if !UCONFIG_NO_COLLATION
23 
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utf16.h"
31 #include "normalizer2impl.h"
32 #include "uassert.h"
33 #include "ucol_bld.h"
34 #include "ucol_elm.h"
35 #include "ucol_cnt.h"
36 #include "ucln_in.h"
37 #include "umutex.h"
38 #include "cmemory.h"
39 #include "cstring.h"
40 
41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
42 
43 static const InverseUCATableHeader* _staticInvUCA = NULL;
44 static UDataMemory* invUCA_DATA_MEM = NULL;
45 static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER;
46 
47 U_CDECL_BEGIN
48 static UBool U_CALLCONV
isAcceptableInvUCA(void *,const char *,const char *,const UDataInfo * pInfo)49 isAcceptableInvUCA(void * /*context*/,
50                    const char * /*type*/, const char * /*name*/,
51                    const UDataInfo *pInfo)
52 {
53     /* context, type & name are intentionally not used */
54     if( pInfo->size>=20 &&
55         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
56         pInfo->charsetFamily==U_CHARSET_FAMILY &&
57         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
58         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
59         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
60         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
61         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
62         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
63         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
64         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
65         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
66         )
67     {
68         // TODO: Check that the invuca data version (pInfo->dataVersion)
69         // matches the ucadata version.
70         return TRUE;
71     } else {
72         return FALSE;
73     }
74 }
75 U_CDECL_END
76 
77 /*
78 * Takes two CEs (lead and continuation) and
79 * compares them as CEs should be compared:
80 * primary vs. primary, secondary vs. secondary
81 * tertiary vs. tertiary
82 */
compareCEs(uint32_t source0,uint32_t source1,uint32_t target0,uint32_t target1)83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
84     uint32_t s1 = source0, s2, t1 = target0, t2;
85     if(isContinuation(source1)) {
86         s2 = source1;
87     } else {
88         s2 = 0;
89     }
90     if(isContinuation(target1)) {
91         t2 = target1;
92     } else {
93         t2 = 0;
94     }
95 
96     uint32_t s = 0, t = 0;
97     if(s1 == t1 && s2 == t2) {
98         return 0;
99     }
100     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
101     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
102     if(s < t) {
103         return -1;
104     } else if(s > t) {
105         return 1;
106     } else {
107         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
108         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
109         if(s < t) {
110             return -1;
111         } else if(s > t) {
112             return 1;
113         } else {
114             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
115             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
116             if(s < t) {
117                 return -1;
118             } else {
119                 return 1;
120             }
121         }
122     }
123 }
124 
125 static
ucol_inv_findCE(const UColTokenParser * src,uint32_t CE,uint32_t SecondCE)126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
127     uint32_t bottom = 0, top = src->invUCA->tableSize;
128     uint32_t i = 0;
129     uint32_t first = 0, second = 0;
130     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
131     int32_t res = 0;
132 
133     while(bottom < top-1) {
134         i = (top+bottom)/2;
135         first = *(CETable+3*i);
136         second = *(CETable+3*i+1);
137         res = compareCEs(first, second, CE, SecondCE);
138         if(res > 0) {
139             top = i;
140         } else if(res < 0) {
141             bottom = i;
142         } else {
143             break;
144         }
145     }
146 
147     /* weiv:                                                  */
148     /* in searching for elements, I have removed the failure  */
149     /* The reason for this is that the builder does not rely  */
150     /* on search mechanism telling it that it didn't find an  */
151     /* element. However, indirect positioning relies on being */
152     /* able to find the elements around any CE, even if it is */
153     /* not defined in the UCA. */
154     return i;
155     /*
156     if((first == CE && second == SecondCE)) {
157     return i;
158     } else {
159     return -1;
160     }
161     */
162 }
163 
164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
165     0xFFFF0000,
166     0xFFFFFF00,
167     0xFFFFFFFF
168 };
169 
ucol_inv_getNextCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * nextCE,uint32_t * nextContCE,uint32_t strength)170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
171                                             uint32_t CE, uint32_t contCE,
172                                             uint32_t *nextCE, uint32_t *nextContCE,
173                                             uint32_t strength)
174 {
175     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
176     int32_t iCE;
177 
178     iCE = ucol_inv_findCE(src, CE, contCE);
179 
180     if(iCE<0) {
181         *nextCE = UCOL_NOT_FOUND;
182         return -1;
183     }
184 
185     CE &= strengthMask[strength];
186     contCE &= strengthMask[strength];
187 
188     *nextCE = CE;
189     *nextContCE = contCE;
190 
191     while((*nextCE  & strengthMask[strength]) == CE
192         && (*nextContCE  & strengthMask[strength]) == contCE)
193     {
194         *nextCE = (*(CETable+3*(++iCE)));
195         *nextContCE = (*(CETable+3*(iCE)+1));
196     }
197 
198     return iCE;
199 }
200 
ucol_inv_getPrevCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * prevCE,uint32_t * prevContCE,uint32_t strength)201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
202                                             uint32_t CE, uint32_t contCE,
203                                             uint32_t *prevCE, uint32_t *prevContCE,
204                                             uint32_t strength)
205 {
206     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
207     int32_t iCE;
208 
209     iCE = ucol_inv_findCE(src, CE, contCE);
210 
211     if(iCE<0) {
212         *prevCE = UCOL_NOT_FOUND;
213         return -1;
214     }
215 
216     CE &= strengthMask[strength];
217     contCE &= strengthMask[strength];
218 
219     *prevCE = CE;
220     *prevContCE = contCE;
221 
222     while((*prevCE  & strengthMask[strength]) == CE
223         && (*prevContCE  & strengthMask[strength])== contCE
224         && iCE > 0) /* this condition should prevent falling off the edge of the world */
225     {
226         /* here, we end up in a singularity - zero */
227         *prevCE = (*(CETable+3*(--iCE)));
228         *prevContCE = (*(CETable+3*(iCE)+1));
229     }
230 
231     return iCE;
232 }
233 
ucol_getCEStrengthDifference(uint32_t CE,uint32_t contCE,uint32_t prevCE,uint32_t prevContCE)234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
235                                                        uint32_t prevCE, uint32_t prevContCE)
236 {
237     if(prevCE == CE && prevContCE == contCE) {
238         return UCOL_IDENTICAL;
239     }
240     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
241         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
242     {
243         return UCOL_PRIMARY;
244     }
245     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
246         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
247     {
248         return UCOL_SECONDARY;
249     }
250     return UCOL_TERTIARY;
251 }
252 
253 
254 /*static
255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
256 
257     uint32_t CE = lh->baseCE;
258     uint32_t SecondCE = lh->baseContCE;
259 
260     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
261     uint32_t previousCE, previousContCE;
262     int32_t iCE;
263 
264     iCE = ucol_inv_findCE(src, CE, SecondCE);
265 
266     if(iCE<0) {
267         return -1;
268     }
269 
270     CE &= strengthMask[strength];
271     SecondCE &= strengthMask[strength];
272 
273     previousCE = CE;
274     previousContCE = SecondCE;
275 
276     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
277         previousCE = (*(CETable+3*(--iCE)));
278         previousContCE = (*(CETable+3*(iCE)+1));
279     }
280     lh->previousCE = previousCE;
281     lh->previousContCE = previousContCE;
282 
283     return iCE;
284 }*/
285 
286 static
ucol_inv_getNext(UColTokenParser * src,UColTokListHeader * lh,uint32_t strength)287 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
288     uint32_t CE = lh->baseCE;
289     uint32_t SecondCE = lh->baseContCE;
290 
291     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
292     uint32_t nextCE, nextContCE;
293     int32_t iCE;
294 
295     iCE = ucol_inv_findCE(src, CE, SecondCE);
296 
297     if(iCE<0) {
298         return -1;
299     }
300 
301     CE &= strengthMask[strength];
302     SecondCE &= strengthMask[strength];
303 
304     nextCE = CE;
305     nextContCE = SecondCE;
306 
307     while((nextCE  & strengthMask[strength]) == CE
308         && (nextContCE  & strengthMask[strength]) == SecondCE)
309     {
310         nextCE = (*(CETable+3*(++iCE)));
311         nextContCE = (*(CETable+3*(iCE)+1));
312     }
313 
314     lh->nextCE = nextCE;
315     lh->nextContCE = nextContCE;
316 
317     return iCE;
318 }
319 
ucol_inv_getGapPositions(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)320 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
321     /* reset all the gaps */
322     int32_t i = 0;
323     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
324     uint32_t st = 0;
325     uint32_t t1, t2;
326     int32_t pos;
327 
328     UColToken *tok = lh->first;
329     uint32_t tokStrength = tok->strength;
330 
331     for(i = 0; i<3; i++) {
332         lh->gapsHi[3*i] = 0;
333         lh->gapsHi[3*i+1] = 0;
334         lh->gapsHi[3*i+2] = 0;
335         lh->gapsLo[3*i] = 0;
336         lh->gapsLo[3*i+1] = 0;
337         lh->gapsLo[3*i+2] = 0;
338         lh->numStr[i] = 0;
339         lh->fStrToken[i] = NULL;
340         lh->lStrToken[i] = NULL;
341         lh->pos[i] = -1;
342     }
343 
344     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
345 
346     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
347         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
348         lh->pos[0] = 0;
349         t1 = lh->baseCE;
350         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
351         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
352         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
353         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
354         uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
355         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
356 
357         t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
358         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
359 
360         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
361         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
362         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
363     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
364         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
365         lh->pos[0] = 0;
366         t1 = lh->baseCE;
367         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
368         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
369         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
370         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
371         t1 = lh->nextCE;
372         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
373         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
374         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
375         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
376     } else {
377         for(;;) {
378             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
379                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
380                     lh->fStrToken[tokStrength] = tok;
381                 } else { /* The CE must be implicit, since it's not in the table */
382                     /* Error */
383                     *status = U_INTERNAL_PROGRAM_ERROR;
384                 }
385             }
386 
387             while(tok != NULL && tok->strength >= tokStrength) {
388                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
389                     lh->lStrToken[tokStrength] = tok;
390                 }
391                 tok = tok->next;
392             }
393             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
394                 /* check if previous interval is the same and merge the intervals if it is so */
395                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
396                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
397                     lh->fStrToken[tokStrength+1] = NULL;
398                     lh->lStrToken[tokStrength+1] = NULL;
399                     lh->pos[tokStrength+1] = -1;
400                 }
401             }
402             if(tok != NULL) {
403                 tokStrength = tok->strength;
404             } else {
405                 break;
406             }
407         }
408         for(st = 0; st < 3; st++) {
409             if((pos = lh->pos[st]) >= 0) {
410                 t1 = *(CETable+3*(pos));
411                 t2 = *(CETable+3*(pos)+1);
412                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
413                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
414                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
415                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
416                 //pos--;
417                 //t1 = *(CETable+3*(pos));
418                 //t2 = *(CETable+3*(pos)+1);
419                 t1 = lh->baseCE;
420                 t2 = lh->baseContCE;
421                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
422                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
423                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
424             }
425         }
426     }
427 }
428 
429 
430 #define ucol_countBytes(value, noOfBytes)   \
431 {                               \
432     uint32_t mask = 0xFFFFFFFF;   \
433     (noOfBytes) = 0;              \
434     while(mask != 0) {            \
435     if(((value) & mask) != 0) { \
436     (noOfBytes)++;            \
437     }                           \
438     mask >>= 8;                 \
439     }                             \
440 }
441 
ucol_getNextGenerated(ucolCEGenerator * g,UErrorCode * status)442 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
443     if(U_SUCCESS(*status)) {
444         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
445     }
446     return g->current;
447 }
448 
ucol_getSimpleCEGenerator(ucolCEGenerator * g,UColToken * tok,uint32_t strength,UErrorCode * status)449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
450     /* TODO: rename to enum names */
451     uint32_t high, low, count=1;
452     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
453 
454     if(strength == UCOL_SECONDARY) {
455         low = UCOL_COMMON_TOP2<<24;
456         high = 0xFFFFFFFF;
457         count = 0xFF - UCOL_COMMON_TOP2;
458     } else {
459         low = UCOL_BYTE_COMMON << 24; //0x05000000;
460         high = 0x40000000;
461         count = 0x40 - UCOL_BYTE_COMMON;
462     }
463 
464     if(tok->next != NULL && tok->next->strength == strength) {
465         count = tok->next->toInsert;
466     }
467 
468     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
469     g->current = UCOL_BYTE_COMMON<<24;
470 
471     if(g->noOfRanges == 0) {
472         *status = U_INTERNAL_PROGRAM_ERROR;
473     }
474     return g->current;
475 }
476 
ucol_getCEGenerator(ucolCEGenerator * g,uint32_t * lows,uint32_t * highs,UColToken * tok,uint32_t fStrength,UErrorCode * status)477 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
478     uint32_t strength = tok->strength;
479     uint32_t low = lows[fStrength*3+strength];
480     uint32_t high = highs[fStrength*3+strength];
481     uint32_t maxByte = 0;
482     if(strength == UCOL_TERTIARY) {
483         maxByte = 0x3F;
484     } else if(strength == UCOL_PRIMARY) {
485         maxByte = 0xFE;
486     } else {
487         maxByte = 0xFF;
488     }
489 
490     uint32_t count = tok->toInsert;
491 
492     if(low >= high && strength > UCOL_PRIMARY) {
493         int32_t s = strength;
494         for(;;) {
495             s--;
496             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
497                 if(strength == UCOL_SECONDARY) {
498                     if (low < UCOL_COMMON_TOP2<<24 ) {
499                        // Override if low range is less than UCOL_COMMON_TOP2.
500 		        low = UCOL_COMMON_TOP2<<24;
501                     }
502                     high = 0xFFFFFFFF;
503                 } else {
504                     // Override if low range is less than UCOL_COMMON_BOT3.
505 		    if ( low < UCOL_COMMON_BOT3<<24 ) {
506                         low = UCOL_COMMON_BOT3<<24;
507 		    }
508                     high = 0x40000000;
509                 }
510                 break;
511             }
512             if(s<0) {
513                 *status = U_INTERNAL_PROGRAM_ERROR;
514                 return 0;
515             }
516         }
517     }
518 
519     if(low < 0x02000000) {
520         // We must not use CE weight byte 02, so we set it as the minimum lower bound.
521         // See http://site.icu-project.org/design/collation/bytes
522         low = 0x02000000;
523     }
524 
525     if(strength == UCOL_SECONDARY) { /* similar as simple */
526         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
527             low = UCOL_COMMON_TOP2<<24;
528         }
529         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
530             high = UCOL_COMMON_TOP2<<24;
531         }
532         if(low < (UCOL_COMMON_BOT2<<24)) {
533             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
534             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
535             //g->current = UCOL_COMMON_BOT2<<24;
536             return g->current;
537         }
538     }
539 
540     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
541     if(g->noOfRanges == 0) {
542         *status = U_INTERNAL_PROGRAM_ERROR;
543     }
544     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
545     return g->current;
546 }
547 
548 static
u_toLargeKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)549 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
550     uint32_t i = 0;
551     UChar c;
552 
553     if(U_FAILURE(*status)) {
554         return 0;
555     }
556 
557     if(sourceLen > resLen) {
558         *status = U_MEMORY_ALLOCATION_ERROR;
559         return 0;
560     }
561 
562     for(i = 0; i < sourceLen; i++) {
563         c = source[i];
564         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
565             switch(c - 0x3000) {
566             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
567             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
568                 c++;
569                 break;
570             case 0xF5:
571                 c = 0x30AB;
572                 break;
573             case 0xF6:
574                 c = 0x30B1;
575                 break;
576             }
577         }
578         resBuf[i] = c;
579     }
580     return sourceLen;
581 }
582 
583 static
u_toSmallKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)584 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
585     uint32_t i = 0;
586     UChar c;
587 
588     if(U_FAILURE(*status)) {
589         return 0;
590     }
591 
592     if(sourceLen > resLen) {
593         *status = U_MEMORY_ALLOCATION_ERROR;
594         return 0;
595     }
596 
597     for(i = 0; i < sourceLen; i++) {
598         c = source[i];
599         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
600             switch(c - 0x3000) {
601             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
602             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
603                 c--;
604                 break;
605             case 0xAB:
606                 c = 0x30F5;
607                 break;
608             case 0xB1:
609                 c = 0x30F6;
610                 break;
611             }
612         }
613         resBuf[i] = c;
614     }
615     return sourceLen;
616 }
617 
618 U_NAMESPACE_BEGIN
619 
620 static
ucol_uprv_getCaseBits(const UCollator * UCA,const UChar * src,uint32_t len,UErrorCode * status)621 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
622     uint32_t i = 0;
623     UChar n[128];
624     uint32_t nLen = 0;
625     uint32_t uCount = 0, lCount = 0;
626 
627     collIterate s;
628     uint32_t order = 0;
629 
630     if(U_FAILURE(*status)) {
631         return UCOL_LOWER_CASE;
632     }
633 
634     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
635     if(U_SUCCESS(*status)) {
636         for(i = 0; i < nLen; i++) {
637             uprv_init_collIterate(UCA, &n[i], 1, &s, status);
638             order = ucol_getNextCE(UCA, &s, status);
639             if(isContinuation(order)) {
640                 *status = U_INTERNAL_PROGRAM_ERROR;
641                 return UCOL_LOWER_CASE;
642             }
643             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
644                 uCount++;
645             } else {
646                 if(u_islower(n[i])) {
647                     lCount++;
648                 } else if(U_SUCCESS(*status)) {
649                     UChar sk[1], lk[1];
650                     u_toSmallKana(&n[i], 1, sk, 1, status);
651                     u_toLargeKana(&n[i], 1, lk, 1, status);
652                     if(sk[0] == n[i] && lk[0] != n[i]) {
653                         lCount++;
654                     }
655                 }
656             }
657         }
658     }
659 
660     if(uCount != 0 && lCount != 0) {
661         return UCOL_MIXED_CASE;
662     } else if(uCount != 0) {
663         return UCOL_UPPER_CASE;
664     } else {
665         return UCOL_LOWER_CASE;
666     }
667 }
668 
669 
ucol_doCE(UColTokenParser * src,uint32_t * CEparts,UColToken * tok,UErrorCode * status)670 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
671     /* this one makes the table and stuff */
672     uint32_t noOfBytes[3];
673     uint32_t i;
674 
675     for(i = 0; i<3; i++) {
676         ucol_countBytes(CEparts[i], noOfBytes[i]);
677     }
678 
679     /* Here we have to pack CEs from parts */
680 
681     uint32_t CEi = 0;
682     uint32_t value = 0;
683 
684     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
685         if(CEi > 0) {
686             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
687         } else {
688             value = 0;
689         }
690 
691         if(2*CEi<noOfBytes[0]) {
692             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
693         }
694         if(CEi<noOfBytes[1]) {
695             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
696         }
697         if(CEi<noOfBytes[2]) {
698             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
699         }
700         tok->CEs[CEi] = value;
701         CEi++;
702     }
703     if(CEi == 0) { /* totally ignorable */
704         tok->noOfCEs = 1;
705         tok->CEs[0] = 0;
706     } else { /* there is at least something */
707         tok->noOfCEs = CEi;
708     }
709 
710 
711     // we want to set case bits here and now, not later.
712     // Case bits handling
713     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
714         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
715         int32_t cSize = (tok->source & 0xFF000000) >> 24;
716         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
717 
718         if(cSize > 1) {
719             // Do it manually
720             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
721         } else {
722             // Copy it from the UCA
723             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
724             tok->CEs[0] |= (caseCE & 0xC0);
725         }
726     }
727 
728 #if UCOL_DEBUG==2
729     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
730     for(i = 0; i<tok->noOfCEs; i++) {
731         fprintf(stderr, "%08X ", tok->CEs[i]);
732     }
733     fprintf(stderr, "\n");
734 #endif
735 }
736 
ucol_initBuffers(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)737 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
738     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
739     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
740 
741     UColToken *tok = lh->last;
742     uint32_t t[UCOL_STRENGTH_LIMIT];
743 
744     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
745 
746     /* must initialize ranges to avoid memory check warnings */
747     for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
748         uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
749     }
750 
751     tok->toInsert = 1;
752     t[tok->strength] = 1;
753 
754     while(tok->previous != NULL) {
755         if(tok->previous->strength < tok->strength) { /* going up */
756             t[tok->strength] = 0;
757             t[tok->previous->strength]++;
758         } else if(tok->previous->strength > tok->strength) { /* going down */
759             t[tok->previous->strength] = 1;
760         } else {
761             t[tok->strength]++;
762         }
763         tok=tok->previous;
764         tok->toInsert = t[tok->strength];
765     }
766 
767     tok->toInsert = t[tok->strength];
768     ucol_inv_getGapPositions(src, lh, status);
769 
770 #if UCOL_DEBUG
771     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
772     int32_t j = 2;
773     for(j = 2; j >= 0; j--) {
774         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
775         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
776     }
777     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
778 
779     do {
780         fprintf(stderr,"%i", tok->strength);
781         tok = tok->next;
782     } while(tok != NULL);
783     fprintf(stderr, "\n");
784 
785     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
786 
787     do {
788         fprintf(stderr,"%i", tok->toInsert);
789         tok = tok->next;
790     } while(tok != NULL);
791 #endif
792 
793     tok = lh->first;
794     uint32_t fStrength = UCOL_IDENTICAL;
795     uint32_t initStrength = UCOL_IDENTICAL;
796 
797 
798     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
799     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
800     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
801 
802     while (tok != NULL && U_SUCCESS(*status)) {
803         fStrength = tok->strength;
804         if(fStrength < initStrength) {
805             initStrength = fStrength;
806             if(lh->pos[fStrength] == -1) {
807                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
808                     fStrength--;
809                 }
810                 if(lh->pos[fStrength] == -1) {
811                     *status = U_INTERNAL_PROGRAM_ERROR;
812                     return;
813                 }
814             }
815             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
816                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
817                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
818                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
819                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
820             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
821                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
822                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
823                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
824                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
825             } else { /* primaries */
826                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
827                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
828                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
829                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
830             }
831         } else {
832             if(tok->strength == UCOL_TERTIARY) {
833                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
834             } else if(tok->strength == UCOL_SECONDARY) {
835                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
836                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
837             } else if(tok->strength == UCOL_PRIMARY) {
838                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
839                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
840                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
841             }
842         }
843         ucol_doCE(src, CEparts, tok, status);
844         tok = tok->next;
845     }
846 }
847 
ucol_createElements(UColTokenParser * src,tempUCATable * t,UColTokListHeader * lh,UErrorCode * status)848 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
849     UCAElements el;
850     UColToken *tok = lh->first;
851     UColToken *expt = NULL;
852     uint32_t i = 0, j = 0;
853     const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
854 
855     while(tok != NULL && U_SUCCESS(*status)) {
856         /* first, check if there are any expansions */
857         /* if there are expansions, we need to do a little bit more processing */
858         /* since parts of expansion can be tailored, while others are not */
859         if(tok->expansion != 0) {
860             uint32_t len = tok->expansion >> 24;
861             uint32_t currentSequenceLen = len;
862             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
863             //uint32_t exp = currentSequenceLen | expOffset;
864             UColToken exp;
865             exp.source = currentSequenceLen | expOffset;
866             exp.rulesToParseHdl = &(src->source);
867 
868             while(len > 0) {
869                 currentSequenceLen = len;
870                 while(currentSequenceLen > 0) {
871                     exp.source = (currentSequenceLen << 24) | expOffset;
872                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
873                         uint32_t noOfCEsToCopy = expt->noOfCEs;
874                         for(j = 0; j<noOfCEsToCopy; j++) {
875                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
876                         }
877                         tok->noOfExpCEs += noOfCEsToCopy;
878                         // Smart people never try to add codepoints and CEs.
879                         // For some odd reason, it won't work.
880                         expOffset += currentSequenceLen; //noOfCEsToCopy;
881                         len -= currentSequenceLen; //noOfCEsToCopy;
882                         break;
883                     } else {
884                         currentSequenceLen--;
885                     }
886                 }
887                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
888                     /* will have to get one from UCA */
889                     /* first, get the UChars from the rules */
890                     /* then pick CEs out until there is no more and stuff them into expansion */
891                     collIterate s;
892                     uint32_t order = 0;
893                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
894 
895                     for(;;) {
896                         order = ucol_getNextCE(src->UCA, &s, status);
897                         if(order == UCOL_NO_MORE_CES) {
898                             break;
899                         }
900                         tok->expCEs[tok->noOfExpCEs++] = order;
901                     }
902                     expOffset++;
903                     len--;
904                 }
905             }
906         } else {
907             tok->noOfExpCEs = 0;
908         }
909 
910         /* set the ucaelement with obtained values */
911         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
912         /* copy CEs */
913         for(i = 0; i<tok->noOfCEs; i++) {
914             el.CEs[i] = tok->CEs[i];
915         }
916         for(i = 0; i<tok->noOfExpCEs; i++) {
917             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
918         }
919 
920         /* copy UChars */
921         // We kept prefix and source kind of together, as it is a kind of a contraction.
922         // However, now we have to slice the prefix off the main thing -
923         el.prefix = el.prefixChars;
924         el.cPoints = el.uchars;
925         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
926             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
927             // decomposed elements to the unsaf table.
928             el.prefixSize = tok->prefix>>24;
929             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
930 
931             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
932             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
933         } else {
934             el.prefixSize = 0;
935             *el.prefix = 0;
936 
937             el.cSize = (tok->source >> 24);
938             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
939         }
940         if(src->UCA != NULL) {
941             for(i = 0; i<el.cSize; i++) {
942                 if(UCOL_ISJAMO(el.cPoints[i])) {
943                     t->image->jamoSpecial = TRUE;
944                 }
945             }
946             if (!src->buildCCTabFlag && el.cSize > 0) {
947                 // Check the trailing canonical combining class (tccc) of the last character.
948                 const UChar *s = el.cPoints + el.cSize;
949                 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
950                 if ((fcd & 0xff) != 0) {
951                     src->buildCCTabFlag = TRUE;
952                 }
953             }
954         }
955 
956         /* and then, add it */
957 #if UCOL_DEBUG==2
958         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
959 #endif
960         uprv_uca_addAnElement(t, &el, status);
961 
962 #if UCOL_DEBUG_DUPLICATES
963         if(*status != U_ZERO_ERROR) {
964             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
965             *status = U_ZERO_ERROR;
966         }
967 #endif
968 
969         tok = tok->next;
970     }
971 }
972 
973 U_CDECL_BEGIN
974 static UBool U_CALLCONV
_processUCACompleteIgnorables(const void * context,UChar32 start,UChar32 limit,uint32_t value)975 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
976     UErrorCode status = U_ZERO_ERROR;
977     tempUCATable *t = (tempUCATable *)context;
978     if(value == 0) {
979         while(start < limit) {
980             uint32_t CE = utrie_get32(t->mapping, start, NULL);
981             if(CE == UCOL_NOT_FOUND) {
982                 UCAElements el;
983                 el.isThai = FALSE;
984                 el.prefixSize = 0;
985                 el.prefixChars[0] = 0;
986                 el.prefix = el.prefixChars;
987                 el.cPoints = el.uchars;
988 
989                 el.cSize = 0;
990                 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
991 
992                 el.noOfCEs = 1;
993                 el.CEs[0] = 0;
994                 uprv_uca_addAnElement(t, &el, &status);
995 
996             }
997             start++;
998         }
999     }
1000     if(U_FAILURE(status)) {
1001         return FALSE;
1002     } else {
1003         return TRUE;
1004     }
1005 }
1006 U_CDECL_END
1007 
1008 static void
ucol_uprv_bld_copyRangeFromUCA(UColTokenParser * src,tempUCATable * t,UChar32 start,UChar32 end,UErrorCode * status)1009 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1010                                UChar32 start, UChar32 end,
1011                                UErrorCode *status)
1012 {
1013     //UChar decomp[256];
1014     uint32_t CE = UCOL_NOT_FOUND;
1015     UChar32 u = 0;
1016     UCAElements el;
1017     el.isThai = FALSE;
1018     el.prefixSize = 0;
1019     el.prefixChars[0] = 0;
1020     collIterate colIt;
1021 
1022     if(U_SUCCESS(*status)) {
1023         for(u = start; u<=end; u++) {
1024             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1025                 /* this test is for contractions that are missing the starting element. */
1026                 || ((isCntTableElement(CE)) &&
1027                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1028                 )
1029             {
1030                 el.cSize = 0;
1031                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1032                 //decomp[0] = (UChar)u;
1033                 //el.uchars[0] = (UChar)u;
1034                 el.cPoints = el.uchars;
1035                 //el.cSize = 1;
1036                 el.noOfCEs = 0;
1037                 el.prefix = el.prefixChars;
1038                 el.prefixSize = 0;
1039                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1040                 // We actually want to check whether this element is a special
1041                 // If it is an implicit element (hangul, CJK - we want to copy the
1042                 // special, not the resolved CEs) - for hangul, copying resolved
1043                 // would just make things the same (there is an expansion and it
1044                 // takes approximately the same amount of time to resolve as
1045                 // falling back to the UCA).
1046                 /*
1047                 UTRIE_GET32(src->UCA->mapping, u, CE);
1048                 tag = getCETag(CE);
1049                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1050                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1051                 || tag == LEAD_SURROGATE_TAG) {
1052                 el.CEs[el.noOfCEs++] = CE;
1053                 } else {
1054                 */
1055                 // It turns out that it does not make sense to keep implicits
1056                 // unresolved. The cost of resolving them is big enough so that
1057                 // it doesn't make any difference whether we have to go to the UCA
1058                 // or not.
1059                 {
1060                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1061                     while(CE != UCOL_NO_MORE_CES) {
1062                         CE = ucol_getNextCE(src->UCA, &colIt, status);
1063                         if(CE != UCOL_NO_MORE_CES) {
1064                             el.CEs[el.noOfCEs++] = CE;
1065                         }
1066                     }
1067                 }
1068                 uprv_uca_addAnElement(t, &el, status);
1069             }
1070         }
1071     }
1072 }
1073 
1074 U_NAMESPACE_END
1075 
1076 U_CFUNC UCATableHeader *
ucol_assembleTailoringTable(UColTokenParser * src,UErrorCode * status)1077 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1078     U_NAMESPACE_USE
1079 
1080     uint32_t i = 0;
1081     if(U_FAILURE(*status)) {
1082         return NULL;
1083     }
1084     /*
1085     2.  Eliminate the negative lists by doing the following for each non-null negative list:
1086     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1087     create new ListHeader X
1088     o   reverse the list, add to the end of X's positive list. Reset the strength of the
1089     first item you add, based on the stronger strength levels of the two lists.
1090     */
1091     /*
1092     3.  For each ListHeader with a non-null positive list:
1093     */
1094     /*
1095     o   Find all character strings with CEs between the baseCE and the
1096     next/previous CE, at the strength of the first token. Add these to the
1097     tailoring.
1098     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1099     tailoring has & x < z...
1100     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1101     */
1102     /* It is possible that this part should be done even while constructing list */
1103     /* The problem is that it is unknown what is going to be the strongest weight */
1104     /* So we might as well do it here */
1105 
1106     /*
1107     o   Allocate CEs for each token in the list, based on the total number N of the
1108     largest level difference, and the gap G between baseCE and nextCE at that
1109     level. The relation * between the last item and nextCE is the same as the
1110     strongest strength.
1111     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1112     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1113     Then fit b and c into the secondary gap between a and d, then fit q
1114     into the tertiary gap between b and c.
1115 
1116     o   Example: baseCE << b <<< q << c * nextCE(X,2)
1117     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1118     Then fit q into the tertiary gap between b and c.
1119     o   When incrementing primary values, we will not cross high byte
1120     boundaries except where there is only a single-byte primary. That is to
1121     ensure that the script reordering will continue to work.
1122     */
1123     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1124     /* test for NULL */
1125     if (image == NULL) {
1126         *status = U_MEMORY_ALLOCATION_ERROR;
1127         return NULL;
1128     }
1129     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1130 
1131     for(i = 0; i<src->resultLen; i++) {
1132         /* now we need to generate the CEs */
1133         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1134         /* According to strength                                                          */
1135         if(U_SUCCESS(*status)) {
1136             if(src->lh[i].first) { // if there are any elements
1137                 // due to the way parser works, subsequent tailorings
1138                 // may remove all the elements from a sequence, therefore
1139                 // leaving an empty tailoring sequence.
1140                 ucol_initBuffers(src, &src->lh[i], status);
1141             }
1142         }
1143         if(U_FAILURE(*status)) {
1144             uprv_free(image);
1145             return NULL;
1146         }
1147     }
1148 
1149     if(src->varTop != NULL) { /* stuff the variable top value */
1150         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1151         /* remove it from the list */
1152         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1153             src->varTop->listHeader->first = src->varTop->next;
1154         }
1155         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1156             src->varTop->listHeader->last = src->varTop->previous;
1157         }
1158         if(src->varTop->next != NULL) {
1159             src->varTop->next->previous = src->varTop->previous;
1160         }
1161         if(src->varTop->previous != NULL) {
1162             src->varTop->previous->next = src->varTop->next;
1163         }
1164     }
1165 
1166 
1167     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1168     if(U_FAILURE(*status)) {
1169         uprv_free(image);
1170         return NULL;
1171     }
1172 
1173 
1174     /* After this, we have assigned CE values to all regular CEs      */
1175     /* now we will go through list once more and resolve expansions,  */
1176     /* make UCAElements structs and add them to table                 */
1177     for(i = 0; i<src->resultLen; i++) {
1178         /* now we need to generate the CEs */
1179         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1180         /* According to strength                                                          */
1181         if(U_SUCCESS(*status)) {
1182             ucol_createElements(src, t, &src->lh[i], status);
1183         }
1184     }
1185 
1186     UCAElements el;
1187     el.isThai = FALSE;
1188     el.prefixSize = 0;
1189     el.prefixChars[0] = 0;
1190 
1191     /* add latin-1 stuff */
1192     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1193 
1194     /* add stuff for copying */
1195     if(src->copySet != NULL) {
1196         int32_t i = 0;
1197         UnicodeSet *set = (UnicodeSet *)src->copySet;
1198         for(i = 0; i < set->getRangeCount(); i++) {
1199             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1200         }
1201     }
1202 
1203     if(U_SUCCESS(*status)) {
1204         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1205 
1206         uint32_t tailoredCE = UCOL_NOT_FOUND;
1207         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1208         int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
1209         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1210         // Check for null pointer
1211         if (ucaEl == NULL) {
1212             *status = U_MEMORY_ALLOCATION_ERROR;
1213             return NULL;
1214         }
1215         while(*conts != 0) {
1216             // A continuation is NUL-terminated and NUL-padded
1217             // except if it has the maximum length.
1218             int32_t contractionLength = maxUCAContractionLength;
1219             while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
1220                 --contractionLength;
1221             }
1222             UChar32 first;
1223             int32_t firstLength = 0;
1224             U16_NEXT(conts, firstLength, contractionLength, first);
1225             tailoredCE = utrie_get32(t->mapping, first, NULL);
1226             if(tailoredCE != UCOL_NOT_FOUND) {
1227                 UBool needToAdd = TRUE;
1228                 if(isCntTableElement(tailoredCE)) {
1229                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
1230                         needToAdd = FALSE;
1231                     }
1232                 }
1233                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1234                     UCAElements elm;
1235                     elm.cPoints = el.uchars;
1236                     elm.noOfCEs = 0;
1237                     elm.uchars[0] = *conts;
1238                     elm.uchars[1] = 0;
1239                     elm.cSize = 1;
1240                     elm.prefixChars[0] = *(conts+2);
1241                     elm.isThai = FALSE;
1242                     elm.prefix = elm.prefixChars;
1243                     elm.prefixSize = 1;
1244                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1245                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1246                         needToAdd = TRUE;
1247                     }
1248                 }
1249                 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1250                     needToAdd = FALSE;
1251                 }
1252 
1253                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1254                     if (*(conts+1) != 0) {  // contractions
1255                         el.prefix = el.prefixChars;
1256                         el.prefixSize = 0;
1257                         el.cPoints = el.uchars;
1258                         el.noOfCEs = 0;
1259                         u_memcpy(el.uchars, conts, contractionLength);
1260                         el.cSize = contractionLength;
1261                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
1262                     }
1263                     else { // pre-context character
1264                         UChar str[4] = { 0 };
1265                         int32_t len=0;
1266                         int32_t preKeyLen=0;
1267 
1268                         el.cPoints = el.uchars;
1269                         el.noOfCEs = 0;
1270                         el.uchars[0] = *conts;
1271                         el.uchars[1] = 0;
1272                         el.cSize = 1;
1273                         el.prefixChars[0] = *(conts+2);
1274                         el.prefix = el.prefixChars;
1275                         el.prefixSize = 1;
1276                         if (el.prefixChars[0]!=0) {
1277                             // get CE of prefix character first
1278                             str[0]=el.prefixChars[0];
1279                             str[1]=0;
1280                             ucol_setText(ucaEl, str, 1, status);
1281                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1282                                     != UCOL_NULLORDER) {
1283                                 preKeyLen++;  // count number of keys for prefix character
1284                             }
1285                             str[len++] = el.prefixChars[0];
1286                         }
1287 
1288                         str[len++] = el.uchars[0];
1289                         str[len]=0;
1290                         ucol_setText(ucaEl, str, len, status);
1291                         // Skip the keys for prefix character, then copy the rest to el.
1292                         while ((preKeyLen-->0) &&
1293                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1294                             continue;
1295                         }
1296 
1297                     }
1298                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1299                         el.noOfCEs++;
1300                     }
1301                     uprv_uca_addAnElement(t, &el, status);
1302                 }
1303 
1304             } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1305                 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
1306             }
1307             conts+=maxUCAContractionLength;
1308         }
1309         ucol_closeElements(ucaEl);
1310     }
1311 
1312     // Add completely ignorable elements
1313     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1314 
1315     // add tailoring characters related canonical closures
1316     uprv_uca_canonicalClosure(t, src, NULL, status);
1317 
1318     /* still need to produce compatibility closure */
1319 
1320     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1321 
1322     uprv_uca_closeTempTable(t);
1323     uprv_free(image);
1324 
1325     return myData;
1326 }
1327 
1328 U_CDECL_BEGIN
1329 static UBool U_CALLCONV
ucol_bld_cleanup(void)1330 ucol_bld_cleanup(void)
1331 {
1332     udata_close(invUCA_DATA_MEM);
1333     invUCA_DATA_MEM = NULL;
1334     _staticInvUCA = NULL;
1335     gStaticInvUCAInitOnce.reset();
1336     return TRUE;
1337 }
1338 U_CDECL_END
1339 
initInverseUCA(UErrorCode & status)1340 static void U_CALLCONV initInverseUCA(UErrorCode &status) {
1341     U_ASSERT(invUCA_DATA_MEM == NULL);
1342     U_ASSERT(_staticInvUCA == NULL);
1343     ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1344     InverseUCATableHeader *newInvUCA = NULL;
1345     UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status);
1346 
1347     if(U_FAILURE(status)) {
1348         if (result) {
1349             udata_close(result);
1350         }
1351         // This is not needed, as we are talking about
1352         // memory we got from UData
1353         //uprv_free(newInvUCA);
1354         return;
1355     }
1356 
1357     if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1358         newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1359         UCollator *UCA = ucol_initUCA(&status);
1360         // UCA versions of UCA and inverse UCA should match
1361         if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1362             status = U_INVALID_FORMAT_ERROR;
1363             udata_close(result);
1364             return;
1365         }
1366 
1367         invUCA_DATA_MEM = result;
1368         _staticInvUCA = newInvUCA;
1369     }
1370 }
1371 
1372 
1373 U_CAPI const InverseUCATableHeader * U_EXPORT2
ucol_initInverseUCA(UErrorCode * status)1374 ucol_initInverseUCA(UErrorCode *status)
1375 {
1376     umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status);
1377     return _staticInvUCA;
1378 }
1379 
1380 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1381  * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1382  */
1383 static const char * const ReorderingTokenNames[] = {
1384     "SPACE",
1385     "PUNCT",
1386     "SYMBOL",
1387     "CURRENCY",
1388     "DIGIT"
1389 };
1390 
toUpper(const char * src,char * dst,uint32_t length)1391 static void toUpper(const char* src, char* dst, uint32_t length) {
1392    for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1393        *dst = uprv_toupper(*src);
1394    }
1395    *dst = '\0';
1396 }
1397 
1398 U_INTERNAL int32_t U_EXPORT2
ucol_findReorderingEntry(const char * name)1399 ucol_findReorderingEntry(const char* name) {
1400     char buffer[32];
1401     toUpper(name, buffer, 32);
1402     for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
1403         if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1404             return entry + UCOL_REORDER_CODE_FIRST;
1405         }
1406     }
1407     return USCRIPT_INVALID_CODE;
1408 }
1409 
1410 #endif /* #if !UCONFIG_NO_COLLATION */
1411