1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_bld.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "normalizer2impl.h"
29 #include "ucol_bld.h"
30 #include "ucol_elm.h"
31 #include "ucol_cnt.h"
32 #include "ucln_in.h"
33 #include "umutex.h"
34 #include "cmemory.h"
35
36 static const InverseUCATableHeader* _staticInvUCA = NULL;
37 static UDataMemory* invUCA_DATA_MEM = NULL;
38
39 U_CDECL_BEGIN
40 static UBool U_CALLCONV
isAcceptableInvUCA(void *,const char *,const char *,const UDataInfo * pInfo)41 isAcceptableInvUCA(void * /*context*/,
42 const char * /*type*/, const char * /*name*/,
43 const UDataInfo *pInfo)
44 {
45 /* context, type & name are intentionally not used */
46 if( pInfo->size>=20 &&
47 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
48 pInfo->charsetFamily==U_CHARSET_FAMILY &&
49 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
50 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
51 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
52 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
53 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
54 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
55 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
56 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
57 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
58 )
59 {
60 UVersionInfo UCDVersion;
61 u_getUnicodeVersion(UCDVersion);
62 return (pInfo->dataVersion[0]==UCDVersion[0] &&
63 pInfo->dataVersion[1]==UCDVersion[1]);
64 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
65 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
66 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
67 } else {
68 return FALSE;
69 }
70 }
71 U_CDECL_END
72
73 /*
74 * Takes two CEs (lead and continuation) and
75 * compares them as CEs should be compared:
76 * primary vs. primary, secondary vs. secondary
77 * tertiary vs. tertiary
78 */
compareCEs(uint32_t source0,uint32_t source1,uint32_t target0,uint32_t target1)79 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
80 uint32_t s1 = source0, s2, t1 = target0, t2;
81 if(isContinuation(source1)) {
82 s2 = source1;
83 } else {
84 s2 = 0;
85 }
86 if(isContinuation(target1)) {
87 t2 = target1;
88 } else {
89 t2 = 0;
90 }
91
92 uint32_t s = 0, t = 0;
93 if(s1 == t1 && s2 == t2) {
94 return 0;
95 }
96 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
97 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
98 if(s < t) {
99 return -1;
100 } else if(s > t) {
101 return 1;
102 } else {
103 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
104 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
105 if(s < t) {
106 return -1;
107 } else if(s > t) {
108 return 1;
109 } else {
110 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
111 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
112 if(s < t) {
113 return -1;
114 } else {
115 return 1;
116 }
117 }
118 }
119 }
120
121 static
ucol_inv_findCE(const UColTokenParser * src,uint32_t CE,uint32_t SecondCE)122 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
123 uint32_t bottom = 0, top = src->invUCA->tableSize;
124 uint32_t i = 0;
125 uint32_t first = 0, second = 0;
126 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
127 int32_t res = 0;
128
129 while(bottom < top-1) {
130 i = (top+bottom)/2;
131 first = *(CETable+3*i);
132 second = *(CETable+3*i+1);
133 res = compareCEs(first, second, CE, SecondCE);
134 if(res > 0) {
135 top = i;
136 } else if(res < 0) {
137 bottom = i;
138 } else {
139 break;
140 }
141 }
142
143 /* weiv: */
144 /* in searching for elements, I have removed the failure */
145 /* The reason for this is that the builder does not rely */
146 /* on search mechanism telling it that it didn't find an */
147 /* element. However, indirect positioning relies on being */
148 /* able to find the elements around any CE, even if it is */
149 /* not defined in the UCA. */
150 return i;
151 /*
152 if((first == CE && second == SecondCE)) {
153 return i;
154 } else {
155 return -1;
156 }
157 */
158 }
159
160 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
161 0xFFFF0000,
162 0xFFFFFF00,
163 0xFFFFFFFF
164 };
165
ucol_inv_getNextCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * nextCE,uint32_t * nextContCE,uint32_t strength)166 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
167 uint32_t CE, uint32_t contCE,
168 uint32_t *nextCE, uint32_t *nextContCE,
169 uint32_t strength)
170 {
171 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
172 int32_t iCE;
173
174 iCE = ucol_inv_findCE(src, CE, contCE);
175
176 if(iCE<0) {
177 *nextCE = UCOL_NOT_FOUND;
178 return -1;
179 }
180
181 CE &= strengthMask[strength];
182 contCE &= strengthMask[strength];
183
184 *nextCE = CE;
185 *nextContCE = contCE;
186
187 while((*nextCE & strengthMask[strength]) == CE
188 && (*nextContCE & strengthMask[strength]) == contCE)
189 {
190 *nextCE = (*(CETable+3*(++iCE)));
191 *nextContCE = (*(CETable+3*(iCE)+1));
192 }
193
194 return iCE;
195 }
196
ucol_inv_getPrevCE(const UColTokenParser * src,uint32_t CE,uint32_t contCE,uint32_t * prevCE,uint32_t * prevContCE,uint32_t strength)197 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
198 uint32_t CE, uint32_t contCE,
199 uint32_t *prevCE, uint32_t *prevContCE,
200 uint32_t strength)
201 {
202 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
203 int32_t iCE;
204
205 iCE = ucol_inv_findCE(src, CE, contCE);
206
207 if(iCE<0) {
208 *prevCE = UCOL_NOT_FOUND;
209 return -1;
210 }
211
212 CE &= strengthMask[strength];
213 contCE &= strengthMask[strength];
214
215 *prevCE = CE;
216 *prevContCE = contCE;
217
218 while((*prevCE & strengthMask[strength]) == CE
219 && (*prevContCE & strengthMask[strength])== contCE
220 && iCE > 0) /* this condition should prevent falling off the edge of the world */
221 {
222 /* here, we end up in a singularity - zero */
223 *prevCE = (*(CETable+3*(--iCE)));
224 *prevContCE = (*(CETable+3*(iCE)+1));
225 }
226
227 return iCE;
228 }
229
ucol_getCEStrengthDifference(uint32_t CE,uint32_t contCE,uint32_t prevCE,uint32_t prevContCE)230 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
231 uint32_t prevCE, uint32_t prevContCE)
232 {
233 if(prevCE == CE && prevContCE == contCE) {
234 return UCOL_IDENTICAL;
235 }
236 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
237 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
238 {
239 return UCOL_PRIMARY;
240 }
241 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
242 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
243 {
244 return UCOL_SECONDARY;
245 }
246 return UCOL_TERTIARY;
247 }
248
249
250 /*static
251 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
252
253 uint32_t CE = lh->baseCE;
254 uint32_t SecondCE = lh->baseContCE;
255
256 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
257 uint32_t previousCE, previousContCE;
258 int32_t iCE;
259
260 iCE = ucol_inv_findCE(src, CE, SecondCE);
261
262 if(iCE<0) {
263 return -1;
264 }
265
266 CE &= strengthMask[strength];
267 SecondCE &= strengthMask[strength];
268
269 previousCE = CE;
270 previousContCE = SecondCE;
271
272 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
273 previousCE = (*(CETable+3*(--iCE)));
274 previousContCE = (*(CETable+3*(iCE)+1));
275 }
276 lh->previousCE = previousCE;
277 lh->previousContCE = previousContCE;
278
279 return iCE;
280 }*/
281
282 static
ucol_inv_getNext(UColTokenParser * src,UColTokListHeader * lh,uint32_t strength)283 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
284 uint32_t CE = lh->baseCE;
285 uint32_t SecondCE = lh->baseContCE;
286
287 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
288 uint32_t nextCE, nextContCE;
289 int32_t iCE;
290
291 iCE = ucol_inv_findCE(src, CE, SecondCE);
292
293 if(iCE<0) {
294 return -1;
295 }
296
297 CE &= strengthMask[strength];
298 SecondCE &= strengthMask[strength];
299
300 nextCE = CE;
301 nextContCE = SecondCE;
302
303 while((nextCE & strengthMask[strength]) == CE
304 && (nextContCE & strengthMask[strength]) == SecondCE)
305 {
306 nextCE = (*(CETable+3*(++iCE)));
307 nextContCE = (*(CETable+3*(iCE)+1));
308 }
309
310 lh->nextCE = nextCE;
311 lh->nextContCE = nextContCE;
312
313 return iCE;
314 }
315
ucol_inv_getGapPositions(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)316 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
317 /* reset all the gaps */
318 int32_t i = 0;
319 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
320 uint32_t st = 0;
321 uint32_t t1, t2;
322 int32_t pos;
323
324 UColToken *tok = lh->first;
325 uint32_t tokStrength = tok->strength;
326
327 for(i = 0; i<3; i++) {
328 lh->gapsHi[3*i] = 0;
329 lh->gapsHi[3*i+1] = 0;
330 lh->gapsHi[3*i+2] = 0;
331 lh->gapsLo[3*i] = 0;
332 lh->gapsLo[3*i+1] = 0;
333 lh->gapsLo[3*i+2] = 0;
334 lh->numStr[i] = 0;
335 lh->fStrToken[i] = NULL;
336 lh->lStrToken[i] = NULL;
337 lh->pos[i] = -1;
338 }
339
340 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
341
342 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
343 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
344 lh->pos[0] = 0;
345 t1 = lh->baseCE;
346 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
347 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
348 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
349 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
350 uint32_t primaryCE = t1 & UCOL_PRIMARYMASK | (t2 & UCOL_PRIMARYMASK) >> 16;
351 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
352
353 t1 = primaryCE & UCOL_PRIMARYMASK | 0x0505;
354 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
355
356 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
357 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
358 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
359 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
360 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
361 lh->pos[0] = 0;
362 t1 = lh->baseCE;
363 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
364 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
365 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
366 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
367 t1 = lh->nextCE;
368 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
369 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
370 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
371 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
372 } else {
373 for(;;) {
374 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
375 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
376 lh->fStrToken[tokStrength] = tok;
377 } else { /* The CE must be implicit, since it's not in the table */
378 /* Error */
379 *status = U_INTERNAL_PROGRAM_ERROR;
380 }
381 }
382
383 while(tok != NULL && tok->strength >= tokStrength) {
384 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
385 lh->lStrToken[tokStrength] = tok;
386 }
387 tok = tok->next;
388 }
389 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
390 /* check if previous interval is the same and merge the intervals if it is so */
391 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
392 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
393 lh->fStrToken[tokStrength+1] = NULL;
394 lh->lStrToken[tokStrength+1] = NULL;
395 lh->pos[tokStrength+1] = -1;
396 }
397 }
398 if(tok != NULL) {
399 tokStrength = tok->strength;
400 } else {
401 break;
402 }
403 }
404 for(st = 0; st < 3; st++) {
405 if((pos = lh->pos[st]) >= 0) {
406 t1 = *(CETable+3*(pos));
407 t2 = *(CETable+3*(pos)+1);
408 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
409 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
410 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
411 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
412 //pos--;
413 //t1 = *(CETable+3*(pos));
414 //t2 = *(CETable+3*(pos)+1);
415 t1 = lh->baseCE;
416 t2 = lh->baseContCE;
417 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
418 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
419 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
420 }
421 }
422 }
423 }
424
425
426 #define ucol_countBytes(value, noOfBytes) \
427 { \
428 uint32_t mask = 0xFFFFFFFF; \
429 (noOfBytes) = 0; \
430 while(mask != 0) { \
431 if(((value) & mask) != 0) { \
432 (noOfBytes)++; \
433 } \
434 mask >>= 8; \
435 } \
436 }
437
ucol_getNextGenerated(ucolCEGenerator * g,UErrorCode * status)438 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
439 if(U_SUCCESS(*status)) {
440 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
441 }
442 return g->current;
443 }
444
ucol_getSimpleCEGenerator(ucolCEGenerator * g,UColToken * tok,uint32_t strength,UErrorCode * status)445 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
446 /* TODO: rename to enum names */
447 uint32_t high, low, count=1;
448 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
449
450 if(strength == UCOL_SECONDARY) {
451 low = UCOL_COMMON_TOP2<<24;
452 high = 0xFFFFFFFF;
453 count = 0xFF - UCOL_COMMON_TOP2;
454 } else {
455 low = UCOL_BYTE_COMMON << 24; //0x05000000;
456 high = 0x40000000;
457 count = 0x40 - UCOL_BYTE_COMMON;
458 }
459
460 if(tok->next != NULL && tok->next->strength == strength) {
461 count = tok->next->toInsert;
462 }
463
464 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
465 g->current = UCOL_BYTE_COMMON<<24;
466
467 if(g->noOfRanges == 0) {
468 *status = U_INTERNAL_PROGRAM_ERROR;
469 }
470 return g->current;
471 }
472
ucol_getCEGenerator(ucolCEGenerator * g,uint32_t * lows,uint32_t * highs,UColToken * tok,uint32_t fStrength,UErrorCode * status)473 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
474 uint32_t strength = tok->strength;
475 uint32_t low = lows[fStrength*3+strength];
476 uint32_t high = highs[fStrength*3+strength];
477 uint32_t maxByte = 0;
478 if(strength == UCOL_TERTIARY) {
479 maxByte = 0x3F;
480 } else if(strength == UCOL_PRIMARY) {
481 maxByte = 0xFE;
482 } else {
483 maxByte = 0xFF;
484 }
485
486 uint32_t count = tok->toInsert;
487
488 if(low >= high && strength > UCOL_PRIMARY) {
489 int32_t s = strength;
490 for(;;) {
491 s--;
492 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
493 if(strength == UCOL_SECONDARY) {
494 if (low < UCOL_COMMON_TOP2<<24 ) {
495 // Override if low range is less than UCOL_COMMON_TOP2.
496 low = UCOL_COMMON_TOP2<<24;
497 }
498 high = 0xFFFFFFFF;
499 } else {
500 // Override if low range is less than UCOL_COMMON_BOT3.
501 if ( low < UCOL_COMMON_BOT3<<24 ) {
502 low = UCOL_COMMON_BOT3<<24;
503 }
504 high = 0x40000000;
505 }
506 break;
507 }
508 if(s<0) {
509 *status = U_INTERNAL_PROGRAM_ERROR;
510 return 0;
511 }
512 }
513 }
514
515 if(low == 0) {
516 low = 0x01000000;
517 }
518
519 if(strength == UCOL_SECONDARY) { /* similar as simple */
520 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
521 low = UCOL_COMMON_TOP2<<24;
522 }
523 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
524 high = UCOL_COMMON_TOP2<<24;
525 }
526 if(low < (UCOL_COMMON_BOT2<<24)) {
527 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
528 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
529 //g->current = UCOL_COMMON_BOT2<<24;
530 return g->current;
531 }
532 }
533
534 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
535 if(g->noOfRanges == 0) {
536 *status = U_INTERNAL_PROGRAM_ERROR;
537 }
538 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
539 return g->current;
540 }
541
542 static
u_toLargeKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)543 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
544 uint32_t i = 0;
545 UChar c;
546
547 if(U_FAILURE(*status)) {
548 return 0;
549 }
550
551 if(sourceLen > resLen) {
552 *status = U_MEMORY_ALLOCATION_ERROR;
553 return 0;
554 }
555
556 for(i = 0; i < sourceLen; i++) {
557 c = source[i];
558 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
559 switch(c - 0x3000) {
560 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
561 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
562 c++;
563 break;
564 case 0xF5:
565 c = 0x30AB;
566 break;
567 case 0xF6:
568 c = 0x30B1;
569 break;
570 }
571 }
572 resBuf[i] = c;
573 }
574 return sourceLen;
575 }
576
577 static
u_toSmallKana(const UChar * source,const uint32_t sourceLen,UChar * resBuf,const uint32_t resLen,UErrorCode * status)578 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
579 uint32_t i = 0;
580 UChar c;
581
582 if(U_FAILURE(*status)) {
583 return 0;
584 }
585
586 if(sourceLen > resLen) {
587 *status = U_MEMORY_ALLOCATION_ERROR;
588 return 0;
589 }
590
591 for(i = 0; i < sourceLen; i++) {
592 c = source[i];
593 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
594 switch(c - 0x3000) {
595 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
596 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
597 c--;
598 break;
599 case 0xAB:
600 c = 0x30F5;
601 break;
602 case 0xB1:
603 c = 0x30F6;
604 break;
605 }
606 }
607 resBuf[i] = c;
608 }
609 return sourceLen;
610 }
611
612 static
ucol_uprv_getCaseBits(const UCollator * UCA,const UChar * src,uint32_t len,UErrorCode * status)613 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
614 uint32_t i = 0;
615 UChar n[128];
616 uint32_t nLen = 0;
617 uint32_t uCount = 0, lCount = 0;
618
619 collIterate s;
620 uint32_t order = 0;
621
622 if(U_FAILURE(*status)) {
623 return UCOL_LOWER_CASE;
624 }
625
626 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
627 if(U_SUCCESS(*status)) {
628 for(i = 0; i < nLen; i++) {
629 uprv_init_collIterate(UCA, &n[i], 1, &s, status);
630 order = ucol_getNextCE(UCA, &s, status);
631 if(isContinuation(order)) {
632 *status = U_INTERNAL_PROGRAM_ERROR;
633 return UCOL_LOWER_CASE;
634 }
635 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
636 uCount++;
637 } else {
638 if(u_islower(n[i])) {
639 lCount++;
640 } else if(U_SUCCESS(*status)) {
641 UChar sk[1], lk[1];
642 u_toSmallKana(&n[i], 1, sk, 1, status);
643 u_toLargeKana(&n[i], 1, lk, 1, status);
644 if(sk[0] == n[i] && lk[0] != n[i]) {
645 lCount++;
646 }
647 }
648 }
649 }
650 }
651
652 if(uCount != 0 && lCount != 0) {
653 return UCOL_MIXED_CASE;
654 } else if(uCount != 0) {
655 return UCOL_UPPER_CASE;
656 } else {
657 return UCOL_LOWER_CASE;
658 }
659 }
660
661
ucol_doCE(UColTokenParser * src,uint32_t * CEparts,UColToken * tok,UErrorCode * status)662 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
663 /* this one makes the table and stuff */
664 uint32_t noOfBytes[3];
665 uint32_t i;
666
667 for(i = 0; i<3; i++) {
668 ucol_countBytes(CEparts[i], noOfBytes[i]);
669 }
670
671 /* Here we have to pack CEs from parts */
672
673 uint32_t CEi = 0;
674 uint32_t value = 0;
675
676 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
677 if(CEi > 0) {
678 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
679 } else {
680 value = 0;
681 }
682
683 if(2*CEi<noOfBytes[0]) {
684 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
685 }
686 if(CEi<noOfBytes[1]) {
687 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
688 }
689 if(CEi<noOfBytes[2]) {
690 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
691 }
692 tok->CEs[CEi] = value;
693 CEi++;
694 }
695 if(CEi == 0) { /* totally ignorable */
696 tok->noOfCEs = 1;
697 tok->CEs[0] = 0;
698 } else { /* there is at least something */
699 tok->noOfCEs = CEi;
700 }
701
702
703 // we want to set case bits here and now, not later.
704 // Case bits handling
705 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
706 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
707 int32_t cSize = (tok->source & 0xFF000000) >> 24;
708 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
709
710 if(cSize > 1) {
711 // Do it manually
712 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
713 } else {
714 // Copy it from the UCA
715 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
716 tok->CEs[0] |= (caseCE & 0xC0);
717 }
718 }
719
720 #if UCOL_DEBUG==2
721 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
722 for(i = 0; i<tok->noOfCEs; i++) {
723 fprintf(stderr, "%08X ", tok->CEs[i]);
724 }
725 fprintf(stderr, "\n");
726 #endif
727 }
728
ucol_initBuffers(UColTokenParser * src,UColTokListHeader * lh,UErrorCode * status)729 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
730 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
731 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
732
733 UColToken *tok = lh->last;
734 uint32_t t[UCOL_STRENGTH_LIMIT];
735
736 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
737
738 tok->toInsert = 1;
739 t[tok->strength] = 1;
740
741 while(tok->previous != NULL) {
742 if(tok->previous->strength < tok->strength) { /* going up */
743 t[tok->strength] = 0;
744 t[tok->previous->strength]++;
745 } else if(tok->previous->strength > tok->strength) { /* going down */
746 t[tok->previous->strength] = 1;
747 } else {
748 t[tok->strength]++;
749 }
750 tok=tok->previous;
751 tok->toInsert = t[tok->strength];
752 }
753
754 tok->toInsert = t[tok->strength];
755 ucol_inv_getGapPositions(src, lh, status);
756
757 #if UCOL_DEBUG
758 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
759 int32_t j = 2;
760 for(j = 2; j >= 0; j--) {
761 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
762 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
763 }
764 tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
765
766 do {
767 fprintf(stderr,"%i", tok->strength);
768 tok = tok->next;
769 } while(tok != NULL);
770 fprintf(stderr, "\n");
771
772 tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
773
774 do {
775 fprintf(stderr,"%i", tok->toInsert);
776 tok = tok->next;
777 } while(tok != NULL);
778 #endif
779
780 tok = lh->first;
781 uint32_t fStrength = UCOL_IDENTICAL;
782 uint32_t initStrength = UCOL_IDENTICAL;
783
784
785 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
786 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
787 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
788
789 while (tok != NULL && U_SUCCESS(*status)) {
790 fStrength = tok->strength;
791 if(fStrength < initStrength) {
792 initStrength = fStrength;
793 if(lh->pos[fStrength] == -1) {
794 while(lh->pos[fStrength] == -1 && fStrength > 0) {
795 fStrength--;
796 }
797 if(lh->pos[fStrength] == -1) {
798 *status = U_INTERNAL_PROGRAM_ERROR;
799 return;
800 }
801 }
802 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
803 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
804 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
805 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
806 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
807 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
808 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
809 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
810 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
811 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
812 } else { /* primaries */
813 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
814 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
815 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
816 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
817 }
818 } else {
819 if(tok->strength == UCOL_TERTIARY) {
820 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
821 } else if(tok->strength == UCOL_SECONDARY) {
822 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
823 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
824 } else if(tok->strength == UCOL_PRIMARY) {
825 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
826 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
827 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
828 }
829 }
830 ucol_doCE(src, CEparts, tok, status);
831 tok = tok->next;
832 }
833 }
834
ucol_createElements(UColTokenParser * src,tempUCATable * t,UColTokListHeader * lh,UErrorCode * status)835 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
836 UCAElements el;
837 UColToken *tok = lh->first;
838 UColToken *expt = NULL;
839 uint32_t i = 0, j = 0;
840 UChar32 fcdHighStart;
841 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
842
843 while(tok != NULL && U_SUCCESS(*status)) {
844 /* first, check if there are any expansions */
845 /* if there are expansions, we need to do a little bit more processing */
846 /* since parts of expansion can be tailored, while others are not */
847 if(tok->expansion != 0) {
848 uint32_t len = tok->expansion >> 24;
849 uint32_t currentSequenceLen = len;
850 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
851 //uint32_t exp = currentSequenceLen | expOffset;
852 UColToken exp;
853 exp.source = currentSequenceLen | expOffset;
854 exp.rulesToParse = src->source;
855
856 while(len > 0) {
857 currentSequenceLen = len;
858 while(currentSequenceLen > 0) {
859 exp.source = (currentSequenceLen << 24) | expOffset;
860 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
861 uint32_t noOfCEsToCopy = expt->noOfCEs;
862 for(j = 0; j<noOfCEsToCopy; j++) {
863 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
864 }
865 tok->noOfExpCEs += noOfCEsToCopy;
866 // Smart people never try to add codepoints and CEs.
867 // For some odd reason, it won't work.
868 expOffset += currentSequenceLen; //noOfCEsToCopy;
869 len -= currentSequenceLen; //noOfCEsToCopy;
870 break;
871 } else {
872 currentSequenceLen--;
873 }
874 }
875 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
876 /* will have to get one from UCA */
877 /* first, get the UChars from the rules */
878 /* then pick CEs out until there is no more and stuff them into expansion */
879 collIterate s;
880 uint32_t order = 0;
881 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
882
883 for(;;) {
884 order = ucol_getNextCE(src->UCA, &s, status);
885 if(order == UCOL_NO_MORE_CES) {
886 break;
887 }
888 tok->expCEs[tok->noOfExpCEs++] = order;
889 }
890 expOffset++;
891 len--;
892 }
893 }
894 } else {
895 tok->noOfExpCEs = 0;
896 }
897
898 /* set the ucaelement with obtained values */
899 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
900 /* copy CEs */
901 for(i = 0; i<tok->noOfCEs; i++) {
902 el.CEs[i] = tok->CEs[i];
903 }
904 for(i = 0; i<tok->noOfExpCEs; i++) {
905 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
906 }
907
908 /* copy UChars */
909 // We kept prefix and source kind of together, as it is a kind of a contraction.
910 // However, now we have to slice the prefix off the main thing -
911 el.prefix = el.prefixChars;
912 el.cPoints = el.uchars;
913 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
914 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
915 // decomposed elements to the unsaf table.
916 el.prefixSize = tok->prefix>>24;
917 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
918
919 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
920 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
921 } else {
922 el.prefixSize = 0;
923 *el.prefix = 0;
924
925 el.cSize = (tok->source >> 24);
926 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
927 }
928 if(src->UCA != NULL) {
929 for(i = 0; i<el.cSize; i++) {
930 if(UCOL_ISJAMO(el.cPoints[i])) {
931 t->image->jamoSpecial = TRUE;
932 }
933 }
934 if (!src->buildCCTabFlag && el.cSize > 0) {
935 // Check the trailing canonical combining class (tccc) of the last character.
936 const UChar *s = el.cPoints + el.cSize;
937 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s);
938 if ((fcd & 0xff) != 0) {
939 src->buildCCTabFlag = TRUE;
940 }
941 }
942 }
943
944 /* and then, add it */
945 #if UCOL_DEBUG==2
946 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
947 #endif
948 uprv_uca_addAnElement(t, &el, status);
949
950 #if UCOL_DEBUG_DUPLICATES
951 if(*status != U_ZERO_ERROR) {
952 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
953 *status = U_ZERO_ERROR;
954 }
955 #endif
956
957 tok = tok->next;
958 }
959 }
960
961 U_CDECL_BEGIN
962 static UBool U_CALLCONV
_processUCACompleteIgnorables(const void * context,UChar32 start,UChar32 limit,uint32_t value)963 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
964 UErrorCode status = U_ZERO_ERROR;
965 tempUCATable *t = (tempUCATable *)context;
966 if(value == 0) {
967 while(start < limit) {
968 uint32_t CE = utrie_get32(t->mapping, start, NULL);
969 if(CE == UCOL_NOT_FOUND) {
970 UCAElements el;
971 el.isThai = FALSE;
972 el.prefixSize = 0;
973 el.prefixChars[0] = 0;
974 el.prefix = el.prefixChars;
975 el.cPoints = el.uchars;
976
977 el.cSize = 0;
978 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
979
980 el.noOfCEs = 1;
981 el.CEs[0] = 0;
982 uprv_uca_addAnElement(t, &el, &status);
983
984 }
985 start++;
986 }
987 }
988 if(U_FAILURE(status)) {
989 return FALSE;
990 } else {
991 return TRUE;
992 }
993 }
994 U_CDECL_END
995
996 static void
ucol_uprv_bld_copyRangeFromUCA(UColTokenParser * src,tempUCATable * t,UChar32 start,UChar32 end,UErrorCode * status)997 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
998 UChar32 start, UChar32 end,
999 UErrorCode *status)
1000 {
1001 //UChar decomp[256];
1002 uint32_t CE = UCOL_NOT_FOUND;
1003 UChar32 u = 0;
1004 UCAElements el;
1005 el.isThai = FALSE;
1006 el.prefixSize = 0;
1007 el.prefixChars[0] = 0;
1008 collIterate colIt;
1009
1010 if(U_SUCCESS(*status)) {
1011 for(u = start; u<=end; u++) {
1012 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1013 /* this test is for contractions that are missing the starting element. */
1014 || ((isCntTableElement(CE)) &&
1015 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1016 )
1017 {
1018 el.cSize = 0;
1019 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1020 //decomp[0] = (UChar)u;
1021 //el.uchars[0] = (UChar)u;
1022 el.cPoints = el.uchars;
1023 //el.cSize = 1;
1024 el.noOfCEs = 0;
1025 el.prefix = el.prefixChars;
1026 el.prefixSize = 0;
1027 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1028 // We actually want to check whether this element is a special
1029 // If it is an implicit element (hangul, CJK - we want to copy the
1030 // special, not the resolved CEs) - for hangul, copying resolved
1031 // would just make things the same (there is an expansion and it
1032 // takes approximately the same amount of time to resolve as
1033 // falling back to the UCA).
1034 /*
1035 UTRIE_GET32(src->UCA->mapping, u, CE);
1036 tag = getCETag(CE);
1037 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1038 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1039 || tag == LEAD_SURROGATE_TAG) {
1040 el.CEs[el.noOfCEs++] = CE;
1041 } else {
1042 */
1043 // It turns out that it does not make sense to keep implicits
1044 // unresolved. The cost of resolving them is big enough so that
1045 // it doesn't make any difference whether we have to go to the UCA
1046 // or not.
1047 {
1048 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1049 while(CE != UCOL_NO_MORE_CES) {
1050 CE = ucol_getNextCE(src->UCA, &colIt, status);
1051 if(CE != UCOL_NO_MORE_CES) {
1052 el.CEs[el.noOfCEs++] = CE;
1053 }
1054 }
1055 }
1056 uprv_uca_addAnElement(t, &el, status);
1057 }
1058 }
1059 }
1060 }
1061
ucol_assembleTailoringTable(UColTokenParser * src,UErrorCode * status)1062 UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1063 U_NAMESPACE_USE
1064
1065 uint32_t i = 0;
1066 if(U_FAILURE(*status)) {
1067 return NULL;
1068 }
1069 /*
1070 2. Eliminate the negative lists by doing the following for each non-null negative list:
1071 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1072 create new ListHeader X
1073 o reverse the list, add to the end of X's positive list. Reset the strength of the
1074 first item you add, based on the stronger strength levels of the two lists.
1075 */
1076 /*
1077 3. For each ListHeader with a non-null positive list:
1078 */
1079 /*
1080 o Find all character strings with CEs between the baseCE and the
1081 next/previous CE, at the strength of the first token. Add these to the
1082 tailoring.
1083 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1084 tailoring has & x < z...
1085 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1086 */
1087 /* It is possible that this part should be done even while constructing list */
1088 /* The problem is that it is unknown what is going to be the strongest weight */
1089 /* So we might as well do it here */
1090
1091 /*
1092 o Allocate CEs for each token in the list, based on the total number N of the
1093 largest level difference, and the gap G between baseCE and nextCE at that
1094 level. The relation * between the last item and nextCE is the same as the
1095 strongest strength.
1096 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1097 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1098 Then fit b and c into the secondary gap between a and d, then fit q
1099 into the tertiary gap between b and c.
1100
1101 o Example: baseCE << b <<< q << c * nextCE(X,2)
1102 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1103 Then fit q into the tertiary gap between b and c.
1104 o When incrementing primary values, we will not cross high byte
1105 boundaries except where there is only a single-byte primary. That is to
1106 ensure that the script reordering will continue to work.
1107 */
1108 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1109 /* test for NULL */
1110 if (image == NULL) {
1111 *status = U_MEMORY_ALLOCATION_ERROR;
1112 return NULL;
1113 }
1114 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1115
1116 for(i = 0; i<src->resultLen; i++) {
1117 /* now we need to generate the CEs */
1118 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1119 /* According to strength */
1120 if(U_SUCCESS(*status)) {
1121 if(src->lh[i].first) { // if there are any elements
1122 // due to the way parser works, subsequent tailorings
1123 // may remove all the elements from a sequence, therefore
1124 // leaving an empty tailoring sequence.
1125 ucol_initBuffers(src, &src->lh[i], status);
1126 }
1127 }
1128 if(U_FAILURE(*status)) {
1129 uprv_free(image);
1130 return NULL;
1131 }
1132 }
1133
1134 if(src->varTop != NULL) { /* stuff the variable top value */
1135 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1136 /* remove it from the list */
1137 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1138 src->varTop->listHeader->first = src->varTop->next;
1139 }
1140 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1141 src->varTop->listHeader->last = src->varTop->previous;
1142 }
1143 if(src->varTop->next != NULL) {
1144 src->varTop->next->previous = src->varTop->previous;
1145 }
1146 if(src->varTop->previous != NULL) {
1147 src->varTop->previous->next = src->varTop->next;
1148 }
1149 }
1150
1151
1152 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1153 if(U_FAILURE(*status)) {
1154 uprv_free(image);
1155 return NULL;
1156 }
1157
1158
1159 /* After this, we have assigned CE values to all regular CEs */
1160 /* now we will go through list once more and resolve expansions, */
1161 /* make UCAElements structs and add them to table */
1162 for(i = 0; i<src->resultLen; i++) {
1163 /* now we need to generate the CEs */
1164 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1165 /* According to strength */
1166 if(U_SUCCESS(*status)) {
1167 ucol_createElements(src, t, &src->lh[i], status);
1168 }
1169 }
1170
1171 UCAElements el;
1172 el.isThai = FALSE;
1173 el.prefixSize = 0;
1174 el.prefixChars[0] = 0;
1175
1176 /* add latin-1 stuff */
1177 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1178
1179 /* add stuff for copying */
1180 if(src->copySet != NULL) {
1181 int32_t i = 0;
1182 UnicodeSet *set = (UnicodeSet *)src->copySet;
1183 for(i = 0; i < set->getRangeCount(); i++) {
1184 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1185 }
1186 }
1187
1188 if(U_SUCCESS(*status)) {
1189 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1190
1191 uint32_t tailoredCE = UCOL_NOT_FOUND;
1192 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1193 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1194 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1195 // Check for null pointer
1196 if (ucaEl == NULL) {
1197 *status = U_MEMORY_ALLOCATION_ERROR;
1198 return NULL;
1199 }
1200 while(*conts != 0) {
1201 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1202 tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1203 if(tailoredCE != UCOL_NOT_FOUND) {
1204 UBool needToAdd = TRUE;
1205 if(isCntTableElement(tailoredCE)) {
1206 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1207 needToAdd = FALSE;
1208 }
1209 }
1210 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1211 UCAElements elm;
1212 elm.cPoints = el.uchars;
1213 elm.noOfCEs = 0;
1214 elm.uchars[0] = *conts;
1215 elm.uchars[1] = 0;
1216 elm.cSize = 1;
1217 elm.prefixChars[0] = *(conts+2);
1218 elm.isThai = FALSE;
1219 elm.prefix = elm.prefixChars;
1220 elm.prefixSize = 1;
1221 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1222 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1223 needToAdd = TRUE;
1224 }
1225 }
1226 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1227 needToAdd = FALSE;
1228 }
1229
1230 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1231 if (*(conts+1) != 0) { // contractions
1232 el.prefix = el.prefixChars;
1233 el.prefixSize = 0;
1234 el.cPoints = el.uchars;
1235 el.noOfCEs = 0;
1236 el.uchars[0] = *conts;
1237 el.uchars[1] = *(conts+1);
1238 if(*(conts+2)!=0) {
1239 el.uchars[2] = *(conts+2);
1240 el.cSize = 3;
1241 } else {
1242 el.cSize = 2;
1243 }
1244 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1245 }
1246 else { // pre-context character
1247 UChar str[4] = { 0 };
1248 int32_t len=0;
1249 int32_t preKeyLen=0;
1250
1251 el.cPoints = el.uchars;
1252 el.noOfCEs = 0;
1253 el.uchars[0] = *conts;
1254 el.uchars[1] = 0;
1255 el.cSize = 1;
1256 el.prefixChars[0] = *(conts+2);
1257 el.prefix = el.prefixChars;
1258 el.prefixSize = 1;
1259 if (el.prefixChars[0]!=0) {
1260 // get CE of prefix character first
1261 str[0]=el.prefixChars[0];
1262 str[1]=0;
1263 ucol_setText(ucaEl, str, 1, status);
1264 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1265 != UCOL_NULLORDER) {
1266 preKeyLen++; // count number of keys for prefix character
1267 }
1268 str[len++] = el.prefixChars[0];
1269 }
1270
1271 str[len++] = el.uchars[0];
1272 str[len]=0;
1273 ucol_setText(ucaEl, str, len, status);
1274 // Skip the keys for prefix character, then copy the rest to el.
1275 while ((preKeyLen-->0) &&
1276 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1277 continue;
1278 }
1279
1280 }
1281 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1282 el.noOfCEs++;
1283 }
1284 uprv_uca_addAnElement(t, &el, status);
1285 }
1286
1287 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1288 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1289 }
1290 conts+=3;
1291 }
1292 ucol_closeElements(ucaEl);
1293 }
1294
1295 // Add completely ignorable elements
1296 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1297
1298 // add tailoring characters related canonical closures
1299 uprv_uca_canonicalClosure(t, src, status);
1300
1301 /* still need to produce compatibility closure */
1302
1303 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1304
1305 uprv_uca_closeTempTable(t);
1306 uprv_free(image);
1307
1308 return myData;
1309 }
1310
1311 U_CDECL_BEGIN
1312 static UBool U_CALLCONV
ucol_bld_cleanup(void)1313 ucol_bld_cleanup(void)
1314 {
1315 udata_close(invUCA_DATA_MEM);
1316 invUCA_DATA_MEM = NULL;
1317 _staticInvUCA = NULL;
1318 return TRUE;
1319 }
1320 U_CDECL_END
1321
1322 U_CAPI const InverseUCATableHeader * U_EXPORT2
ucol_initInverseUCA(UErrorCode * status)1323 ucol_initInverseUCA(UErrorCode *status)
1324 {
1325 if(U_FAILURE(*status)) return NULL;
1326
1327 UBool needsInit;
1328 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1329
1330 if(needsInit) {
1331 InverseUCATableHeader *newInvUCA = NULL;
1332 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1333
1334 if(U_FAILURE(*status)) {
1335 if (result) {
1336 udata_close(result);
1337 }
1338 // This is not needed, as we are talking about
1339 // memory we got from UData
1340 //uprv_free(newInvUCA);
1341 }
1342
1343 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1344 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1345 UCollator *UCA = ucol_initUCA(status);
1346 // UCA versions of UCA and inverse UCA should match
1347 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1348 *status = U_INVALID_FORMAT_ERROR;
1349 udata_close(result);
1350 return NULL;
1351 }
1352
1353 umtx_lock(NULL);
1354 if(_staticInvUCA == NULL) {
1355 invUCA_DATA_MEM = result;
1356 _staticInvUCA = newInvUCA;
1357 result = NULL;
1358 newInvUCA = NULL;
1359 }
1360 umtx_unlock(NULL);
1361
1362 if(newInvUCA != NULL) {
1363 udata_close(result);
1364 // This is not needed, as we are talking about
1365 // memory we got from UData
1366 //uprv_free(newInvUCA);
1367 }
1368 else {
1369 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1370 }
1371 }
1372 }
1373 return _staticInvUCA;
1374 }
1375
1376 #endif /* #if !UCONFIG_NO_COLLATION */
1377