1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_tok.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module reads a tailoring rule string and produces a list of
17 * tokens that will be turned into collation elements
18 *
19 */
20
21 #include "unicode/utypes.h"
22
23 #if !UCONFIG_NO_COLLATION
24
25 #include "unicode/ustring.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28
29 #include "ucol_tok.h"
30 #include "cmemory.h"
31 #include "../common/util.h"
32
33 U_CDECL_BEGIN
34 static int32_t U_CALLCONV
uhash_hashTokens(const UHashTok k)35 uhash_hashTokens(const UHashTok k)
36 {
37 int32_t hash = 0;
38 //uint32_t key = (uint32_t)k.integer;
39 UColToken *key = (UColToken *)k.pointer;
40 if (key != 0) {
41 //int32_t len = (key & 0xFF000000)>>24;
42 int32_t len = (key->source & 0xFF000000)>>24;
43 int32_t inc = ((len - 32) / 32) + 1;
44
45 //const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
46 const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
47 const UChar *limit = p + len;
48
49 while (p<limit) {
50 hash = (hash * 37) + *p;
51 p += inc;
52 }
53 }
54 return hash;
55 }
56
57 static UBool U_CALLCONV
uhash_compareTokens(const UHashTok key1,const UHashTok key2)58 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
59 {
60 //uint32_t p1 = (uint32_t) key1.integer;
61 //uint32_t p2 = (uint32_t) key2.integer;
62 UColToken *p1 = (UColToken *)key1.pointer;
63 UColToken *p2 = (UColToken *)key2.pointer;
64 const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
65 const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
66 uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
67 uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
68 const UChar *end = s1+s1L-1;
69
70 if (p1 == p2) {
71 return TRUE;
72 }
73 if (p1->source == 0 || p2->source == 0) {
74 return FALSE;
75 }
76 if(s1L != s2L) {
77 return FALSE;
78 }
79 if(p1->source == p2->source) {
80 return TRUE;
81 }
82 while((s1 < end) && *s1 == *s2) {
83 ++s1;
84 ++s2;
85 }
86 if(*s1 == *s2) {
87 return TRUE;
88 } else {
89 return FALSE;
90 }
91 }
92 U_CDECL_END
93
94 /*static inline void U_CALLCONV
95 uhash_freeBlockWrapper(void *obj) {
96 uhash_freeBlock(obj);
97 }*/
98
99
100 typedef struct {
101 uint32_t startCE;
102 uint32_t startContCE;
103 uint32_t limitCE;
104 uint32_t limitContCE;
105 } indirectBoundaries;
106
107 /* these values are used for finding CE values for indirect positioning. */
108 /* Indirect positioning is a mechanism for allowing resets on symbolic */
109 /* values. It only works for resets and you cannot tailor indirect names */
110 /* An indirect name can define either an anchor point or a range. An */
111 /* anchor point behaves in exactly the same way as a code point in reset */
112 /* would, except that it cannot be tailored. A range (we currently only */
113 /* know for the [top] range will explicitly set the upper bound for */
114 /* generated CEs, thus allowing for better control over how many CEs can */
115 /* be squeezed between in the range without performance penalty. */
116 /* In that respect, we use [top] for tailoring of locales that use CJK */
117 /* characters. Other indirect values are currently a pure convenience, */
118 /* they can be used to assure that the CEs will be always positioned in */
119 /* the same place relative to a point with known properties (e.g. first */
120 /* primary ignorable). */
121 static indirectBoundaries ucolIndirectBoundaries[15];
122 /*
123 static indirectBoundaries ucolIndirectBoundaries[11] = {
124 { UCOL_RESET_TOP_VALUE, 0,
125 UCOL_NEXT_TOP_VALUE, 0 },
126 { UCOL_FIRST_PRIMARY_IGNORABLE, 0,
127 0, 0 },
128 { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
129 0, 0 },
130 { UCOL_FIRST_SECONDARY_IGNORABLE, 0,
131 0, 0 },
132 { UCOL_LAST_SECONDARY_IGNORABLE, 0,
133 0, 0 },
134 { UCOL_FIRST_TERTIARY_IGNORABLE, 0,
135 0, 0 },
136 { UCOL_LAST_TERTIARY_IGNORABLE, 0,
137 0, 0 },
138 { UCOL_FIRST_VARIABLE, 0,
139 0, 0 },
140 { UCOL_LAST_VARIABLE, 0,
141 0, 0 },
142 { UCOL_FIRST_NON_VARIABLE, 0,
143 0, 0 },
144 { UCOL_LAST_NON_VARIABLE, 0,
145 0, 0 },
146 };
147 */
148
setIndirectBoundaries(uint32_t indexR,uint32_t * start,uint32_t * end)149 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
150
151 // Set values for the top - TODO: once we have values for all the indirects, we are going
152 // to initalize here.
153 ucolIndirectBoundaries[indexR].startCE = start[0];
154 ucolIndirectBoundaries[indexR].startContCE = start[1];
155 if(end) {
156 ucolIndirectBoundaries[indexR].limitCE = end[0];
157 ucolIndirectBoundaries[indexR].limitContCE = end[1];
158 } else {
159 ucolIndirectBoundaries[indexR].limitCE = 0;
160 ucolIndirectBoundaries[indexR].limitContCE = 0;
161 }
162 }
163
164
165 static inline
syntaxError(const UChar * rules,int32_t pos,int32_t rulesLen,UParseError * parseError)166 void syntaxError(const UChar* rules,
167 int32_t pos,
168 int32_t rulesLen,
169 UParseError* parseError)
170 {
171 parseError->offset = pos;
172 parseError->line = 0 ; /* we are not using line numbers */
173
174 // for pre-context
175 int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
176 int32_t stop = pos;
177
178 u_memcpy(parseError->preContext,rules+start,stop-start);
179 //null terminate the buffer
180 parseError->preContext[stop-start] = 0;
181
182 //for post-context
183 start = pos+1;
184 stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
185 rulesLen;
186
187 if(start < stop) {
188 u_memcpy(parseError->postContext,rules+start,stop-start);
189 //null terminate the buffer
190 parseError->postContext[stop-start]= 0;
191 } else {
192 parseError->postContext[0] = 0;
193 }
194 }
195
196 static
ucol_uprv_tok_setOptionInImage(UColOptionSet * opts,UColAttribute attrib,UColAttributeValue value)197 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
198 switch(attrib) {
199 case UCOL_HIRAGANA_QUATERNARY_MODE:
200 opts->hiraganaQ = value;
201 break;
202 case UCOL_FRENCH_COLLATION:
203 opts->frenchCollation = value;
204 break;
205 case UCOL_ALTERNATE_HANDLING:
206 opts->alternateHandling = value;
207 break;
208 case UCOL_CASE_FIRST:
209 opts->caseFirst = value;
210 break;
211 case UCOL_CASE_LEVEL:
212 opts->caseLevel = value;
213 break;
214 case UCOL_NORMALIZATION_MODE:
215 opts->normalizationMode = value;
216 break;
217 case UCOL_STRENGTH:
218 opts->strength = value;
219 break;
220 case UCOL_NUMERIC_COLLATION:
221 opts->numericCollation = value;
222 break;
223 case UCOL_ATTRIBUTE_COUNT:
224 default:
225 break;
226 }
227 }
228
229 #define UTOK_OPTION_COUNT 20
230
231 static UBool didInit = FALSE;
232 /* we can be strict, or we can be lenient */
233 /* I'd surely be lenient with the option arguments */
234 /* maybe even with options */
235 U_STRING_DECL(suboption_00, "non-ignorable", 13);
236 U_STRING_DECL(suboption_01, "shifted", 7);
237
238 U_STRING_DECL(suboption_02, "lower", 5);
239 U_STRING_DECL(suboption_03, "upper", 5);
240 U_STRING_DECL(suboption_04, "off", 3);
241 U_STRING_DECL(suboption_05, "on", 2);
242 U_STRING_DECL(suboption_06, "1", 1);
243 U_STRING_DECL(suboption_07, "2", 1);
244 U_STRING_DECL(suboption_08, "3", 1);
245 U_STRING_DECL(suboption_09, "4", 1);
246 U_STRING_DECL(suboption_10, "I", 1);
247
248 U_STRING_DECL(suboption_11, "primary", 7);
249 U_STRING_DECL(suboption_12, "secondary", 9);
250 U_STRING_DECL(suboption_13, "tertiary", 8);
251 U_STRING_DECL(suboption_14, "variable", 8);
252 U_STRING_DECL(suboption_15, "regular", 7);
253 U_STRING_DECL(suboption_16, "implicit", 8);
254 U_STRING_DECL(suboption_17, "trailing", 8);
255
256
257 U_STRING_DECL(option_00, "undefined", 9);
258 U_STRING_DECL(option_01, "rearrange", 9);
259 U_STRING_DECL(option_02, "alternate", 9);
260 U_STRING_DECL(option_03, "backwards", 9);
261 U_STRING_DECL(option_04, "variable top", 12);
262 U_STRING_DECL(option_05, "top", 3);
263 U_STRING_DECL(option_06, "normalization", 13);
264 U_STRING_DECL(option_07, "caseLevel", 9);
265 U_STRING_DECL(option_08, "caseFirst", 9);
266 U_STRING_DECL(option_09, "scriptOrder", 11);
267 U_STRING_DECL(option_10, "charsetname", 11);
268 U_STRING_DECL(option_11, "charset", 7);
269 U_STRING_DECL(option_12, "before", 6);
270 U_STRING_DECL(option_13, "hiraganaQ", 9);
271 U_STRING_DECL(option_14, "strength", 8);
272 U_STRING_DECL(option_15, "first", 5);
273 U_STRING_DECL(option_16, "last", 4);
274 U_STRING_DECL(option_17, "optimize", 8);
275 U_STRING_DECL(option_18, "suppressContractions", 20);
276 U_STRING_DECL(option_19, "numericOrdering", 15);
277
278
279 /*
280 [last variable] last variable value
281 [last primary ignorable] largest CE for primary ignorable
282 [last secondary ignorable] largest CE for secondary ignorable
283 [last tertiary ignorable] largest CE for tertiary ignorable
284 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
285 */
286
287
288 static const ucolTokSuboption alternateSub[2] = {
289 {suboption_00, 13, UCOL_NON_IGNORABLE},
290 {suboption_01, 7, UCOL_SHIFTED}
291 };
292
293 static const ucolTokSuboption caseFirstSub[3] = {
294 {suboption_02, 5, UCOL_LOWER_FIRST},
295 {suboption_03, 5, UCOL_UPPER_FIRST},
296 {suboption_04, 3, UCOL_OFF},
297 };
298
299 static const ucolTokSuboption onOffSub[2] = {
300 {suboption_04, 3, UCOL_OFF},
301 {suboption_05, 2, UCOL_ON}
302 };
303
304 static const ucolTokSuboption frenchSub[1] = {
305 {suboption_07, 1, UCOL_ON}
306 };
307
308 static const ucolTokSuboption beforeSub[3] = {
309 {suboption_06, 1, UCOL_PRIMARY},
310 {suboption_07, 1, UCOL_SECONDARY},
311 {suboption_08, 1, UCOL_TERTIARY}
312 };
313
314 static const ucolTokSuboption strengthSub[5] = {
315 {suboption_06, 1, UCOL_PRIMARY},
316 {suboption_07, 1, UCOL_SECONDARY},
317 {suboption_08, 1, UCOL_TERTIARY},
318 {suboption_09, 1, UCOL_QUATERNARY},
319 {suboption_10, 1, UCOL_IDENTICAL},
320 };
321
322 static const ucolTokSuboption firstLastSub[7] = {
323 {suboption_11, 7, UCOL_PRIMARY},
324 {suboption_12, 9, UCOL_PRIMARY},
325 {suboption_13, 8, UCOL_PRIMARY},
326 {suboption_14, 8, UCOL_PRIMARY},
327 {suboption_15, 7, UCOL_PRIMARY},
328 {suboption_16, 8, UCOL_PRIMARY},
329 {suboption_17, 8, UCOL_PRIMARY},
330 };
331
332 enum OptionNumber {
333 OPTION_ALTERNATE_HANDLING = 0,
334 OPTION_FRENCH_COLLATION,
335 OPTION_CASE_LEVEL,
336 OPTION_CASE_FIRST,
337 OPTION_NORMALIZATION_MODE,
338 OPTION_HIRAGANA_QUATERNARY,
339 OPTION_STRENGTH,
340 OPTION_NUMERIC_COLLATION,
341 OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
342 OPTION_VARIABLE_TOP,
343 OPTION_REARRANGE,
344 OPTION_BEFORE,
345 OPTION_TOP,
346 OPTION_FIRST,
347 OPTION_LAST,
348 OPTION_OPTIMIZE,
349 OPTION_SUPPRESS_CONTRACTIONS,
350 OPTION_UNDEFINED,
351 OPTION_SCRIPT_ORDER,
352 OPTION_CHARSET_NAME,
353 OPTION_CHARSET
354 } ;
355
356 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
357 /*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
358 /*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */
359 /*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */
360 /*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */
361 /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
362 /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
363 /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
364 /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/
365 /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */
366 /*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
367 /*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
368 /*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
369 /*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
370 /*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
371 /*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */
372 /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */
373 /*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
374 /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
375 /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
376 /*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"charset" */
377 };
378
379 static
u_strncmpNoCase(const UChar * s1,const UChar * s2,int32_t n)380 int32_t u_strncmpNoCase(const UChar *s1,
381 const UChar *s2,
382 int32_t n)
383 {
384 if(n > 0) {
385 int32_t rc;
386 for(;;) {
387 rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
388 if(rc != 0 || *s1 == 0 || --n == 0) {
389 return rc;
390 }
391 ++s1;
392 ++s2;
393 }
394 }
395 return 0;
396 }
397
398 static
ucol_uprv_tok_initData()399 void ucol_uprv_tok_initData() {
400 if(!didInit) {
401 U_STRING_INIT(suboption_00, "non-ignorable", 13);
402 U_STRING_INIT(suboption_01, "shifted", 7);
403
404 U_STRING_INIT(suboption_02, "lower", 5);
405 U_STRING_INIT(suboption_03, "upper", 5);
406 U_STRING_INIT(suboption_04, "off", 3);
407 U_STRING_INIT(suboption_05, "on", 2);
408
409 U_STRING_INIT(suboption_06, "1", 1);
410 U_STRING_INIT(suboption_07, "2", 1);
411 U_STRING_INIT(suboption_08, "3", 1);
412 U_STRING_INIT(suboption_09, "4", 1);
413 U_STRING_INIT(suboption_10, "I", 1);
414
415 U_STRING_INIT(suboption_11, "primary", 7);
416 U_STRING_INIT(suboption_12, "secondary", 9);
417 U_STRING_INIT(suboption_13, "tertiary", 8);
418 U_STRING_INIT(suboption_14, "variable", 8);
419 U_STRING_INIT(suboption_15, "regular", 7);
420 U_STRING_INIT(suboption_16, "implicit", 8);
421 U_STRING_INIT(suboption_17, "trailing", 8);
422
423
424 U_STRING_INIT(option_00, "undefined", 9);
425 U_STRING_INIT(option_01, "rearrange", 9);
426 U_STRING_INIT(option_02, "alternate", 9);
427 U_STRING_INIT(option_03, "backwards", 9);
428 U_STRING_INIT(option_04, "variable top", 12);
429 U_STRING_INIT(option_05, "top", 3);
430 U_STRING_INIT(option_06, "normalization", 13);
431 U_STRING_INIT(option_07, "caseLevel", 9);
432 U_STRING_INIT(option_08, "caseFirst", 9);
433 U_STRING_INIT(option_09, "scriptOrder", 11);
434 U_STRING_INIT(option_10, "charsetname", 11);
435 U_STRING_INIT(option_11, "charset", 7);
436 U_STRING_INIT(option_12, "before", 6);
437 U_STRING_INIT(option_13, "hiraganaQ", 9);
438 U_STRING_INIT(option_14, "strength", 8);
439 U_STRING_INIT(option_15, "first", 5);
440 U_STRING_INIT(option_16, "last", 4);
441 U_STRING_INIT(option_17, "optimize", 8);
442 U_STRING_INIT(option_18, "suppressContractions", 20);
443 U_STRING_INIT(option_19, "numericOrdering", 15);
444 didInit = TRUE;
445 }
446 }
447
448
449 // This function reads basic options to set in the runtime collator
450 // used by data driven tests. Should not support build time options
451 U_CAPI const UChar * U_EXPORT2
ucol_tok_getNextArgument(const UChar * start,const UChar * end,UColAttribute * attrib,UColAttributeValue * value,UErrorCode * status)452 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
453 UColAttribute *attrib, UColAttributeValue *value,
454 UErrorCode *status)
455 {
456 uint32_t i = 0;
457 int32_t j=0;
458 UBool foundOption = FALSE;
459 const UChar *optionArg = NULL;
460
461 ucol_uprv_tok_initData();
462
463 while(start < end && u_isWhitespace(*start)) { /* eat whitespace */
464 start++;
465 }
466 if(start >= end) {
467 return NULL;
468 }
469 /* skip opening '[' */
470 if(*start == 0x005b) {
471 start++;
472 } else {
473 *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
474 return NULL;
475 }
476
477 while(i < UTOK_OPTION_COUNT) {
478 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
479 foundOption = TRUE;
480 if(end - start > rulesOptions[i].optionLen) {
481 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
482 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
483 optionArg++;
484 }
485 }
486 break;
487 }
488 i++;
489 }
490
491 if(!foundOption) {
492 *status = U_ILLEGAL_ARGUMENT_ERROR;
493 return NULL;
494 }
495
496 if(optionArg) {
497 for(j = 0; j<rulesOptions[i].subSize; j++) {
498 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
499 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
500 *attrib = rulesOptions[i].attr;
501 *value = rulesOptions[i].subopts[j].attrVal;
502 optionArg += rulesOptions[i].subopts[j].subLen;
503 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
504 optionArg++;
505 }
506 if(*optionArg == 0x005d) {
507 optionArg++;
508 return optionArg;
509 } else {
510 *status = U_ILLEGAL_ARGUMENT_ERROR;
511 return NULL;
512 }
513 }
514 }
515 }
516 *status = U_ILLEGAL_ARGUMENT_ERROR;
517 return NULL;
518 }
519
520 static
ucol_uprv_tok_readAndSetUnicodeSet(const UChar * start,const UChar * end,UErrorCode * status)521 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
522 while(*start != 0x005b) { /* advance while we find the first '[' */
523 start++;
524 }
525 // now we need to get a balanced set of '[]'. The problem is that a set can have
526 // many, and *end point to the first closing '['
527 int32_t noOpenBraces = 1;
528 int32_t current = 1; // skip the opening brace
529 while(start+current < end && noOpenBraces != 0) {
530 if(start[current] == 0x005b) {
531 noOpenBraces++;
532 } else if(start[current] == 0x005D) { // closing brace
533 noOpenBraces--;
534 }
535 current++;
536 }
537
538 if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
539 *status = U_ILLEGAL_ARGUMENT_ERROR;
540 return NULL;
541 }
542 return uset_openPattern(start, current, status);
543 }
544
545 static
ucol_uprv_tok_readOption(const UChar * start,const UChar * end,const UChar ** optionArg)546 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
547 int32_t i = 0;
548 ucol_uprv_tok_initData();
549
550 while(u_isWhitespace(*start)) { /* eat whitespace */
551 start++;
552 }
553 while(i < UTOK_OPTION_COUNT) {
554 if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
555 if(end - start > rulesOptions[i].optionLen) {
556 *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
557 while(u_isWhitespace(**optionArg)) { /* eat whitespace */
558 (*optionArg)++;
559 }
560 }
561 break;
562 }
563 i++;
564 }
565 if(i == UTOK_OPTION_COUNT) {
566 i = -1; // didn't find an option
567 }
568 return i;
569 }
570
571
572 // reads and conforms to various options in rules
573 // end is the position of the first closing ']'
574 // However, some of the options take an UnicodeSet definition
575 // which needs to duplicate the closing ']'
576 // for example: '[copy [\uAC00-\uD7FF]]'
577 // These options will move end to the second ']' and the
578 // caller will set the current to it.
579 static
ucol_uprv_tok_readAndSetOption(UColTokenParser * src,UErrorCode * status)580 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
581 const UChar* start = src->current;
582 int32_t i = 0;
583 int32_t j=0;
584 const UChar *optionArg = NULL;
585
586 uint8_t result = 0;
587
588 start++; /*skip opening '['*/
589 i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
590 if(optionArg) {
591 src->current = optionArg;
592 }
593
594 if(i < 0) {
595 *status = U_ILLEGAL_ARGUMENT_ERROR;
596 } else {
597 int32_t noOpenBraces = 1;
598 switch(i) {
599 case OPTION_ALTERNATE_HANDLING:
600 case OPTION_FRENCH_COLLATION:
601 case OPTION_CASE_LEVEL:
602 case OPTION_CASE_FIRST:
603 case OPTION_NORMALIZATION_MODE:
604 case OPTION_HIRAGANA_QUATERNARY:
605 case OPTION_STRENGTH:
606 case OPTION_NUMERIC_COLLATION:
607 if(optionArg) {
608 for(j = 0; j<rulesOptions[i].subSize; j++) {
609 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
610 ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
611 result = UCOL_TOK_SUCCESS;
612 }
613 }
614 }
615 if(result == 0) {
616 *status = U_ILLEGAL_ARGUMENT_ERROR;
617 }
618 break;
619 case OPTION_VARIABLE_TOP:
620 result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
621 break;
622 case OPTION_REARRANGE:
623 result = UCOL_TOK_SUCCESS;
624 break;
625 case OPTION_BEFORE:
626 if(optionArg) {
627 for(j = 0; j<rulesOptions[i].subSize; j++) {
628 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
629 result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
630 }
631 }
632 }
633 if(result == 0) {
634 *status = U_ILLEGAL_ARGUMENT_ERROR;
635 }
636 break;
637 case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
638 /* index to this array will be src->parsedToken.indirectIndex*/
639 src->parsedToken.indirectIndex = 0;
640 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
641 break;
642 case OPTION_FIRST:
643 case OPTION_LAST: /* first, last */
644 for(j = 0; j<rulesOptions[i].subSize; j++) {
645 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
646 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
647 // element of indirect boundaries is reserved for top.
648 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
649 result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
650 }
651 }
652 if(result == 0) {
653 *status = U_ILLEGAL_ARGUMENT_ERROR;
654 }
655 break;
656 case OPTION_OPTIMIZE:
657 case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization
658 // we need to move end here
659 src->current++; // skip opening brace
660 while(src->current < src->end && noOpenBraces != 0) {
661 if(*src->current == 0x005b) {
662 noOpenBraces++;
663 } else if(*src->current == 0x005D) { // closing brace
664 noOpenBraces--;
665 }
666 src->current++;
667 }
668 result = UCOL_TOK_SUCCESS;
669 break;
670 default:
671 *status = U_UNSUPPORTED_ERROR;
672 break;
673 }
674 }
675 src->current = u_memchr(src->current, 0x005d, src->end-src->current);
676 return result;
677 }
678
679
ucol_tok_addToExtraCurrent(UColTokenParser * src,const UChar * stuff,int32_t len,UErrorCode * status)680 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
681 if(src->extraCurrent+len >= src->extraEnd) {
682 /* reallocate */
683 UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
684 if(newSrc != NULL) {
685 src->current = newSrc + (src->current - src->source);
686 src->extraCurrent = newSrc + (src->extraCurrent - src->source);
687 src->end = newSrc + (src->end - src->source);
688 src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
689 src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
690 src->source = newSrc;
691 } else {
692 *status = U_MEMORY_ALLOCATION_ERROR;
693 }
694 }
695 if(len == 1) {
696 *src->extraCurrent++ = *stuff;
697 } else {
698 uprv_memcpy(src->extraCurrent, stuff, len*sizeof(UChar));
699 src->extraCurrent += len;
700 }
701
702
703 }
704
ucol_tok_doSetTop(UColTokenParser * src,UErrorCode * status)705 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
706 /*
707 top = TRUE;
708 */
709 UChar buff[5];
710 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
711 buff[0] = 0xFFFE;
712 buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
713 buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
714 if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
715 src->parsedToken.charsLen = 3;
716 ucol_tok_addToExtraCurrent(src, buff, 3, status);
717 } else {
718 buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
719 buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
720 src->parsedToken.charsLen = 5;
721 ucol_tok_addToExtraCurrent(src, buff, 5, status);
722 }
723 return TRUE;
724 }
725
isCharNewLine(UChar c)726 static UBool isCharNewLine(UChar c){
727 switch(c){
728 case 0x000A: /* LF */
729 case 0x000D: /* CR */
730 case 0x000C: /* FF */
731 case 0x0085: /* NEL */
732 case 0x2028: /* LS */
733 case 0x2029: /* PS */
734 return TRUE;
735 default:
736 return FALSE;
737 }
738 }
739
740 U_CAPI const UChar* U_EXPORT2
ucol_tok_parseNextToken(UColTokenParser * src,UBool startOfRules,UParseError * parseError,UErrorCode * status)741 ucol_tok_parseNextToken(UColTokenParser *src,
742 UBool startOfRules,
743 UParseError *parseError,
744 UErrorCode *status)
745 {
746 /* parsing part */
747 UBool variableTop = FALSE;
748 UBool top = FALSE;
749 UBool inChars = TRUE;
750 UBool inQuote = FALSE;
751 UBool wasInQuote = FALSE;
752 uint8_t before = 0;
753 UBool isEscaped = FALSE;
754 // TODO: replace these variables with src->parsedToken counterparts
755 // no need to use them anymore since we have src->parsedToken.
756 // Ideally, token parser would be a nice class... Once, when I have
757 // more time (around 2020 probably).
758 uint32_t newExtensionLen = 0;
759 uint32_t extensionOffset = 0;
760 uint32_t newStrength = UCOL_TOK_UNSET;
761 UChar buff[10];
762
763 src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
764 src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
765 src->parsedToken.indirectIndex = 0;
766
767 while (src->current < src->end) {
768 UChar ch = *(src->current);
769
770 if (inQuote) {
771 if (ch == 0x0027/*'\''*/) {
772 inQuote = FALSE;
773 } else {
774 if ((src->parsedToken.charsLen == 0) || inChars) {
775 if(src->parsedToken.charsLen == 0) {
776 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
777 }
778 src->parsedToken.charsLen++;
779 } else {
780 if(newExtensionLen == 0) {
781 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
782 }
783 newExtensionLen++;
784 }
785 }
786 }else if(isEscaped){
787 isEscaped =FALSE;
788 if (newStrength == UCOL_TOK_UNSET) {
789 *status = U_INVALID_FORMAT_ERROR;
790 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
791 return NULL;
792 // enabling rules to start with non-tokens a < b
793 // newStrength = UCOL_TOK_RESET;
794 }
795 if(ch != 0x0000 && src->current != src->end) {
796 if (inChars) {
797 if(src->parsedToken.charsLen == 0) {
798 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
799 }
800 src->parsedToken.charsLen++;
801 } else {
802 if(newExtensionLen == 0) {
803 extensionOffset = (uint32_t)(src->current - src->source);
804 }
805 newExtensionLen++;
806 }
807 }
808 }else {
809 if(!uprv_isRuleWhiteSpace(ch)) {
810 /* Sets the strength for this entry */
811 switch (ch) {
812 case 0x003D/*'='*/ :
813 if (newStrength != UCOL_TOK_UNSET) {
814 goto EndOfLoop;
815 }
816
817 /* if we start with strength, we'll reset to top */
818 if(startOfRules == TRUE) {
819 src->parsedToken.indirectIndex = 5;
820 top = ucol_tok_doSetTop(src, status);
821 newStrength = UCOL_TOK_RESET;
822 goto EndOfLoop;
823 }
824 newStrength = UCOL_IDENTICAL;
825 break;
826
827 case 0x002C/*','*/:
828 if (newStrength != UCOL_TOK_UNSET) {
829 goto EndOfLoop;
830 }
831
832 /* if we start with strength, we'll reset to top */
833 if(startOfRules == TRUE) {
834 src->parsedToken.indirectIndex = 5;
835 top = ucol_tok_doSetTop(src, status);
836 newStrength = UCOL_TOK_RESET;
837 goto EndOfLoop;
838 }
839 newStrength = UCOL_TERTIARY;
840 break;
841
842 case 0x003B/*';'*/:
843 if (newStrength != UCOL_TOK_UNSET) {
844 goto EndOfLoop;
845 }
846
847 /* if we start with strength, we'll reset to top */
848 if(startOfRules == TRUE) {
849 src->parsedToken.indirectIndex = 5;
850 top = ucol_tok_doSetTop(src, status);
851 newStrength = UCOL_TOK_RESET;
852 goto EndOfLoop;
853 }
854 newStrength = UCOL_SECONDARY;
855 break;
856
857 case 0x003C/*'<'*/:
858 if (newStrength != UCOL_TOK_UNSET) {
859 goto EndOfLoop;
860 }
861
862 /* if we start with strength, we'll reset to top */
863 if(startOfRules == TRUE) {
864 src->parsedToken.indirectIndex = 5;
865 top = ucol_tok_doSetTop(src, status);
866 newStrength = UCOL_TOK_RESET;
867 goto EndOfLoop;
868 }
869 /* before this, do a scan to verify whether this is */
870 /* another strength */
871 if(*(src->current+1) == 0x003C) {
872 src->current++;
873 if(*(src->current+1) == 0x003C) {
874 src->current++; /* three in a row! */
875 newStrength = UCOL_TERTIARY;
876 } else { /* two in a row */
877 newStrength = UCOL_SECONDARY;
878 }
879 } else { /* just one */
880 newStrength = UCOL_PRIMARY;
881 }
882 break;
883
884 case 0x0026/*'&'*/:
885 if (newStrength != UCOL_TOK_UNSET) {
886 /**/
887 goto EndOfLoop;
888 }
889
890 newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
891 break;
892
893 case 0x005b/*'['*/:
894 /* options - read an option, analyze it */
895 if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
896 uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
897 if(U_SUCCESS(*status)) {
898 if(result & UCOL_TOK_TOP) {
899 if(newStrength == UCOL_TOK_RESET) {
900 top = ucol_tok_doSetTop(src, status);
901 if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
902 src->parsedToken.charsLen+=2;
903 buff[0] = 0x002d;
904 buff[1] = before;
905 ucol_tok_addToExtraCurrent(src, buff, 2, status);
906 }
907
908 src->current++;
909 goto EndOfLoop;
910 } else {
911 *status = U_INVALID_FORMAT_ERROR;
912 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
913 }
914 } else if(result & UCOL_TOK_VARIABLE_TOP) {
915 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
916 variableTop = TRUE;
917 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
918 src->parsedToken.charsLen = 1;
919 buff[0] = 0xFFFF;
920 ucol_tok_addToExtraCurrent(src, buff, 1, status);
921 src->current++;
922 goto EndOfLoop;
923 } else {
924 *status = U_INVALID_FORMAT_ERROR;
925 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
926 }
927 } else if (result & UCOL_TOK_BEFORE){
928 if(newStrength == UCOL_TOK_RESET) {
929 before = result & UCOL_TOK_BEFORE;
930 } else {
931 *status = U_INVALID_FORMAT_ERROR;
932 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
933
934 }
935 }
936 } else {
937 *status = U_INVALID_FORMAT_ERROR;
938 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
939 return NULL;
940 }
941 }
942 break;
943 case 0x0021/*! skip java thai modifier reordering*/:
944 break;
945 case 0x002F/*'/'*/:
946 wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
947 inChars = FALSE; /* we're now processing expansion */
948 break;
949 case 0x005C /* back slash for escaped chars */:
950 isEscaped = TRUE;
951 break;
952 /* found a quote, we're gonna start copying */
953 case 0x0027/*'\''*/:
954 if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
955 *status = U_INVALID_FORMAT_ERROR;
956 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
957 return NULL;
958 // enabling rules to start with a non-token character a < b
959 // newStrength = UCOL_TOK_RESET;
960 }
961
962 inQuote = TRUE;
963
964 if(inChars) { /* we're doing characters */
965 if(wasInQuote == FALSE) {
966 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
967 }
968 if (src->parsedToken.charsLen != 0) {
969 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
970 }
971 src->parsedToken.charsLen++;
972 } else { /* we're doing an expansion */
973 if(wasInQuote == FALSE) {
974 extensionOffset = (uint32_t)(src->extraCurrent - src->source);
975 }
976 if (newExtensionLen != 0) {
977 ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
978 }
979 newExtensionLen++;
980 }
981
982 wasInQuote = TRUE;
983
984 ch = *(++(src->current));
985 if(ch == 0x0027) { /* copy the double quote */
986 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
987 inQuote = FALSE;
988 }
989 break;
990
991 /* '@' is french only if the strength is not currently set */
992 /* if it is, it's just a regular character in collation rules */
993 case 0x0040/*'@'*/:
994 if (newStrength == UCOL_TOK_UNSET) {
995 src->opts->frenchCollation = UCOL_ON;
996 break;
997 }
998
999 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1000 // we want to store read characters to the prefix part and continue reading
1001 // the characters (proper way would be to restart reading the chars, but in
1002 // that case we would have to complicate the token hasher, which I do not
1003 // intend to play with. Instead, we will do prefixes when prefixes are due
1004 // (before adding the elements).
1005 src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1006 src->parsedToken.prefixLen = src->parsedToken.charsLen;
1007
1008 if(inChars) { /* we're doing characters */
1009 if(wasInQuote == FALSE) {
1010 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1011 }
1012 if (src->parsedToken.charsLen != 0) {
1013 ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1014 }
1015 src->parsedToken.charsLen++;
1016 }
1017
1018 wasInQuote = TRUE;
1019
1020 do {
1021 ch = *(++(src->current));
1022 // skip whitespace between '|' and the character
1023 } while (uprv_isRuleWhiteSpace(ch));
1024 break;
1025
1026 //charsOffset = 0;
1027 //newCharsLen = 0;
1028 //break; // We want to store the whole prefix/character sequence. If we break
1029 // the '|' is going to get lost.
1030 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1031 do {
1032 ch = *(++(src->current));
1033 } while (!isCharNewLine(ch));
1034
1035 break;
1036 default:
1037 if (newStrength == UCOL_TOK_UNSET) {
1038 *status = U_INVALID_FORMAT_ERROR;
1039 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1040 return NULL;
1041 }
1042
1043 if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1044 *status = U_INVALID_FORMAT_ERROR;
1045 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1046 return NULL;
1047 }
1048
1049 if(ch == 0x0000 && src->current+1 == src->end) {
1050 break;
1051 }
1052
1053 if (inChars) {
1054 if(src->parsedToken.charsLen == 0) {
1055 src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1056 }
1057 src->parsedToken.charsLen++;
1058 } else {
1059 if(newExtensionLen == 0) {
1060 extensionOffset = (uint32_t)(src->current - src->source);
1061 }
1062 newExtensionLen++;
1063 }
1064
1065 break;
1066 }
1067 }
1068 }
1069
1070 if(wasInQuote) {
1071 if(ch != 0x27) {
1072 if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
1073 ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1074 }
1075 }
1076 }
1077
1078 src->current++;
1079 }
1080
1081 EndOfLoop:
1082 wasInQuote = FALSE;
1083 if (newStrength == UCOL_TOK_UNSET) {
1084 return NULL;
1085 }
1086
1087 if (src->parsedToken.charsLen == 0 && top == FALSE) {
1088 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1089 *status = U_INVALID_FORMAT_ERROR;
1090 return NULL;
1091 }
1092
1093 src->parsedToken.strength = newStrength;
1094 src->parsedToken.extensionOffset = extensionOffset;
1095 src->parsedToken.extensionLen = newExtensionLen;
1096 src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1097
1098 return src->current;
1099 }
1100
1101 /*
1102 Processing Description
1103 1 Build a ListList. Each list has a header, which contains two lists (positive
1104 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1105 reset may be null.
1106 2 As you process, you keep a LAST pointer that points to the last token you
1107 handled.
1108 */
1109
ucol_tok_initAReset(UColTokenParser * src,UChar * expand,uint32_t * expandNext,UParseError * parseError,UErrorCode * status)1110 static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
1111 UParseError *parseError, UErrorCode *status)
1112 {
1113 if(src->resultLen == src->listCapacity) {
1114 // Unfortunately, this won't work, as we store addresses of lhs in token
1115 src->listCapacity *= 2;
1116 src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1117 if(src->lh == NULL) {
1118 *status = U_MEMORY_ALLOCATION_ERROR;
1119 return NULL;
1120 }
1121 }
1122 /* do the reset thing */
1123 UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1124 /* test for NULL */
1125 if (sourceToken == NULL) {
1126 *status = U_MEMORY_ALLOCATION_ERROR;
1127 return NULL;
1128 }
1129 sourceToken->rulesToParse = src->source;
1130 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1131 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1132
1133 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1134 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1135
1136 // keep the flags around so that we know about before
1137 sourceToken->flags = src->parsedToken.flags;
1138
1139 if(src->parsedToken.prefixOffset != 0) {
1140 // this is a syntax error
1141 *status = U_INVALID_FORMAT_ERROR;
1142 syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1143 return 0;
1144 } else {
1145 sourceToken->prefix = 0;
1146 }
1147
1148 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1149 sourceToken->strength = UCOL_TOK_RESET;
1150 sourceToken->next = NULL;
1151 sourceToken->previous = NULL;
1152 sourceToken->noOfCEs = 0;
1153 sourceToken->noOfExpCEs = 0;
1154 sourceToken->listHeader = &src->lh[src->resultLen];
1155
1156 src->lh[src->resultLen].first = NULL;
1157 src->lh[src->resultLen].last = NULL;
1158 src->lh[src->resultLen].first = NULL;
1159 src->lh[src->resultLen].last = NULL;
1160
1161 src->lh[src->resultLen].reset = sourceToken;
1162
1163 /*
1164 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1165 First convert all expansions into normal form. Examples:
1166 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1167 d * ... into &x * c/y * d * ...
1168 Note: reset values can never have expansions, although they can cause the
1169 very next item to have one. They may be contractions, if they are found
1170 earlier in the list.
1171 */
1172 *expandNext = 0;
1173 if(expand != NULL) {
1174 /* check to see if there is an expansion */
1175 if(src->parsedToken.charsLen > 1) {
1176 uint32_t resetCharsOffset;
1177 resetCharsOffset = (uint32_t)(expand - src->source);
1178 sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1179 *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1180 }
1181 }
1182
1183 src->resultLen++;
1184
1185 uhash_put(src->tailored, sourceToken, sourceToken, status);
1186
1187 return sourceToken;
1188 }
1189
1190 static
getVirginBefore(UColTokenParser * src,UColToken * sourceToken,uint8_t strength,UParseError * parseError,UErrorCode * status)1191 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1192 if(U_FAILURE(*status)) {
1193 return NULL;
1194 }
1195 /* this is a virgin before - we need to fish the anchor from the UCA */
1196 collIterate s;
1197 uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1198 uint32_t CE, SecondCE;
1199 uint32_t invPos;
1200 if(sourceToken != NULL) {
1201 uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
1202 } else {
1203 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
1204 }
1205
1206 baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1207 baseContCE = ucol_getNextCE(src->UCA, &s, status);
1208 if(baseContCE == UCOL_NO_MORE_CES) {
1209 baseContCE = 0;
1210 }
1211
1212
1213 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1214 uint32_t ch = 0;
1215 uint32_t expandNext = 0;
1216 UColToken key;
1217
1218 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1219 uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1220 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1221 ch = uprv_uca_getCodePointFromRaw(raw-1);
1222 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1223 CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1224 SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1225
1226 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1227 *src->extraCurrent++ = 0xFFFE;
1228 *src->extraCurrent++ = (UChar)ch;
1229 src->parsedToken.charsLen++;
1230
1231 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1232 key.rulesToParse = src->source;
1233
1234 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1235 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1236
1237 if(sourceToken == NULL) {
1238 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1239 if(isContinuation(SecondCE)) {
1240 src->lh[src->resultLen].baseContCE = SecondCE;
1241 } else {
1242 src->lh[src->resultLen].baseContCE = 0;
1243 }
1244 src->lh[src->resultLen].nextCE = 0;
1245 src->lh[src->resultLen].nextContCE = 0;
1246 src->lh[src->resultLen].previousCE = 0;
1247 src->lh[src->resultLen].previousContCE = 0;
1248
1249 src->lh[src->resultLen].indirect = FALSE;
1250
1251 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1252 }
1253
1254 } else {
1255 invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1256
1257 // we got the previous CE. Now we need to see if the difference between
1258 // the two CEs is really of the requested strength.
1259 // if it's a bigger difference (we asked for secondary and got primary), we
1260 // need to modify the CE.
1261 if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1262 // adjust the strength
1263 // now we are in the situation where our baseCE should actually be modified in
1264 // order to get the CE in the right position.
1265 if(strength == UCOL_SECONDARY) {
1266 CE = baseCE - 0x0200;
1267 } else { // strength == UCOL_TERTIARY
1268 CE = baseCE - 0x02;
1269 }
1270 if(baseContCE) {
1271 if(strength == UCOL_SECONDARY) {
1272 SecondCE = baseContCE - 0x0200;
1273 } else { // strength == UCOL_TERTIARY
1274 SecondCE = baseContCE - 0x02;
1275 }
1276 }
1277 }
1278
1279 #if 0
1280 // the code below relies on getting a code point from the inverse table, in order to be
1281 // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1282 // 1. There are many code points that have the same CE
1283 // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1284 // Also, in case when there is no equivalent strength before an element, we have to actually
1285 // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1286 // before a is a primary difference.
1287
1288 //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1289
1290
1291 ch = CETable[3*invPos+2];
1292
1293 if((ch & UCOL_INV_SIZEMASK) != 0) {
1294 uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1295 uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1296 ch = conts[offset];
1297 }
1298
1299 *src->extraCurrent++ = (UChar)ch;
1300 src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1301 src->parsedToken.charsLen = 1;
1302
1303 // We got an UCA before. However, this might have been tailored.
1304 // example:
1305 // &\u30ca = \u306a
1306 // &[before 3]\u306a<<<\u306a|\u309d
1307
1308
1309 // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1310 key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1311 key.rulesToParse = src->source;
1312
1313 //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1314 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1315 #endif
1316
1317 // here is how it should be. The situation such as &[before 1]a < x, should be
1318 // resolved exactly as if we wrote &a > x.
1319 // therefore, I don't really care if the UCA value before a has been changed.
1320 // However, I do care if the strength between my element and the previous element
1321 // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1322 // have to construct the base CE.
1323
1324
1325
1326 // if we found a tailored thing, we have to use the UCA value and construct
1327 // a new reset token with constructed name
1328 //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1329 // character to which we want to anchor is already tailored.
1330 // We need to construct a new token which will be the anchor
1331 // point
1332 //*(src->extraCurrent-1) = 0xFFFE;
1333 //*src->extraCurrent++ = (UChar)ch;
1334 // grab before
1335 src->parsedToken.charsOffset -= 10;
1336 src->parsedToken.charsLen += 10;
1337 src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1338 if(isContinuation(SecondCE)) {
1339 src->lh[src->resultLen].baseContCE = SecondCE;
1340 } else {
1341 src->lh[src->resultLen].baseContCE = 0;
1342 }
1343 src->lh[src->resultLen].nextCE = 0;
1344 src->lh[src->resultLen].nextContCE = 0;
1345 src->lh[src->resultLen].previousCE = 0;
1346 src->lh[src->resultLen].previousContCE = 0;
1347
1348 src->lh[src->resultLen].indirect = FALSE;
1349
1350 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1351 //}
1352 }
1353
1354 return sourceToken;
1355
1356 }
1357
ucol_tok_assembleTokenList(UColTokenParser * src,UParseError * parseError,UErrorCode * status)1358 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1359 UColToken *lastToken = NULL;
1360 const UChar *parseEnd = NULL;
1361 uint32_t expandNext = 0;
1362 UBool variableTop = FALSE;
1363 UBool top = FALSE;
1364 uint16_t specs = 0;
1365 UColTokListHeader *ListList = NULL;
1366
1367 src->parsedToken.strength = UCOL_TOK_UNSET;
1368
1369 ListList = src->lh;
1370
1371 if(U_FAILURE(*status)) {
1372 return 0;
1373 }
1374
1375 while(src->current < src->end) {
1376 src->parsedToken.prefixOffset = 0;
1377
1378 parseEnd = ucol_tok_parseNextToken(src,
1379 (UBool)(lastToken == NULL),
1380 parseError,
1381 status);
1382
1383 specs = src->parsedToken.flags;
1384
1385
1386 variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1387 top = ((specs & UCOL_TOK_TOP) != 0);
1388
1389 if(U_SUCCESS(*status) && parseEnd != NULL) {
1390 UColToken *sourceToken = NULL;
1391 //uint32_t key = 0;
1392 uint32_t lastStrength = UCOL_TOK_UNSET;
1393
1394 if(lastToken != NULL ) {
1395 lastStrength = lastToken->strength;
1396 }
1397
1398 //key = newCharsLen << 24 | charsOffset;
1399 UColToken key;
1400 key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1401 key.rulesToParse = src->source;
1402
1403 /* 4 Lookup each source in the CharsToToken map, and find a sourceToken */
1404 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1405
1406 if(src->parsedToken.strength != UCOL_TOK_RESET) {
1407 if(lastToken == NULL) { /* this means that rules haven't started properly */
1408 *status = U_INVALID_FORMAT_ERROR;
1409 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1410 return 0;
1411 }
1412 /* 6 Otherwise (when relation != reset) */
1413 if(sourceToken == NULL) {
1414 /* If sourceToken is null, create new one, */
1415 sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1416 /* test for NULL */
1417 if (sourceToken == NULL) {
1418 *status = U_MEMORY_ALLOCATION_ERROR;
1419 return 0;
1420 }
1421 sourceToken->rulesToParse = src->source;
1422 sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1423
1424 sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1425
1426 sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1427 sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1428
1429 sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1430 sourceToken->next = NULL;
1431 sourceToken->previous = NULL;
1432 sourceToken->noOfCEs = 0;
1433 sourceToken->noOfExpCEs = 0;
1434 // keep the flags around so that we know about before
1435 sourceToken->flags = src->parsedToken.flags;
1436 uhash_put(src->tailored, sourceToken, sourceToken, status);
1437 } else {
1438 /* we could have fished out a reset here */
1439 if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1440 /* otherwise remove sourceToken from where it was. */
1441 if(sourceToken->next != NULL) {
1442 if(sourceToken->next->strength > sourceToken->strength) {
1443 sourceToken->next->strength = sourceToken->strength;
1444 }
1445 sourceToken->next->previous = sourceToken->previous;
1446 } else {
1447 sourceToken->listHeader->last = sourceToken->previous;
1448 }
1449
1450 if(sourceToken->previous != NULL) {
1451 sourceToken->previous->next = sourceToken->next;
1452 } else {
1453 sourceToken->listHeader->first = sourceToken->next;
1454 }
1455 sourceToken->next = NULL;
1456 sourceToken->previous = NULL;
1457 }
1458 }
1459
1460 sourceToken->strength = src->parsedToken.strength;
1461 sourceToken->listHeader = lastToken->listHeader;
1462
1463 /*
1464 1. Find the strongest strength in each list, and set strongestP and strongestN
1465 accordingly in the headers.
1466 */
1467 if(lastStrength == UCOL_TOK_RESET
1468 || sourceToken->listHeader->first == 0) {
1469 /* If LAST is a reset
1470 insert sourceToken in the list. */
1471 if(sourceToken->listHeader->first == 0) {
1472 sourceToken->listHeader->first = sourceToken;
1473 sourceToken->listHeader->last = sourceToken;
1474 } else { /* we need to find a place for us */
1475 /* and we'll get in front of the same strength */
1476 if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1477 sourceToken->next = sourceToken->listHeader->first;
1478 sourceToken->next->previous = sourceToken;
1479 sourceToken->listHeader->first = sourceToken;
1480 sourceToken->previous = NULL;
1481 } else {
1482 lastToken = sourceToken->listHeader->first;
1483 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1484 lastToken = lastToken->next;
1485 }
1486 if(lastToken->next != NULL) {
1487 lastToken->next->previous = sourceToken;
1488 } else {
1489 sourceToken->listHeader->last = sourceToken;
1490 }
1491 sourceToken->previous = lastToken;
1492 sourceToken->next = lastToken->next;
1493 lastToken->next = sourceToken;
1494 }
1495 }
1496 } else {
1497 /* Otherwise (when LAST is not a reset)
1498 if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1499 otherwise insert before.
1500 when inserting after or before, search to the next position with the same
1501 strength in that direction. (This is called postpone insertion). */
1502 if(sourceToken != lastToken) {
1503 if(lastToken->polarity == sourceToken->polarity) {
1504 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1505 lastToken = lastToken->next;
1506 }
1507 sourceToken->previous = lastToken;
1508 if(lastToken->next != NULL) {
1509 lastToken->next->previous = sourceToken;
1510 } else {
1511 sourceToken->listHeader->last = sourceToken;
1512 }
1513
1514 sourceToken->next = lastToken->next;
1515 lastToken->next = sourceToken;
1516 } else {
1517 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1518 lastToken = lastToken->previous;
1519 }
1520 sourceToken->next = lastToken;
1521 if(lastToken->previous != NULL) {
1522 lastToken->previous->next = sourceToken;
1523 } else {
1524 sourceToken->listHeader->first = sourceToken;
1525 }
1526 sourceToken->previous = lastToken->previous;
1527 lastToken->previous = sourceToken;
1528 }
1529 } else { /* repeated one thing twice in rules, stay with the stronger strength */
1530 if(lastStrength < sourceToken->strength) {
1531 sourceToken->strength = lastStrength;
1532 }
1533 }
1534 }
1535
1536 /* if the token was a variable top, we're gonna put it in */
1537 if(variableTop == TRUE && src->varTop == NULL) {
1538 variableTop = FALSE;
1539 src->varTop = sourceToken;
1540 }
1541
1542 // Treat the expansions.
1543 // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1544 // (&abc * d * e <=> &ab * d / c * e / c)
1545 // if both of them are in effect for a token, they are combined.
1546
1547 sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1548
1549 if(expandNext != 0) {
1550 if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1551 expandNext = 0;
1552 } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1553 sourceToken->expansion = expandNext;
1554 } else { /* there is both explicit and implicit expansion. We need to make a combination */
1555 uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1556 uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1557 sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1558 src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1559 }
1560 }
1561
1562 // This is just for debugging purposes
1563 if(sourceToken->expansion != 0) {
1564 sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1565 } else {
1566 sourceToken->debugExpansion = 0;
1567 }
1568 // if the previous token was a reset before, the strength of this
1569 // token must match the strength of before. Otherwise we have an
1570 // undefined situation.
1571 // In other words, we currently have a cludge which we use to
1572 // represent &a >> x. This is written as &[before 2]a << x.
1573 if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1574 uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1575 if(beforeStrength != sourceToken->strength) {
1576 *status = U_INVALID_FORMAT_ERROR;
1577 syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1578 return 0;
1579 }
1580 }
1581 } else {
1582 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1583 /* if the previous token was also a reset, */
1584 /*this means that we have two consecutive resets */
1585 /* and we want to remove the previous one if empty*/
1586 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1587 src->resultLen--;
1588 }
1589 }
1590
1591 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1592 uint32_t searchCharsLen = src->parsedToken.charsLen;
1593 while(searchCharsLen > 1 && sourceToken == NULL) {
1594 searchCharsLen--;
1595 //key = searchCharsLen << 24 | charsOffset;
1596 UColToken key;
1597 key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1598 key.rulesToParse = src->source;
1599 sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1600 }
1601 if(sourceToken != NULL) {
1602 expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1603 }
1604 }
1605
1606 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1607 if(top == FALSE) { /* there is no indirection */
1608 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1609 if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1610 /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1611 while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1612 sourceToken = sourceToken->previous;
1613 }
1614 /* here, either we hit the strength or NULL */
1615 if(sourceToken->strength == strength) {
1616 if(sourceToken->previous != NULL) {
1617 sourceToken = sourceToken->previous;
1618 } else { /* start of list */
1619 sourceToken = sourceToken->listHeader->reset;
1620 }
1621 } else { /* we hit NULL */
1622 /* we should be doing the else part */
1623 sourceToken = sourceToken->listHeader->reset;
1624 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1625 }
1626 } else {
1627 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1628 }
1629 } else { /* this is both before and indirection */
1630 top = FALSE;
1631 ListList[src->resultLen].previousCE = 0;
1632 ListList[src->resultLen].previousContCE = 0;
1633 ListList[src->resultLen].indirect = TRUE;
1634 /* we need to do slightly more work. we need to get the baseCE using the */
1635 /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
1636 /* in ucol_bld */
1637 uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1638 uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1639 uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
1640 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1641
1642 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1643 if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1644 uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1645 uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1646 uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1647 CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1648 SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1649 } else {
1650 /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
1651 ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1652 }
1653
1654 ListList[src->resultLen].baseCE = CE;
1655 ListList[src->resultLen].baseContCE = SecondCE;
1656 ListList[src->resultLen].nextCE = 0;
1657 ListList[src->resultLen].nextContCE = 0;
1658
1659 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1660 }
1661 }
1662
1663
1664 /* 5 If the relation is a reset:
1665 If sourceToken is null
1666 Create new list, create new sourceToken, make the baseCE from source, put
1667 the sourceToken in ListHeader of the new list */
1668 if(sourceToken == NULL) {
1669 /*
1670 3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1671 First convert all expansions into normal form. Examples:
1672 If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1673 d * ... into &x * c/y * d * ...
1674 Note: reset values can never have expansions, although they can cause the
1675 very next item to have one. They may be contractions, if they are found
1676 earlier in the list.
1677 */
1678 if(top == FALSE) {
1679 collIterate s;
1680 uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1681
1682 uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);
1683
1684 CE = ucol_getNextCE(src->UCA, &s, status);
1685 UChar *expand = s.pos;
1686 SecondCE = ucol_getNextCE(src->UCA, &s, status);
1687
1688 ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1689 if(isContinuation(SecondCE)) {
1690 ListList[src->resultLen].baseContCE = SecondCE;
1691 } else {
1692 ListList[src->resultLen].baseContCE = 0;
1693 }
1694 ListList[src->resultLen].nextCE = 0;
1695 ListList[src->resultLen].nextContCE = 0;
1696 ListList[src->resultLen].previousCE = 0;
1697 ListList[src->resultLen].previousContCE = 0;
1698 ListList[src->resultLen].indirect = FALSE;
1699 sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
1700 } else { /* top == TRUE */
1701 /* just use the supplied values */
1702 top = FALSE;
1703 ListList[src->resultLen].previousCE = 0;
1704 ListList[src->resultLen].previousContCE = 0;
1705 ListList[src->resultLen].indirect = TRUE;
1706 ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1707 ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
1708 ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
1709 ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
1710
1711 sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1712
1713 }
1714 } else { /* reset to something already in rules */
1715 top = FALSE;
1716 }
1717 }
1718 /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
1719 lastToken = sourceToken;
1720 } else {
1721 if(U_FAILURE(*status)) {
1722 return 0;
1723 }
1724 }
1725 }
1726
1727 if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1728 src->resultLen--;
1729 }
1730 return src->resultLen;
1731 }
1732
ucol_tok_initTokenList(UColTokenParser * src,const UChar * rules,const uint32_t rulesLength,const UCollator * UCA,UErrorCode * status)1733 void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
1734 U_NAMESPACE_USE
1735
1736 uint32_t nSize = 0;
1737 uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
1738 if(U_FAILURE(*status)) {
1739 return;
1740 }
1741
1742 // set everything to zero, so that we can clean up gracefully
1743 uprv_memset(src, 0, sizeof(UColTokenParser));
1744
1745 // first we need to find options that don't like to be normalized,
1746 // like copy and remove...
1747 //const UChar *openBrace = rules;
1748 int32_t optionNumber = -1;
1749 const UChar *setStart;
1750 uint32_t i = 0;
1751 while(i < rulesLength) {
1752 if(rules[i] == 0x005B) {
1753 // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
1754 //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
1755 optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
1756 if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
1757 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1758 if(U_SUCCESS(*status)) {
1759 if(src->copySet == NULL) {
1760 src->copySet = newSet;
1761 } else {
1762 uset_addAll(src->copySet, newSet);
1763 uset_close(newSet);
1764 }
1765 } else {
1766 return;
1767 }
1768 } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
1769 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1770 if(U_SUCCESS(*status)) {
1771 if(src->removeSet == NULL) {
1772 src->removeSet = newSet;
1773 } else {
1774 uset_addAll(src->removeSet, newSet);
1775 uset_close(newSet);
1776 }
1777 } else {
1778 return;
1779 }
1780 }
1781 }
1782 //openBrace++;
1783 i++;
1784 }
1785
1786 src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
1787 /* test for NULL */
1788 if (src->source == NULL) {
1789 *status = U_MEMORY_ALLOCATION_ERROR;
1790 return;
1791 }
1792 uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
1793 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
1794 if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
1795 *status = U_ZERO_ERROR;
1796 src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
1797 /* test for NULL */
1798 if (src->source == NULL) {
1799 *status = U_MEMORY_ALLOCATION_ERROR;
1800 return;
1801 }
1802 nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
1803 }
1804 src->current = src->source;
1805 src->end = src->source+nSize;
1806 src->sourceCurrent = src->source;
1807 src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
1808 src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1809 src->varTop = NULL;
1810 src->UCA = UCA;
1811 src->invUCA = ucol_initInverseUCA(status);
1812 src->parsedToken.charsLen = 0;
1813 src->parsedToken.charsOffset = 0;
1814 src->parsedToken.extensionLen = 0;
1815 src->parsedToken.extensionOffset = 0;
1816 src->parsedToken.prefixLen = 0;
1817 src->parsedToken.prefixOffset = 0;
1818 src->parsedToken.flags = 0;
1819 src->parsedToken.strength = UCOL_TOK_UNSET;
1820 src->buildCCTabFlag = FALSE;
1821
1822 if(U_FAILURE(*status)) {
1823 return;
1824 }
1825 src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
1826 if(U_FAILURE(*status)) {
1827 return;
1828 }
1829 uhash_setValueDeleter(src->tailored, uhash_freeBlock);
1830
1831 src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
1832 /* test for NULL */
1833 if (src->opts == NULL) {
1834 *status = U_MEMORY_ALLOCATION_ERROR;
1835 return;
1836 }
1837
1838 uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
1839
1840 // rulesToParse = src->source;
1841 src->lh = 0;
1842 src->listCapacity = 1024;
1843 src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
1844 //Test for NULL
1845 if (src->lh == NULL) {
1846 *status = U_MEMORY_ALLOCATION_ERROR;
1847 return;
1848 }
1849 uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
1850 src->resultLen = 0;
1851
1852 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1853
1854 // UCOL_RESET_TOP_VALUE
1855 setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1856 // UCOL_FIRST_PRIMARY_IGNORABLE
1857 setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
1858 // UCOL_LAST_PRIMARY_IGNORABLE
1859 setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
1860 // UCOL_FIRST_SECONDARY_IGNORABLE
1861 setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
1862 // UCOL_LAST_SECONDARY_IGNORABLE
1863 setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
1864 // UCOL_FIRST_TERTIARY_IGNORABLE
1865 setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
1866 // UCOL_LAST_TERTIARY_IGNORABLE
1867 setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
1868 // UCOL_FIRST_VARIABLE
1869 setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
1870 // UCOL_LAST_VARIABLE
1871 setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
1872 // UCOL_FIRST_NON_VARIABLE
1873 setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
1874 // UCOL_LAST_NON_VARIABLE
1875 setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1876 // UCOL_FIRST_IMPLICIT
1877 setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
1878 // UCOL_LAST_IMPLICIT
1879 setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
1880 // UCOL_FIRST_TRAILING
1881 setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
1882 // UCOL_LAST_TRAILING
1883 setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
1884 ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
1885 }
1886
1887
ucol_tok_closeTokenList(UColTokenParser * src)1888 void ucol_tok_closeTokenList(UColTokenParser *src) {
1889 if(src->copySet != NULL) {
1890 uset_close(src->copySet);
1891 }
1892 if(src->removeSet != NULL) {
1893 uset_close(src->removeSet);
1894 }
1895 if(src->tailored != NULL) {
1896 uhash_close(src->tailored);
1897 }
1898 if(src->lh != NULL) {
1899 uprv_free(src->lh);
1900 }
1901 if(src->source != NULL) {
1902 uprv_free(src->source);
1903 }
1904 if(src->opts != NULL) {
1905 uprv_free(src->opts);
1906 }
1907 }
1908
1909 #endif /* #if !UCONFIG_NO_COLLATION */
1910
1911