• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2001-2007, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  ucol_tok.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created 02/22/2001
14 *   created by: Vladimir Weinstein
15 *
16 * This module reads a tailoring rule string and produces a list of
17 * tokens that will be turned into collation elements
18 *
19 */
20 
21 #include "unicode/utypes.h"
22 
23 #if !UCONFIG_NO_COLLATION
24 
25 #include "unicode/ustring.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 
29 #include "ucol_tok.h"
30 #include "cmemory.h"
31 #include "../common/util.h"
32 
33 U_CDECL_BEGIN
34 static int32_t U_CALLCONV
uhash_hashTokens(const UHashTok k)35 uhash_hashTokens(const UHashTok k)
36 {
37     int32_t hash = 0;
38     //uint32_t key = (uint32_t)k.integer;
39     UColToken *key = (UColToken *)k.pointer;
40     if (key != 0) {
41         //int32_t len = (key & 0xFF000000)>>24;
42         int32_t len = (key->source & 0xFF000000)>>24;
43         int32_t inc = ((len - 32) / 32) + 1;
44 
45         //const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
46         const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
47         const UChar *limit = p + len;
48 
49         while (p<limit) {
50             hash = (hash * 37) + *p;
51             p += inc;
52         }
53     }
54     return hash;
55 }
56 
57 static UBool U_CALLCONV
uhash_compareTokens(const UHashTok key1,const UHashTok key2)58 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
59 {
60     //uint32_t p1 = (uint32_t) key1.integer;
61     //uint32_t p2 = (uint32_t) key2.integer;
62     UColToken *p1 = (UColToken *)key1.pointer;
63     UColToken *p2 = (UColToken *)key2.pointer;
64     const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
65     const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
66     uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
67     uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
68     const UChar *end = s1+s1L-1;
69 
70     if (p1 == p2) {
71         return TRUE;
72     }
73     if (p1->source == 0 || p2->source == 0) {
74         return FALSE;
75     }
76     if(s1L != s2L) {
77         return FALSE;
78     }
79     if(p1->source == p2->source) {
80         return TRUE;
81     }
82     while((s1 < end) && *s1 == *s2) {
83         ++s1;
84         ++s2;
85     }
86     if(*s1 == *s2) {
87         return TRUE;
88     } else {
89         return FALSE;
90     }
91 }
92 U_CDECL_END
93 
94 /*static inline void U_CALLCONV
95 uhash_freeBlockWrapper(void *obj) {
96     uhash_freeBlock(obj);
97 }*/
98 
99 
100 typedef struct {
101     uint32_t startCE;
102     uint32_t startContCE;
103     uint32_t limitCE;
104     uint32_t limitContCE;
105 } indirectBoundaries;
106 
107 /* these values are used for finding CE values for indirect positioning. */
108 /* Indirect positioning is a mechanism for allowing resets on symbolic   */
109 /* values. It only works for resets and you cannot tailor indirect names */
110 /* An indirect name can define either an anchor point or a range. An     */
111 /* anchor point behaves in exactly the same way as a code point in reset */
112 /* would, except that it cannot be tailored. A range (we currently only  */
113 /* know for the [top] range will explicitly set the upper bound for      */
114 /* generated CEs, thus allowing for better control over how many CEs can */
115 /* be squeezed between in the range without performance penalty.         */
116 /* In that respect, we use [top] for tailoring of locales that use CJK   */
117 /* characters. Other indirect values are currently a pure convenience,   */
118 /* they can be used to assure that the CEs will be always positioned in  */
119 /* the same place relative to a point with known properties (e.g. first  */
120 /* primary ignorable). */
121 static indirectBoundaries ucolIndirectBoundaries[15];
122 /*
123 static indirectBoundaries ucolIndirectBoundaries[11] = {
124 { UCOL_RESET_TOP_VALUE,               0,
125 UCOL_NEXT_TOP_VALUE,                0 },
126 { UCOL_FIRST_PRIMARY_IGNORABLE,       0,
127 0,                                  0 },
128 { UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
129 0,                                  0 },
130 { UCOL_FIRST_SECONDARY_IGNORABLE,     0,
131 0,                                  0 },
132 { UCOL_LAST_SECONDARY_IGNORABLE,      0,
133 0,                                  0 },
134 { UCOL_FIRST_TERTIARY_IGNORABLE,      0,
135 0,                                  0 },
136 { UCOL_LAST_TERTIARY_IGNORABLE,       0,
137 0,                                  0 },
138 { UCOL_FIRST_VARIABLE,                0,
139 0,                                  0 },
140 { UCOL_LAST_VARIABLE,                 0,
141 0,                                  0 },
142 { UCOL_FIRST_NON_VARIABLE,            0,
143 0,                                  0 },
144 { UCOL_LAST_NON_VARIABLE,             0,
145 0,                                  0 },
146 };
147 */
148 
setIndirectBoundaries(uint32_t indexR,uint32_t * start,uint32_t * end)149 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
150 
151     // Set values for the top - TODO: once we have values for all the indirects, we are going
152     // to initalize here.
153     ucolIndirectBoundaries[indexR].startCE = start[0];
154     ucolIndirectBoundaries[indexR].startContCE = start[1];
155     if(end) {
156         ucolIndirectBoundaries[indexR].limitCE = end[0];
157         ucolIndirectBoundaries[indexR].limitContCE = end[1];
158     } else {
159         ucolIndirectBoundaries[indexR].limitCE = 0;
160         ucolIndirectBoundaries[indexR].limitContCE = 0;
161     }
162 }
163 
164 
165 static inline
syntaxError(const UChar * rules,int32_t pos,int32_t rulesLen,UParseError * parseError)166 void syntaxError(const UChar* rules,
167                  int32_t pos,
168                  int32_t rulesLen,
169                  UParseError* parseError)
170 {
171     parseError->offset = pos;
172     parseError->line = 0 ; /* we are not using line numbers */
173 
174     // for pre-context
175     int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
176     int32_t stop  = pos;
177 
178     u_memcpy(parseError->preContext,rules+start,stop-start);
179     //null terminate the buffer
180     parseError->preContext[stop-start] = 0;
181 
182     //for post-context
183     start = pos+1;
184     stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
185     rulesLen;
186 
187     if(start < stop) {
188         u_memcpy(parseError->postContext,rules+start,stop-start);
189         //null terminate the buffer
190         parseError->postContext[stop-start]= 0;
191     } else {
192         parseError->postContext[0] = 0;
193     }
194 }
195 
196 static
ucol_uprv_tok_setOptionInImage(UColOptionSet * opts,UColAttribute attrib,UColAttributeValue value)197 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
198     switch(attrib) {
199     case UCOL_HIRAGANA_QUATERNARY_MODE:
200         opts->hiraganaQ = value;
201         break;
202     case UCOL_FRENCH_COLLATION:
203         opts->frenchCollation = value;
204         break;
205     case UCOL_ALTERNATE_HANDLING:
206         opts->alternateHandling = value;
207         break;
208     case UCOL_CASE_FIRST:
209         opts->caseFirst = value;
210         break;
211     case UCOL_CASE_LEVEL:
212         opts->caseLevel = value;
213         break;
214     case UCOL_NORMALIZATION_MODE:
215         opts->normalizationMode = value;
216         break;
217     case UCOL_STRENGTH:
218         opts->strength = value;
219         break;
220     case UCOL_NUMERIC_COLLATION:
221         opts->numericCollation = value;
222         break;
223     case UCOL_ATTRIBUTE_COUNT:
224     default:
225         break;
226     }
227 }
228 
229 #define UTOK_OPTION_COUNT 20
230 
231 static UBool didInit = FALSE;
232 /* we can be strict, or we can be lenient */
233 /* I'd surely be lenient with the option arguments */
234 /* maybe even with options */
235 U_STRING_DECL(suboption_00, "non-ignorable", 13);
236 U_STRING_DECL(suboption_01, "shifted",        7);
237 
238 U_STRING_DECL(suboption_02, "lower",          5);
239 U_STRING_DECL(suboption_03, "upper",          5);
240 U_STRING_DECL(suboption_04, "off",            3);
241 U_STRING_DECL(suboption_05, "on",             2);
242 U_STRING_DECL(suboption_06, "1",              1);
243 U_STRING_DECL(suboption_07, "2",              1);
244 U_STRING_DECL(suboption_08, "3",              1);
245 U_STRING_DECL(suboption_09, "4",              1);
246 U_STRING_DECL(suboption_10, "I",              1);
247 
248 U_STRING_DECL(suboption_11, "primary",        7);
249 U_STRING_DECL(suboption_12, "secondary",      9);
250 U_STRING_DECL(suboption_13, "tertiary",       8);
251 U_STRING_DECL(suboption_14, "variable",       8);
252 U_STRING_DECL(suboption_15, "regular",        7);
253 U_STRING_DECL(suboption_16, "implicit",       8);
254 U_STRING_DECL(suboption_17, "trailing",       8);
255 
256 
257 U_STRING_DECL(option_00,    "undefined",      9);
258 U_STRING_DECL(option_01,    "rearrange",      9);
259 U_STRING_DECL(option_02,    "alternate",      9);
260 U_STRING_DECL(option_03,    "backwards",      9);
261 U_STRING_DECL(option_04,    "variable top",  12);
262 U_STRING_DECL(option_05,    "top",            3);
263 U_STRING_DECL(option_06,    "normalization", 13);
264 U_STRING_DECL(option_07,    "caseLevel",      9);
265 U_STRING_DECL(option_08,    "caseFirst",      9);
266 U_STRING_DECL(option_09,    "scriptOrder",   11);
267 U_STRING_DECL(option_10,    "charsetname",   11);
268 U_STRING_DECL(option_11,    "charset",        7);
269 U_STRING_DECL(option_12,    "before",         6);
270 U_STRING_DECL(option_13,    "hiraganaQ",      9);
271 U_STRING_DECL(option_14,    "strength",       8);
272 U_STRING_DECL(option_15,    "first",          5);
273 U_STRING_DECL(option_16,    "last",           4);
274 U_STRING_DECL(option_17,    "optimize",       8);
275 U_STRING_DECL(option_18,    "suppressContractions",         20);
276 U_STRING_DECL(option_19,    "numericOrdering",              15);
277 
278 
279 /*
280 [last variable] last variable value
281 [last primary ignorable] largest CE for primary ignorable
282 [last secondary ignorable] largest CE for secondary ignorable
283 [last tertiary ignorable] largest CE for tertiary ignorable
284 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
285 */
286 
287 
288 static const ucolTokSuboption alternateSub[2] = {
289     {suboption_00, 13, UCOL_NON_IGNORABLE},
290     {suboption_01,  7, UCOL_SHIFTED}
291 };
292 
293 static const ucolTokSuboption caseFirstSub[3] = {
294     {suboption_02, 5, UCOL_LOWER_FIRST},
295     {suboption_03,  5, UCOL_UPPER_FIRST},
296     {suboption_04,  3, UCOL_OFF},
297 };
298 
299 static const ucolTokSuboption onOffSub[2] = {
300     {suboption_04, 3, UCOL_OFF},
301     {suboption_05, 2, UCOL_ON}
302 };
303 
304 static const ucolTokSuboption frenchSub[1] = {
305     {suboption_07, 1, UCOL_ON}
306 };
307 
308 static const ucolTokSuboption beforeSub[3] = {
309     {suboption_06, 1, UCOL_PRIMARY},
310     {suboption_07, 1, UCOL_SECONDARY},
311     {suboption_08, 1, UCOL_TERTIARY}
312 };
313 
314 static const ucolTokSuboption strengthSub[5] = {
315     {suboption_06, 1, UCOL_PRIMARY},
316     {suboption_07, 1, UCOL_SECONDARY},
317     {suboption_08, 1, UCOL_TERTIARY},
318     {suboption_09, 1, UCOL_QUATERNARY},
319     {suboption_10, 1, UCOL_IDENTICAL},
320 };
321 
322 static const ucolTokSuboption firstLastSub[7] = {
323     {suboption_11, 7, UCOL_PRIMARY},
324     {suboption_12, 9, UCOL_PRIMARY},
325     {suboption_13, 8, UCOL_PRIMARY},
326     {suboption_14, 8, UCOL_PRIMARY},
327     {suboption_15, 7, UCOL_PRIMARY},
328     {suboption_16, 8, UCOL_PRIMARY},
329     {suboption_17, 8, UCOL_PRIMARY},
330 };
331 
332 enum OptionNumber {
333     OPTION_ALTERNATE_HANDLING = 0,
334     OPTION_FRENCH_COLLATION,
335     OPTION_CASE_LEVEL,
336     OPTION_CASE_FIRST,
337     OPTION_NORMALIZATION_MODE,
338     OPTION_HIRAGANA_QUATERNARY,
339     OPTION_STRENGTH,
340     OPTION_NUMERIC_COLLATION,
341     OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
342     OPTION_VARIABLE_TOP,
343     OPTION_REARRANGE,
344     OPTION_BEFORE,
345     OPTION_TOP,
346     OPTION_FIRST,
347     OPTION_LAST,
348     OPTION_OPTIMIZE,
349     OPTION_SUPPRESS_CONTRACTIONS,
350     OPTION_UNDEFINED,
351     OPTION_SCRIPT_ORDER,
352     OPTION_CHARSET_NAME,
353     OPTION_CHARSET
354 } ;
355 
356 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
357     /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
358     /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
359     /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
360     /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
361     /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
362     /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
363     /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
364     /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
365     /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
366     /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
367     /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
368     /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
369     /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
370     /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
371     /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
372     /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
373     /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
374     /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
375     /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
376     /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"charset"        */
377 };
378 
379 static
u_strncmpNoCase(const UChar * s1,const UChar * s2,int32_t n)380 int32_t u_strncmpNoCase(const UChar     *s1,
381                         const UChar     *s2,
382                         int32_t     n)
383 {
384     if(n > 0) {
385         int32_t rc;
386         for(;;) {
387             rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
388             if(rc != 0 || *s1 == 0 || --n == 0) {
389                 return rc;
390             }
391             ++s1;
392             ++s2;
393         }
394     }
395     return 0;
396 }
397 
398 static
ucol_uprv_tok_initData()399 void ucol_uprv_tok_initData() {
400     if(!didInit) {
401         U_STRING_INIT(suboption_00, "non-ignorable", 13);
402         U_STRING_INIT(suboption_01, "shifted",        7);
403 
404         U_STRING_INIT(suboption_02, "lower",          5);
405         U_STRING_INIT(suboption_03, "upper",          5);
406         U_STRING_INIT(suboption_04, "off",            3);
407         U_STRING_INIT(suboption_05, "on",             2);
408 
409         U_STRING_INIT(suboption_06, "1",              1);
410         U_STRING_INIT(suboption_07, "2",              1);
411         U_STRING_INIT(suboption_08, "3",              1);
412         U_STRING_INIT(suboption_09, "4",              1);
413         U_STRING_INIT(suboption_10, "I",              1);
414 
415         U_STRING_INIT(suboption_11, "primary",        7);
416         U_STRING_INIT(suboption_12, "secondary",      9);
417         U_STRING_INIT(suboption_13, "tertiary",       8);
418         U_STRING_INIT(suboption_14, "variable",       8);
419         U_STRING_INIT(suboption_15, "regular",        7);
420         U_STRING_INIT(suboption_16, "implicit",       8);
421         U_STRING_INIT(suboption_17, "trailing",       8);
422 
423 
424         U_STRING_INIT(option_00, "undefined",      9);
425         U_STRING_INIT(option_01, "rearrange",      9);
426         U_STRING_INIT(option_02, "alternate",      9);
427         U_STRING_INIT(option_03, "backwards",      9);
428         U_STRING_INIT(option_04, "variable top",  12);
429         U_STRING_INIT(option_05, "top",            3);
430         U_STRING_INIT(option_06, "normalization", 13);
431         U_STRING_INIT(option_07, "caseLevel",      9);
432         U_STRING_INIT(option_08, "caseFirst",      9);
433         U_STRING_INIT(option_09, "scriptOrder",   11);
434         U_STRING_INIT(option_10, "charsetname",   11);
435         U_STRING_INIT(option_11, "charset",        7);
436         U_STRING_INIT(option_12, "before",         6);
437         U_STRING_INIT(option_13, "hiraganaQ",      9);
438         U_STRING_INIT(option_14, "strength",       8);
439         U_STRING_INIT(option_15, "first",          5);
440         U_STRING_INIT(option_16, "last",           4);
441         U_STRING_INIT(option_17, "optimize",       8);
442         U_STRING_INIT(option_18, "suppressContractions",         20);
443         U_STRING_INIT(option_19, "numericOrdering",      15);
444         didInit = TRUE;
445     }
446 }
447 
448 
449 // This function reads basic options to set in the runtime collator
450 // used by data driven tests. Should not support build time options
451 U_CAPI const UChar * U_EXPORT2
ucol_tok_getNextArgument(const UChar * start,const UChar * end,UColAttribute * attrib,UColAttributeValue * value,UErrorCode * status)452 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
453                          UColAttribute *attrib, UColAttributeValue *value,
454                          UErrorCode *status)
455 {
456     uint32_t i = 0;
457     int32_t j=0;
458     UBool foundOption = FALSE;
459     const UChar *optionArg = NULL;
460 
461     ucol_uprv_tok_initData();
462 
463     while(start < end && u_isWhitespace(*start)) { /* eat whitespace */
464         start++;
465     }
466     if(start >= end) {
467         return NULL;
468     }
469     /* skip opening '[' */
470     if(*start == 0x005b) {
471         start++;
472     } else {
473         *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
474         return NULL;
475     }
476 
477     while(i < UTOK_OPTION_COUNT) {
478         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
479             foundOption = TRUE;
480             if(end - start > rulesOptions[i].optionLen) {
481                 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
482                 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
483                     optionArg++;
484                 }
485             }
486             break;
487         }
488         i++;
489     }
490 
491     if(!foundOption) {
492         *status = U_ILLEGAL_ARGUMENT_ERROR;
493         return NULL;
494     }
495 
496     if(optionArg) {
497         for(j = 0; j<rulesOptions[i].subSize; j++) {
498             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
499                 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
500                 *attrib = rulesOptions[i].attr;
501                 *value = rulesOptions[i].subopts[j].attrVal;
502                 optionArg += rulesOptions[i].subopts[j].subLen;
503                 while(u_isWhitespace(*optionArg)) { /* eat whitespace */
504                     optionArg++;
505                 }
506                 if(*optionArg == 0x005d) {
507                     optionArg++;
508                     return optionArg;
509                 } else {
510                     *status = U_ILLEGAL_ARGUMENT_ERROR;
511                     return NULL;
512                 }
513             }
514         }
515     }
516     *status = U_ILLEGAL_ARGUMENT_ERROR;
517     return NULL;
518 }
519 
520 static
ucol_uprv_tok_readAndSetUnicodeSet(const UChar * start,const UChar * end,UErrorCode * status)521 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
522     while(*start != 0x005b) { /* advance while we find the first '[' */
523         start++;
524     }
525     // now we need to get a balanced set of '[]'. The problem is that a set can have
526     // many, and *end point to the first closing '['
527     int32_t noOpenBraces = 1;
528     int32_t current = 1; // skip the opening brace
529     while(start+current < end && noOpenBraces != 0) {
530         if(start[current] == 0x005b) {
531             noOpenBraces++;
532         } else if(start[current] == 0x005D) { // closing brace
533             noOpenBraces--;
534         }
535         current++;
536     }
537 
538     if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
539         *status = U_ILLEGAL_ARGUMENT_ERROR;
540         return NULL;
541     }
542     return uset_openPattern(start, current, status);
543 }
544 
545 static
ucol_uprv_tok_readOption(const UChar * start,const UChar * end,const UChar ** optionArg)546 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
547     int32_t i = 0;
548     ucol_uprv_tok_initData();
549 
550     while(u_isWhitespace(*start)) { /* eat whitespace */
551         start++;
552     }
553     while(i < UTOK_OPTION_COUNT) {
554         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
555             if(end - start > rulesOptions[i].optionLen) {
556                 *optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
557                 while(u_isWhitespace(**optionArg)) { /* eat whitespace */
558                     (*optionArg)++;
559                 }
560             }
561             break;
562         }
563         i++;
564     }
565     if(i == UTOK_OPTION_COUNT) {
566         i = -1; // didn't find an option
567     }
568     return i;
569 }
570 
571 
572 // reads and conforms to various options in rules
573 // end is the position of the first closing ']'
574 // However, some of the options take an UnicodeSet definition
575 // which needs to duplicate the closing ']'
576 // for example: '[copy [\uAC00-\uD7FF]]'
577 // These options will move end to the second ']' and the
578 // caller will set the current to it.
579 static
ucol_uprv_tok_readAndSetOption(UColTokenParser * src,UErrorCode * status)580 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
581     const UChar* start = src->current;
582     int32_t i = 0;
583     int32_t j=0;
584     const UChar *optionArg = NULL;
585 
586     uint8_t result = 0;
587 
588     start++; /*skip opening '['*/
589     i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
590     if(optionArg) {
591         src->current = optionArg;
592     }
593 
594     if(i < 0) {
595         *status = U_ILLEGAL_ARGUMENT_ERROR;
596     } else {
597         int32_t noOpenBraces = 1;
598         switch(i) {
599     case OPTION_ALTERNATE_HANDLING:
600     case OPTION_FRENCH_COLLATION:
601     case OPTION_CASE_LEVEL:
602     case OPTION_CASE_FIRST:
603     case OPTION_NORMALIZATION_MODE:
604     case OPTION_HIRAGANA_QUATERNARY:
605     case OPTION_STRENGTH:
606     case OPTION_NUMERIC_COLLATION:
607         if(optionArg) {
608             for(j = 0; j<rulesOptions[i].subSize; j++) {
609                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
610                     ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
611                     result =  UCOL_TOK_SUCCESS;
612                 }
613             }
614         }
615         if(result == 0) {
616             *status = U_ILLEGAL_ARGUMENT_ERROR;
617         }
618         break;
619     case OPTION_VARIABLE_TOP:
620         result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
621         break;
622     case OPTION_REARRANGE:
623         result = UCOL_TOK_SUCCESS;
624         break;
625     case OPTION_BEFORE:
626         if(optionArg) {
627             for(j = 0; j<rulesOptions[i].subSize; j++) {
628                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
629                     result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
630                 }
631             }
632         }
633         if(result == 0) {
634             *status = U_ILLEGAL_ARGUMENT_ERROR;
635         }
636         break;
637     case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
638         /* index to this array will be src->parsedToken.indirectIndex*/
639         src->parsedToken.indirectIndex = 0;
640         result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
641         break;
642     case OPTION_FIRST:
643     case OPTION_LAST: /* first, last */
644         for(j = 0; j<rulesOptions[i].subSize; j++) {
645             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
646                 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
647                 // element of indirect boundaries is reserved for top.
648                 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
649                 result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
650             }
651         }
652         if(result == 0) {
653             *status = U_ILLEGAL_ARGUMENT_ERROR;
654         }
655         break;
656     case OPTION_OPTIMIZE:
657     case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
658         // we need to move end here
659         src->current++; // skip opening brace
660         while(src->current < src->end && noOpenBraces != 0) {
661             if(*src->current == 0x005b) {
662                 noOpenBraces++;
663             } else if(*src->current == 0x005D) { // closing brace
664                 noOpenBraces--;
665             }
666             src->current++;
667         }
668         result = UCOL_TOK_SUCCESS;
669         break;
670     default:
671         *status = U_UNSUPPORTED_ERROR;
672         break;
673         }
674     }
675     src->current = u_memchr(src->current, 0x005d, src->end-src->current);
676     return result;
677 }
678 
679 
ucol_tok_addToExtraCurrent(UColTokenParser * src,const UChar * stuff,int32_t len,UErrorCode * status)680 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
681     if(src->extraCurrent+len >= src->extraEnd) {
682         /* reallocate */
683         UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
684         if(newSrc != NULL) {
685             src->current = newSrc + (src->current - src->source);
686             src->extraCurrent = newSrc + (src->extraCurrent - src->source);
687             src->end = newSrc + (src->end - src->source);
688             src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
689             src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
690             src->source = newSrc;
691         } else {
692             *status = U_MEMORY_ALLOCATION_ERROR;
693         }
694     }
695     if(len == 1) {
696         *src->extraCurrent++ = *stuff;
697     } else {
698         uprv_memcpy(src->extraCurrent, stuff, len*sizeof(UChar));
699         src->extraCurrent += len;
700     }
701 
702 
703 }
704 
ucol_tok_doSetTop(UColTokenParser * src,UErrorCode * status)705 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
706     /*
707     top = TRUE;
708     */
709     UChar buff[5];
710     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
711     buff[0] = 0xFFFE;
712     buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
713     buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
714     if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
715         src->parsedToken.charsLen = 3;
716         ucol_tok_addToExtraCurrent(src, buff, 3, status);
717     } else {
718         buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
719         buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
720         src->parsedToken.charsLen = 5;
721         ucol_tok_addToExtraCurrent(src, buff, 5, status);
722     }
723     return TRUE;
724 }
725 
isCharNewLine(UChar c)726 static UBool isCharNewLine(UChar c){
727     switch(c){
728     case 0x000A: /* LF  */
729     case 0x000D: /* CR  */
730     case 0x000C: /* FF  */
731     case 0x0085: /* NEL */
732     case 0x2028: /* LS  */
733     case 0x2029: /* PS  */
734         return TRUE;
735     default:
736         return FALSE;
737     }
738 }
739 
740 U_CAPI const UChar* U_EXPORT2
ucol_tok_parseNextToken(UColTokenParser * src,UBool startOfRules,UParseError * parseError,UErrorCode * status)741 ucol_tok_parseNextToken(UColTokenParser *src,
742                         UBool startOfRules,
743                         UParseError *parseError,
744                         UErrorCode *status)
745 {
746     /* parsing part */
747     UBool variableTop = FALSE;
748     UBool top = FALSE;
749     UBool inChars = TRUE;
750     UBool inQuote = FALSE;
751     UBool wasInQuote = FALSE;
752     uint8_t before = 0;
753     UBool isEscaped = FALSE;
754     // TODO: replace these variables with src->parsedToken counterparts
755     // no need to use them anymore since we have src->parsedToken.
756     // Ideally, token parser would be a nice class... Once, when I have
757     // more time (around 2020 probably).
758     uint32_t newExtensionLen = 0;
759     uint32_t extensionOffset = 0;
760     uint32_t newStrength = UCOL_TOK_UNSET;
761     UChar buff[10];
762 
763     src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
764     src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
765     src->parsedToken.indirectIndex = 0;
766 
767     while (src->current < src->end) {
768         UChar ch = *(src->current);
769 
770         if (inQuote) {
771             if (ch == 0x0027/*'\''*/) {
772                 inQuote = FALSE;
773             } else {
774                 if ((src->parsedToken.charsLen == 0) || inChars) {
775                     if(src->parsedToken.charsLen == 0) {
776                         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
777                     }
778                     src->parsedToken.charsLen++;
779                 } else {
780                     if(newExtensionLen == 0) {
781                         extensionOffset = (uint32_t)(src->extraCurrent - src->source);
782                     }
783                     newExtensionLen++;
784                 }
785             }
786         }else if(isEscaped){
787             isEscaped =FALSE;
788             if (newStrength == UCOL_TOK_UNSET) {
789                 *status = U_INVALID_FORMAT_ERROR;
790                 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
791                 return NULL;
792                 // enabling rules to start with non-tokens a < b
793                 // newStrength = UCOL_TOK_RESET;
794             }
795             if(ch != 0x0000  && src->current != src->end) {
796                 if (inChars) {
797                     if(src->parsedToken.charsLen == 0) {
798                         src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
799                     }
800                     src->parsedToken.charsLen++;
801                 } else {
802                     if(newExtensionLen == 0) {
803                         extensionOffset = (uint32_t)(src->current - src->source);
804                     }
805                     newExtensionLen++;
806                 }
807             }
808         }else {
809             if(!uprv_isRuleWhiteSpace(ch)) {
810                 /* Sets the strength for this entry */
811                 switch (ch) {
812                 case 0x003D/*'='*/ :
813                     if (newStrength != UCOL_TOK_UNSET) {
814                         goto EndOfLoop;
815                     }
816 
817                     /* if we start with strength, we'll reset to top */
818                     if(startOfRules == TRUE) {
819                         src->parsedToken.indirectIndex = 5;
820                         top = ucol_tok_doSetTop(src, status);
821                         newStrength = UCOL_TOK_RESET;
822                         goto EndOfLoop;
823                     }
824                     newStrength = UCOL_IDENTICAL;
825                     break;
826 
827                 case 0x002C/*','*/:
828                     if (newStrength != UCOL_TOK_UNSET) {
829                         goto EndOfLoop;
830                     }
831 
832                     /* if we start with strength, we'll reset to top */
833                     if(startOfRules == TRUE) {
834                         src->parsedToken.indirectIndex = 5;
835                         top = ucol_tok_doSetTop(src, status);
836                         newStrength = UCOL_TOK_RESET;
837                         goto EndOfLoop;
838                     }
839                     newStrength = UCOL_TERTIARY;
840                     break;
841 
842                 case  0x003B/*';'*/:
843                     if (newStrength != UCOL_TOK_UNSET) {
844                         goto EndOfLoop;
845                     }
846 
847                     /* if we start with strength, we'll reset to top */
848                     if(startOfRules == TRUE) {
849                         src->parsedToken.indirectIndex = 5;
850                         top = ucol_tok_doSetTop(src, status);
851                         newStrength = UCOL_TOK_RESET;
852                         goto EndOfLoop;
853                     }
854                     newStrength = UCOL_SECONDARY;
855                     break;
856 
857                 case 0x003C/*'<'*/:
858                     if (newStrength != UCOL_TOK_UNSET) {
859                         goto EndOfLoop;
860                     }
861 
862                     /* if we start with strength, we'll reset to top */
863                     if(startOfRules == TRUE) {
864                         src->parsedToken.indirectIndex = 5;
865                         top = ucol_tok_doSetTop(src, status);
866                         newStrength = UCOL_TOK_RESET;
867                         goto EndOfLoop;
868                     }
869                     /* before this, do a scan to verify whether this is */
870                     /* another strength */
871                     if(*(src->current+1) == 0x003C) {
872                         src->current++;
873                         if(*(src->current+1) == 0x003C) {
874                             src->current++; /* three in a row! */
875                             newStrength = UCOL_TERTIARY;
876                         } else { /* two in a row */
877                             newStrength = UCOL_SECONDARY;
878                         }
879                     } else { /* just one */
880                         newStrength = UCOL_PRIMARY;
881                     }
882                     break;
883 
884                 case 0x0026/*'&'*/:
885                     if (newStrength != UCOL_TOK_UNSET) {
886                         /**/
887                         goto EndOfLoop;
888                     }
889 
890                     newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
891                     break;
892 
893                 case 0x005b/*'['*/:
894                     /* options - read an option, analyze it */
895                     if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
896                         uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
897                         if(U_SUCCESS(*status)) {
898                             if(result & UCOL_TOK_TOP) {
899                                 if(newStrength == UCOL_TOK_RESET) {
900                                     top = ucol_tok_doSetTop(src, status);
901                                     if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
902                                         src->parsedToken.charsLen+=2;
903                                         buff[0] = 0x002d;
904                                         buff[1] = before;
905                                         ucol_tok_addToExtraCurrent(src, buff, 2, status);
906                                     }
907 
908                                     src->current++;
909                                     goto EndOfLoop;
910                                 } else {
911                                     *status = U_INVALID_FORMAT_ERROR;
912                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
913                                 }
914                             } else if(result & UCOL_TOK_VARIABLE_TOP) {
915                                 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
916                                     variableTop = TRUE;
917                                     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
918                                     src->parsedToken.charsLen = 1;
919                                     buff[0] = 0xFFFF;
920                                     ucol_tok_addToExtraCurrent(src, buff, 1, status);
921                                     src->current++;
922                                     goto EndOfLoop;
923                                 } else {
924                                     *status = U_INVALID_FORMAT_ERROR;
925                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
926                                 }
927                             } else if (result & UCOL_TOK_BEFORE){
928                                 if(newStrength == UCOL_TOK_RESET) {
929                                     before = result & UCOL_TOK_BEFORE;
930                                 } else {
931                                     *status = U_INVALID_FORMAT_ERROR;
932                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
933 
934                                 }
935                             }
936                         } else {
937                             *status = U_INVALID_FORMAT_ERROR;
938                             syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
939                             return NULL;
940                         }
941                     }
942                     break;
943                 case 0x0021/*! skip java thai modifier reordering*/:
944                     break;
945                 case 0x002F/*'/'*/:
946                     wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
947                     inChars = FALSE; /* we're now processing expansion */
948                     break;
949                 case 0x005C /* back slash for escaped chars */:
950                     isEscaped = TRUE;
951                     break;
952                     /* found a quote, we're gonna start copying */
953                 case 0x0027/*'\''*/:
954                     if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
955                         *status = U_INVALID_FORMAT_ERROR;
956                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
957                         return NULL;
958                         // enabling rules to start with a non-token character a < b
959                         // newStrength = UCOL_TOK_RESET;
960                     }
961 
962                     inQuote = TRUE;
963 
964                     if(inChars) { /* we're doing characters */
965                         if(wasInQuote == FALSE) {
966                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
967                         }
968                         if (src->parsedToken.charsLen != 0) {
969                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
970                         }
971                         src->parsedToken.charsLen++;
972                     } else { /* we're doing an expansion */
973                         if(wasInQuote == FALSE) {
974                             extensionOffset = (uint32_t)(src->extraCurrent - src->source);
975                         }
976                         if (newExtensionLen != 0) {
977                             ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
978                         }
979                         newExtensionLen++;
980                     }
981 
982                     wasInQuote = TRUE;
983 
984                     ch = *(++(src->current));
985                     if(ch == 0x0027) { /* copy the double quote */
986                         ucol_tok_addToExtraCurrent(src, &ch, 1, status);
987                         inQuote = FALSE;
988                     }
989                     break;
990 
991                     /* '@' is french only if the strength is not currently set */
992                     /* if it is, it's just a regular character in collation rules */
993                 case 0x0040/*'@'*/:
994                     if (newStrength == UCOL_TOK_UNSET) {
995                         src->opts->frenchCollation = UCOL_ON;
996                         break;
997                     }
998 
999                 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1000                     // we want to store read characters to the prefix part and continue reading
1001                     // the characters (proper way would be to restart reading the chars, but in
1002                     // that case we would have to complicate the token hasher, which I do not
1003                     // intend to play with. Instead, we will do prefixes when prefixes are due
1004                     // (before adding the elements).
1005                     src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1006                     src->parsedToken.prefixLen = src->parsedToken.charsLen;
1007 
1008                     if(inChars) { /* we're doing characters */
1009                         if(wasInQuote == FALSE) {
1010                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1011                         }
1012                         if (src->parsedToken.charsLen != 0) {
1013                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1014                         }
1015                         src->parsedToken.charsLen++;
1016                     }
1017 
1018                     wasInQuote = TRUE;
1019 
1020                     do {
1021                         ch = *(++(src->current));
1022                         // skip whitespace between '|' and the character
1023                     } while (uprv_isRuleWhiteSpace(ch));
1024                     break;
1025 
1026                     //charsOffset = 0;
1027                     //newCharsLen = 0;
1028                     //break; // We want to store the whole prefix/character sequence. If we break
1029                     // the '|' is going to get lost.
1030                 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1031                     do {
1032                         ch = *(++(src->current));
1033                     } while (!isCharNewLine(ch));
1034 
1035                     break;
1036                 default:
1037                     if (newStrength == UCOL_TOK_UNSET) {
1038                         *status = U_INVALID_FORMAT_ERROR;
1039                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1040                         return NULL;
1041                     }
1042 
1043                     if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1044                         *status = U_INVALID_FORMAT_ERROR;
1045                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1046                         return NULL;
1047                     }
1048 
1049                     if(ch == 0x0000 && src->current+1 == src->end) {
1050                         break;
1051                     }
1052 
1053                     if (inChars) {
1054                         if(src->parsedToken.charsLen == 0) {
1055                             src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1056                         }
1057                         src->parsedToken.charsLen++;
1058                     } else {
1059                         if(newExtensionLen == 0) {
1060                             extensionOffset = (uint32_t)(src->current - src->source);
1061                         }
1062                         newExtensionLen++;
1063                     }
1064 
1065                     break;
1066                 }
1067             }
1068         }
1069 
1070         if(wasInQuote) {
1071             if(ch != 0x27) {
1072                 if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
1073                     ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1074                 }
1075             }
1076         }
1077 
1078         src->current++;
1079     }
1080 
1081 EndOfLoop:
1082     wasInQuote = FALSE;
1083     if (newStrength == UCOL_TOK_UNSET) {
1084         return NULL;
1085     }
1086 
1087     if (src->parsedToken.charsLen == 0 && top == FALSE) {
1088         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1089         *status = U_INVALID_FORMAT_ERROR;
1090         return NULL;
1091     }
1092 
1093     src->parsedToken.strength = newStrength;
1094     src->parsedToken.extensionOffset = extensionOffset;
1095     src->parsedToken.extensionLen = newExtensionLen;
1096     src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1097 
1098     return src->current;
1099 }
1100 
1101 /*
1102 Processing Description
1103 1 Build a ListList. Each list has a header, which contains two lists (positive
1104 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1105 reset may be null.
1106 2 As you process, you keep a LAST pointer that points to the last token you
1107 handled.
1108 */
1109 
ucol_tok_initAReset(UColTokenParser * src,UChar * expand,uint32_t * expandNext,UParseError * parseError,UErrorCode * status)1110 static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
1111                                       UParseError *parseError, UErrorCode *status)
1112 {
1113     if(src->resultLen == src->listCapacity) {
1114         // Unfortunately, this won't work, as we store addresses of lhs in token
1115         src->listCapacity *= 2;
1116         src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1117         if(src->lh == NULL) {
1118             *status = U_MEMORY_ALLOCATION_ERROR;
1119             return NULL;
1120         }
1121     }
1122     /* do the reset thing */
1123     UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1124     /* test for NULL */
1125     if (sourceToken == NULL) {
1126         *status = U_MEMORY_ALLOCATION_ERROR;
1127         return NULL;
1128     }
1129     sourceToken->rulesToParse = src->source;
1130     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1131     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1132 
1133     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1134     sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1135 
1136     // keep the flags around so that we know about before
1137     sourceToken->flags = src->parsedToken.flags;
1138 
1139     if(src->parsedToken.prefixOffset != 0) {
1140         // this is a syntax error
1141         *status = U_INVALID_FORMAT_ERROR;
1142         syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1143         return 0;
1144     } else {
1145         sourceToken->prefix = 0;
1146     }
1147 
1148     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1149     sourceToken->strength = UCOL_TOK_RESET;
1150     sourceToken->next = NULL;
1151     sourceToken->previous = NULL;
1152     sourceToken->noOfCEs = 0;
1153     sourceToken->noOfExpCEs = 0;
1154     sourceToken->listHeader = &src->lh[src->resultLen];
1155 
1156     src->lh[src->resultLen].first = NULL;
1157     src->lh[src->resultLen].last = NULL;
1158     src->lh[src->resultLen].first = NULL;
1159     src->lh[src->resultLen].last = NULL;
1160 
1161     src->lh[src->resultLen].reset = sourceToken;
1162 
1163     /*
1164     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1165     First convert all expansions into normal form. Examples:
1166     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1167     d * ... into &x * c/y * d * ...
1168     Note: reset values can never have expansions, although they can cause the
1169     very next item to have one. They may be contractions, if they are found
1170     earlier in the list.
1171     */
1172     *expandNext = 0;
1173     if(expand != NULL) {
1174         /* check to see if there is an expansion */
1175         if(src->parsedToken.charsLen > 1) {
1176             uint32_t resetCharsOffset;
1177             resetCharsOffset = (uint32_t)(expand - src->source);
1178             sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1179             *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1180         }
1181     }
1182 
1183     src->resultLen++;
1184 
1185     uhash_put(src->tailored, sourceToken, sourceToken, status);
1186 
1187     return sourceToken;
1188 }
1189 
1190 static
getVirginBefore(UColTokenParser * src,UColToken * sourceToken,uint8_t strength,UParseError * parseError,UErrorCode * status)1191 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1192     if(U_FAILURE(*status)) {
1193         return NULL;
1194     }
1195     /* this is a virgin before - we need to fish the anchor from the UCA */
1196     collIterate s;
1197     uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1198     uint32_t CE, SecondCE;
1199     uint32_t invPos;
1200     if(sourceToken != NULL) {
1201         uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
1202     } else {
1203         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
1204     }
1205 
1206     baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1207     baseContCE = ucol_getNextCE(src->UCA, &s, status);
1208     if(baseContCE == UCOL_NO_MORE_CES) {
1209         baseContCE = 0;
1210     }
1211 
1212 
1213     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1214     uint32_t ch = 0;
1215     uint32_t expandNext = 0;
1216     UColToken key;
1217 
1218     if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1219         uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1220         uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1221         ch = uprv_uca_getCodePointFromRaw(raw-1);
1222         uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1223         CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1224         SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1225 
1226         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1227         *src->extraCurrent++ = 0xFFFE;
1228         *src->extraCurrent++ = (UChar)ch;
1229         src->parsedToken.charsLen++;
1230 
1231         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1232         key.rulesToParse = src->source;
1233 
1234         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1235         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1236 
1237         if(sourceToken == NULL) {
1238             src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1239             if(isContinuation(SecondCE)) {
1240                 src->lh[src->resultLen].baseContCE = SecondCE;
1241             } else {
1242                 src->lh[src->resultLen].baseContCE = 0;
1243             }
1244             src->lh[src->resultLen].nextCE = 0;
1245             src->lh[src->resultLen].nextContCE = 0;
1246             src->lh[src->resultLen].previousCE = 0;
1247             src->lh[src->resultLen].previousContCE = 0;
1248 
1249             src->lh[src->resultLen].indirect = FALSE;
1250 
1251             sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1252         }
1253 
1254     } else {
1255         invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1256 
1257         // we got the previous CE. Now we need to see if the difference between
1258         // the two CEs is really of the requested strength.
1259         // if it's a bigger difference (we asked for secondary and got primary), we
1260         // need to modify the CE.
1261         if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1262             // adjust the strength
1263             // now we are in the situation where our baseCE should actually be modified in
1264             // order to get the CE in the right position.
1265             if(strength == UCOL_SECONDARY) {
1266                 CE = baseCE - 0x0200;
1267             } else { // strength == UCOL_TERTIARY
1268                 CE = baseCE - 0x02;
1269             }
1270             if(baseContCE) {
1271                 if(strength == UCOL_SECONDARY) {
1272                     SecondCE = baseContCE - 0x0200;
1273                 } else { // strength == UCOL_TERTIARY
1274                     SecondCE = baseContCE - 0x02;
1275                 }
1276             }
1277         }
1278 
1279 #if 0
1280         // the code below relies on getting a code point from the inverse table, in order to be
1281         // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1282         // 1. There are many code points that have the same CE
1283         // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1284         // Also, in case when there is no equivalent strength before an element, we have to actually
1285         // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1286         // before a is a primary difference.
1287 
1288         //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1289 
1290 
1291         ch = CETable[3*invPos+2];
1292 
1293         if((ch &  UCOL_INV_SIZEMASK) != 0) {
1294             uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1295             uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1296             ch = conts[offset];
1297         }
1298 
1299         *src->extraCurrent++ = (UChar)ch;
1300         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1301         src->parsedToken.charsLen = 1;
1302 
1303         // We got an UCA before. However, this might have been tailored.
1304         // example:
1305         // &\u30ca = \u306a
1306         // &[before 3]\u306a<<<\u306a|\u309d
1307 
1308 
1309         // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1310         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1311         key.rulesToParse = src->source;
1312 
1313         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1314         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1315 #endif
1316 
1317         // here is how it should be. The situation such as &[before 1]a < x, should be
1318         // resolved exactly as if we wrote &a > x.
1319         // therefore, I don't really care if the UCA value before a has been changed.
1320         // However, I do care if the strength between my element and the previous element
1321         // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1322         // have to construct the base CE.
1323 
1324 
1325 
1326         // if we found a tailored thing, we have to use the UCA value and construct
1327         // a new reset token with constructed name
1328         //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1329         // character to which we want to anchor is already tailored.
1330         // We need to construct a new token which will be the anchor
1331         // point
1332         //*(src->extraCurrent-1) = 0xFFFE;
1333         //*src->extraCurrent++ = (UChar)ch;
1334         // grab before
1335         src->parsedToken.charsOffset -= 10;
1336         src->parsedToken.charsLen += 10;
1337         src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1338         if(isContinuation(SecondCE)) {
1339             src->lh[src->resultLen].baseContCE = SecondCE;
1340         } else {
1341             src->lh[src->resultLen].baseContCE = 0;
1342         }
1343         src->lh[src->resultLen].nextCE = 0;
1344         src->lh[src->resultLen].nextContCE = 0;
1345         src->lh[src->resultLen].previousCE = 0;
1346         src->lh[src->resultLen].previousContCE = 0;
1347 
1348         src->lh[src->resultLen].indirect = FALSE;
1349 
1350         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1351         //}
1352     }
1353 
1354     return sourceToken;
1355 
1356 }
1357 
ucol_tok_assembleTokenList(UColTokenParser * src,UParseError * parseError,UErrorCode * status)1358 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1359     UColToken *lastToken = NULL;
1360     const UChar *parseEnd = NULL;
1361     uint32_t expandNext = 0;
1362     UBool variableTop = FALSE;
1363     UBool top = FALSE;
1364     uint16_t specs = 0;
1365     UColTokListHeader *ListList = NULL;
1366 
1367     src->parsedToken.strength = UCOL_TOK_UNSET;
1368 
1369     ListList = src->lh;
1370 
1371     if(U_FAILURE(*status)) {
1372         return 0;
1373     }
1374 
1375     while(src->current < src->end) {
1376         src->parsedToken.prefixOffset = 0;
1377 
1378         parseEnd = ucol_tok_parseNextToken(src,
1379             (UBool)(lastToken == NULL),
1380             parseError,
1381             status);
1382 
1383         specs = src->parsedToken.flags;
1384 
1385 
1386         variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1387         top = ((specs & UCOL_TOK_TOP) != 0);
1388 
1389         if(U_SUCCESS(*status) && parseEnd != NULL) {
1390             UColToken *sourceToken = NULL;
1391             //uint32_t key = 0;
1392             uint32_t lastStrength = UCOL_TOK_UNSET;
1393 
1394             if(lastToken != NULL ) {
1395                 lastStrength = lastToken->strength;
1396             }
1397 
1398             //key = newCharsLen << 24 | charsOffset;
1399             UColToken key;
1400             key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1401             key.rulesToParse = src->source;
1402 
1403             /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
1404             sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1405 
1406             if(src->parsedToken.strength != UCOL_TOK_RESET) {
1407                 if(lastToken == NULL) { /* this means that rules haven't started properly */
1408                     *status = U_INVALID_FORMAT_ERROR;
1409                     syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1410                     return 0;
1411                 }
1412                 /*  6 Otherwise (when relation != reset) */
1413                 if(sourceToken == NULL) {
1414                     /* If sourceToken is null, create new one, */
1415                     sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1416                     /* test for NULL */
1417                     if (sourceToken == NULL) {
1418                         *status = U_MEMORY_ALLOCATION_ERROR;
1419                         return 0;
1420                     }
1421                     sourceToken->rulesToParse = src->source;
1422                     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1423 
1424                     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1425 
1426                     sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1427                     sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1428 
1429                     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1430                     sourceToken->next = NULL;
1431                     sourceToken->previous = NULL;
1432                     sourceToken->noOfCEs = 0;
1433                     sourceToken->noOfExpCEs = 0;
1434                     // keep the flags around so that we know about before
1435                     sourceToken->flags = src->parsedToken.flags;
1436                     uhash_put(src->tailored, sourceToken, sourceToken, status);
1437                 } else {
1438                     /* we could have fished out a reset here */
1439                     if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1440                         /* otherwise remove sourceToken from where it was. */
1441                         if(sourceToken->next != NULL) {
1442                             if(sourceToken->next->strength > sourceToken->strength) {
1443                                 sourceToken->next->strength = sourceToken->strength;
1444                             }
1445                             sourceToken->next->previous = sourceToken->previous;
1446                         } else {
1447                             sourceToken->listHeader->last = sourceToken->previous;
1448                         }
1449 
1450                         if(sourceToken->previous != NULL) {
1451                             sourceToken->previous->next = sourceToken->next;
1452                         } else {
1453                             sourceToken->listHeader->first = sourceToken->next;
1454                         }
1455                         sourceToken->next = NULL;
1456                         sourceToken->previous = NULL;
1457                     }
1458                 }
1459 
1460                 sourceToken->strength = src->parsedToken.strength;
1461                 sourceToken->listHeader = lastToken->listHeader;
1462 
1463                 /*
1464                 1.  Find the strongest strength in each list, and set strongestP and strongestN
1465                 accordingly in the headers.
1466                 */
1467                 if(lastStrength == UCOL_TOK_RESET
1468                     || sourceToken->listHeader->first == 0) {
1469                         /* If LAST is a reset
1470                         insert sourceToken in the list. */
1471                         if(sourceToken->listHeader->first == 0) {
1472                             sourceToken->listHeader->first = sourceToken;
1473                             sourceToken->listHeader->last = sourceToken;
1474                         } else { /* we need to find a place for us */
1475                             /* and we'll get in front of the same strength */
1476                             if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1477                                 sourceToken->next = sourceToken->listHeader->first;
1478                                 sourceToken->next->previous = sourceToken;
1479                                 sourceToken->listHeader->first = sourceToken;
1480                                 sourceToken->previous = NULL;
1481                             } else {
1482                                 lastToken = sourceToken->listHeader->first;
1483                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1484                                     lastToken = lastToken->next;
1485                                 }
1486                                 if(lastToken->next != NULL) {
1487                                     lastToken->next->previous = sourceToken;
1488                                 } else {
1489                                     sourceToken->listHeader->last = sourceToken;
1490                                 }
1491                                 sourceToken->previous = lastToken;
1492                                 sourceToken->next = lastToken->next;
1493                                 lastToken->next = sourceToken;
1494                             }
1495                         }
1496                     } else {
1497                         /* Otherwise (when LAST is not a reset)
1498                         if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1499                         otherwise insert before.
1500                         when inserting after or before, search to the next position with the same
1501                         strength in that direction. (This is called postpone insertion).         */
1502                         if(sourceToken != lastToken) {
1503                             if(lastToken->polarity == sourceToken->polarity) {
1504                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1505                                     lastToken = lastToken->next;
1506                                 }
1507                                 sourceToken->previous = lastToken;
1508                                 if(lastToken->next != NULL) {
1509                                     lastToken->next->previous = sourceToken;
1510                                 } else {
1511                                     sourceToken->listHeader->last = sourceToken;
1512                                 }
1513 
1514                                 sourceToken->next = lastToken->next;
1515                                 lastToken->next = sourceToken;
1516                             } else {
1517                                 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1518                                     lastToken = lastToken->previous;
1519                                 }
1520                                 sourceToken->next = lastToken;
1521                                 if(lastToken->previous != NULL) {
1522                                     lastToken->previous->next = sourceToken;
1523                                 } else {
1524                                     sourceToken->listHeader->first = sourceToken;
1525                                 }
1526                                 sourceToken->previous = lastToken->previous;
1527                                 lastToken->previous = sourceToken;
1528                             }
1529                         } else { /* repeated one thing twice in rules, stay with the stronger strength */
1530                             if(lastStrength < sourceToken->strength) {
1531                                 sourceToken->strength = lastStrength;
1532                             }
1533                         }
1534                     }
1535 
1536                     /* if the token was a variable top, we're gonna put it in */
1537                     if(variableTop == TRUE && src->varTop == NULL) {
1538                         variableTop = FALSE;
1539                         src->varTop = sourceToken;
1540                     }
1541 
1542                     // Treat the expansions.
1543                     // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1544                     // (&abc * d * e <=> &ab * d / c * e / c)
1545                     // if both of them are in effect for a token, they are combined.
1546 
1547                     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1548 
1549                     if(expandNext != 0) {
1550                         if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1551                             expandNext = 0;
1552                         } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1553                             sourceToken->expansion = expandNext;
1554                         } else { /* there is both explicit and implicit expansion. We need to make a combination */
1555                             uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1556                             uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1557                             sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1558                             src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1559                         }
1560                     }
1561 
1562                     // This is just for debugging purposes
1563                     if(sourceToken->expansion != 0) {
1564                         sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1565                     } else {
1566                         sourceToken->debugExpansion = 0;
1567                     }
1568                     // if the previous token was a reset before, the strength of this
1569                     // token must match the strength of before. Otherwise we have an
1570                     // undefined situation.
1571                     // In other words, we currently have a cludge which we use to
1572                     // represent &a >> x. This is written as &[before 2]a << x.
1573                     if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1574                         uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1575                         if(beforeStrength != sourceToken->strength) {
1576                             *status = U_INVALID_FORMAT_ERROR;
1577                             syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1578                             return 0;
1579                         }
1580                     }
1581             } else {
1582                 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1583                     /* if the previous token was also a reset, */
1584                     /*this means that we have two consecutive resets */
1585                     /* and we want to remove the previous one if empty*/
1586                     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1587                         src->resultLen--;
1588                     }
1589                 }
1590 
1591                 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1592                     uint32_t searchCharsLen = src->parsedToken.charsLen;
1593                     while(searchCharsLen > 1 && sourceToken == NULL) {
1594                         searchCharsLen--;
1595                         //key = searchCharsLen << 24 | charsOffset;
1596                         UColToken key;
1597                         key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1598                         key.rulesToParse = src->source;
1599                         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1600                     }
1601                     if(sourceToken != NULL) {
1602                         expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1603                     }
1604                 }
1605 
1606                 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1607                     if(top == FALSE) { /* there is no indirection */
1608                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1609                         if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1610                             /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1611                             while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1612                                 sourceToken = sourceToken->previous;
1613                             }
1614                             /* here, either we hit the strength or NULL */
1615                             if(sourceToken->strength == strength) {
1616                                 if(sourceToken->previous != NULL) {
1617                                     sourceToken = sourceToken->previous;
1618                                 } else { /* start of list */
1619                                     sourceToken = sourceToken->listHeader->reset;
1620                                 }
1621                             } else { /* we hit NULL */
1622                                 /* we should be doing the else part */
1623                                 sourceToken = sourceToken->listHeader->reset;
1624                                 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1625                             }
1626                         } else {
1627                             sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
1628                         }
1629                     } else { /* this is both before and indirection */
1630                         top = FALSE;
1631                         ListList[src->resultLen].previousCE = 0;
1632                         ListList[src->resultLen].previousContCE = 0;
1633                         ListList[src->resultLen].indirect = TRUE;
1634                         /* we need to do slightly more work. we need to get the baseCE using the */
1635                         /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
1636                         /* in ucol_bld */
1637                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1638                         uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1639                         uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
1640                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1641 
1642                         UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1643                         if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1644                             uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
1645                             uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1646                             uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1647                             CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
1648                             SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
1649                         } else {
1650                             /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
1651                             ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1652                         }
1653 
1654                         ListList[src->resultLen].baseCE = CE;
1655                         ListList[src->resultLen].baseContCE = SecondCE;
1656                         ListList[src->resultLen].nextCE = 0;
1657                         ListList[src->resultLen].nextContCE = 0;
1658 
1659                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1660                     }
1661                 }
1662 
1663 
1664                 /*  5 If the relation is a reset:
1665                 If sourceToken is null
1666                 Create new list, create new sourceToken, make the baseCE from source, put
1667                 the sourceToken in ListHeader of the new list */
1668                 if(sourceToken == NULL) {
1669                     /*
1670                     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1671                     First convert all expansions into normal form. Examples:
1672                     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1673                     d * ... into &x * c/y * d * ...
1674                     Note: reset values can never have expansions, although they can cause the
1675                     very next item to have one. They may be contractions, if they are found
1676                     earlier in the list.
1677                     */
1678                     if(top == FALSE) {
1679                         collIterate s;
1680                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
1681 
1682                         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);
1683 
1684                         CE = ucol_getNextCE(src->UCA, &s, status);
1685                         UChar *expand = s.pos;
1686                         SecondCE = ucol_getNextCE(src->UCA, &s, status);
1687 
1688                         ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1689                         if(isContinuation(SecondCE)) {
1690                             ListList[src->resultLen].baseContCE = SecondCE;
1691                         } else {
1692                             ListList[src->resultLen].baseContCE = 0;
1693                         }
1694                         ListList[src->resultLen].nextCE = 0;
1695                         ListList[src->resultLen].nextContCE = 0;
1696                         ListList[src->resultLen].previousCE = 0;
1697                         ListList[src->resultLen].previousContCE = 0;
1698                         ListList[src->resultLen].indirect = FALSE;
1699                         sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
1700                     } else { /* top == TRUE */
1701                         /* just use the supplied values */
1702                         top = FALSE;
1703                         ListList[src->resultLen].previousCE = 0;
1704                         ListList[src->resultLen].previousContCE = 0;
1705                         ListList[src->resultLen].indirect = TRUE;
1706                         ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
1707                         ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
1708                         ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
1709                         ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
1710 
1711                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1712 
1713                     }
1714                 } else { /* reset to something already in rules */
1715                     top = FALSE;
1716                 }
1717             }
1718             /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
1719             lastToken = sourceToken;
1720         } else {
1721             if(U_FAILURE(*status)) {
1722                 return 0;
1723             }
1724         }
1725     }
1726 
1727     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1728         src->resultLen--;
1729     }
1730     return src->resultLen;
1731 }
1732 
ucol_tok_initTokenList(UColTokenParser * src,const UChar * rules,const uint32_t rulesLength,const UCollator * UCA,UErrorCode * status)1733 void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
1734     U_NAMESPACE_USE
1735 
1736     uint32_t nSize = 0;
1737     uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
1738     if(U_FAILURE(*status)) {
1739         return;
1740     }
1741 
1742     // set everything to zero, so that we can clean up gracefully
1743     uprv_memset(src, 0, sizeof(UColTokenParser));
1744 
1745     // first we need to find options that don't like to be normalized,
1746     // like copy and remove...
1747     //const UChar *openBrace = rules;
1748     int32_t optionNumber = -1;
1749     const UChar *setStart;
1750     uint32_t i = 0;
1751     while(i < rulesLength) {
1752         if(rules[i] == 0x005B) {
1753             // while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
1754             //optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
1755             optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
1756             if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
1757                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1758                 if(U_SUCCESS(*status)) {
1759                     if(src->copySet == NULL) {
1760                         src->copySet = newSet;
1761                     } else {
1762                         uset_addAll(src->copySet, newSet);
1763                         uset_close(newSet);
1764                     }
1765                 } else {
1766                     return;
1767                 }
1768             } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
1769                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
1770                 if(U_SUCCESS(*status)) {
1771                     if(src->removeSet == NULL) {
1772                         src->removeSet = newSet;
1773                     } else {
1774                         uset_addAll(src->removeSet, newSet);
1775                         uset_close(newSet);
1776                     }
1777                 } else {
1778                     return;
1779                 }
1780             }
1781         }
1782         //openBrace++;
1783         i++;
1784     }
1785 
1786     src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
1787     /* test for NULL */
1788     if (src->source == NULL) {
1789         *status = U_MEMORY_ALLOCATION_ERROR;
1790         return;
1791     }
1792     uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
1793     nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
1794     if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
1795         *status = U_ZERO_ERROR;
1796         src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
1797         /* test for NULL */
1798         if (src->source == NULL) {
1799             *status = U_MEMORY_ALLOCATION_ERROR;
1800             return;
1801         }
1802         nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
1803     }
1804     src->current = src->source;
1805     src->end = src->source+nSize;
1806     src->sourceCurrent = src->source;
1807     src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
1808     src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
1809     src->varTop = NULL;
1810     src->UCA = UCA;
1811     src->invUCA = ucol_initInverseUCA(status);
1812     src->parsedToken.charsLen = 0;
1813     src->parsedToken.charsOffset = 0;
1814     src->parsedToken.extensionLen = 0;
1815     src->parsedToken.extensionOffset = 0;
1816     src->parsedToken.prefixLen = 0;
1817     src->parsedToken.prefixOffset = 0;
1818     src->parsedToken.flags = 0;
1819     src->parsedToken.strength = UCOL_TOK_UNSET;
1820     src->buildCCTabFlag = FALSE;
1821 
1822     if(U_FAILURE(*status)) {
1823         return;
1824     }
1825     src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
1826     if(U_FAILURE(*status)) {
1827         return;
1828     }
1829     uhash_setValueDeleter(src->tailored, uhash_freeBlock);
1830 
1831     src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
1832     /* test for NULL */
1833     if (src->opts == NULL) {
1834         *status = U_MEMORY_ALLOCATION_ERROR;
1835         return;
1836     }
1837 
1838     uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
1839 
1840     // rulesToParse = src->source;
1841     src->lh = 0;
1842     src->listCapacity = 1024;
1843     src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
1844     //Test for NULL
1845     if (src->lh == NULL) {
1846         *status = U_MEMORY_ALLOCATION_ERROR;
1847         return;
1848     }
1849     uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
1850     src->resultLen = 0;
1851 
1852     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1853 
1854     // UCOL_RESET_TOP_VALUE
1855     setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1856     // UCOL_FIRST_PRIMARY_IGNORABLE
1857     setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
1858     // UCOL_LAST_PRIMARY_IGNORABLE
1859     setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
1860     // UCOL_FIRST_SECONDARY_IGNORABLE
1861     setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
1862     // UCOL_LAST_SECONDARY_IGNORABLE
1863     setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
1864     // UCOL_FIRST_TERTIARY_IGNORABLE
1865     setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
1866     // UCOL_LAST_TERTIARY_IGNORABLE
1867     setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
1868     // UCOL_FIRST_VARIABLE
1869     setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
1870     // UCOL_LAST_VARIABLE
1871     setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
1872     // UCOL_FIRST_NON_VARIABLE
1873     setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
1874     // UCOL_LAST_NON_VARIABLE
1875     setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
1876     // UCOL_FIRST_IMPLICIT
1877     setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
1878     // UCOL_LAST_IMPLICIT
1879     setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
1880     // UCOL_FIRST_TRAILING
1881     setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
1882     // UCOL_LAST_TRAILING
1883     setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
1884     ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
1885 }
1886 
1887 
ucol_tok_closeTokenList(UColTokenParser * src)1888 void ucol_tok_closeTokenList(UColTokenParser *src) {
1889     if(src->copySet != NULL) {
1890         uset_close(src->copySet);
1891     }
1892     if(src->removeSet != NULL) {
1893         uset_close(src->removeSet);
1894     }
1895     if(src->tailored != NULL) {
1896         uhash_close(src->tailored);
1897     }
1898     if(src->lh != NULL) {
1899         uprv_free(src->lh);
1900     }
1901     if(src->source != NULL) {
1902         uprv_free(src->source);
1903     }
1904     if(src->opts != NULL) {
1905         uprv_free(src->opts);
1906     }
1907 }
1908 
1909 #endif /* #if !UCONFIG_NO_COLLATION */
1910 
1911