• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2001-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  ucol_tok.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created 02/22/2001
14 *   created by: Vladimir Weinstein
15 *
16 * This module reads a tailoring rule string and produces a list of
17 * tokens that will be turned into collation elements
18 *
19 */
20 
21 #include "unicode/utypes.h"
22 
23 #if !UCONFIG_NO_COLLATION
24 
25 #include "unicode/uscript.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
28 #include "unicode/uniset.h"
29 
30 #include "cmemory.h"
31 #include "cstring.h"
32 #include "ucol_bld.h"
33 #include "ucol_tok.h"
34 #include "ulocimp.h"
35 #include "uresimp.h"
36 #include "util.h"
37 
38 // Define this only for debugging.
39 // #define DEBUG_FOR_COLL_RULES 1
40 
41 #ifdef DEBUG_FOR_COLL_RULES
42 #include <iostream>
43 #endif
44 
45 U_NAMESPACE_USE
46 
47 U_CDECL_BEGIN
48 static int32_t U_CALLCONV
uhash_hashTokens(const UHashTok k)49 uhash_hashTokens(const UHashTok k)
50 {
51     int32_t hash = 0;
52     //uint32_t key = (uint32_t)k.integer;
53     UColToken *key = (UColToken *)k.pointer;
54     if (key != 0) {
55         int32_t len = (key->source & 0xFF000000)>>24;
56         int32_t inc = ((len - 32) / 32) + 1;
57 
58         const UChar *p = (key->source & 0x00FFFFFF) + *(key->rulesToParseHdl);
59         const UChar *limit = p + len;
60 
61         while (p<limit) {
62             hash = (hash * 37) + *p;
63             p += inc;
64         }
65     }
66     return hash;
67 }
68 
69 static UBool U_CALLCONV
uhash_compareTokens(const UHashTok key1,const UHashTok key2)70 uhash_compareTokens(const UHashTok key1, const UHashTok key2)
71 {
72     //uint32_t p1 = (uint32_t) key1.integer;
73     //uint32_t p2 = (uint32_t) key2.integer;
74     UColToken *p1 = (UColToken *)key1.pointer;
75     UColToken *p2 = (UColToken *)key2.pointer;
76     const UChar *s1 = (p1->source & 0x00FFFFFF) + *(p1->rulesToParseHdl);
77     const UChar *s2 = (p2->source & 0x00FFFFFF) + *(p2->rulesToParseHdl);
78     uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
79     uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
80     const UChar *end = s1+s1L-1;
81 
82     if (p1 == p2) {
83         return TRUE;
84     }
85     if (p1->source == 0 || p2->source == 0) {
86         return FALSE;
87     }
88     if(s1L != s2L) {
89         return FALSE;
90     }
91     if(p1->source == p2->source) {
92         return TRUE;
93     }
94     while((s1 < end) && *s1 == *s2) {
95         ++s1;
96         ++s2;
97     }
98     if(*s1 == *s2) {
99         return TRUE;
100     } else {
101         return FALSE;
102     }
103 }
104 U_CDECL_END
105 
106 /*
107  * Debug messages used to pinpoint where a format error occurred.
108  * A better way is to include context-sensitive information in syntaxError() function.
109  *
110  * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_FORMAT_ERROR
111  * in the compile line.
112  */
113 /* #define DEBUG_FOR_FORMAT_ERROR 1 */
114 
115 #ifdef DEBUG_FOR_FORMAT_ERROR
116 #define DBG_FORMAT_ERROR { printf("U_INVALID_FORMAT_ERROR at line %d", __LINE__);}
117 #else
118 #define DBG_FORMAT_ERROR
119 #endif
120 
121 
122 /*
123  * Controls debug messages so that the output can be compared before and after a
124  * big change.  Prints the information of every code point that comes out of the
125  * collation parser and its strength into a file.  When a big change in format
126  * happens, the files before and after the change should be identical.
127  *
128  * To turn this debugging on, either uncomment the following line, or define use -DDEBUG_FOR_CODE_POINTS
129  * in the compile line.
130  */
131 // #define DEBUG_FOR_CODE_POINTS 1
132 
133 #ifdef DEBUG_FOR_CODE_POINTS
134     FILE* dfcp_fp = NULL;
135 #endif
136 
137 
138 /*static inline void U_CALLCONV
139 uhash_freeBlockWrapper(void *obj) {
140     uhash_freeBlock(obj);
141 }*/
142 
143 
144 typedef struct {
145     uint32_t startCE;
146     uint32_t startContCE;
147     uint32_t limitCE;
148     uint32_t limitContCE;
149 } indirectBoundaries;
150 
151 /* these values are used for finding CE values for indirect positioning. */
152 /* Indirect positioning is a mechanism for allowing resets on symbolic   */
153 /* values. It only works for resets and you cannot tailor indirect names */
154 /* An indirect name can define either an anchor point or a range. An     */
155 /* anchor point behaves in exactly the same way as a code point in reset */
156 /* would, except that it cannot be tailored. A range (we currently only  */
157 /* know for the [top] range will explicitly set the upper bound for      */
158 /* generated CEs, thus allowing for better control over how many CEs can */
159 /* be squeezed between in the range without performance penalty.         */
160 /* In that respect, we use [top] for tailoring of locales that use CJK   */
161 /* characters. Other indirect values are currently a pure convenience,   */
162 /* they can be used to assure that the CEs will be always positioned in  */
163 /* the same place relative to a point with known properties (e.g. first  */
164 /* primary ignorable). */
165 static indirectBoundaries ucolIndirectBoundaries[15];
166 /*
167 static indirectBoundaries ucolIndirectBoundaries[11] = {
168 { UCOL_RESET_TOP_VALUE,               0,
169 UCOL_NEXT_TOP_VALUE,                0 },
170 { UCOL_FIRST_PRIMARY_IGNORABLE,       0,
171 0,                                  0 },
172 { UCOL_LAST_PRIMARY_IGNORABLE,        UCOL_LAST_PRIMARY_IGNORABLE_CONT,
173 0,                                  0 },
174 { UCOL_FIRST_SECONDARY_IGNORABLE,     0,
175 0,                                  0 },
176 { UCOL_LAST_SECONDARY_IGNORABLE,      0,
177 0,                                  0 },
178 { UCOL_FIRST_TERTIARY_IGNORABLE,      0,
179 0,                                  0 },
180 { UCOL_LAST_TERTIARY_IGNORABLE,       0,
181 0,                                  0 },
182 { UCOL_FIRST_VARIABLE,                0,
183 0,                                  0 },
184 { UCOL_LAST_VARIABLE,                 0,
185 0,                                  0 },
186 { UCOL_FIRST_NON_VARIABLE,            0,
187 0,                                  0 },
188 { UCOL_LAST_NON_VARIABLE,             0,
189 0,                                  0 },
190 };
191 */
192 
setIndirectBoundaries(uint32_t indexR,uint32_t * start,uint32_t * end)193 static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
194 
195     // Set values for the top - TODO: once we have values for all the indirects, we are going
196     // to initalize here.
197     ucolIndirectBoundaries[indexR].startCE = start[0];
198     ucolIndirectBoundaries[indexR].startContCE = start[1];
199     if(end) {
200         ucolIndirectBoundaries[indexR].limitCE = end[0];
201         ucolIndirectBoundaries[indexR].limitContCE = end[1];
202     } else {
203         ucolIndirectBoundaries[indexR].limitCE = 0;
204         ucolIndirectBoundaries[indexR].limitContCE = 0;
205     }
206 }
207 
208 
209 static inline
syntaxError(const UChar * rules,int32_t pos,int32_t rulesLen,UParseError * parseError)210 void syntaxError(const UChar* rules,
211                  int32_t pos,
212                  int32_t rulesLen,
213                  UParseError* parseError)
214 {
215     parseError->offset = pos;
216     parseError->line = 0 ; /* we are not using line numbers */
217 
218     // for pre-context
219     int32_t start = (pos < U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
220     int32_t stop  = pos;
221 
222     u_memcpy(parseError->preContext,rules+start,stop-start);
223     //null terminate the buffer
224     parseError->preContext[stop-start] = 0;
225 
226     //for post-context
227     start = pos+1;
228     stop  = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
229     rulesLen;
230 
231     if(start < stop) {
232         u_memcpy(parseError->postContext,rules+start,stop-start);
233         //null terminate the buffer
234         parseError->postContext[stop-start]= 0;
235     } else {
236         parseError->postContext[0] = 0;
237     }
238 }
239 
240 static
ucol_uprv_tok_setOptionInImage(UColOptionSet * opts,UColAttribute attrib,UColAttributeValue value)241 void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
242     switch(attrib) {
243     case UCOL_HIRAGANA_QUATERNARY_MODE:
244         opts->hiraganaQ = value;
245         break;
246     case UCOL_FRENCH_COLLATION:
247         opts->frenchCollation = value;
248         break;
249     case UCOL_ALTERNATE_HANDLING:
250         opts->alternateHandling = value;
251         break;
252     case UCOL_CASE_FIRST:
253         opts->caseFirst = value;
254         break;
255     case UCOL_CASE_LEVEL:
256         opts->caseLevel = value;
257         break;
258     case UCOL_NORMALIZATION_MODE:
259         opts->normalizationMode = value;
260         break;
261     case UCOL_STRENGTH:
262         opts->strength = value;
263         break;
264     case UCOL_NUMERIC_COLLATION:
265         opts->numericCollation = value;
266         break;
267     case UCOL_ATTRIBUTE_COUNT:
268     default:
269         break;
270     }
271 }
272 
273 #define UTOK_OPTION_COUNT 22
274 
275 static UBool didInit = FALSE;
276 /* we can be strict, or we can be lenient */
277 /* I'd surely be lenient with the option arguments */
278 /* maybe even with options */
279 U_STRING_DECL(suboption_00, "non-ignorable", 13);
280 U_STRING_DECL(suboption_01, "shifted",        7);
281 
282 U_STRING_DECL(suboption_02, "lower",          5);
283 U_STRING_DECL(suboption_03, "upper",          5);
284 U_STRING_DECL(suboption_04, "off",            3);
285 U_STRING_DECL(suboption_05, "on",             2);
286 U_STRING_DECL(suboption_06, "1",              1);
287 U_STRING_DECL(suboption_07, "2",              1);
288 U_STRING_DECL(suboption_08, "3",              1);
289 U_STRING_DECL(suboption_09, "4",              1);
290 U_STRING_DECL(suboption_10, "I",              1);
291 
292 U_STRING_DECL(suboption_11, "primary",        7);
293 U_STRING_DECL(suboption_12, "secondary",      9);
294 U_STRING_DECL(suboption_13, "tertiary",       8);
295 U_STRING_DECL(suboption_14, "variable",       8);
296 U_STRING_DECL(suboption_15, "regular",        7);
297 U_STRING_DECL(suboption_16, "implicit",       8);
298 U_STRING_DECL(suboption_17, "trailing",       8);
299 
300 
301 U_STRING_DECL(option_00,    "undefined",      9);
302 U_STRING_DECL(option_01,    "rearrange",      9);
303 U_STRING_DECL(option_02,    "alternate",      9);
304 U_STRING_DECL(option_03,    "backwards",      9);
305 U_STRING_DECL(option_04,    "variable top",  12);
306 U_STRING_DECL(option_05,    "top",            3);
307 U_STRING_DECL(option_06,    "normalization", 13);
308 U_STRING_DECL(option_07,    "caseLevel",      9);
309 U_STRING_DECL(option_08,    "caseFirst",      9);
310 U_STRING_DECL(option_09,    "scriptOrder",   11);
311 U_STRING_DECL(option_10,    "charsetname",   11);
312 U_STRING_DECL(option_11,    "charset",        7);
313 U_STRING_DECL(option_12,    "before",         6);
314 U_STRING_DECL(option_13,    "hiraganaQ",      9);
315 U_STRING_DECL(option_14,    "strength",       8);
316 U_STRING_DECL(option_15,    "first",          5);
317 U_STRING_DECL(option_16,    "last",           4);
318 U_STRING_DECL(option_17,    "optimize",       8);
319 U_STRING_DECL(option_18,    "suppressContractions",         20);
320 U_STRING_DECL(option_19,    "numericOrdering",              15);
321 U_STRING_DECL(option_20,    "import",         6);
322 U_STRING_DECL(option_21,    "reorder",         7);
323 
324 /*
325 [last variable] last variable value
326 [last primary ignorable] largest CE for primary ignorable
327 [last secondary ignorable] largest CE for secondary ignorable
328 [last tertiary ignorable] largest CE for tertiary ignorable
329 [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
330 */
331 
332 
333 static const ucolTokSuboption alternateSub[2] = {
334     {suboption_00, 13, UCOL_NON_IGNORABLE},
335     {suboption_01,  7, UCOL_SHIFTED}
336 };
337 
338 static const ucolTokSuboption caseFirstSub[3] = {
339     {suboption_02, 5, UCOL_LOWER_FIRST},
340     {suboption_03,  5, UCOL_UPPER_FIRST},
341     {suboption_04,  3, UCOL_OFF},
342 };
343 
344 static const ucolTokSuboption onOffSub[2] = {
345     {suboption_04, 3, UCOL_OFF},
346     {suboption_05, 2, UCOL_ON}
347 };
348 
349 static const ucolTokSuboption frenchSub[1] = {
350     {suboption_07, 1, UCOL_ON}
351 };
352 
353 static const ucolTokSuboption beforeSub[3] = {
354     {suboption_06, 1, UCOL_PRIMARY},
355     {suboption_07, 1, UCOL_SECONDARY},
356     {suboption_08, 1, UCOL_TERTIARY}
357 };
358 
359 static const ucolTokSuboption strengthSub[5] = {
360     {suboption_06, 1, UCOL_PRIMARY},
361     {suboption_07, 1, UCOL_SECONDARY},
362     {suboption_08, 1, UCOL_TERTIARY},
363     {suboption_09, 1, UCOL_QUATERNARY},
364     {suboption_10, 1, UCOL_IDENTICAL},
365 };
366 
367 static const ucolTokSuboption firstLastSub[7] = {
368     {suboption_11, 7, UCOL_PRIMARY},
369     {suboption_12, 9, UCOL_PRIMARY},
370     {suboption_13, 8, UCOL_PRIMARY},
371     {suboption_14, 8, UCOL_PRIMARY},
372     {suboption_15, 7, UCOL_PRIMARY},
373     {suboption_16, 8, UCOL_PRIMARY},
374     {suboption_17, 8, UCOL_PRIMARY},
375 };
376 
377 enum OptionNumber {
378     OPTION_ALTERNATE_HANDLING = 0,
379     OPTION_FRENCH_COLLATION,
380     OPTION_CASE_LEVEL,
381     OPTION_CASE_FIRST,
382     OPTION_NORMALIZATION_MODE,
383     OPTION_HIRAGANA_QUATERNARY,
384     OPTION_STRENGTH,
385     OPTION_NUMERIC_COLLATION,
386     OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
387     OPTION_VARIABLE_TOP,
388     OPTION_REARRANGE,
389     OPTION_BEFORE,
390     OPTION_TOP,
391     OPTION_FIRST,
392     OPTION_LAST,
393     OPTION_OPTIMIZE,
394     OPTION_SUPPRESS_CONTRACTIONS,
395     OPTION_UNDEFINED,
396     OPTION_SCRIPT_ORDER,
397     OPTION_CHARSET_NAME,
398     OPTION_CHARSET,
399     OPTION_IMPORT,
400     OPTION_SCRIPTREORDER
401 } ;
402 
403 static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
404     /*00*/ {option_02,  9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
405     /*01*/ {option_03,  9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards"      */
406     /*02*/ {option_07,  9, onOffSub, 2, UCOL_CASE_LEVEL},  /*"caseLevel"      */
407     /*03*/ {option_08,  9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst"   */
408     /*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
409     /*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
410     /*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
411     /*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION},  /*"numericOrdering"*/
412     /*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top"   */
413     /*09*/ {option_01,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange"      */
414     /*10*/ {option_12,  6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before"    */
415     /*11*/ {option_05,  3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top"            */
416     /*12*/ {option_15,  5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
417     /*13*/ {option_16,  4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
418     /*14*/ {option_17,  8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize"      */
419     /*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions"      */
420     /*16*/ {option_00,  9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined"      */
421     /*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder"    */
422     /*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname"    */
423     /*19*/ {option_11,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"charset"        */
424     /*20*/ {option_20,  6, NULL, 0, UCOL_ATTRIBUTE_COUNT},  /*"import"        */
425     /*21*/ {option_21,  7, NULL, 0, UCOL_ATTRIBUTE_COUNT}  /*"reorder"        */
426 };
427 
428 static
u_strncmpNoCase(const UChar * s1,const UChar * s2,int32_t n)429 int32_t u_strncmpNoCase(const UChar     *s1,
430                         const UChar     *s2,
431                         int32_t     n)
432 {
433     if(n > 0) {
434         int32_t rc;
435         for(;;) {
436             rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
437             if(rc != 0 || *s1 == 0 || --n == 0) {
438                 return rc;
439             }
440             ++s1;
441             ++s2;
442         }
443     }
444     return 0;
445 }
446 
447 static
ucol_uprv_tok_initData()448 void ucol_uprv_tok_initData() {
449     if(!didInit) {
450         U_STRING_INIT(suboption_00, "non-ignorable", 13);
451         U_STRING_INIT(suboption_01, "shifted",        7);
452 
453         U_STRING_INIT(suboption_02, "lower",          5);
454         U_STRING_INIT(suboption_03, "upper",          5);
455         U_STRING_INIT(suboption_04, "off",            3);
456         U_STRING_INIT(suboption_05, "on",             2);
457 
458         U_STRING_INIT(suboption_06, "1",              1);
459         U_STRING_INIT(suboption_07, "2",              1);
460         U_STRING_INIT(suboption_08, "3",              1);
461         U_STRING_INIT(suboption_09, "4",              1);
462         U_STRING_INIT(suboption_10, "I",              1);
463 
464         U_STRING_INIT(suboption_11, "primary",        7);
465         U_STRING_INIT(suboption_12, "secondary",      9);
466         U_STRING_INIT(suboption_13, "tertiary",       8);
467         U_STRING_INIT(suboption_14, "variable",       8);
468         U_STRING_INIT(suboption_15, "regular",        7);
469         U_STRING_INIT(suboption_16, "implicit",       8);
470         U_STRING_INIT(suboption_17, "trailing",       8);
471 
472 
473         U_STRING_INIT(option_00, "undefined",      9);
474         U_STRING_INIT(option_01, "rearrange",      9);
475         U_STRING_INIT(option_02, "alternate",      9);
476         U_STRING_INIT(option_03, "backwards",      9);
477         U_STRING_INIT(option_04, "variable top",  12);
478         U_STRING_INIT(option_05, "top",            3);
479         U_STRING_INIT(option_06, "normalization", 13);
480         U_STRING_INIT(option_07, "caseLevel",      9);
481         U_STRING_INIT(option_08, "caseFirst",      9);
482         U_STRING_INIT(option_09, "scriptOrder",   11);
483         U_STRING_INIT(option_10, "charsetname",   11);
484         U_STRING_INIT(option_11, "charset",        7);
485         U_STRING_INIT(option_12, "before",         6);
486         U_STRING_INIT(option_13, "hiraganaQ",      9);
487         U_STRING_INIT(option_14, "strength",       8);
488         U_STRING_INIT(option_15, "first",          5);
489         U_STRING_INIT(option_16, "last",           4);
490         U_STRING_INIT(option_17, "optimize",       8);
491         U_STRING_INIT(option_18, "suppressContractions",         20);
492         U_STRING_INIT(option_19, "numericOrdering",      15);
493         U_STRING_INIT(option_20, "import ",        6);
494         U_STRING_INIT(option_21, "reorder",        7);
495         didInit = TRUE;
496     }
497 }
498 
499 
500 // This function reads basic options to set in the runtime collator
501 // used by data driven tests. Should not support build time options
502 U_CAPI const UChar * U_EXPORT2
ucol_tok_getNextArgument(const UChar * start,const UChar * end,UColAttribute * attrib,UColAttributeValue * value,UErrorCode * status)503 ucol_tok_getNextArgument(const UChar *start, const UChar *end,
504                          UColAttribute *attrib, UColAttributeValue *value,
505                          UErrorCode *status)
506 {
507     uint32_t i = 0;
508     int32_t j=0;
509     UBool foundOption = FALSE;
510     const UChar *optionArg = NULL;
511 
512     ucol_uprv_tok_initData();
513 
514     while(start < end && (u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start))) { /* eat whitespace */
515         start++;
516     }
517     if(start >= end) {
518         return NULL;
519     }
520     /* skip opening '[' */
521     if(*start == 0x005b) {
522         start++;
523     } else {
524         *status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
525         return NULL;
526     }
527 
528     while(i < UTOK_OPTION_COUNT) {
529         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
530             foundOption = TRUE;
531             if(end - start > rulesOptions[i].optionLen) {
532                 optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
533                 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */
534                     optionArg++;
535                 }
536             }
537             break;
538         }
539         i++;
540     }
541 
542     if(!foundOption) {
543         *status = U_ILLEGAL_ARGUMENT_ERROR;
544         return NULL;
545     }
546 
547     if(optionArg) {
548         for(j = 0; j<rulesOptions[i].subSize; j++) {
549             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
550                 //ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
551                 *attrib = rulesOptions[i].attr;
552                 *value = rulesOptions[i].subopts[j].attrVal;
553                 optionArg += rulesOptions[i].subopts[j].subLen;
554                 while(u_isWhitespace(*optionArg) || uprv_isRuleWhiteSpace(*optionArg)) { /* eat whitespace */
555                     optionArg++;
556                 }
557                 if(*optionArg == 0x005d) {
558                     optionArg++;
559                     return optionArg;
560                 } else {
561                     *status = U_ILLEGAL_ARGUMENT_ERROR;
562                     return NULL;
563                 }
564             }
565         }
566     }
567     *status = U_ILLEGAL_ARGUMENT_ERROR;
568     return NULL;
569 }
570 
571 static
ucol_uprv_tok_readAndSetUnicodeSet(const UChar * start,const UChar * end,UErrorCode * status)572 USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
573     while(*start != 0x005b) { /* advance while we find the first '[' */
574         start++;
575     }
576     // now we need to get a balanced set of '[]'. The problem is that a set can have
577     // many, and *end point to the first closing '['
578     int32_t noOpenBraces = 1;
579     int32_t current = 1; // skip the opening brace
580     while(start+current < end && noOpenBraces != 0) {
581         if(start[current] == 0x005b) {
582             noOpenBraces++;
583         } else if(start[current] == 0x005D) { // closing brace
584             noOpenBraces--;
585         }
586         current++;
587     }
588 
589     if(noOpenBraces != 0 || u_strchr(start+current, 0x005d /*']'*/) == NULL) {
590         *status = U_ILLEGAL_ARGUMENT_ERROR;
591         return NULL;
592     }
593     return uset_openPattern(start, current, status);
594 }
595 
596 /**
597  * Reads an option and matches the option name with the predefined options. (Case-insensitive.)
598  * @param start Pointer to the start UChar.
599  * @param end Pointer to the last valid pointer beyond which the option will not extend.
600  * @param optionArg Address of the pointer at which the options start (after the option name)
601  * @return The index of the option, or -1 if the option is not valid.
602  */
603 static
ucol_uprv_tok_readOption(const UChar * start,const UChar * end,const UChar ** optionArg)604 int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
605     int32_t i = 0;
606     ucol_uprv_tok_initData();
607 
608     while(u_isWhitespace(*start) || uprv_isRuleWhiteSpace(*start)) { /* eat whitespace */
609         start++;
610     }
611     while(i < UTOK_OPTION_COUNT) {
612         if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
613             if(end - start > rulesOptions[i].optionLen) {
614                 *optionArg = start+rulesOptions[i].optionLen; /* End of option name; start of the options */
615                 while(u_isWhitespace(**optionArg) || uprv_isRuleWhiteSpace(**optionArg)) { /* eat whitespace */
616                     (*optionArg)++;
617                 }
618             }
619             break;
620         }
621         i++;
622     }
623     if(i == UTOK_OPTION_COUNT) {
624         i = -1; // didn't find an option
625     }
626     return i;
627 }
628 
629 
630 static
ucol_tok_parseScriptReorder(UColTokenParser * src,UErrorCode * status)631 void ucol_tok_parseScriptReorder(UColTokenParser *src, UErrorCode *status) {
632     int32_t codeCount = 0;
633     int32_t codeIndex = 0;
634     char conversion[64];
635     int32_t tokenLength = 0;
636     const UChar* space;
637 
638     const UChar* current = src->current;
639     const UChar* end = u_memchr(src->current, 0x005d, src->end - src->current);
640 
641     // eat leading whitespace
642     while(current < end && u_isWhitespace(*current)) {
643         current++;
644     }
645 
646     while(current < end) {
647         space = u_memchr(current, 0x0020, end - current);
648         space = space == 0 ? end : space;
649         tokenLength = space - current;
650         if (tokenLength < 4) {
651             *status = U_INVALID_FORMAT_ERROR;
652             return;
653         }
654         codeCount++;
655         current += tokenLength;
656         while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
657             ++current;
658         }
659     }
660 
661     if (codeCount == 0) {
662         *status = U_INVALID_FORMAT_ERROR;
663     }
664 
665     src->reorderCodesLength = codeCount;
666     src->reorderCodes = (int32_t*)uprv_malloc(codeCount * sizeof(int32_t));
667     current = src->current;
668 
669     // eat leading whitespace
670     while(current < end && u_isWhitespace(*current)) {
671         current++;
672     }
673 
674     while(current < end) {
675         space = u_memchr(current, 0x0020, end - current);
676         space = space == 0 ? end : space;
677         tokenLength = space - current;
678         if (tokenLength < 4) {
679             *status = U_ILLEGAL_ARGUMENT_ERROR;
680             return;
681         } else {
682             u_UCharsToChars(current, conversion, tokenLength);
683             conversion[tokenLength] = '\0';
684             src->reorderCodes[codeIndex] = ucol_findReorderingEntry(conversion);
685             if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
686                 src->reorderCodes[codeIndex] = u_getPropertyValueEnum(UCHAR_SCRIPT, conversion);
687             }
688             if (src->reorderCodes[codeIndex] == USCRIPT_INVALID_CODE) {
689                 *status = U_ILLEGAL_ARGUMENT_ERROR;
690             }
691         }
692         codeIndex++;
693         current += tokenLength;
694         while(current < end && u_isWhitespace(*current)) { /* eat whitespace */
695             ++current;
696         }
697     }
698 }
699 
700 // reads and conforms to various options in rules
701 // end is the position of the first closing ']'
702 // However, some of the options take an UnicodeSet definition
703 // which needs to duplicate the closing ']'
704 // for example: '[copy [\uAC00-\uD7FF]]'
705 // These options will move end to the second ']' and the
706 // caller will set the current to it.
707 static
ucol_uprv_tok_readAndSetOption(UColTokenParser * src,UErrorCode * status)708 uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
709     const UChar* start = src->current;
710     int32_t i = 0;
711     int32_t j=0;
712     const UChar *optionArg = NULL;
713 
714     uint8_t result = 0;
715 
716     start++; /*skip opening '['*/
717     i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
718     if(optionArg) {
719         src->current = optionArg;
720     }
721 
722     if(i < 0) {
723         *status = U_ILLEGAL_ARGUMENT_ERROR;
724     } else {
725         int32_t noOpenBraces = 1;
726         switch(i) {
727     case OPTION_ALTERNATE_HANDLING:
728     case OPTION_FRENCH_COLLATION:
729     case OPTION_CASE_LEVEL:
730     case OPTION_CASE_FIRST:
731     case OPTION_NORMALIZATION_MODE:
732     case OPTION_HIRAGANA_QUATERNARY:
733     case OPTION_STRENGTH:
734     case OPTION_NUMERIC_COLLATION:
735         if(optionArg) {
736             for(j = 0; j<rulesOptions[i].subSize; j++) {
737                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
738                     ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
739                     result =  UCOL_TOK_SUCCESS;
740                 }
741             }
742         }
743         if(result == 0) {
744             *status = U_ILLEGAL_ARGUMENT_ERROR;
745         }
746         break;
747     case OPTION_VARIABLE_TOP:
748         result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
749         break;
750     case OPTION_REARRANGE:
751         result = UCOL_TOK_SUCCESS;
752         break;
753     case OPTION_BEFORE:
754         if(optionArg) {
755             for(j = 0; j<rulesOptions[i].subSize; j++) {
756                 if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
757                     result = UCOL_TOK_SUCCESS | (rulesOptions[i].subopts[j].attrVal + 1);
758                 }
759             }
760         }
761         if(result == 0) {
762             *status = U_ILLEGAL_ARGUMENT_ERROR;
763         }
764         break;
765     case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
766         /* index to this array will be src->parsedToken.indirectIndex*/
767         src->parsedToken.indirectIndex = 0;
768         result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
769         break;
770     case OPTION_FIRST:
771     case OPTION_LAST: /* first, last */
772         for(j = 0; j<rulesOptions[i].subSize; j++) {
773             if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
774                 // the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
775                 // element of indirect boundaries is reserved for top.
776                 src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
777                 result =  UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
778             }
779         }
780         if(result == 0) {
781             *status = U_ILLEGAL_ARGUMENT_ERROR;
782         }
783         break;
784     case OPTION_OPTIMIZE:
785     case OPTION_SUPPRESS_CONTRACTIONS:  // copy and remove are handled before normalization
786         // we need to move end here
787         src->current++; // skip opening brace
788         while(src->current < src->end && noOpenBraces != 0) {
789             if(*src->current == 0x005b) {
790                 noOpenBraces++;
791             } else if(*src->current == 0x005D) { // closing brace
792                 noOpenBraces--;
793             }
794             src->current++;
795         }
796         result = UCOL_TOK_SUCCESS;
797         break;
798     case OPTION_SCRIPTREORDER:
799         ucol_tok_parseScriptReorder(src, status);
800         break;
801     default:
802         *status = U_UNSUPPORTED_ERROR;
803         break;
804         }
805     }
806     src->current = u_memchr(src->current, 0x005d, (int32_t)(src->end-src->current));
807     return result;
808 }
809 
810 
ucol_tok_addToExtraCurrent(UColTokenParser * src,const UChar * stuff,int32_t len,UErrorCode * status)811 inline void ucol_tok_addToExtraCurrent(UColTokenParser *src, const UChar *stuff, int32_t len, UErrorCode *status) {
812     if (stuff == NULL || len <= 0) {
813         return;
814     }
815     UnicodeString tempStuff(FALSE, stuff, len);
816     if(src->extraCurrent+len >= src->extraEnd) {
817         /* reallocate */
818         if (stuff >= src->source && stuff <= src->end) {
819             // Copy the "stuff" contents into tempStuff's own buffer.
820             // UnicodeString is copy-on-write.
821             if (len > 0) {
822                 tempStuff.setCharAt(0, tempStuff[0]);
823             } else {
824                 tempStuff.remove();
825             }
826         }
827         UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
828         if(newSrc != NULL) {
829             src->current = newSrc + (src->current - src->source);
830             src->extraCurrent = newSrc + (src->extraCurrent - src->source);
831             src->end = newSrc + (src->end - src->source);
832             src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
833             src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
834             src->source = newSrc;
835         } else {
836             *status = U_MEMORY_ALLOCATION_ERROR;
837             return;
838         }
839     }
840     if(len == 1) {
841         *src->extraCurrent++ = tempStuff[0];
842     } else {
843         u_memcpy(src->extraCurrent, tempStuff.getBuffer(), len);
844         src->extraCurrent += len;
845     }
846 }
847 
ucol_tok_doSetTop(UColTokenParser * src,UErrorCode * status)848 inline UBool ucol_tok_doSetTop(UColTokenParser *src, UErrorCode *status) {
849     /*
850     top = TRUE;
851     */
852     UChar buff[5];
853     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
854     buff[0] = 0xFFFE;
855     buff[1] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
856     buff[2] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
857     if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
858         src->parsedToken.charsLen = 3;
859         ucol_tok_addToExtraCurrent(src, buff, 3, status);
860     } else {
861         buff[3] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
862         buff[4] = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
863         src->parsedToken.charsLen = 5;
864         ucol_tok_addToExtraCurrent(src, buff, 5, status);
865     }
866     return TRUE;
867 }
868 
isCharNewLine(UChar c)869 static UBool isCharNewLine(UChar c){
870     switch(c){
871     case 0x000A: /* LF  */
872     case 0x000D: /* CR  */
873     case 0x000C: /* FF  */
874     case 0x0085: /* NEL */
875     case 0x2028: /* LS  */
876     case 0x2029: /* PS  */
877         return TRUE;
878     default:
879         return FALSE;
880     }
881 }
882 
883 /*
884  * This function is called several times when a range is processed.  Each time, the next code point
885  * is processed.
886  * The following variables must be set before calling this function:
887  *   src->currentRangeCp:  The current code point to process.
888  *   src->lastRangeCp: The last code point in the range.
889  * Pre-requisite: src->currentRangeCp <= src->lastRangeCp.
890  */
891 static const UChar*
ucol_tok_processNextCodePointInRange(UColTokenParser * src,UErrorCode * status)892 ucol_tok_processNextCodePointInRange(UColTokenParser *src,
893                                      UErrorCode *status)
894 {
895   // Append current code point to source
896   UChar buff[U16_MAX_LENGTH];
897   uint32_t i = 0;
898 
899   uint32_t nChars = U16_LENGTH(src->currentRangeCp);
900   src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
901   src->parsedToken.charsLen = nChars;
902 
903   U16_APPEND_UNSAFE(buff, i, src->currentRangeCp);
904   ucol_tok_addToExtraCurrent(src, buff, nChars, status);
905 
906   ++src->currentRangeCp;
907   if (src->currentRangeCp > src->lastRangeCp) {
908     src->inRange = FALSE;
909 
910     if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
911       src->isStarred = FALSE;
912     }
913   } else {
914     src->previousCp = src->currentRangeCp;
915   }
916   return src->current;
917 }
918 
919 /*
920  * This function is called several times when a starred list is processed.  Each time, the next code point
921  * in the list is processed.
922  * The following variables must be set before calling this function:
923  *   src->currentStarredCharIndex:  Index (in src->source) of the first char of the current code point.
924  *   src->lastStarredCharIndex: Index to the last character in the list.
925  * Pre-requisite: src->currentStarredCharIndex <= src->lastStarredCharIndex.
926  */
927 static const UChar*
ucol_tok_processNextTokenInStarredList(UColTokenParser * src)928 ucol_tok_processNextTokenInStarredList(UColTokenParser *src)
929 {
930   // Extract the characters corresponding to the next code point.
931   UChar32 cp;
932   src->parsedToken.charsOffset = src->currentStarredCharIndex;
933   int32_t prev = src->currentStarredCharIndex;
934   U16_NEXT(src->source, src->currentStarredCharIndex, (uint32_t)(src->end - src->source), cp);
935   src->parsedToken.charsLen = src->currentStarredCharIndex - prev;
936 
937   // When we are done parsing the starred string, turn the flag off so that
938   // the normal processing is restored.
939   if (src->currentStarredCharIndex > src->lastStarredCharIndex) {
940     src->isStarred = FALSE;
941   }
942   src->previousCp = cp;
943   return src->current;
944 }
945 
946 /*
947  * Partially parses the next token, keeps the indices in src->parsedToken, and updates the counters.
948  *
949  * This routine parses and separates almost all tokens. The following are the syntax characters recognized.
950  *  # : Comment character
951  *  & : Reset operator
952  *  = : Equality
953  *  < : Primary collation
954  *  << : Secondary collation
955  *  <<< : Tertiary collation
956  *  ; : Secondary collation
957  *  , : Tertiary collation
958  *  / : Expansions
959  *  | : Prefix
960  *  - : Range
961 
962  *  ! : Java Thai modifier, ignored
963  *  @ : French only
964 
965  * [] : Options
966  * '' : Quotes
967  *
968  *  Along with operators =, <, <<, <<<, the operator * is supported to indicate a list.  For example, &a<*bcdexyz
969  *  is equivalent to &a<b<c<d<e<x<y<z.  In lists, ranges also can be given, so &a*b-ex-z is equivalent to the above.
970  *  This function do not separate the tokens in a list.  Instead, &a<*b-ex-z is parsed as three tokens - "&a",
971  *  "<*b", "-ex", "-z".  The strength (< in this case), whether in a list, whether in a range and the previous
972  *  character returned as cached so that the calling program can do further splitting.
973  */
974 static const UChar*
ucol_tok_parseNextTokenInternal(UColTokenParser * src,UBool startOfRules,UParseError * parseError,UErrorCode * status)975 ucol_tok_parseNextTokenInternal(UColTokenParser *src,
976                                 UBool startOfRules,
977                                 UParseError *parseError,
978                                 UErrorCode *status)
979 {
980     UBool variableTop = FALSE;
981     UBool top = FALSE;
982     UBool inChars = TRUE;
983     UBool inQuote = FALSE;
984     UBool wasInQuote = FALSE;
985     uint8_t before = 0;
986     UBool isEscaped = FALSE;
987 
988     // TODO: replace these variables with src->parsedToken counterparts
989     // no need to use them anymore since we have src->parsedToken.
990     // Ideally, token parser would be a nice class... Once, when I have
991     // more time (around 2020 probably).
992     uint32_t newExtensionLen = 0;
993     uint32_t extensionOffset = 0;
994     uint32_t newStrength = UCOL_TOK_UNSET;
995     UChar buff[10];
996 
997     src->parsedToken.charsOffset = 0;  src->parsedToken.charsLen = 0;
998     src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
999     src->parsedToken.indirectIndex = 0;
1000 
1001     while (src->current < src->end) {
1002         UChar ch = *(src->current);
1003 
1004         if (inQuote) {
1005             if (ch == 0x0027/*'\''*/) {
1006                 inQuote = FALSE;
1007             } else {
1008                 if ((src->parsedToken.charsLen == 0) || inChars) {
1009                     if(src->parsedToken.charsLen == 0) {
1010                         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1011                     }
1012                     src->parsedToken.charsLen++;
1013                 } else {
1014                     if(newExtensionLen == 0) {
1015                         extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1016                     }
1017                     newExtensionLen++;
1018                 }
1019             }
1020         }else if(isEscaped){
1021             isEscaped =FALSE;
1022             if (newStrength == UCOL_TOK_UNSET) {
1023                 *status = U_INVALID_FORMAT_ERROR;
1024                 syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1025                 DBG_FORMAT_ERROR
1026                 return NULL;
1027                 // enabling rules to start with non-tokens a < b
1028                 // newStrength = UCOL_TOK_RESET;
1029             }
1030             if(ch != 0x0000  && src->current != src->end) {
1031                 if (inChars) {
1032                     if(src->parsedToken.charsLen == 0) {
1033                         src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1034                     }
1035                     src->parsedToken.charsLen++;
1036                 } else {
1037                     if(newExtensionLen == 0) {
1038                         extensionOffset = (uint32_t)(src->current - src->source);
1039                     }
1040                     newExtensionLen++;
1041                 }
1042             }
1043         }else {
1044             if(!uprv_isRuleWhiteSpace(ch)) {
1045                 /* Sets the strength for this entry */
1046                 switch (ch) {
1047                 case 0x003D/*'='*/ :
1048                     if (newStrength != UCOL_TOK_UNSET) {
1049                         goto EndOfLoop;
1050                     }
1051 
1052                     /* if we start with strength, we'll reset to top */
1053                     if(startOfRules == TRUE) {
1054                         src->parsedToken.indirectIndex = 5;
1055                         top = ucol_tok_doSetTop(src, status);
1056                         newStrength = UCOL_TOK_RESET;
1057                         goto EndOfLoop;
1058                     }
1059                     newStrength = UCOL_IDENTICAL;
1060                     if(*(src->current+1) == 0x002A) {/*'*'*/
1061                         src->current++;
1062                         src->isStarred = TRUE;
1063                     }
1064                     break;
1065 
1066                 case 0x002C/*','*/:
1067                     if (newStrength != UCOL_TOK_UNSET) {
1068                         goto EndOfLoop;
1069                     }
1070 
1071                     /* if we start with strength, we'll reset to top */
1072                     if(startOfRules == TRUE) {
1073                         src->parsedToken.indirectIndex = 5;
1074                         top = ucol_tok_doSetTop(src, status);
1075                         newStrength = UCOL_TOK_RESET;
1076                         goto EndOfLoop;
1077                     }
1078                     newStrength = UCOL_TERTIARY;
1079                     break;
1080 
1081                 case  0x003B/*';'*/:
1082                     if (newStrength != UCOL_TOK_UNSET) {
1083                         goto EndOfLoop;
1084                     }
1085 
1086                     /* if we start with strength, we'll reset to top */
1087                     if(startOfRules == TRUE) {
1088                         src->parsedToken.indirectIndex = 5;
1089                         top = ucol_tok_doSetTop(src, status);
1090                         newStrength = UCOL_TOK_RESET;
1091                         goto EndOfLoop;
1092                     }
1093                     newStrength = UCOL_SECONDARY;
1094                     break;
1095 
1096                 case 0x003C/*'<'*/:
1097                     if (newStrength != UCOL_TOK_UNSET) {
1098                         goto EndOfLoop;
1099                     }
1100 
1101                     /* if we start with strength, we'll reset to top */
1102                     if(startOfRules == TRUE) {
1103                         src->parsedToken.indirectIndex = 5;
1104                         top = ucol_tok_doSetTop(src, status);
1105                         newStrength = UCOL_TOK_RESET;
1106                         goto EndOfLoop;
1107                     }
1108                     /* before this, do a scan to verify whether this is */
1109                     /* another strength */
1110                     if(*(src->current+1) == 0x003C) {
1111                         src->current++;
1112                         if(*(src->current+1) == 0x003C) {
1113                             src->current++; /* three in a row! */
1114                             newStrength = UCOL_TERTIARY;
1115                         } else { /* two in a row */
1116                             newStrength = UCOL_SECONDARY;
1117                         }
1118                     } else { /* just one */
1119                         newStrength = UCOL_PRIMARY;
1120                     }
1121                     if(*(src->current+1) == 0x002A) {/*'*'*/
1122                         src->current++;
1123                         src->isStarred = TRUE;
1124                     }
1125                     break;
1126 
1127                 case 0x0026/*'&'*/:
1128                     if (newStrength != UCOL_TOK_UNSET) {
1129                         /**/
1130                         goto EndOfLoop;
1131                     }
1132 
1133                     newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
1134                     break;
1135 
1136                 case 0x005b/*'['*/:
1137                     /* options - read an option, analyze it */
1138                     if(u_strchr(src->current, 0x005d /*']'*/) != NULL) {
1139                         uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
1140                         if(U_SUCCESS(*status)) {
1141                             if(result & UCOL_TOK_TOP) {
1142                                 if(newStrength == UCOL_TOK_RESET) {
1143                                     top = ucol_tok_doSetTop(src, status);
1144                                     if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
1145                                         src->parsedToken.charsLen+=2;
1146                                         buff[0] = 0x002d;
1147                                         buff[1] = before;
1148                                         ucol_tok_addToExtraCurrent(src, buff, 2, status);
1149                                     }
1150 
1151                                     src->current++;
1152                                     goto EndOfLoop;
1153                                 } else {
1154                                     *status = U_INVALID_FORMAT_ERROR;
1155                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1156                                     DBG_FORMAT_ERROR
1157                                 }
1158                             } else if(result & UCOL_TOK_VARIABLE_TOP) {
1159                                 if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
1160                                     variableTop = TRUE;
1161                                     src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1162                                     src->parsedToken.charsLen = 1;
1163                                     buff[0] = 0xFFFF;
1164                                     ucol_tok_addToExtraCurrent(src, buff, 1, status);
1165                                     src->current++;
1166                                     goto EndOfLoop;
1167                                 } else {
1168                                     *status = U_INVALID_FORMAT_ERROR;
1169                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1170                                     DBG_FORMAT_ERROR
1171                                 }
1172                             } else if (result & UCOL_TOK_BEFORE){
1173                                 if(newStrength == UCOL_TOK_RESET) {
1174                                     before = result & UCOL_TOK_BEFORE;
1175                                 } else {
1176                                     *status = U_INVALID_FORMAT_ERROR;
1177                                     syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1178                                     DBG_FORMAT_ERROR
1179                                 }
1180                             }
1181                         } else {
1182                             *status = U_INVALID_FORMAT_ERROR;
1183                             syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1184                             DBG_FORMAT_ERROR
1185                             return NULL;
1186                         }
1187                     }
1188                     break;
1189                 case 0x0021/*! skip java thai modifier reordering*/:
1190                     break;
1191                 case 0x002F/*'/'*/:
1192                     wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
1193                     inChars = FALSE; /* we're now processing expansion */
1194                     break;
1195                 case 0x005C /* back slash for escaped chars */:
1196                     isEscaped = TRUE;
1197                     break;
1198                     /* found a quote, we're gonna start copying */
1199                 case 0x0027/*'\''*/:
1200                     if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
1201                       *status = U_INVALID_FORMAT_ERROR;
1202                       syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1203                       DBG_FORMAT_ERROR
1204                       return NULL;
1205                       // enabling rules to start with a non-token character a < b
1206                       // newStrength = UCOL_TOK_RESET;
1207                     }
1208 
1209                     inQuote = TRUE;
1210 
1211                     if(inChars) { /* we're doing characters */
1212                         if(wasInQuote == FALSE) {
1213                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1214                         }
1215                         if (src->parsedToken.charsLen != 0) {
1216                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1217                         }
1218                         src->parsedToken.charsLen++;
1219                     } else { /* we're doing an expansion */
1220                         if(wasInQuote == FALSE) {
1221                             extensionOffset = (uint32_t)(src->extraCurrent - src->source);
1222                         }
1223                         if (newExtensionLen != 0) {
1224                             ucol_tok_addToExtraCurrent(src, src->current - newExtensionLen, newExtensionLen, status);
1225                         }
1226                         newExtensionLen++;
1227                     }
1228 
1229                     wasInQuote = TRUE;
1230 
1231                     ch = *(++(src->current));
1232                     if(ch == 0x0027) { /* copy the double quote */
1233                         ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1234                         inQuote = FALSE;
1235                     }
1236                     break;
1237 
1238                     /* '@' is french only if the strength is not currently set */
1239                     /* if it is, it's just a regular character in collation rules */
1240                 case 0x0040/*'@'*/:
1241                     if (newStrength == UCOL_TOK_UNSET) {
1242                         src->opts->frenchCollation = UCOL_ON;
1243                         break;
1244                     }
1245 
1246                 case 0x007C /*|*/: /* this means we have actually been reading prefix part */
1247                     // we want to store read characters to the prefix part and continue reading
1248                     // the characters (proper way would be to restart reading the chars, but in
1249                     // that case we would have to complicate the token hasher, which I do not
1250                     // intend to play with. Instead, we will do prefixes when prefixes are due
1251                     // (before adding the elements).
1252                     src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
1253                     src->parsedToken.prefixLen = src->parsedToken.charsLen;
1254 
1255                     if(inChars) { /* we're doing characters */
1256                         if(wasInQuote == FALSE) {
1257                             src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1258                         }
1259                         if (src->parsedToken.charsLen != 0) {
1260                             ucol_tok_addToExtraCurrent(src, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen, status);
1261                         }
1262                         src->parsedToken.charsLen++;
1263                     }
1264 
1265                     wasInQuote = TRUE;
1266 
1267                     do {
1268                         ch = *(++(src->current));
1269                         // skip whitespace between '|' and the character
1270                     } while (uprv_isRuleWhiteSpace(ch));
1271                     break;
1272 
1273                     //charsOffset = 0;
1274                     //newCharsLen = 0;
1275                     //break; // We want to store the whole prefix/character sequence. If we break
1276                     // the '|' is going to get lost.
1277 
1278                 case 0x002D /*-*/: /* A range. */
1279                     if (newStrength != UCOL_TOK_UNSET) {
1280                       // While processing the pending token, the isStarred field
1281                       // is reset, so it needs to be saved for the next
1282                       // invocation.
1283                       src->savedIsStarred = src->isStarred;
1284                       goto EndOfLoop;
1285                    }
1286                    src->isStarred = src->savedIsStarred;
1287 
1288                    // Ranges are valid only in starred tokens.
1289                    if (!src->isStarred) {
1290                      *status = U_INVALID_FORMAT_ERROR;
1291                      syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1292                      DBG_FORMAT_ERROR
1293                      return NULL;
1294                    }
1295                    newStrength = src->parsedToken.strength;
1296                    src->inRange = TRUE;
1297                    break;
1298 
1299                 case 0x0023 /*#*/: /* this is a comment, skip everything through the end of line */
1300                     do {
1301                         ch = *(++(src->current));
1302                     } while (!isCharNewLine(ch));
1303 
1304                     break;
1305                 default:
1306                     if (newStrength == UCOL_TOK_UNSET) {
1307                       *status = U_INVALID_FORMAT_ERROR;
1308                       syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1309                       DBG_FORMAT_ERROR
1310                       return NULL;
1311                     }
1312 
1313                     if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
1314                         *status = U_INVALID_FORMAT_ERROR;
1315                         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1316                         DBG_FORMAT_ERROR
1317                         return NULL;
1318                     }
1319 
1320                     if(ch == 0x0000 && src->current+1 == src->end) {
1321                         break;
1322                     }
1323 
1324                     if (inChars) {
1325                         if(src->parsedToken.charsLen == 0) {
1326                             src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
1327                         }
1328                         src->parsedToken.charsLen++;
1329                     } else {
1330                         if(newExtensionLen == 0) {
1331                             extensionOffset = (uint32_t)(src->current - src->source);
1332                         }
1333                         newExtensionLen++;
1334                     }
1335 
1336                     break;
1337                 }
1338             }
1339         }
1340 
1341         if(wasInQuote) {
1342             if(ch != 0x27) {
1343                 if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
1344                     ucol_tok_addToExtraCurrent(src, &ch, 1, status);
1345                 }
1346             }
1347         }
1348 
1349         src->current++;
1350     }
1351 
1352 EndOfLoop:
1353     wasInQuote = FALSE;
1354     if (newStrength == UCOL_TOK_UNSET) {
1355         return NULL;
1356     }
1357 
1358     if (src->parsedToken.charsLen == 0 && top == FALSE) {
1359         syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
1360         *status = U_INVALID_FORMAT_ERROR;
1361         DBG_FORMAT_ERROR
1362         return NULL;
1363     }
1364 
1365     src->parsedToken.strength = newStrength;
1366     src->parsedToken.extensionOffset = extensionOffset;
1367     src->parsedToken.extensionLen = newExtensionLen;
1368     src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
1369 
1370     return src->current;
1371 }
1372 
1373 /*
1374  * Parses the next token, keeps the indices in src->parsedToken, and updates the counters.
1375  * @see ucol_tok_parseNextTokenInternal() for the description of what operators are supported.
1376  *
1377  * In addition to what ucol_tok_parseNextTokenInternal() does, this function does the following:
1378  *  1) ucol_tok_parseNextTokenInternal() returns a range as a single token.  This function separates
1379  *     it to separate tokens and returns one by one.  In order to do that, the necessary states are
1380  *     cached as member variables of the token parser.
1381  *  2) When encountering a range, ucol_tok_parseNextTokenInternal() processes characters up to the
1382  *     starting character as a single list token (which is separated into individual characters here)
1383  *     and as another list token starting with the last character in the range.  Before expanding it
1384  *     as a list of tokens, this function expands the range by filling the intermediate characters and
1385  *     returns them one by one as separate tokens.
1386  * Necessary checks are done for invalid combinations.
1387  */
1388 U_CAPI const UChar* U_EXPORT2
ucol_tok_parseNextToken(UColTokenParser * src,UBool startOfRules,UParseError * parseError,UErrorCode * status)1389 ucol_tok_parseNextToken(UColTokenParser *src,
1390                         UBool startOfRules,
1391                         UParseError *parseError,
1392                         UErrorCode *status)
1393 {
1394   const UChar *nextToken;
1395 
1396   if (src->inRange) {
1397     // We are not done processing a range.  Continue it.
1398     return ucol_tok_processNextCodePointInRange(src, status);
1399   } else if (src->isStarred) {
1400     // We are not done processing a starred token.  Continue it.
1401     return ucol_tok_processNextTokenInStarredList(src);
1402   }
1403 
1404   // Get the next token.
1405   nextToken = ucol_tok_parseNextTokenInternal(src, startOfRules, parseError, status);
1406 
1407   if (nextToken == NULL) {
1408     return NULL;
1409   }
1410 
1411   if (src->inRange) {
1412     // A new range has started.
1413     // Check whether it is a chain of ranges with more than one hyphen.
1414     if (src->lastRangeCp > 0 && src->lastRangeCp == src->previousCp) {
1415         *status = U_INVALID_FORMAT_ERROR;
1416         syntaxError(src->source,src->parsedToken.charsOffset-1,
1417                     src->parsedToken.charsOffset+src->parsedToken.charsLen, parseError);
1418         DBG_FORMAT_ERROR
1419         return NULL;
1420     }
1421 
1422     // The current token indicates the second code point of the range.
1423     // Process just that, and then proceed with the star.
1424     src->currentStarredCharIndex = src->parsedToken.charsOffset;
1425     U16_NEXT(src->source, src->currentStarredCharIndex,
1426              (uint32_t)(src->end - src->source), src->lastRangeCp);
1427     if (src->lastRangeCp <= src->previousCp) {
1428         *status = U_INVALID_FORMAT_ERROR;
1429         syntaxError(src->source,src->parsedToken.charsOffset-1,
1430                     src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1431         DBG_FORMAT_ERROR
1432         return NULL;
1433     }
1434 
1435     // Set current range code point to process the range loop
1436     src->currentRangeCp = src->previousCp + 1;
1437 
1438     src->lastStarredCharIndex = src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1439 
1440     return ucol_tok_processNextCodePointInRange(src, status);
1441  } else if (src->isStarred) {
1442     // We define two indices m_currentStarredCharIndex_ and m_lastStarredCharIndex_ so that
1443     // [m_currentStarredCharIndex_ .. m_lastStarredCharIndex_], both inclusive, need to be
1444     // separated into several tokens and returned.
1445     src->currentStarredCharIndex = src->parsedToken.charsOffset;
1446     src->lastStarredCharIndex =  src->parsedToken.charsOffset + src->parsedToken.charsLen - 1;
1447 
1448     return ucol_tok_processNextTokenInStarredList(src);
1449   } else {
1450     // Set previous codepoint
1451     U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->end - src->source), src->previousCp);
1452   }
1453   return nextToken;
1454 }
1455 
1456 
1457 /*
1458 Processing Description
1459 1 Build a ListList. Each list has a header, which contains two lists (positive
1460 and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
1461 reset may be null.
1462 2 As you process, you keep a LAST pointer that points to the last token you
1463 handled.
1464 
1465 */
1466 
ucol_tok_initAReset(UColTokenParser * src,const UChar * expand,uint32_t * expandNext,UParseError * parseError,UErrorCode * status)1467 static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
1468                                       UParseError *parseError, UErrorCode *status)
1469 {
1470     if(src->resultLen == src->listCapacity) {
1471         // Unfortunately, this won't work, as we store addresses of lhs in token
1472         src->listCapacity *= 2;
1473         src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
1474         if(src->lh == NULL) {
1475             *status = U_MEMORY_ALLOCATION_ERROR;
1476             return NULL;
1477         }
1478     }
1479     /* do the reset thing */
1480     UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1481     /* test for NULL */
1482     if (sourceToken == NULL) {
1483         *status = U_MEMORY_ALLOCATION_ERROR;
1484         return NULL;
1485     }
1486     sourceToken->rulesToParseHdl = &(src->source);
1487     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1488     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1489 
1490     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1491     sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1492 
1493     // keep the flags around so that we know about before
1494     sourceToken->flags = src->parsedToken.flags;
1495 
1496     if(src->parsedToken.prefixOffset != 0) {
1497         // this is a syntax error
1498         *status = U_INVALID_FORMAT_ERROR;
1499         syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
1500         DBG_FORMAT_ERROR
1501         uprv_free(sourceToken);
1502         return 0;
1503     } else {
1504         sourceToken->prefix = 0;
1505     }
1506 
1507     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1508     sourceToken->strength = UCOL_TOK_RESET;
1509     sourceToken->next = NULL;
1510     sourceToken->previous = NULL;
1511     sourceToken->noOfCEs = 0;
1512     sourceToken->noOfExpCEs = 0;
1513     sourceToken->listHeader = &src->lh[src->resultLen];
1514 
1515     src->lh[src->resultLen].first = NULL;
1516     src->lh[src->resultLen].last = NULL;
1517     src->lh[src->resultLen].first = NULL;
1518     src->lh[src->resultLen].last = NULL;
1519 
1520     src->lh[src->resultLen].reset = sourceToken;
1521 
1522     /*
1523     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
1524     First convert all expansions into normal form. Examples:
1525     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
1526     d * ... into &x * c/y * d * ...
1527     Note: reset values can never have expansions, although they can cause the
1528     very next item to have one. They may be contractions, if they are found
1529     earlier in the list.
1530     */
1531     *expandNext = 0;
1532     if(expand != NULL) {
1533         /* check to see if there is an expansion */
1534         if(src->parsedToken.charsLen > 1) {
1535             uint32_t resetCharsOffset;
1536             resetCharsOffset = (uint32_t)(expand - src->source);
1537             sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
1538             *expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
1539         }
1540     }
1541 
1542     src->resultLen++;
1543 
1544     uhash_put(src->tailored, sourceToken, sourceToken, status);
1545 
1546     return sourceToken;
1547 }
1548 
1549 static
getVirginBefore(UColTokenParser * src,UColToken * sourceToken,uint8_t strength,UParseError * parseError,UErrorCode * status)1550 inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
1551     if(U_FAILURE(*status)) {
1552         return NULL;
1553     }
1554     /* this is a virgin before - we need to fish the anchor from the UCA */
1555     collIterate s;
1556     uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
1557     uint32_t CE, SecondCE;
1558     uint32_t invPos;
1559     if(sourceToken != NULL) {
1560         uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
1561     } else {
1562         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
1563     }
1564     if(U_FAILURE(*status)) {
1565         return NULL;
1566     }
1567 
1568     baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
1569     baseContCE = ucol_getNextCE(src->UCA, &s, status);
1570     if(baseContCE == UCOL_NO_MORE_CES) {
1571         baseContCE = 0;
1572     }
1573 
1574 
1575     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
1576     uint32_t ch = 0;
1577     uint32_t expandNext = 0;
1578     UColToken key;
1579 
1580     if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
1581         uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
1582         uint32_t raw = uprv_uca_getRawFromImplicit(primary);
1583         ch = uprv_uca_getCodePointFromRaw(raw-1);
1584         uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
1585         CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
1586         SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
1587 
1588         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
1589         *src->extraCurrent++ = 0xFFFE;
1590         *src->extraCurrent++ = (UChar)ch;
1591         src->parsedToken.charsLen++;
1592 
1593         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1594         key.rulesToParseHdl = &(src->source);
1595 
1596         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1597         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1598 
1599         if(sourceToken == NULL) {
1600             src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1601             if(isContinuation(SecondCE)) {
1602                 src->lh[src->resultLen].baseContCE = SecondCE;
1603             } else {
1604                 src->lh[src->resultLen].baseContCE = 0;
1605             }
1606             src->lh[src->resultLen].nextCE = 0;
1607             src->lh[src->resultLen].nextContCE = 0;
1608             src->lh[src->resultLen].previousCE = 0;
1609             src->lh[src->resultLen].previousContCE = 0;
1610 
1611             src->lh[src->resultLen].indirect = FALSE;
1612 
1613             sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1614         }
1615 
1616     } else {
1617         invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
1618 
1619         // we got the previous CE. Now we need to see if the difference between
1620         // the two CEs is really of the requested strength.
1621         // if it's a bigger difference (we asked for secondary and got primary), we
1622         // need to modify the CE.
1623         if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
1624             // adjust the strength
1625             // now we are in the situation where our baseCE should actually be modified in
1626             // order to get the CE in the right position.
1627             if(strength == UCOL_SECONDARY) {
1628                 CE = baseCE - 0x0200;
1629             } else { // strength == UCOL_TERTIARY
1630                 CE = baseCE - 0x02;
1631             }
1632             if(baseContCE) {
1633                 if(strength == UCOL_SECONDARY) {
1634                     SecondCE = baseContCE - 0x0200;
1635                 } else { // strength == UCOL_TERTIARY
1636                     SecondCE = baseContCE - 0x02;
1637                 }
1638             }
1639         }
1640 
1641 #if 0
1642         // the code below relies on getting a code point from the inverse table, in order to be
1643         // able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
1644         // 1. There are many code points that have the same CE
1645         // 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
1646         // Also, in case when there is no equivalent strength before an element, we have to actually
1647         // construct one. For example, &[before 2]a << x won't result in x << a, because the element
1648         // before a is a primary difference.
1649 
1650         //uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
1651 
1652 
1653         ch = CETable[3*invPos+2];
1654 
1655         if((ch &  UCOL_INV_SIZEMASK) != 0) {
1656             uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
1657             uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
1658             ch = conts[offset];
1659         }
1660 
1661         *src->extraCurrent++ = (UChar)ch;
1662         src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
1663         src->parsedToken.charsLen = 1;
1664 
1665         // We got an UCA before. However, this might have been tailored.
1666         // example:
1667         // &\u30ca = \u306a
1668         // &[before 3]\u306a<<<\u306a|\u309d
1669 
1670 
1671         // uint32_t key = (*newCharsLen << 24) | *charsOffset;
1672         key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
1673         key.rulesToParseHdl = &(src->source);
1674 
1675         //sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
1676         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1677 #endif
1678 
1679         // here is how it should be. The situation such as &[before 1]a < x, should be
1680         // resolved exactly as if we wrote &a > x.
1681         // therefore, I don't really care if the UCA value before a has been changed.
1682         // However, I do care if the strength between my element and the previous element
1683         // is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
1684         // have to construct the base CE.
1685 
1686 
1687 
1688         // if we found a tailored thing, we have to use the UCA value and construct
1689         // a new reset token with constructed name
1690         //if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1691         // character to which we want to anchor is already tailored.
1692         // We need to construct a new token which will be the anchor
1693         // point
1694         //*(src->extraCurrent-1) = 0xFFFE;
1695         //*src->extraCurrent++ = (UChar)ch;
1696         // grab before
1697         src->parsedToken.charsOffset -= 10;
1698         src->parsedToken.charsLen += 10;
1699         src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
1700         if(isContinuation(SecondCE)) {
1701             src->lh[src->resultLen].baseContCE = SecondCE;
1702         } else {
1703             src->lh[src->resultLen].baseContCE = 0;
1704         }
1705         src->lh[src->resultLen].nextCE = 0;
1706         src->lh[src->resultLen].nextContCE = 0;
1707         src->lh[src->resultLen].previousCE = 0;
1708         src->lh[src->resultLen].previousContCE = 0;
1709 
1710         src->lh[src->resultLen].indirect = FALSE;
1711 
1712         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
1713         //}
1714     }
1715 
1716     return sourceToken;
1717 
1718 }
1719 
ucol_tok_assembleTokenList(UColTokenParser * src,UParseError * parseError,UErrorCode * status)1720 uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
1721     UColToken *lastToken = NULL;
1722     const UChar *parseEnd = NULL;
1723     uint32_t expandNext = 0;
1724     UBool variableTop = FALSE;
1725     UBool top = FALSE;
1726     uint16_t specs = 0;
1727     UColTokListHeader *ListList = NULL;
1728 
1729     src->parsedToken.strength = UCOL_TOK_UNSET;
1730 
1731     ListList = src->lh;
1732 
1733     if(U_FAILURE(*status)) {
1734         return 0;
1735     }
1736 #ifdef DEBUG_FOR_CODE_POINTS
1737     char filename[35];
1738     sprintf(filename, "/tmp/debug_for_cp_%09d.txt", getpid());
1739     dfcp_fp = fopen(filename, "a");
1740     fprintf(stdout, "Output is in the file %s.\n", filename);
1741 #endif
1742 
1743 #ifdef DEBUG_FOR_COLL_RULES
1744     std::string s3;
1745     UnicodeString(src->source).toUTF8String(s3);
1746     std::cout << "src->source = " << s3 << std::endl;
1747 #endif
1748 
1749     while(src->current < src->end || src->isStarred) {
1750         src->parsedToken.prefixOffset = 0;
1751 
1752         parseEnd = ucol_tok_parseNextToken(src,
1753             (UBool)(lastToken == NULL),
1754             parseError,
1755             status);
1756 
1757         specs = src->parsedToken.flags;
1758 
1759 
1760         variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
1761         top = ((specs & UCOL_TOK_TOP) != 0);
1762 
1763         if(U_SUCCESS(*status) && parseEnd != NULL) {
1764             UColToken *sourceToken = NULL;
1765             //uint32_t key = 0;
1766             uint32_t lastStrength = UCOL_TOK_UNSET;
1767 
1768             if(lastToken != NULL ) {
1769                 lastStrength = lastToken->strength;
1770             }
1771 
1772 #ifdef DEBUG_FOR_CODE_POINTS
1773             UChar32 cp;
1774             U16_GET(src->source, 0, src->parsedToken.charsOffset, (uint32_t)(src->extraEnd - src->source), cp);
1775             fprintf(dfcp_fp, "Code point = %x, Strength = %x\n", cp, src->parsedToken.strength);
1776 #endif
1777             //key = newCharsLen << 24 | charsOffset;
1778             UColToken key;
1779             key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1780             key.rulesToParseHdl = &(src->source);
1781 
1782             /*  4 Lookup each source in the CharsToToken map, and find a sourceToken */
1783             sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1784 
1785             if(src->parsedToken.strength != UCOL_TOK_RESET) {
1786                 if(lastToken == NULL) { /* this means that rules haven't started properly */
1787                     *status = U_INVALID_FORMAT_ERROR;
1788                     syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1789                     DBG_FORMAT_ERROR
1790                     return 0;
1791                 }
1792                 /*  6 Otherwise (when relation != reset) */
1793                 if(sourceToken == NULL) {
1794                     /* If sourceToken is null, create new one, */
1795                     sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
1796                     /* test for NULL */
1797                     if (sourceToken == NULL) {
1798                         *status = U_MEMORY_ALLOCATION_ERROR;
1799                         return 0;
1800                     }
1801                     sourceToken->rulesToParseHdl = &(src->source);
1802                     sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
1803 
1804                     sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
1805 
1806                     sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
1807                     sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
1808 
1809                     sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
1810                     sourceToken->next = NULL;
1811                     sourceToken->previous = NULL;
1812                     sourceToken->noOfCEs = 0;
1813                     sourceToken->noOfExpCEs = 0;
1814                     // keep the flags around so that we know about before
1815                     sourceToken->flags = src->parsedToken.flags;
1816                     uhash_put(src->tailored, sourceToken, sourceToken, status);
1817                     if(U_FAILURE(*status)) {
1818                         return 0;
1819                     }
1820                 } else {
1821                     /* we could have fished out a reset here */
1822                     if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
1823                         /* otherwise remove sourceToken from where it was. */
1824                         if(sourceToken->next != NULL) {
1825                             if(sourceToken->next->strength > sourceToken->strength) {
1826                                 sourceToken->next->strength = sourceToken->strength;
1827                             }
1828                             sourceToken->next->previous = sourceToken->previous;
1829                         } else {
1830                             sourceToken->listHeader->last = sourceToken->previous;
1831                         }
1832 
1833                         if(sourceToken->previous != NULL) {
1834                             sourceToken->previous->next = sourceToken->next;
1835                         } else {
1836                             sourceToken->listHeader->first = sourceToken->next;
1837                         }
1838                         sourceToken->next = NULL;
1839                         sourceToken->previous = NULL;
1840                     }
1841                 }
1842 
1843                 sourceToken->strength = src->parsedToken.strength;
1844                 sourceToken->listHeader = lastToken->listHeader;
1845 
1846                 /*
1847                 1.  Find the strongest strength in each list, and set strongestP and strongestN
1848                 accordingly in the headers.
1849                 */
1850                 if(lastStrength == UCOL_TOK_RESET
1851                     || sourceToken->listHeader->first == 0) {
1852                         /* If LAST is a reset
1853                         insert sourceToken in the list. */
1854                         if(sourceToken->listHeader->first == 0) {
1855                             sourceToken->listHeader->first = sourceToken;
1856                             sourceToken->listHeader->last = sourceToken;
1857                         } else { /* we need to find a place for us */
1858                             /* and we'll get in front of the same strength */
1859                             if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
1860                                 sourceToken->next = sourceToken->listHeader->first;
1861                                 sourceToken->next->previous = sourceToken;
1862                                 sourceToken->listHeader->first = sourceToken;
1863                                 sourceToken->previous = NULL;
1864                             } else {
1865                                 lastToken = sourceToken->listHeader->first;
1866                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1867                                     lastToken = lastToken->next;
1868                                 }
1869                                 if(lastToken->next != NULL) {
1870                                     lastToken->next->previous = sourceToken;
1871                                 } else {
1872                                     sourceToken->listHeader->last = sourceToken;
1873                                 }
1874                                 sourceToken->previous = lastToken;
1875                                 sourceToken->next = lastToken->next;
1876                                 lastToken->next = sourceToken;
1877                             }
1878                         }
1879                     } else {
1880                         /* Otherwise (when LAST is not a reset)
1881                         if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
1882                         otherwise insert before.
1883                         when inserting after or before, search to the next position with the same
1884                         strength in that direction. (This is called postpone insertion).         */
1885                         if(sourceToken != lastToken) {
1886                             if(lastToken->polarity == sourceToken->polarity) {
1887                                 while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
1888                                     lastToken = lastToken->next;
1889                                 }
1890                                 sourceToken->previous = lastToken;
1891                                 if(lastToken->next != NULL) {
1892                                     lastToken->next->previous = sourceToken;
1893                                 } else {
1894                                     sourceToken->listHeader->last = sourceToken;
1895                                 }
1896 
1897                                 sourceToken->next = lastToken->next;
1898                                 lastToken->next = sourceToken;
1899                             } else {
1900                                 while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
1901                                     lastToken = lastToken->previous;
1902                                 }
1903                                 sourceToken->next = lastToken;
1904                                 if(lastToken->previous != NULL) {
1905                                     lastToken->previous->next = sourceToken;
1906                                 } else {
1907                                     sourceToken->listHeader->first = sourceToken;
1908                                 }
1909                                 sourceToken->previous = lastToken->previous;
1910                                 lastToken->previous = sourceToken;
1911                             }
1912                         } else { /* repeated one thing twice in rules, stay with the stronger strength */
1913                             if(lastStrength < sourceToken->strength) {
1914                                 sourceToken->strength = lastStrength;
1915                             }
1916                         }
1917                     }
1918 
1919                     /* if the token was a variable top, we're gonna put it in */
1920                     if(variableTop == TRUE && src->varTop == NULL) {
1921                         variableTop = FALSE;
1922                         src->varTop = sourceToken;
1923                     }
1924 
1925                     // Treat the expansions.
1926                     // There are two types of expansions: explicit (x / y) and reset based propagating expansions
1927                     // (&abc * d * e <=> &ab * d / c * e / c)
1928                     // if both of them are in effect for a token, they are combined.
1929 
1930                     sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
1931 
1932                     if(expandNext != 0) {
1933                         if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
1934                             expandNext = 0;
1935                         } else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
1936                             sourceToken->expansion = expandNext;
1937                         } else { /* there is both explicit and implicit expansion. We need to make a combination */
1938                             uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
1939                             uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
1940                             sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (uint32_t)(src->extraCurrent - src->source));
1941                             src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
1942                         }
1943                     }
1944 
1945                     // This is just for debugging purposes
1946                     if(sourceToken->expansion != 0) {
1947                         sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
1948                     } else {
1949                         sourceToken->debugExpansion = 0;
1950                     }
1951                     // if the previous token was a reset before, the strength of this
1952                     // token must match the strength of before. Otherwise we have an
1953                     // undefined situation.
1954                     // In other words, we currently have a cludge which we use to
1955                     // represent &a >> x. This is written as &[before 2]a << x.
1956                     if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
1957                         uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
1958                         if(beforeStrength != sourceToken->strength) {
1959                             *status = U_INVALID_FORMAT_ERROR;
1960                             syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
1961                             DBG_FORMAT_ERROR
1962                             return 0;
1963                         }
1964                     }
1965             } else {
1966                 if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
1967                     /* if the previous token was also a reset, */
1968                     /*this means that we have two consecutive resets */
1969                     /* and we want to remove the previous one if empty*/
1970                     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
1971                         src->resultLen--;
1972                     }
1973                 }
1974 
1975                 if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
1976                     uint32_t searchCharsLen = src->parsedToken.charsLen;
1977                     while(searchCharsLen > 1 && sourceToken == NULL) {
1978                         searchCharsLen--;
1979                         //key = searchCharsLen << 24 | charsOffset;
1980                         UColToken key;
1981                         key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
1982                         key.rulesToParseHdl = &(src->source);
1983                         sourceToken = (UColToken *)uhash_get(src->tailored, &key);
1984                     }
1985                     if(sourceToken != NULL) {
1986                         expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
1987                     }
1988                 }
1989 
1990                 if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
1991                     if(top == FALSE) { /* there is no indirection */
1992                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
1993                         if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
1994                             /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
1995                             while(sourceToken->strength > strength && sourceToken->previous != NULL) {
1996                                 sourceToken = sourceToken->previous;
1997                             }
1998                             /* here, either we hit the strength or NULL */
1999                             if(sourceToken->strength == strength) {
2000                                 if(sourceToken->previous != NULL) {
2001                                     sourceToken = sourceToken->previous;
2002                                 } else { /* start of list */
2003                                     sourceToken = sourceToken->listHeader->reset;
2004                                 }
2005                             } else { /* we hit NULL */
2006                                 /* we should be doing the else part */
2007                                 sourceToken = sourceToken->listHeader->reset;
2008                                 sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2009                             }
2010                         } else {
2011                             sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
2012                         }
2013                     } else { /* this is both before and indirection */
2014                         top = FALSE;
2015                         ListList[src->resultLen].previousCE = 0;
2016                         ListList[src->resultLen].previousContCE = 0;
2017                         ListList[src->resultLen].indirect = TRUE;
2018                         /* we need to do slightly more work. we need to get the baseCE using the */
2019                         /* inverse UCA & getPrevious. The next bound is not set, and will be decided */
2020                         /* in ucol_bld */
2021                         uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
2022                         uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2023                         uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
2024                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2025 
2026                         UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
2027                         if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) &&
2028                            (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
2029                             uint32_t primary = (baseCE & UCOL_PRIMARYMASK) | ((baseContCE & UCOL_PRIMARYMASK) >> 16);
2030                             uint32_t raw = uprv_uca_getRawFromImplicit(primary);
2031                             uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
2032                             CE = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
2033                             SecondCE = ((primaryCE << 16) & UCOL_PRIMARYMASK) | UCOL_CONTINUATION_MARKER;
2034                         } else {
2035                             /*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
2036                             ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
2037                         }
2038 
2039                         ListList[src->resultLen].baseCE = CE;
2040                         ListList[src->resultLen].baseContCE = SecondCE;
2041                         ListList[src->resultLen].nextCE = 0;
2042                         ListList[src->resultLen].nextContCE = 0;
2043 
2044                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
2045                     }
2046                 }
2047 
2048 
2049                 /*  5 If the relation is a reset:
2050                 If sourceToken is null
2051                 Create new list, create new sourceToken, make the baseCE from source, put
2052                 the sourceToken in ListHeader of the new list */
2053                 if(sourceToken == NULL) {
2054                     /*
2055                     3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
2056                     First convert all expansions into normal form. Examples:
2057                     If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
2058                     d * ... into &x * c/y * d * ...
2059                     Note: reset values can never have expansions, although they can cause the
2060                     very next item to have one. They may be contractions, if they are found
2061                     earlier in the list.
2062                     */
2063                     if(top == FALSE) {
2064                         collIterate s;
2065                         uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
2066 
2067                         uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
2068 
2069                         CE = ucol_getNextCE(src->UCA, &s, status);
2070                         const UChar *expand = s.pos;
2071                         SecondCE = ucol_getNextCE(src->UCA, &s, status);
2072 
2073                         ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
2074                         if(isContinuation(SecondCE)) {
2075                             ListList[src->resultLen].baseContCE = SecondCE;
2076                         } else {
2077                             ListList[src->resultLen].baseContCE = 0;
2078                         }
2079                         ListList[src->resultLen].nextCE = 0;
2080                         ListList[src->resultLen].nextContCE = 0;
2081                         ListList[src->resultLen].previousCE = 0;
2082                         ListList[src->resultLen].previousContCE = 0;
2083                         ListList[src->resultLen].indirect = FALSE;
2084                         sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
2085                     } else { /* top == TRUE */
2086                         /* just use the supplied values */
2087                         top = FALSE;
2088                         ListList[src->resultLen].previousCE = 0;
2089                         ListList[src->resultLen].previousContCE = 0;
2090                         ListList[src->resultLen].indirect = TRUE;
2091                         ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
2092                         ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
2093                         ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
2094                         ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
2095 
2096                         sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
2097 
2098                     }
2099                 } else { /* reset to something already in rules */
2100                     top = FALSE;
2101                 }
2102             }
2103             /*  7 After all this, set LAST to point to sourceToken, and goto step 3. */
2104             lastToken = sourceToken;
2105         } else {
2106             if(U_FAILURE(*status)) {
2107                 return 0;
2108             }
2109         }
2110     }
2111 #ifdef DEBUG_FOR_CODE_POINTS
2112     fclose(dfcp_fp);
2113 #endif
2114 
2115 
2116     if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
2117         src->resultLen--;
2118     }
2119     return src->resultLen;
2120 }
2121 
ucol_tok_getRulesFromBundle(void *,const char * locale,const char * type,int32_t * pLength,UErrorCode * status)2122 const UChar* ucol_tok_getRulesFromBundle(
2123     void* /*context*/,
2124     const char* locale,
2125     const char* type,
2126     int32_t* pLength,
2127     UErrorCode* status)
2128 {
2129     const UChar* rules = NULL;
2130     UResourceBundle* bundle;
2131     UResourceBundle* collations;
2132     UResourceBundle* collation;
2133 
2134     *pLength = 0;
2135 
2136     bundle = ures_open(U_ICUDATA_COLL, locale, status);
2137     if(U_SUCCESS(*status)){
2138         collations = ures_getByKey(bundle, "collations", NULL, status);
2139         if(U_SUCCESS(*status)){
2140             collation = ures_getByKey(collations, type, NULL, status);
2141             if(U_SUCCESS(*status)){
2142                 rules = ures_getStringByKey(collation, "Sequence", pLength, status);
2143                 if(U_FAILURE(*status)){
2144                     *pLength = 0;
2145                     rules = NULL;
2146                 }
2147                 ures_close(collation);
2148             }
2149             ures_close(collations);
2150         }
2151     }
2152 
2153     ures_close(bundle);
2154 
2155     return rules;
2156 }
2157 
ucol_tok_initTokenList(UColTokenParser * src,const UChar * rules,uint32_t rulesLength,const UCollator * UCA,GetCollationRulesFunction importFunc,void * context,UErrorCode * status)2158 void ucol_tok_initTokenList(
2159     UColTokenParser *src,
2160     const UChar *rules,
2161     uint32_t rulesLength,
2162     const UCollator *UCA,
2163     GetCollationRulesFunction importFunc,
2164     void* context,
2165     UErrorCode *status) {
2166     U_NAMESPACE_USE
2167 
2168     uint32_t nSize = 0;
2169     uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
2170 
2171     bool needToDeallocRules = false;
2172 
2173     if(U_FAILURE(*status)) {
2174         return;
2175     }
2176 
2177     // set everything to zero, so that we can clean up gracefully
2178     uprv_memset(src, 0, sizeof(UColTokenParser));
2179 
2180     // first we need to find options that don't like to be normalized,
2181     // like copy and remove...
2182     //const UChar *openBrace = rules;
2183     int32_t optionNumber = -1;
2184     const UChar *setStart = NULL;
2185     uint32_t i = 0;
2186     while(i < rulesLength) {
2187         if(rules[i] == 0x005B) {    // '[': start of an option
2188             /* Gets the following:
2189                optionNumber: The index of the option.
2190                setStart: The pointer at which the option arguments start.
2191              */
2192             optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
2193 
2194             if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
2195                 // [optimize]
2196                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2197                 if(U_SUCCESS(*status)) {
2198                     if(src->copySet == NULL) {
2199                         src->copySet = newSet;
2200                     } else {
2201                         uset_addAll(src->copySet, newSet);
2202                         uset_close(newSet);
2203                     }
2204                 } else {
2205                     return;
2206                 }
2207             } else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
2208                 USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
2209                 if(U_SUCCESS(*status)) {
2210                     if(src->removeSet == NULL) {
2211                         src->removeSet = newSet;
2212                     } else {
2213                         uset_addAll(src->removeSet, newSet);
2214                         uset_close(newSet);
2215                     }
2216                 } else {
2217                     return;
2218                 }
2219             } else if(optionNumber == OPTION_IMPORT){
2220                 // [import <collation-name>]
2221 
2222                 // Find the address of the closing ].
2223                 UChar* import_end = u_strchr(setStart, 0x005D);
2224                 int32_t optionEndOffset = (int32_t)(import_end + 1 - rules);
2225                 // Ignore trailing whitespace.
2226                 while(uprv_isRuleWhiteSpace(*(import_end-1))) {
2227                     --import_end;
2228                 }
2229 
2230                 int32_t optionLength = (int32_t)(import_end - setStart);
2231                 char option[50];
2232                 if(optionLength >= (int32_t)sizeof(option)) {
2233                     *status = U_ILLEGAL_ARGUMENT_ERROR;
2234                     return;
2235                 }
2236                 u_UCharsToChars(setStart, option, optionLength);
2237                 option[optionLength] = 0;
2238 
2239                 *status = U_ZERO_ERROR;
2240                 char locale[50];
2241                 int32_t templ;
2242                 uloc_forLanguageTag(option, locale, (int32_t)sizeof(locale), &templ, status);
2243                 if(U_FAILURE(*status)) {
2244                     *status = U_ILLEGAL_ARGUMENT_ERROR;
2245                     return;
2246                 }
2247 
2248                 char type[50];
2249                 if (uloc_getKeywordValue(locale, "collation", type, (int32_t)sizeof(type), status) <= 0 ||
2250                     U_FAILURE(*status)
2251                 ) {
2252                     *status = U_ZERO_ERROR;
2253                     uprv_strcpy(type, "standard");
2254                 }
2255 
2256                 // TODO: Use public functions when available, see ticket #8134.
2257                 char *keywords = (char *)locale_getKeywordsStart(locale);
2258                 if(keywords != NULL) {
2259                     *keywords = 0;
2260                 }
2261 
2262                 int32_t importRulesLength = 0;
2263                 const UChar* importRules = importFunc(context, locale, type, &importRulesLength, status);
2264 
2265 #ifdef DEBUG_FOR_COLL_RULES
2266                 std::string s;
2267                 UnicodeString(importRules).toUTF8String(s);
2268                 std::cout << "Import rules = " << s << std::endl;
2269 #endif
2270 
2271                 // Add the length of the imported rules to length of the original rules,
2272                 // and subtract the length of the import option.
2273                 uint32_t newRulesLength = rulesLength + importRulesLength - (optionEndOffset - i);
2274 
2275                 UChar* newRules = (UChar*)uprv_malloc(newRulesLength*sizeof(UChar));
2276 
2277 #ifdef DEBUG_FOR_COLL_RULES
2278                 std::string s1;
2279                 UnicodeString(rules).toUTF8String(s1);
2280                 std::cout << "Original rules = " << s1 << std::endl;
2281 #endif
2282 
2283 
2284                 // Copy the section of the original rules leading up to the import
2285                 uprv_memcpy(newRules, rules, i*sizeof(UChar));
2286                 // Copy the imported rules
2287                 uprv_memcpy(newRules+i, importRules, importRulesLength*sizeof(UChar));
2288                 // Copy the rest of the original rules (minus the import option itself)
2289                 uprv_memcpy(newRules+i+importRulesLength,
2290                             rules+optionEndOffset,
2291                             (rulesLength-optionEndOffset)*sizeof(UChar));
2292 
2293 #ifdef DEBUG_FOR_COLL_RULES
2294                 std::string s2;
2295                 UnicodeString(newRules).toUTF8String(s2);
2296                 std::cout << "Resulting rules = " << s2 << std::endl;
2297 #endif
2298 
2299                 if(needToDeallocRules){
2300                     // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2301                     uprv_free((void*)rules);
2302                 }
2303                 needToDeallocRules = true;
2304                 rules = newRules;
2305                 rulesLength = newRulesLength;
2306 
2307                 estimatedSize += importRulesLength*2;
2308 
2309                 // First character of the new rules needs to be processed
2310                 i--;
2311             }
2312         }
2313         //openBrace++;
2314         i++;
2315     }
2316 
2317     src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
2318     /* test for NULL */
2319     if (src->source == NULL) {
2320         *status = U_MEMORY_ALLOCATION_ERROR;
2321         return;
2322     }
2323     uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
2324     nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
2325     if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
2326         *status = U_ZERO_ERROR;
2327         src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
2328         /* test for NULL */
2329         if (src->source == NULL) {
2330             *status = U_MEMORY_ALLOCATION_ERROR;
2331             return;
2332         }
2333         nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
2334     }
2335     if(needToDeallocRules){
2336         // if needToDeallocRules is set, then we allocated rules, so it's safe to cast and free
2337         uprv_free((void*)rules);
2338     }
2339 
2340 
2341     src->current = src->source;
2342     src->end = src->source+nSize;
2343     src->sourceCurrent = src->source;
2344     src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
2345     src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
2346     src->varTop = NULL;
2347     src->UCA = UCA;
2348     src->invUCA = ucol_initInverseUCA(status);
2349     src->parsedToken.charsLen = 0;
2350     src->parsedToken.charsOffset = 0;
2351     src->parsedToken.extensionLen = 0;
2352     src->parsedToken.extensionOffset = 0;
2353     src->parsedToken.prefixLen = 0;
2354     src->parsedToken.prefixOffset = 0;
2355     src->parsedToken.flags = 0;
2356     src->parsedToken.strength = UCOL_TOK_UNSET;
2357     src->buildCCTabFlag = FALSE;
2358     src->isStarred = FALSE;
2359     src->inRange = FALSE;
2360     src->lastRangeCp = 0;
2361     src->previousCp = 0;
2362 
2363     if(U_FAILURE(*status)) {
2364         return;
2365     }
2366     src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, NULL, status);
2367     if(U_FAILURE(*status)) {
2368         return;
2369     }
2370     uhash_setValueDeleter(src->tailored, uhash_freeBlock);
2371 
2372     src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
2373     /* test for NULL */
2374     if (src->opts == NULL) {
2375         *status = U_MEMORY_ALLOCATION_ERROR;
2376         return;
2377     }
2378 
2379     uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
2380 
2381     src->lh = 0;
2382     src->listCapacity = 1024;
2383     src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
2384     //Test for NULL
2385     if (src->lh == NULL) {
2386         *status = U_MEMORY_ALLOCATION_ERROR;
2387         return;
2388     }
2389     uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
2390     src->resultLen = 0;
2391 
2392     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
2393 
2394     // UCOL_RESET_TOP_VALUE
2395     setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2396     // UCOL_FIRST_PRIMARY_IGNORABLE
2397     setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
2398     // UCOL_LAST_PRIMARY_IGNORABLE
2399     setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
2400     // UCOL_FIRST_SECONDARY_IGNORABLE
2401     setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
2402     // UCOL_LAST_SECONDARY_IGNORABLE
2403     setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
2404     // UCOL_FIRST_TERTIARY_IGNORABLE
2405     setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
2406     // UCOL_LAST_TERTIARY_IGNORABLE
2407     setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
2408     // UCOL_FIRST_VARIABLE
2409     setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
2410     // UCOL_LAST_VARIABLE
2411     setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
2412     // UCOL_FIRST_NON_VARIABLE
2413     setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
2414     // UCOL_LAST_NON_VARIABLE
2415     setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
2416     // UCOL_FIRST_IMPLICIT
2417     setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
2418     // UCOL_LAST_IMPLICIT
2419     setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
2420     // UCOL_FIRST_TRAILING
2421     setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
2422     // UCOL_LAST_TRAILING
2423     setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
2424     ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
2425 }
2426 
2427 
ucol_tok_closeTokenList(UColTokenParser * src)2428 void ucol_tok_closeTokenList(UColTokenParser *src) {
2429     if(src->copySet != NULL) {
2430         uset_close(src->copySet);
2431     }
2432     if(src->removeSet != NULL) {
2433         uset_close(src->removeSet);
2434     }
2435     if(src->tailored != NULL) {
2436         uhash_close(src->tailored);
2437     }
2438     if(src->lh != NULL) {
2439         uprv_free(src->lh);
2440     }
2441     if(src->source != NULL) {
2442         uprv_free(src->source);
2443     }
2444     if(src->opts != NULL) {
2445         uprv_free(src->opts);
2446     }
2447     if (src->reorderCodes != NULL) {
2448         uprv_free(src->reorderCodes);
2449     }
2450 }
2451 
2452 #endif /* #if !UCONFIG_NO_COLLATION */
2453