• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationruleparser.cpp
7 *
8 * (replaced the former ucol_tok.cpp)
9 *
10 * created on: 2013apr10
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/normalizer2.h"
19 #include "unicode/parseerr.h"
20 #include "unicode/uchar.h"
21 #include "unicode/ucol.h"
22 #include "unicode/uloc.h"
23 #include "unicode/unistr.h"
24 #include "unicode/utf16.h"
25 #include "charstr.h"
26 #include "cmemory.h"
27 #include "collation.h"
28 #include "collationdata.h"
29 #include "collationruleparser.h"
30 #include "collationsettings.h"
31 #include "collationtailoring.h"
32 #include "cstring.h"
33 #include "patternprops.h"
34 #include "uassert.h"
35 #include "uvectr32.h"
36 
37 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
38 
39 U_NAMESPACE_BEGIN
40 
41 namespace {
42 
43 static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
44 const int32_t BEFORE_LENGTH = 7;
45 
46 }  // namespace
47 
~Sink()48 CollationRuleParser::Sink::~Sink() {}
49 
50 void
suppressContractions(const UnicodeSet &,const char * &,UErrorCode &)51 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
52 
53 void
optimize(const UnicodeSet &,const char * &,UErrorCode &)54 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
55 
~Importer()56 CollationRuleParser::Importer::~Importer() {}
57 
CollationRuleParser(const CollationData * base,UErrorCode & errorCode)58 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
59         : nfd(*Normalizer2::getNFDInstance(errorCode)),
60           nfc(*Normalizer2::getNFCInstance(errorCode)),
61           rules(NULL), baseData(base), settings(NULL),
62           parseError(NULL), errorReason(NULL),
63           sink(NULL), importer(NULL),
64           ruleIndex(0) {
65 }
66 
~CollationRuleParser()67 CollationRuleParser::~CollationRuleParser() {
68 }
69 
70 void
parse(const UnicodeString & ruleString,CollationSettings & outSettings,UParseError * outParseError,UErrorCode & errorCode)71 CollationRuleParser::parse(const UnicodeString &ruleString,
72                            CollationSettings &outSettings,
73                            UParseError *outParseError,
74                            UErrorCode &errorCode) {
75     if(U_FAILURE(errorCode)) { return; }
76     settings = &outSettings;
77     parseError = outParseError;
78     if(parseError != NULL) {
79         parseError->line = 0;
80         parseError->offset = -1;
81         parseError->preContext[0] = 0;
82         parseError->postContext[0] = 0;
83     }
84     errorReason = NULL;
85     parse(ruleString, errorCode);
86 }
87 
88 void
parse(const UnicodeString & ruleString,UErrorCode & errorCode)89 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
90     if(U_FAILURE(errorCode)) { return; }
91     rules = &ruleString;
92     ruleIndex = 0;
93 
94     while(ruleIndex < rules->length()) {
95         UChar c = rules->charAt(ruleIndex);
96         if(PatternProps::isWhiteSpace(c)) {
97             ++ruleIndex;
98             continue;
99         }
100         switch(c) {
101         case 0x26:  // '&'
102             parseRuleChain(errorCode);
103             break;
104         case 0x5b:  // '['
105             parseSetting(errorCode);
106             break;
107         case 0x23:  // '#' starts a comment, until the end of the line
108             ruleIndex = skipComment(ruleIndex + 1);
109             break;
110         case 0x40:  // '@' is equivalent to [backwards 2]
111             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
112                               UCOL_ON, 0, errorCode);
113             ++ruleIndex;
114             break;
115         case 0x21:  // '!' used to turn on Thai/Lao character reversal
116             // Accept but ignore. The root collator has contractions
117             // that are equivalent to the character reversal, where appropriate.
118             ++ruleIndex;
119             break;
120         default:
121             setParseError("expected a reset or setting or comment", errorCode);
122             break;
123         }
124         if(U_FAILURE(errorCode)) { return; }
125     }
126 }
127 
128 void
parseRuleChain(UErrorCode & errorCode)129 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
130     int32_t resetStrength = parseResetAndPosition(errorCode);
131     UBool isFirstRelation = TRUE;
132     for(;;) {
133         int32_t result = parseRelationOperator(errorCode);
134         if(U_FAILURE(errorCode)) { return; }
135         if(result < 0) {
136             if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
137                 // '#' starts a comment, until the end of the line
138                 ruleIndex = skipComment(ruleIndex + 1);
139                 continue;
140             }
141             if(isFirstRelation) {
142                 setParseError("reset not followed by a relation", errorCode);
143             }
144             return;
145         }
146         int32_t strength = result & STRENGTH_MASK;
147         if(resetStrength < UCOL_IDENTICAL) {
148             // reset-before rule chain
149             if(isFirstRelation) {
150                 if(strength != resetStrength) {
151                     setParseError("reset-before strength differs from its first relation", errorCode);
152                     return;
153                 }
154             } else {
155                 if(strength < resetStrength) {
156                     setParseError("reset-before strength followed by a stronger relation", errorCode);
157                     return;
158                 }
159             }
160         }
161         int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
162         if((result & STARRED_FLAG) == 0) {
163             parseRelationStrings(strength, i, errorCode);
164         } else {
165             parseStarredCharacters(strength, i, errorCode);
166         }
167         if(U_FAILURE(errorCode)) { return; }
168         isFirstRelation = FALSE;
169     }
170 }
171 
172 int32_t
parseResetAndPosition(UErrorCode & errorCode)173 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
174     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
175     int32_t i = skipWhiteSpace(ruleIndex + 1);
176     int32_t j;
177     UChar c;
178     int32_t resetStrength;
179     if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
180             (j = i + BEFORE_LENGTH) < rules->length() &&
181             PatternProps::isWhiteSpace(rules->charAt(j)) &&
182             ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
183             0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
184             rules->charAt(j + 1) == 0x5d) {
185         // &[before n] with n=1 or 2 or 3
186         resetStrength = UCOL_PRIMARY + (c - 0x31);
187         i = skipWhiteSpace(j + 2);
188     } else {
189         resetStrength = UCOL_IDENTICAL;
190     }
191     if(i >= rules->length()) {
192         setParseError("reset without position", errorCode);
193         return UCOL_DEFAULT;
194     }
195     UnicodeString str;
196     if(rules->charAt(i) == 0x5b) {  // '['
197         i = parseSpecialPosition(i, str, errorCode);
198     } else {
199         i = parseTailoringString(i, str, errorCode);
200     }
201     sink->addReset(resetStrength, str, errorReason, errorCode);
202     if(U_FAILURE(errorCode)) { setErrorContext(); }
203     ruleIndex = i;
204     return resetStrength;
205 }
206 
207 int32_t
parseRelationOperator(UErrorCode & errorCode)208 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
209     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
210     ruleIndex = skipWhiteSpace(ruleIndex);
211     if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
212     int32_t strength;
213     int32_t i = ruleIndex;
214     UChar c = rules->charAt(i++);
215     switch(c) {
216     case 0x3c:  // '<'
217         if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
218             ++i;
219             if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
220                 ++i;
221                 if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
222                     ++i;
223                     strength = UCOL_QUATERNARY;
224                 } else {
225                     strength = UCOL_TERTIARY;
226                 }
227             } else {
228                 strength = UCOL_SECONDARY;
229             }
230         } else {
231             strength = UCOL_PRIMARY;
232         }
233         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
234             ++i;
235             strength |= STARRED_FLAG;
236         }
237         break;
238     case 0x3b:  // ';' same as <<
239         strength = UCOL_SECONDARY;
240         break;
241     case 0x2c:  // ',' same as <<<
242         strength = UCOL_TERTIARY;
243         break;
244     case 0x3d:  // '='
245         strength = UCOL_IDENTICAL;
246         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
247             ++i;
248             strength |= STARRED_FLAG;
249         }
250         break;
251     default:
252         return UCOL_DEFAULT;
253     }
254     return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
255 }
256 
257 void
parseRelationStrings(int32_t strength,int32_t i,UErrorCode & errorCode)258 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
259     // Parse
260     //     prefix | str / extension
261     // where prefix and extension are optional.
262     UnicodeString prefix, str, extension;
263     i = parseTailoringString(i, str, errorCode);
264     if(U_FAILURE(errorCode)) { return; }
265     UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
266     if(next == 0x7c) {  // '|' separates the context prefix from the string.
267         prefix = str;
268         i = parseTailoringString(i + 1, str, errorCode);
269         if(U_FAILURE(errorCode)) { return; }
270         next = (i < rules->length()) ? rules->charAt(i) : 0;
271     }
272     if(next == 0x2f) {  // '/' separates the string from the extension.
273         i = parseTailoringString(i + 1, extension, errorCode);
274     }
275     if(!prefix.isEmpty()) {
276         UChar32 prefix0 = prefix.char32At(0);
277         UChar32 c = str.char32At(0);
278         if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
279             setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
280                           errorCode);
281             return;
282         }
283     }
284     sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
285     if(U_FAILURE(errorCode)) { setErrorContext(); }
286     ruleIndex = i;
287 }
288 
289 void
parseStarredCharacters(int32_t strength,int32_t i,UErrorCode & errorCode)290 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
291     UnicodeString empty, raw;
292     i = parseString(skipWhiteSpace(i), raw, errorCode);
293     if(U_FAILURE(errorCode)) { return; }
294     if(raw.isEmpty()) {
295         setParseError("missing starred-relation string", errorCode);
296         return;
297     }
298     UChar32 prev = -1;
299     int32_t j = 0;
300     for(;;) {
301         while(j < raw.length()) {
302             UChar32 c = raw.char32At(j);
303             if(!nfd.isInert(c)) {
304                 setParseError("starred-relation string is not all NFD-inert", errorCode);
305                 return;
306             }
307             sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
308             if(U_FAILURE(errorCode)) {
309                 setErrorContext();
310                 return;
311             }
312             j += U16_LENGTH(c);
313             prev = c;
314         }
315         if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
316             break;
317         }
318         if(prev < 0) {
319             setParseError("range without start in starred-relation string", errorCode);
320             return;
321         }
322         i = parseString(i + 1, raw, errorCode);
323         if(U_FAILURE(errorCode)) { return; }
324         if(raw.isEmpty()) {
325             setParseError("range without end in starred-relation string", errorCode);
326             return;
327         }
328         UChar32 c = raw.char32At(0);
329         if(c < prev) {
330             setParseError("range start greater than end in starred-relation string", errorCode);
331             return;
332         }
333         // range prev-c
334         UnicodeString s;
335         while(++prev <= c) {
336             if(!nfd.isInert(prev)) {
337                 setParseError("starred-relation string range is not all NFD-inert", errorCode);
338                 return;
339             }
340             if(U_IS_SURROGATE(prev)) {
341                 setParseError("starred-relation string range contains a surrogate", errorCode);
342                 return;
343             }
344             if(0xfffd <= prev && prev <= 0xffff) {
345                 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
346                 return;
347             }
348             s.setTo(prev);
349             sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
350             if(U_FAILURE(errorCode)) {
351                 setErrorContext();
352                 return;
353             }
354         }
355         prev = -1;
356         j = U16_LENGTH(c);
357     }
358     ruleIndex = skipWhiteSpace(i);
359 }
360 
361 int32_t
parseTailoringString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)362 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
363     i = parseString(skipWhiteSpace(i), raw, errorCode);
364     if(U_SUCCESS(errorCode) && raw.isEmpty()) {
365         setParseError("missing relation string", errorCode);
366     }
367     return skipWhiteSpace(i);
368 }
369 
370 int32_t
parseString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)371 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
372     if(U_FAILURE(errorCode)) { return i; }
373     raw.remove();
374     while(i < rules->length()) {
375         UChar32 c = rules->charAt(i++);
376         if(isSyntaxChar(c)) {
377             if(c == 0x27) {  // apostrophe
378                 if(i < rules->length() && rules->charAt(i) == 0x27) {
379                     // Double apostrophe, encodes a single one.
380                     raw.append((UChar)0x27);
381                     ++i;
382                     continue;
383                 }
384                 // Quote literal text until the next single apostrophe.
385                 for(;;) {
386                     if(i == rules->length()) {
387                         setParseError("quoted literal text missing terminating apostrophe", errorCode);
388                         return i;
389                     }
390                     c = rules->charAt(i++);
391                     if(c == 0x27) {
392                         if(i < rules->length() && rules->charAt(i) == 0x27) {
393                             // Double apostrophe inside quoted literal text,
394                             // still encodes a single apostrophe.
395                             ++i;
396                         } else {
397                             break;
398                         }
399                     }
400                     raw.append((UChar)c);
401                 }
402             } else if(c == 0x5c) {  // backslash
403                 if(i == rules->length()) {
404                     setParseError("backslash escape at the end of the rule string", errorCode);
405                     return i;
406                 }
407                 c = rules->char32At(i);
408                 raw.append(c);
409                 i += U16_LENGTH(c);
410             } else {
411                 // Any other syntax character terminates a string.
412                 --i;
413                 break;
414             }
415         } else if(PatternProps::isWhiteSpace(c)) {
416             // Unquoted white space terminates a string.
417             --i;
418             break;
419         } else {
420             raw.append((UChar)c);
421         }
422     }
423     for(int32_t j = 0; j < raw.length();) {
424         UChar32 c = raw.char32At(j);
425         if(U_IS_SURROGATE(c)) {
426             setParseError("string contains an unpaired surrogate", errorCode);
427             return i;
428         }
429         if(0xfffd <= c && c <= 0xffff) {
430             setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
431             return i;
432         }
433         j += U16_LENGTH(c);
434     }
435     return i;
436 }
437 
438 namespace {
439 
440 static const char *const positions[] = {
441     "first tertiary ignorable",
442     "last tertiary ignorable",
443     "first secondary ignorable",
444     "last secondary ignorable",
445     "first primary ignorable",
446     "last primary ignorable",
447     "first variable",
448     "last variable",
449     "first regular",
450     "last regular",
451     "first implicit",
452     "last implicit",
453     "first trailing",
454     "last trailing"
455 };
456 
457 }  // namespace
458 
459 int32_t
parseSpecialPosition(int32_t i,UnicodeString & str,UErrorCode & errorCode)460 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
461     if(U_FAILURE(errorCode)) { return 0; }
462     UnicodeString raw;
463     int32_t j = readWords(i + 1, raw);
464     if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
465         ++j;
466         for(int32_t pos = 0; pos < LENGTHOF(positions); ++pos) {
467             if(raw == UnicodeString(positions[pos], -1, US_INV)) {
468                 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
469                 return j;
470             }
471         }
472         if(raw == UNICODE_STRING_SIMPLE("top")) {
473             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
474             return j;
475         }
476         if(raw == UNICODE_STRING_SIMPLE("variable top")) {
477             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
478             return j;
479         }
480     }
481     setParseError("not a valid special reset position", errorCode);
482     return i;
483 }
484 
485 void
parseSetting(UErrorCode & errorCode)486 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
487     if(U_FAILURE(errorCode)) { return; }
488     UnicodeString raw;
489     int32_t i = ruleIndex + 1;
490     int32_t j = readWords(i, raw);
491     if(j <= i || raw.isEmpty()) {
492         setParseError("expected a setting/option at '['", errorCode);
493     }
494     if(rules->charAt(j) == 0x5d) {  // words end with ]
495         ++j;
496         if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497                 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
498             parseReordering(raw, errorCode);
499             ruleIndex = j;
500             return;
501         }
502         if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
503             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
504                               UCOL_ON, 0, errorCode);
505             ruleIndex = j;
506             return;
507         }
508         UnicodeString v;
509         int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
510         if(valueIndex >= 0) {
511             v.setTo(raw, valueIndex + 1);
512             raw.truncate(valueIndex);
513         }
514         if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
515             int32_t value = UCOL_DEFAULT;
516             UChar c = v.charAt(0);
517             if(0x31 <= c && c <= 0x34) {  // 1..4
518                 value = UCOL_PRIMARY + (c - 0x31);
519             } else if(c == 0x49) {  // 'I'
520                 value = UCOL_IDENTICAL;
521             }
522             if(value != UCOL_DEFAULT) {
523                 settings->setStrength(value, 0, errorCode);
524                 ruleIndex = j;
525                 return;
526             }
527         } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
528             UColAttributeValue value = UCOL_DEFAULT;
529             if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
530                 value = UCOL_NON_IGNORABLE;
531             } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
532                 value = UCOL_SHIFTED;
533             }
534             if(value != UCOL_DEFAULT) {
535                 settings->setAlternateHandling(value, 0, errorCode);
536                 ruleIndex = j;
537                 return;
538             }
539         } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
540             int32_t value = UCOL_DEFAULT;
541             if(v == UNICODE_STRING_SIMPLE("space")) {
542                 value = CollationSettings::MAX_VAR_SPACE;
543             } else if(v == UNICODE_STRING_SIMPLE("punct")) {
544                 value = CollationSettings::MAX_VAR_PUNCT;
545             } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
546                 value = CollationSettings::MAX_VAR_SYMBOL;
547             } else if(v == UNICODE_STRING_SIMPLE("currency")) {
548                 value = CollationSettings::MAX_VAR_CURRENCY;
549             }
550             if(value != UCOL_DEFAULT) {
551                 settings->setMaxVariable(value, 0, errorCode);
552                 settings->variableTop = baseData->getLastPrimaryForGroup(
553                     UCOL_REORDER_CODE_FIRST + value);
554                 U_ASSERT(settings->variableTop != 0);
555                 ruleIndex = j;
556                 return;
557             }
558         } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
559             UColAttributeValue value = UCOL_DEFAULT;
560             if(v == UNICODE_STRING_SIMPLE("off")) {
561                 value = UCOL_OFF;
562             } else if(v == UNICODE_STRING_SIMPLE("lower")) {
563                 value = UCOL_LOWER_FIRST;
564             } else if(v == UNICODE_STRING_SIMPLE("upper")) {
565                 value = UCOL_UPPER_FIRST;
566             }
567             if(value != UCOL_DEFAULT) {
568                 settings->setCaseFirst(value, 0, errorCode);
569                 ruleIndex = j;
570                 return;
571             }
572         } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
573             UColAttributeValue value = getOnOffValue(v);
574             if(value != UCOL_DEFAULT) {
575                 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
576                 ruleIndex = j;
577                 return;
578             }
579         } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
580             UColAttributeValue value = getOnOffValue(v);
581             if(value != UCOL_DEFAULT) {
582                 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
583                 ruleIndex = j;
584                 return;
585             }
586         } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
587             UColAttributeValue value = getOnOffValue(v);
588             if(value != UCOL_DEFAULT) {
589                 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
590                 ruleIndex = j;
591                 return;
592             }
593         } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
594             UColAttributeValue value = getOnOffValue(v);
595             if(value != UCOL_DEFAULT) {
596                 if(value == UCOL_ON) {
597                     setParseError("[hiraganaQ on] is not supported", errorCode);
598                 }
599                 ruleIndex = j;
600                 return;
601             }
602         } else if(raw == UNICODE_STRING_SIMPLE("import")) {
603             CharString lang;
604             lang.appendInvariantChars(v, errorCode);
605             if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
606             // BCP 47 language tag -> ICU locale ID
607             char localeID[ULOC_FULLNAME_CAPACITY];
608             int32_t parsedLength;
609             int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
610                                                  &parsedLength, &errorCode);
611             if(U_FAILURE(errorCode) ||
612                     parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
613                 errorCode = U_ZERO_ERROR;
614                 setParseError("expected language tag in [import langTag]", errorCode);
615                 return;
616             }
617             // localeID minus all keywords
618             char baseID[ULOC_FULLNAME_CAPACITY];
619             length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
620             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
621                 errorCode = U_ZERO_ERROR;
622                 setParseError("expected language tag in [import langTag]", errorCode);
623                 return;
624             }
625             // @collation=type, or length=0 if not specified
626             char collationType[ULOC_KEYWORDS_CAPACITY];
627             length = uloc_getKeywordValue(localeID, "collation",
628                                           collationType, ULOC_KEYWORDS_CAPACITY,
629                                           &errorCode);
630             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
631                 errorCode = U_ZERO_ERROR;
632                 setParseError("expected language tag in [import langTag]", errorCode);
633                 return;
634             }
635             if(importer == NULL) {
636                 setParseError("[import langTag] is not supported", errorCode);
637             } else {
638                 const UnicodeString *importedRules =
639                     importer->getRules(baseID,
640                                        length > 0 ? collationType : "standard",
641                                        errorReason, errorCode);
642                 if(U_FAILURE(errorCode)) {
643                     if(errorReason == NULL) {
644                         errorReason = "[import langTag] failed";
645                     }
646                     setErrorContext();
647                     return;
648                 }
649                 const UnicodeString *outerRules = rules;
650                 int32_t outerRuleIndex = ruleIndex;
651                 parse(*importedRules, errorCode);
652                 if(U_FAILURE(errorCode)) {
653                     if(parseError != NULL) {
654                         parseError->offset = outerRuleIndex;
655                     }
656                 }
657                 rules = outerRules;
658                 ruleIndex = j;
659             }
660             return;
661         }
662     } else if(rules->charAt(j) == 0x5b) {  // words end with [
663         UnicodeSet set;
664         j = parseUnicodeSet(j, set, errorCode);
665         if(U_FAILURE(errorCode)) { return; }
666         if(raw == UNICODE_STRING_SIMPLE("optimize")) {
667             sink->optimize(set, errorReason, errorCode);
668             if(U_FAILURE(errorCode)) { setErrorContext(); }
669             ruleIndex = j;
670             return;
671         } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
672             sink->suppressContractions(set, errorReason, errorCode);
673             if(U_FAILURE(errorCode)) { setErrorContext(); }
674             ruleIndex = j;
675             return;
676         }
677     }
678     setParseError("not a valid setting/option", errorCode);
679 }
680 
681 void
parseReordering(const UnicodeString & raw,UErrorCode & errorCode)682 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
683     if(U_FAILURE(errorCode)) { return; }
684     int32_t i = 7;  // after "reorder"
685     if(i == raw.length()) {
686         // empty [reorder] with no codes
687         settings->resetReordering();
688         return;
689     }
690     // Parse the codes in [reorder aa bb cc].
691     UVector32 reorderCodes(errorCode);
692     if(U_FAILURE(errorCode)) { return; }
693     CharString word;
694     while(i < raw.length()) {
695         ++i;  // skip the word-separating space
696         int32_t limit = raw.indexOf((UChar)0x20, i);
697         if(limit < 0) { limit = raw.length(); }
698         word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
699         if(U_FAILURE(errorCode)) { return; }
700         int32_t code = getReorderCode(word.data());
701         if(code < 0) {
702             setParseError("unknown script or reorder code", errorCode);
703             return;
704         }
705         reorderCodes.addElement(code, errorCode);
706         if(U_FAILURE(errorCode)) { return; }
707         i = limit;
708     }
709     int32_t length = reorderCodes.size();
710     if(length == 1 && reorderCodes.elementAti(0) == UCOL_REORDER_CODE_DEFAULT) {
711         // The root collator does not have a reordering, by definition.
712         settings->resetReordering();
713         return;
714     }
715     uint8_t table[256];
716     baseData->makeReorderTable(reorderCodes.getBuffer(), length, table, errorCode);
717     if(U_FAILURE(errorCode)) { return; }
718     if(!settings->setReordering(reorderCodes.getBuffer(), length, table)) {
719         errorCode = U_MEMORY_ALLOCATION_ERROR;
720     }
721 }
722 
723 static const char *const gSpecialReorderCodes[] = {
724     "space", "punct", "symbol", "currency", "digit"
725 };
726 
727 int32_t
getReorderCode(const char * word)728 CollationRuleParser::getReorderCode(const char *word) {
729     for(int32_t i = 0; i < LENGTHOF(gSpecialReorderCodes); ++i) {
730         if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
731             return UCOL_REORDER_CODE_FIRST + i;
732         }
733     }
734     int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
735     if(script >= 0) {
736         return script;
737     }
738     if(uprv_stricmp(word, "default") == 0) {
739         return UCOL_REORDER_CODE_DEFAULT;
740     }
741     return -2;
742 }
743 
744 UColAttributeValue
getOnOffValue(const UnicodeString & s)745 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
746     if(s == UNICODE_STRING_SIMPLE("on")) {
747         return UCOL_ON;
748     } else if(s == UNICODE_STRING_SIMPLE("off")) {
749         return UCOL_OFF;
750     } else {
751         return UCOL_DEFAULT;
752     }
753 }
754 
755 int32_t
parseUnicodeSet(int32_t i,UnicodeSet & set,UErrorCode & errorCode)756 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
757     // Collect a UnicodeSet pattern between a balanced pair of [brackets].
758     int32_t level = 0;
759     int32_t j = i;
760     for(;;) {
761         if(j == rules->length()) {
762             setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
763             return j;
764         }
765         UChar c = rules->charAt(j++);
766         if(c == 0x5b) {  // '['
767             ++level;
768         } else if(c == 0x5d) {  // ']'
769             if(--level == 0) { break; }
770         }
771     }
772     set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
773     if(U_FAILURE(errorCode)) {
774         errorCode = U_ZERO_ERROR;
775         setParseError("not a valid UnicodeSet pattern", errorCode);
776         return j;
777     }
778     j = skipWhiteSpace(j);
779     if(j == rules->length() || rules->charAt(j) != 0x5d) {
780         setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
781         return j;
782     }
783     return ++j;
784 }
785 
786 int32_t
readWords(int32_t i,UnicodeString & raw) const787 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
788     static const UChar sp = 0x20;
789     raw.remove();
790     i = skipWhiteSpace(i);
791     for(;;) {
792         if(i >= rules->length()) { return 0; }
793         UChar c = rules->charAt(i);
794         if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
795             if(raw.isEmpty()) { return i; }
796             if(raw.endsWith(&sp, 1)) {  // remove trailing space
797                 raw.truncate(raw.length() - 1);
798             }
799             return i;
800         }
801         if(PatternProps::isWhiteSpace(c)) {
802             raw.append(0x20);
803             i = skipWhiteSpace(i + 1);
804         } else {
805             raw.append(c);
806             ++i;
807         }
808     }
809 }
810 
811 int32_t
skipComment(int32_t i) const812 CollationRuleParser::skipComment(int32_t i) const {
813     // skip to past the newline
814     while(i < rules->length()) {
815         UChar c = rules->charAt(i++);
816         // LF or FF or CR or NEL or LS or PS
817         if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
818             // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
819             // NLF (new line function) = CR or LF or CR+LF or NEL.
820             // No need to collect all of CR+LF because a following LF will be ignored anyway.
821             break;
822         }
823     }
824     return i;
825 }
826 
827 void
setParseError(const char * reason,UErrorCode & errorCode)828 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
829     if(U_FAILURE(errorCode)) { return; }
830     // Error code consistent with the old parser (from ca. 2001),
831     // rather than U_PARSE_ERROR;
832     errorCode = U_INVALID_FORMAT_ERROR;
833     errorReason = reason;
834     if(parseError != NULL) { setErrorContext(); }
835 }
836 
837 void
setErrorContext()838 CollationRuleParser::setErrorContext() {
839     if(parseError == NULL) { return; }
840 
841     // Note: This relies on the calling code maintaining the ruleIndex
842     // at a position that is useful for debugging.
843     // For example, at the beginning of a reset or relation etc.
844     parseError->offset = ruleIndex;
845     parseError->line = 0;  // We are not counting line numbers.
846 
847     // before ruleIndex
848     int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
849     if(start < 0) {
850         start = 0;
851     } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
852         ++start;
853     }
854     int32_t length = ruleIndex - start;
855     rules->extract(start, length, parseError->preContext);
856     parseError->preContext[length] = 0;
857 
858     // starting from ruleIndex
859     length = rules->length() - ruleIndex;
860     if(length >= U_PARSE_CONTEXT_LEN) {
861         length = U_PARSE_CONTEXT_LEN - 1;
862         if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
863             --length;
864         }
865     }
866     rules->extract(ruleIndex, length, parseError->postContext);
867     parseError->postContext[length] = 0;
868 }
869 
870 UBool
isSyntaxChar(UChar32 c)871 CollationRuleParser::isSyntaxChar(UChar32 c) {
872     return 0x21 <= c && c <= 0x7e &&
873             (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
874             (0x5b <= c && c <= 0x60) || (0x7b <= c));
875 }
876 
877 int32_t
skipWhiteSpace(int32_t i) const878 CollationRuleParser::skipWhiteSpace(int32_t i) const {
879     while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
880         ++i;
881     }
882     return i;
883 }
884 
885 U_NAMESPACE_END
886 
887 #endif  // !UCONFIG_NO_COLLATION
888