• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.cpp
9 *
10 * (replaced the former ucol_tok.cpp)
11 *
12 * created on: 2013apr10
13 * created by: Markus W. Scherer
14 */
15 
16 #include "unicode/utypes.h"
17 
18 #if !UCONFIG_NO_COLLATION
19 
20 #include "unicode/normalizer2.h"
21 #include "unicode/parseerr.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ucol.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unistr.h"
26 #include "unicode/utf16.h"
27 #include "charstr.h"
28 #include "cmemory.h"
29 #include "collation.h"
30 #include "collationdata.h"
31 #include "collationruleparser.h"
32 #include "collationsettings.h"
33 #include "collationtailoring.h"
34 #include "cstring.h"
35 #include "patternprops.h"
36 #include "uassert.h"
37 #include "ulocimp.h"
38 #include "uvectr32.h"
39 
40 U_NAMESPACE_BEGIN
41 
42 namespace {
43 
44 const char16_t BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
45 const int32_t BEFORE_LENGTH = 7;
46 
47 }  // namespace
48 
~Sink()49 CollationRuleParser::Sink::~Sink() {}
50 
51 void
suppressContractions(const UnicodeSet &,const char * &,UErrorCode &)52 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
53 
54 void
optimize(const UnicodeSet &,const char * &,UErrorCode &)55 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
56 
~Importer()57 CollationRuleParser::Importer::~Importer() {}
58 
CollationRuleParser(const CollationData * base,UErrorCode & errorCode)59 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
60         : nfd(*Normalizer2::getNFDInstance(errorCode)),
61           nfc(*Normalizer2::getNFCInstance(errorCode)),
62           rules(nullptr), baseData(base), settings(nullptr),
63           parseError(nullptr), errorReason(nullptr),
64           sink(nullptr), importer(nullptr),
65           ruleIndex(0) {
66 }
67 
~CollationRuleParser()68 CollationRuleParser::~CollationRuleParser() {
69 }
70 
71 void
parse(const UnicodeString & ruleString,CollationSettings & outSettings,UParseError * outParseError,UErrorCode & errorCode)72 CollationRuleParser::parse(const UnicodeString &ruleString,
73                            CollationSettings &outSettings,
74                            UParseError *outParseError,
75                            UErrorCode &errorCode) {
76     if(U_FAILURE(errorCode)) { return; }
77     settings = &outSettings;
78     parseError = outParseError;
79     if(parseError != nullptr) {
80         parseError->line = 0;
81         parseError->offset = -1;
82         parseError->preContext[0] = 0;
83         parseError->postContext[0] = 0;
84     }
85     errorReason = nullptr;
86     parse(ruleString, errorCode);
87 }
88 
89 void
parse(const UnicodeString & ruleString,UErrorCode & errorCode)90 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
91     if(U_FAILURE(errorCode)) { return; }
92     rules = &ruleString;
93     ruleIndex = 0;
94 
95     while(ruleIndex < rules->length()) {
96         char16_t c = rules->charAt(ruleIndex);
97         if(PatternProps::isWhiteSpace(c)) {
98             ++ruleIndex;
99             continue;
100         }
101         switch(c) {
102         case 0x26:  // '&'
103             parseRuleChain(errorCode);
104             break;
105         case 0x5b:  // '['
106             parseSetting(errorCode);
107             break;
108         case 0x23:  // '#' starts a comment, until the end of the line
109             ruleIndex = skipComment(ruleIndex + 1);
110             break;
111         case 0x40:  // '@' is equivalent to [backwards 2]
112             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
113                               UCOL_ON, 0, errorCode);
114             ++ruleIndex;
115             break;
116         case 0x21:  // '!' used to turn on Thai/Lao character reversal
117             // Accept but ignore. The root collator has contractions
118             // that are equivalent to the character reversal, where appropriate.
119             ++ruleIndex;
120             break;
121         default:
122             setParseError("expected a reset or setting or comment", errorCode);
123             break;
124         }
125         if(U_FAILURE(errorCode)) { return; }
126     }
127 }
128 
129 void
parseRuleChain(UErrorCode & errorCode)130 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
131     int32_t resetStrength = parseResetAndPosition(errorCode);
132     UBool isFirstRelation = true;
133     for(;;) {
134         int32_t result = parseRelationOperator(errorCode);
135         if(U_FAILURE(errorCode)) { return; }
136         if(result < 0) {
137             if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
138                 // '#' starts a comment, until the end of the line
139                 ruleIndex = skipComment(ruleIndex + 1);
140                 continue;
141             }
142             if(isFirstRelation) {
143                 setParseError("reset not followed by a relation", errorCode);
144             }
145             return;
146         }
147         int32_t strength = result & STRENGTH_MASK;
148         if(resetStrength < UCOL_IDENTICAL) {
149             // reset-before rule chain
150             if(isFirstRelation) {
151                 if(strength != resetStrength) {
152                     setParseError("reset-before strength differs from its first relation", errorCode);
153                     return;
154                 }
155             } else {
156                 if(strength < resetStrength) {
157                     setParseError("reset-before strength followed by a stronger relation", errorCode);
158                     return;
159                 }
160             }
161         }
162         int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
163         if((result & STARRED_FLAG) == 0) {
164             parseRelationStrings(strength, i, errorCode);
165         } else {
166             parseStarredCharacters(strength, i, errorCode);
167         }
168         if(U_FAILURE(errorCode)) { return; }
169         isFirstRelation = false;
170     }
171 }
172 
173 int32_t
parseResetAndPosition(UErrorCode & errorCode)174 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
175     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
176     int32_t i = skipWhiteSpace(ruleIndex + 1);
177     int32_t j;
178     char16_t c;
179     int32_t resetStrength;
180     if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
181             (j = i + BEFORE_LENGTH) < rules->length() &&
182             PatternProps::isWhiteSpace(rules->charAt(j)) &&
183             ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
184             0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
185             rules->charAt(j + 1) == 0x5d) {
186         // &[before n] with n=1 or 2 or 3
187         resetStrength = UCOL_PRIMARY + (c - 0x31);
188         i = skipWhiteSpace(j + 2);
189     } else {
190         resetStrength = UCOL_IDENTICAL;
191     }
192     if(i >= rules->length()) {
193         setParseError("reset without position", errorCode);
194         return UCOL_DEFAULT;
195     }
196     UnicodeString str;
197     if(rules->charAt(i) == 0x5b) {  // '['
198         i = parseSpecialPosition(i, str, errorCode);
199     } else {
200         i = parseTailoringString(i, str, errorCode);
201     }
202     sink->addReset(resetStrength, str, errorReason, errorCode);
203     if(U_FAILURE(errorCode)) { setErrorContext(); }
204     ruleIndex = i;
205     return resetStrength;
206 }
207 
208 int32_t
parseRelationOperator(UErrorCode & errorCode)209 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
210     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
211     ruleIndex = skipWhiteSpace(ruleIndex);
212     if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
213     int32_t strength;
214     int32_t i = ruleIndex;
215     char16_t c = rules->charAt(i++);
216     switch(c) {
217     case 0x3c:  // '<'
218         if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
219             ++i;
220             if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
221                 ++i;
222                 if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
223                     ++i;
224                     strength = UCOL_QUATERNARY;
225                 } else {
226                     strength = UCOL_TERTIARY;
227                 }
228             } else {
229                 strength = UCOL_SECONDARY;
230             }
231         } else {
232             strength = UCOL_PRIMARY;
233         }
234         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
235             ++i;
236             strength |= STARRED_FLAG;
237         }
238         break;
239     case 0x3b:  // ';' same as <<
240         strength = UCOL_SECONDARY;
241         break;
242     case 0x2c:  // ',' same as <<<
243         strength = UCOL_TERTIARY;
244         break;
245     case 0x3d:  // '='
246         strength = UCOL_IDENTICAL;
247         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
248             ++i;
249             strength |= STARRED_FLAG;
250         }
251         break;
252     default:
253         return UCOL_DEFAULT;
254     }
255     return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
256 }
257 
258 void
parseRelationStrings(int32_t strength,int32_t i,UErrorCode & errorCode)259 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
260     // Parse
261     //     prefix | str / extension
262     // where prefix and extension are optional.
263     UnicodeString prefix, str, extension;
264     i = parseTailoringString(i, str, errorCode);
265     if(U_FAILURE(errorCode)) { return; }
266     char16_t next = (i < rules->length()) ? rules->charAt(i) : 0;
267     if(next == 0x7c) {  // '|' separates the context prefix from the string.
268         prefix = str;
269         i = parseTailoringString(i + 1, str, errorCode);
270         if(U_FAILURE(errorCode)) { return; }
271         next = (i < rules->length()) ? rules->charAt(i) : 0;
272     }
273     if(next == 0x2f) {  // '/' separates the string from the extension.
274         i = parseTailoringString(i + 1, extension, errorCode);
275     }
276     if(!prefix.isEmpty()) {
277         UChar32 prefix0 = prefix.char32At(0);
278         UChar32 c = str.char32At(0);
279         if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
280             setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
281                           errorCode);
282             return;
283         }
284     }
285     sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
286     if(U_FAILURE(errorCode)) { setErrorContext(); }
287     ruleIndex = i;
288 }
289 
290 void
parseStarredCharacters(int32_t strength,int32_t i,UErrorCode & errorCode)291 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
292     UnicodeString empty, raw;
293     i = parseString(skipWhiteSpace(i), raw, errorCode);
294     if(U_FAILURE(errorCode)) { return; }
295     if(raw.isEmpty()) {
296         setParseError("missing starred-relation string", errorCode);
297         return;
298     }
299     UChar32 prev = -1;
300     int32_t j = 0;
301     for(;;) {
302         while(j < raw.length()) {
303             UChar32 c = raw.char32At(j);
304             if(!nfd.isInert(c)) {
305                 setParseError("starred-relation string is not all NFD-inert", errorCode);
306                 return;
307             }
308             sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
309             if(U_FAILURE(errorCode)) {
310                 setErrorContext();
311                 return;
312             }
313             j += U16_LENGTH(c);
314             prev = c;
315         }
316         if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
317             break;
318         }
319         if(prev < 0) {
320             setParseError("range without start in starred-relation string", errorCode);
321             return;
322         }
323         i = parseString(i + 1, raw, errorCode);
324         if(U_FAILURE(errorCode)) { return; }
325         if(raw.isEmpty()) {
326             setParseError("range without end in starred-relation string", errorCode);
327             return;
328         }
329         UChar32 c = raw.char32At(0);
330         if(c < prev) {
331             setParseError("range start greater than end in starred-relation string", errorCode);
332             return;
333         }
334         // range prev-c
335         UnicodeString s;
336         while(++prev <= c) {
337             if(!nfd.isInert(prev)) {
338                 setParseError("starred-relation string range is not all NFD-inert", errorCode);
339                 return;
340             }
341             if(U_IS_SURROGATE(prev)) {
342                 setParseError("starred-relation string range contains a surrogate", errorCode);
343                 return;
344             }
345             if(0xfffd <= prev && prev <= 0xffff) {
346                 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
347                 return;
348             }
349             s.setTo(prev);
350             sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
351             if(U_FAILURE(errorCode)) {
352                 setErrorContext();
353                 return;
354             }
355         }
356         prev = -1;
357         j = U16_LENGTH(c);
358     }
359     ruleIndex = skipWhiteSpace(i);
360 }
361 
362 int32_t
parseTailoringString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)363 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
364     i = parseString(skipWhiteSpace(i), raw, errorCode);
365     if(U_SUCCESS(errorCode) && raw.isEmpty()) {
366         setParseError("missing relation string", errorCode);
367     }
368     return skipWhiteSpace(i);
369 }
370 
371 int32_t
parseString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)372 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
373     if(U_FAILURE(errorCode)) { return i; }
374     raw.remove();
375     while(i < rules->length()) {
376         UChar32 c = rules->charAt(i++);
377         if(isSyntaxChar(c)) {
378             if(c == 0x27) {  // apostrophe
379                 if(i < rules->length() && rules->charAt(i) == 0x27) {
380                     // Double apostrophe, encodes a single one.
381                     raw.append(static_cast<char16_t>(0x27));
382                     ++i;
383                     continue;
384                 }
385                 // Quote literal text until the next single apostrophe.
386                 for(;;) {
387                     if(i == rules->length()) {
388                         setParseError("quoted literal text missing terminating apostrophe", errorCode);
389                         return i;
390                     }
391                     c = rules->charAt(i++);
392                     if(c == 0x27) {
393                         if(i < rules->length() && rules->charAt(i) == 0x27) {
394                             // Double apostrophe inside quoted literal text,
395                             // still encodes a single apostrophe.
396                             ++i;
397                         } else {
398                             break;
399                         }
400                     }
401                     raw.append(static_cast<char16_t>(c));
402                 }
403             } else if(c == 0x5c) {  // backslash
404                 if(i == rules->length()) {
405                     setParseError("backslash escape at the end of the rule string", errorCode);
406                     return i;
407                 }
408                 c = rules->char32At(i);
409                 raw.append(c);
410                 i += U16_LENGTH(c);
411             } else {
412                 // Any other syntax character terminates a string.
413                 --i;
414                 break;
415             }
416         } else if(PatternProps::isWhiteSpace(c)) {
417             // Unquoted white space terminates a string.
418             --i;
419             break;
420         } else {
421             raw.append(static_cast<char16_t>(c));
422         }
423     }
424     for(int32_t j = 0; j < raw.length();) {
425         UChar32 c = raw.char32At(j);
426         if(U_IS_SURROGATE(c)) {
427             setParseError("string contains an unpaired surrogate", errorCode);
428             return i;
429         }
430         if(0xfffd <= c && c <= 0xffff) {
431             setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
432             return i;
433         }
434         j += U16_LENGTH(c);
435     }
436     return i;
437 }
438 
439 namespace {
440 
441 const char* const positions[] = {
442     "first tertiary ignorable",
443     "last tertiary ignorable",
444     "first secondary ignorable",
445     "last secondary ignorable",
446     "first primary ignorable",
447     "last primary ignorable",
448     "first variable",
449     "last variable",
450     "first regular",
451     "last regular",
452     "first implicit",
453     "last implicit",
454     "first trailing",
455     "last trailing"
456 };
457 
458 }  // namespace
459 
460 int32_t
parseSpecialPosition(int32_t i,UnicodeString & str,UErrorCode & errorCode)461 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
462     if(U_FAILURE(errorCode)) { return 0; }
463     UnicodeString raw;
464     int32_t j = readWords(i + 1, raw);
465     if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
466         ++j;
467         for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
468             if(raw == UnicodeString(positions[pos], -1, US_INV)) {
469                 str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + pos));
470                 return j;
471             }
472         }
473         if(raw == UNICODE_STRING_SIMPLE("top")) {
474             str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + LAST_REGULAR));
475             return j;
476         }
477         if(raw == UNICODE_STRING_SIMPLE("variable top")) {
478             str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + LAST_VARIABLE));
479             return j;
480         }
481     }
482     setParseError("not a valid special reset position", errorCode);
483     return i;
484 }
485 
486 void
parseSetting(UErrorCode & errorCode)487 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
488     if(U_FAILURE(errorCode)) { return; }
489     UnicodeString raw;
490     int32_t i = ruleIndex + 1;
491     int32_t j = readWords(i, raw);
492     if(j <= i || raw.isEmpty()) {
493         setParseError("expected a setting/option at '['", errorCode);
494     }
495     if(rules->charAt(j) == 0x5d) {  // words end with ]
496         ++j;
497         if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
498                 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
499             parseReordering(raw, errorCode);
500             ruleIndex = j;
501             return;
502         }
503         if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
504             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
505                               UCOL_ON, 0, errorCode);
506             ruleIndex = j;
507             return;
508         }
509         UnicodeString v;
510         int32_t valueIndex = raw.lastIndexOf(static_cast<char16_t>(0x20));
511         if(valueIndex >= 0) {
512             v.setTo(raw, valueIndex + 1);
513             raw.truncate(valueIndex);
514         }
515         if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
516             int32_t value = UCOL_DEFAULT;
517             char16_t c = v.charAt(0);
518             if(0x31 <= c && c <= 0x34) {  // 1..4
519                 value = UCOL_PRIMARY + (c - 0x31);
520             } else if(c == 0x49) {  // 'I'
521                 value = UCOL_IDENTICAL;
522             }
523             if(value != UCOL_DEFAULT) {
524                 settings->setStrength(value, 0, errorCode);
525                 ruleIndex = j;
526                 return;
527             }
528         } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
529             UColAttributeValue value = UCOL_DEFAULT;
530             if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
531                 value = UCOL_NON_IGNORABLE;
532             } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
533                 value = UCOL_SHIFTED;
534             }
535             if(value != UCOL_DEFAULT) {
536                 settings->setAlternateHandling(value, 0, errorCode);
537                 ruleIndex = j;
538                 return;
539             }
540         } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
541             int32_t value = UCOL_DEFAULT;
542             if(v == UNICODE_STRING_SIMPLE("space")) {
543                 value = CollationSettings::MAX_VAR_SPACE;
544             } else if(v == UNICODE_STRING_SIMPLE("punct")) {
545                 value = CollationSettings::MAX_VAR_PUNCT;
546             } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
547                 value = CollationSettings::MAX_VAR_SYMBOL;
548             } else if(v == UNICODE_STRING_SIMPLE("currency")) {
549                 value = CollationSettings::MAX_VAR_CURRENCY;
550             }
551             if(value != UCOL_DEFAULT) {
552                 settings->setMaxVariable(value, 0, errorCode);
553                 settings->variableTop = baseData->getLastPrimaryForGroup(
554                     UCOL_REORDER_CODE_FIRST + value);
555                 U_ASSERT(settings->variableTop != 0);
556                 ruleIndex = j;
557                 return;
558             }
559         } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
560             UColAttributeValue value = UCOL_DEFAULT;
561             if(v == UNICODE_STRING_SIMPLE("off")) {
562                 value = UCOL_OFF;
563             } else if(v == UNICODE_STRING_SIMPLE("lower")) {
564                 value = UCOL_LOWER_FIRST;
565             } else if(v == UNICODE_STRING_SIMPLE("upper")) {
566                 value = UCOL_UPPER_FIRST;
567             }
568             if(value != UCOL_DEFAULT) {
569                 settings->setCaseFirst(value, 0, errorCode);
570                 ruleIndex = j;
571                 return;
572             }
573         } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
574             UColAttributeValue value = getOnOffValue(v);
575             if(value != UCOL_DEFAULT) {
576                 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
577                 ruleIndex = j;
578                 return;
579             }
580         } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
581             UColAttributeValue value = getOnOffValue(v);
582             if(value != UCOL_DEFAULT) {
583                 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
584                 ruleIndex = j;
585                 return;
586             }
587         } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
588             UColAttributeValue value = getOnOffValue(v);
589             if(value != UCOL_DEFAULT) {
590                 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
591                 ruleIndex = j;
592                 return;
593             }
594         } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
595             UColAttributeValue value = getOnOffValue(v);
596             if(value != UCOL_DEFAULT) {
597                 if(value == UCOL_ON) {
598                     setParseError("[hiraganaQ on] is not supported", errorCode);
599                 }
600                 ruleIndex = j;
601                 return;
602             }
603         } else if(raw == UNICODE_STRING_SIMPLE("import")) {
604             CharString lang;
605             lang.appendInvariantChars(v, errorCode);
606             if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
607             // BCP 47 language tag -> ICU locale ID
608             int32_t parsedLength;
609             CharString localeID = ulocimp_forLanguageTag(lang.data(), -1, &parsedLength, errorCode);
610             if(U_FAILURE(errorCode) || parsedLength != lang.length()) {
611                 errorCode = U_ZERO_ERROR;
612                 setParseError("expected language tag in [import langTag]", errorCode);
613                 return;
614             }
615             // localeID minus all keywords
616             char baseID[ULOC_FULLNAME_CAPACITY];
617             int32_t length = uloc_getBaseName(localeID.data(), baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
618             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
619                 errorCode = U_ZERO_ERROR;
620                 setParseError("expected language tag in [import langTag]", errorCode);
621                 return;
622             }
623             if(length == 0) {
624                 uprv_strcpy(baseID, "root");
625             } else if(*baseID == '_') {
626                 uprv_memmove(baseID + 3, baseID, length + 1);
627                 uprv_memcpy(baseID, "und", 3);
628             }
629             // @collation=type, or length=0 if not specified
630             CharString collationType = ulocimp_getKeywordValue(localeID.data(), "collation", errorCode);
631             if(U_FAILURE(errorCode)) {
632                 errorCode = U_ZERO_ERROR;
633                 setParseError("expected language tag in [import langTag]", errorCode);
634                 return;
635             }
636             if(importer == nullptr) {
637                 setParseError("[import langTag] is not supported", errorCode);
638             } else {
639                 UnicodeString importedRules;
640                 importer->getRules(baseID,
641                                    !collationType.isEmpty() ? collationType.data() : "standard",
642                                    importedRules, errorReason, errorCode);
643                 if(U_FAILURE(errorCode)) {
644                     if(errorReason == nullptr) {
645                         errorReason = "[import langTag] failed";
646                     }
647                     setErrorContext();
648                     return;
649                 }
650                 const UnicodeString *outerRules = rules;
651                 int32_t outerRuleIndex = ruleIndex;
652                 parse(importedRules, errorCode);
653                 if(U_FAILURE(errorCode)) {
654                     if(parseError != nullptr) {
655                         parseError->offset = outerRuleIndex;
656                     }
657                 }
658                 rules = outerRules;
659                 ruleIndex = j;
660             }
661             return;
662         }
663     } else if(rules->charAt(j) == 0x5b) {  // words end with [
664         UnicodeSet set;
665         j = parseUnicodeSet(j, set, errorCode);
666         if(U_FAILURE(errorCode)) { return; }
667         if(raw == UNICODE_STRING_SIMPLE("optimize")) {
668             sink->optimize(set, errorReason, errorCode);
669             if(U_FAILURE(errorCode)) { setErrorContext(); }
670             ruleIndex = j;
671             return;
672         } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
673             sink->suppressContractions(set, errorReason, errorCode);
674             if(U_FAILURE(errorCode)) { setErrorContext(); }
675             ruleIndex = j;
676             return;
677         }
678     }
679     setParseError("not a valid setting/option", errorCode);
680 }
681 
682 void
parseReordering(const UnicodeString & raw,UErrorCode & errorCode)683 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
684     if(U_FAILURE(errorCode)) { return; }
685     int32_t i = 7;  // after "reorder"
686     if(i == raw.length()) {
687         // empty [reorder] with no codes
688         settings->resetReordering();
689         return;
690     }
691     // Parse the codes in [reorder aa bb cc].
692     UVector32 reorderCodes(errorCode);
693     if(U_FAILURE(errorCode)) { return; }
694     CharString word;
695     while(i < raw.length()) {
696         ++i;  // skip the word-separating space
697         int32_t limit = raw.indexOf(static_cast<char16_t>(0x20), i);
698         if(limit < 0) { limit = raw.length(); }
699         word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
700         if(U_FAILURE(errorCode)) { return; }
701         int32_t code = getReorderCode(word.data());
702         if(code < 0) {
703             setParseError("unknown script or reorder code", errorCode);
704             return;
705         }
706         reorderCodes.addElement(code, errorCode);
707         if(U_FAILURE(errorCode)) { return; }
708         i = limit;
709     }
710     settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
711 }
712 
713 static const char *const gSpecialReorderCodes[] = {
714     "space", "punct", "symbol", "currency", "digit"
715 };
716 
717 int32_t
getReorderCode(const char * word)718 CollationRuleParser::getReorderCode(const char *word) {
719     for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
720         if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
721             return UCOL_REORDER_CODE_FIRST + i;
722         }
723     }
724     int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
725     if(script >= 0) {
726         return script;
727     }
728     if(uprv_stricmp(word, "others") == 0) {
729         return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
730     }
731     return -1;
732 }
733 
734 UColAttributeValue
getOnOffValue(const UnicodeString & s)735 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
736     if(s == UNICODE_STRING_SIMPLE("on")) {
737         return UCOL_ON;
738     } else if(s == UNICODE_STRING_SIMPLE("off")) {
739         return UCOL_OFF;
740     } else {
741         return UCOL_DEFAULT;
742     }
743 }
744 
745 int32_t
parseUnicodeSet(int32_t i,UnicodeSet & set,UErrorCode & errorCode)746 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
747     // Collect a UnicodeSet pattern between a balanced pair of [brackets].
748     int32_t level = 0;
749     int32_t j = i;
750     for(;;) {
751         if(j == rules->length()) {
752             setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
753             return j;
754         }
755         char16_t c = rules->charAt(j++);
756         if(c == 0x5b) {  // '['
757             ++level;
758         } else if(c == 0x5d) {  // ']'
759             if(--level == 0) { break; }
760         }
761     }
762     set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
763     if(U_FAILURE(errorCode)) {
764         errorCode = U_ZERO_ERROR;
765         setParseError("not a valid UnicodeSet pattern", errorCode);
766         return j;
767     }
768     j = skipWhiteSpace(j);
769     if(j == rules->length() || rules->charAt(j) != 0x5d) {
770         setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
771         return j;
772     }
773     return ++j;
774 }
775 
776 int32_t
readWords(int32_t i,UnicodeString & raw) const777 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
778     static const char16_t sp = 0x20;
779     raw.remove();
780     i = skipWhiteSpace(i);
781     for(;;) {
782         if(i >= rules->length()) { return 0; }
783         char16_t c = rules->charAt(i);
784         if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
785             if(raw.isEmpty()) { return i; }
786             if(raw.endsWith(&sp, 1)) {  // remove trailing space
787                 raw.truncate(raw.length() - 1);
788             }
789             return i;
790         }
791         if(PatternProps::isWhiteSpace(c)) {
792             raw.append(sp);
793             i = skipWhiteSpace(i + 1);
794         } else {
795             raw.append(c);
796             ++i;
797         }
798     }
799 }
800 
801 int32_t
skipComment(int32_t i) const802 CollationRuleParser::skipComment(int32_t i) const {
803     // skip to past the newline
804     while(i < rules->length()) {
805         char16_t c = rules->charAt(i++);
806         // LF or FF or CR or NEL or LS or PS
807         if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
808             // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
809             // NLF (new line function) = CR or LF or CR+LF or NEL.
810             // No need to collect all of CR+LF because a following LF will be ignored anyway.
811             break;
812         }
813     }
814     return i;
815 }
816 
817 void
setParseError(const char * reason,UErrorCode & errorCode)818 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
819     if(U_FAILURE(errorCode)) { return; }
820     // Error code consistent with the old parser (from ca. 2001),
821     // rather than U_PARSE_ERROR;
822     errorCode = U_INVALID_FORMAT_ERROR;
823     errorReason = reason;
824     if(parseError != nullptr) { setErrorContext(); }
825 }
826 
827 void
setErrorContext()828 CollationRuleParser::setErrorContext() {
829     if(parseError == nullptr) { return; }
830 
831     // Note: This relies on the calling code maintaining the ruleIndex
832     // at a position that is useful for debugging.
833     // For example, at the beginning of a reset or relation etc.
834     parseError->offset = ruleIndex;
835     parseError->line = 0;  // We are not counting line numbers.
836 
837     // before ruleIndex
838     int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
839     if(start < 0) {
840         start = 0;
841     } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
842         ++start;
843     }
844     int32_t length = ruleIndex - start;
845     rules->extract(start, length, parseError->preContext);
846     parseError->preContext[length] = 0;
847 
848     // starting from ruleIndex
849     length = rules->length() - ruleIndex;
850     if(length >= U_PARSE_CONTEXT_LEN) {
851         length = U_PARSE_CONTEXT_LEN - 1;
852         if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
853             --length;
854         }
855     }
856     rules->extract(ruleIndex, length, parseError->postContext);
857     parseError->postContext[length] = 0;
858 }
859 
860 UBool
isSyntaxChar(UChar32 c)861 CollationRuleParser::isSyntaxChar(UChar32 c) {
862     return 0x21 <= c && c <= 0x7e &&
863             (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
864             (0x5b <= c && c <= 0x60) || (0x7b <= c));
865 }
866 
867 int32_t
skipWhiteSpace(int32_t i) const868 CollationRuleParser::skipWhiteSpace(int32_t i) const {
869     while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
870         ++i;
871     }
872     return i;
873 }
874 
875 U_NAMESPACE_END
876 
877 #endif  // !UCONFIG_NO_COLLATION
878