1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.cpp
9 *
10 * (replaced the former ucol_tok.cpp)
11 *
12 * created on: 2013apr10
13 * created by: Markus W. Scherer
14 */
15
16 #include "unicode/utypes.h"
17
18 #if !UCONFIG_NO_COLLATION
19
20 #include "unicode/normalizer2.h"
21 #include "unicode/parseerr.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ucol.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unistr.h"
26 #include "unicode/utf16.h"
27 #include "charstr.h"
28 #include "cmemory.h"
29 #include "collation.h"
30 #include "collationdata.h"
31 #include "collationruleparser.h"
32 #include "collationsettings.h"
33 #include "collationtailoring.h"
34 #include "cstring.h"
35 #include "patternprops.h"
36 #include "uassert.h"
37 #include "ulocimp.h"
38 #include "uvectr32.h"
39
40 U_NAMESPACE_BEGIN
41
42 namespace {
43
44 const char16_t BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
45 const int32_t BEFORE_LENGTH = 7;
46
47 } // namespace
48
~Sink()49 CollationRuleParser::Sink::~Sink() {}
50
51 void
suppressContractions(const UnicodeSet &,const char * &,UErrorCode &)52 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
53
54 void
optimize(const UnicodeSet &,const char * &,UErrorCode &)55 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
56
~Importer()57 CollationRuleParser::Importer::~Importer() {}
58
CollationRuleParser(const CollationData * base,UErrorCode & errorCode)59 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
60 : nfd(*Normalizer2::getNFDInstance(errorCode)),
61 nfc(*Normalizer2::getNFCInstance(errorCode)),
62 rules(nullptr), baseData(base), settings(nullptr),
63 parseError(nullptr), errorReason(nullptr),
64 sink(nullptr), importer(nullptr),
65 ruleIndex(0) {
66 }
67
~CollationRuleParser()68 CollationRuleParser::~CollationRuleParser() {
69 }
70
71 void
parse(const UnicodeString & ruleString,CollationSettings & outSettings,UParseError * outParseError,UErrorCode & errorCode)72 CollationRuleParser::parse(const UnicodeString &ruleString,
73 CollationSettings &outSettings,
74 UParseError *outParseError,
75 UErrorCode &errorCode) {
76 if(U_FAILURE(errorCode)) { return; }
77 settings = &outSettings;
78 parseError = outParseError;
79 if(parseError != nullptr) {
80 parseError->line = 0;
81 parseError->offset = -1;
82 parseError->preContext[0] = 0;
83 parseError->postContext[0] = 0;
84 }
85 errorReason = nullptr;
86 parse(ruleString, errorCode);
87 }
88
89 void
parse(const UnicodeString & ruleString,UErrorCode & errorCode)90 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
91 if(U_FAILURE(errorCode)) { return; }
92 rules = &ruleString;
93 ruleIndex = 0;
94
95 while(ruleIndex < rules->length()) {
96 char16_t c = rules->charAt(ruleIndex);
97 if(PatternProps::isWhiteSpace(c)) {
98 ++ruleIndex;
99 continue;
100 }
101 switch(c) {
102 case 0x26: // '&'
103 parseRuleChain(errorCode);
104 break;
105 case 0x5b: // '['
106 parseSetting(errorCode);
107 break;
108 case 0x23: // '#' starts a comment, until the end of the line
109 ruleIndex = skipComment(ruleIndex + 1);
110 break;
111 case 0x40: // '@' is equivalent to [backwards 2]
112 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
113 UCOL_ON, 0, errorCode);
114 ++ruleIndex;
115 break;
116 case 0x21: // '!' used to turn on Thai/Lao character reversal
117 // Accept but ignore. The root collator has contractions
118 // that are equivalent to the character reversal, where appropriate.
119 ++ruleIndex;
120 break;
121 default:
122 setParseError("expected a reset or setting or comment", errorCode);
123 break;
124 }
125 if(U_FAILURE(errorCode)) { return; }
126 }
127 }
128
129 void
parseRuleChain(UErrorCode & errorCode)130 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
131 int32_t resetStrength = parseResetAndPosition(errorCode);
132 UBool isFirstRelation = true;
133 for(;;) {
134 int32_t result = parseRelationOperator(errorCode);
135 if(U_FAILURE(errorCode)) { return; }
136 if(result < 0) {
137 if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
138 // '#' starts a comment, until the end of the line
139 ruleIndex = skipComment(ruleIndex + 1);
140 continue;
141 }
142 if(isFirstRelation) {
143 setParseError("reset not followed by a relation", errorCode);
144 }
145 return;
146 }
147 int32_t strength = result & STRENGTH_MASK;
148 if(resetStrength < UCOL_IDENTICAL) {
149 // reset-before rule chain
150 if(isFirstRelation) {
151 if(strength != resetStrength) {
152 setParseError("reset-before strength differs from its first relation", errorCode);
153 return;
154 }
155 } else {
156 if(strength < resetStrength) {
157 setParseError("reset-before strength followed by a stronger relation", errorCode);
158 return;
159 }
160 }
161 }
162 int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
163 if((result & STARRED_FLAG) == 0) {
164 parseRelationStrings(strength, i, errorCode);
165 } else {
166 parseStarredCharacters(strength, i, errorCode);
167 }
168 if(U_FAILURE(errorCode)) { return; }
169 isFirstRelation = false;
170 }
171 }
172
173 int32_t
parseResetAndPosition(UErrorCode & errorCode)174 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
175 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
176 int32_t i = skipWhiteSpace(ruleIndex + 1);
177 int32_t j;
178 char16_t c;
179 int32_t resetStrength;
180 if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
181 (j = i + BEFORE_LENGTH) < rules->length() &&
182 PatternProps::isWhiteSpace(rules->charAt(j)) &&
183 ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
184 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
185 rules->charAt(j + 1) == 0x5d) {
186 // &[before n] with n=1 or 2 or 3
187 resetStrength = UCOL_PRIMARY + (c - 0x31);
188 i = skipWhiteSpace(j + 2);
189 } else {
190 resetStrength = UCOL_IDENTICAL;
191 }
192 if(i >= rules->length()) {
193 setParseError("reset without position", errorCode);
194 return UCOL_DEFAULT;
195 }
196 UnicodeString str;
197 if(rules->charAt(i) == 0x5b) { // '['
198 i = parseSpecialPosition(i, str, errorCode);
199 } else {
200 i = parseTailoringString(i, str, errorCode);
201 }
202 sink->addReset(resetStrength, str, errorReason, errorCode);
203 if(U_FAILURE(errorCode)) { setErrorContext(); }
204 ruleIndex = i;
205 return resetStrength;
206 }
207
208 int32_t
parseRelationOperator(UErrorCode & errorCode)209 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
210 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
211 ruleIndex = skipWhiteSpace(ruleIndex);
212 if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
213 int32_t strength;
214 int32_t i = ruleIndex;
215 char16_t c = rules->charAt(i++);
216 switch(c) {
217 case 0x3c: // '<'
218 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
219 ++i;
220 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
221 ++i;
222 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
223 ++i;
224 strength = UCOL_QUATERNARY;
225 } else {
226 strength = UCOL_TERTIARY;
227 }
228 } else {
229 strength = UCOL_SECONDARY;
230 }
231 } else {
232 strength = UCOL_PRIMARY;
233 }
234 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
235 ++i;
236 strength |= STARRED_FLAG;
237 }
238 break;
239 case 0x3b: // ';' same as <<
240 strength = UCOL_SECONDARY;
241 break;
242 case 0x2c: // ',' same as <<<
243 strength = UCOL_TERTIARY;
244 break;
245 case 0x3d: // '='
246 strength = UCOL_IDENTICAL;
247 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
248 ++i;
249 strength |= STARRED_FLAG;
250 }
251 break;
252 default:
253 return UCOL_DEFAULT;
254 }
255 return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
256 }
257
258 void
parseRelationStrings(int32_t strength,int32_t i,UErrorCode & errorCode)259 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
260 // Parse
261 // prefix | str / extension
262 // where prefix and extension are optional.
263 UnicodeString prefix, str, extension;
264 i = parseTailoringString(i, str, errorCode);
265 if(U_FAILURE(errorCode)) { return; }
266 char16_t next = (i < rules->length()) ? rules->charAt(i) : 0;
267 if(next == 0x7c) { // '|' separates the context prefix from the string.
268 prefix = str;
269 i = parseTailoringString(i + 1, str, errorCode);
270 if(U_FAILURE(errorCode)) { return; }
271 next = (i < rules->length()) ? rules->charAt(i) : 0;
272 }
273 if(next == 0x2f) { // '/' separates the string from the extension.
274 i = parseTailoringString(i + 1, extension, errorCode);
275 }
276 if(!prefix.isEmpty()) {
277 UChar32 prefix0 = prefix.char32At(0);
278 UChar32 c = str.char32At(0);
279 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
280 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
281 errorCode);
282 return;
283 }
284 }
285 sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
286 if(U_FAILURE(errorCode)) { setErrorContext(); }
287 ruleIndex = i;
288 }
289
290 void
parseStarredCharacters(int32_t strength,int32_t i,UErrorCode & errorCode)291 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
292 UnicodeString empty, raw;
293 i = parseString(skipWhiteSpace(i), raw, errorCode);
294 if(U_FAILURE(errorCode)) { return; }
295 if(raw.isEmpty()) {
296 setParseError("missing starred-relation string", errorCode);
297 return;
298 }
299 UChar32 prev = -1;
300 int32_t j = 0;
301 for(;;) {
302 while(j < raw.length()) {
303 UChar32 c = raw.char32At(j);
304 if(!nfd.isInert(c)) {
305 setParseError("starred-relation string is not all NFD-inert", errorCode);
306 return;
307 }
308 sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
309 if(U_FAILURE(errorCode)) {
310 setErrorContext();
311 return;
312 }
313 j += U16_LENGTH(c);
314 prev = c;
315 }
316 if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'
317 break;
318 }
319 if(prev < 0) {
320 setParseError("range without start in starred-relation string", errorCode);
321 return;
322 }
323 i = parseString(i + 1, raw, errorCode);
324 if(U_FAILURE(errorCode)) { return; }
325 if(raw.isEmpty()) {
326 setParseError("range without end in starred-relation string", errorCode);
327 return;
328 }
329 UChar32 c = raw.char32At(0);
330 if(c < prev) {
331 setParseError("range start greater than end in starred-relation string", errorCode);
332 return;
333 }
334 // range prev-c
335 UnicodeString s;
336 while(++prev <= c) {
337 if(!nfd.isInert(prev)) {
338 setParseError("starred-relation string range is not all NFD-inert", errorCode);
339 return;
340 }
341 if(U_IS_SURROGATE(prev)) {
342 setParseError("starred-relation string range contains a surrogate", errorCode);
343 return;
344 }
345 if(0xfffd <= prev && prev <= 0xffff) {
346 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
347 return;
348 }
349 s.setTo(prev);
350 sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
351 if(U_FAILURE(errorCode)) {
352 setErrorContext();
353 return;
354 }
355 }
356 prev = -1;
357 j = U16_LENGTH(c);
358 }
359 ruleIndex = skipWhiteSpace(i);
360 }
361
362 int32_t
parseTailoringString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)363 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
364 i = parseString(skipWhiteSpace(i), raw, errorCode);
365 if(U_SUCCESS(errorCode) && raw.isEmpty()) {
366 setParseError("missing relation string", errorCode);
367 }
368 return skipWhiteSpace(i);
369 }
370
371 int32_t
parseString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)372 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
373 if(U_FAILURE(errorCode)) { return i; }
374 raw.remove();
375 while(i < rules->length()) {
376 UChar32 c = rules->charAt(i++);
377 if(isSyntaxChar(c)) {
378 if(c == 0x27) { // apostrophe
379 if(i < rules->length() && rules->charAt(i) == 0x27) {
380 // Double apostrophe, encodes a single one.
381 raw.append(static_cast<char16_t>(0x27));
382 ++i;
383 continue;
384 }
385 // Quote literal text until the next single apostrophe.
386 for(;;) {
387 if(i == rules->length()) {
388 setParseError("quoted literal text missing terminating apostrophe", errorCode);
389 return i;
390 }
391 c = rules->charAt(i++);
392 if(c == 0x27) {
393 if(i < rules->length() && rules->charAt(i) == 0x27) {
394 // Double apostrophe inside quoted literal text,
395 // still encodes a single apostrophe.
396 ++i;
397 } else {
398 break;
399 }
400 }
401 raw.append(static_cast<char16_t>(c));
402 }
403 } else if(c == 0x5c) { // backslash
404 if(i == rules->length()) {
405 setParseError("backslash escape at the end of the rule string", errorCode);
406 return i;
407 }
408 c = rules->char32At(i);
409 raw.append(c);
410 i += U16_LENGTH(c);
411 } else {
412 // Any other syntax character terminates a string.
413 --i;
414 break;
415 }
416 } else if(PatternProps::isWhiteSpace(c)) {
417 // Unquoted white space terminates a string.
418 --i;
419 break;
420 } else {
421 raw.append(static_cast<char16_t>(c));
422 }
423 }
424 for(int32_t j = 0; j < raw.length();) {
425 UChar32 c = raw.char32At(j);
426 if(U_IS_SURROGATE(c)) {
427 setParseError("string contains an unpaired surrogate", errorCode);
428 return i;
429 }
430 if(0xfffd <= c && c <= 0xffff) {
431 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
432 return i;
433 }
434 j += U16_LENGTH(c);
435 }
436 return i;
437 }
438
439 namespace {
440
441 const char* const positions[] = {
442 "first tertiary ignorable",
443 "last tertiary ignorable",
444 "first secondary ignorable",
445 "last secondary ignorable",
446 "first primary ignorable",
447 "last primary ignorable",
448 "first variable",
449 "last variable",
450 "first regular",
451 "last regular",
452 "first implicit",
453 "last implicit",
454 "first trailing",
455 "last trailing"
456 };
457
458 } // namespace
459
460 int32_t
parseSpecialPosition(int32_t i,UnicodeString & str,UErrorCode & errorCode)461 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
462 if(U_FAILURE(errorCode)) { return 0; }
463 UnicodeString raw;
464 int32_t j = readWords(i + 1, raw);
465 if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
466 ++j;
467 for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
468 if(raw == UnicodeString(positions[pos], -1, US_INV)) {
469 str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + pos));
470 return j;
471 }
472 }
473 if(raw == UNICODE_STRING_SIMPLE("top")) {
474 str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + LAST_REGULAR));
475 return j;
476 }
477 if(raw == UNICODE_STRING_SIMPLE("variable top")) {
478 str.setTo(POS_LEAD).append(static_cast<char16_t>(POS_BASE + LAST_VARIABLE));
479 return j;
480 }
481 }
482 setParseError("not a valid special reset position", errorCode);
483 return i;
484 }
485
486 void
parseSetting(UErrorCode & errorCode)487 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
488 if(U_FAILURE(errorCode)) { return; }
489 UnicodeString raw;
490 int32_t i = ruleIndex + 1;
491 int32_t j = readWords(i, raw);
492 if(j <= i || raw.isEmpty()) {
493 setParseError("expected a setting/option at '['", errorCode);
494 }
495 if(rules->charAt(j) == 0x5d) { // words end with ]
496 ++j;
497 if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
498 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
499 parseReordering(raw, errorCode);
500 ruleIndex = j;
501 return;
502 }
503 if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
504 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
505 UCOL_ON, 0, errorCode);
506 ruleIndex = j;
507 return;
508 }
509 UnicodeString v;
510 int32_t valueIndex = raw.lastIndexOf(static_cast<char16_t>(0x20));
511 if(valueIndex >= 0) {
512 v.setTo(raw, valueIndex + 1);
513 raw.truncate(valueIndex);
514 }
515 if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
516 int32_t value = UCOL_DEFAULT;
517 char16_t c = v.charAt(0);
518 if(0x31 <= c && c <= 0x34) { // 1..4
519 value = UCOL_PRIMARY + (c - 0x31);
520 } else if(c == 0x49) { // 'I'
521 value = UCOL_IDENTICAL;
522 }
523 if(value != UCOL_DEFAULT) {
524 settings->setStrength(value, 0, errorCode);
525 ruleIndex = j;
526 return;
527 }
528 } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
529 UColAttributeValue value = UCOL_DEFAULT;
530 if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
531 value = UCOL_NON_IGNORABLE;
532 } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
533 value = UCOL_SHIFTED;
534 }
535 if(value != UCOL_DEFAULT) {
536 settings->setAlternateHandling(value, 0, errorCode);
537 ruleIndex = j;
538 return;
539 }
540 } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
541 int32_t value = UCOL_DEFAULT;
542 if(v == UNICODE_STRING_SIMPLE("space")) {
543 value = CollationSettings::MAX_VAR_SPACE;
544 } else if(v == UNICODE_STRING_SIMPLE("punct")) {
545 value = CollationSettings::MAX_VAR_PUNCT;
546 } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
547 value = CollationSettings::MAX_VAR_SYMBOL;
548 } else if(v == UNICODE_STRING_SIMPLE("currency")) {
549 value = CollationSettings::MAX_VAR_CURRENCY;
550 }
551 if(value != UCOL_DEFAULT) {
552 settings->setMaxVariable(value, 0, errorCode);
553 settings->variableTop = baseData->getLastPrimaryForGroup(
554 UCOL_REORDER_CODE_FIRST + value);
555 U_ASSERT(settings->variableTop != 0);
556 ruleIndex = j;
557 return;
558 }
559 } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
560 UColAttributeValue value = UCOL_DEFAULT;
561 if(v == UNICODE_STRING_SIMPLE("off")) {
562 value = UCOL_OFF;
563 } else if(v == UNICODE_STRING_SIMPLE("lower")) {
564 value = UCOL_LOWER_FIRST;
565 } else if(v == UNICODE_STRING_SIMPLE("upper")) {
566 value = UCOL_UPPER_FIRST;
567 }
568 if(value != UCOL_DEFAULT) {
569 settings->setCaseFirst(value, 0, errorCode);
570 ruleIndex = j;
571 return;
572 }
573 } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
574 UColAttributeValue value = getOnOffValue(v);
575 if(value != UCOL_DEFAULT) {
576 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
577 ruleIndex = j;
578 return;
579 }
580 } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
581 UColAttributeValue value = getOnOffValue(v);
582 if(value != UCOL_DEFAULT) {
583 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
584 ruleIndex = j;
585 return;
586 }
587 } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
588 UColAttributeValue value = getOnOffValue(v);
589 if(value != UCOL_DEFAULT) {
590 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
591 ruleIndex = j;
592 return;
593 }
594 } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
595 UColAttributeValue value = getOnOffValue(v);
596 if(value != UCOL_DEFAULT) {
597 if(value == UCOL_ON) {
598 setParseError("[hiraganaQ on] is not supported", errorCode);
599 }
600 ruleIndex = j;
601 return;
602 }
603 } else if(raw == UNICODE_STRING_SIMPLE("import")) {
604 CharString lang;
605 lang.appendInvariantChars(v, errorCode);
606 if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
607 // BCP 47 language tag -> ICU locale ID
608 int32_t parsedLength;
609 CharString localeID = ulocimp_forLanguageTag(lang.data(), -1, &parsedLength, errorCode);
610 if(U_FAILURE(errorCode) || parsedLength != lang.length()) {
611 errorCode = U_ZERO_ERROR;
612 setParseError("expected language tag in [import langTag]", errorCode);
613 return;
614 }
615 // localeID minus all keywords
616 char baseID[ULOC_FULLNAME_CAPACITY];
617 int32_t length = uloc_getBaseName(localeID.data(), baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
618 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
619 errorCode = U_ZERO_ERROR;
620 setParseError("expected language tag in [import langTag]", errorCode);
621 return;
622 }
623 if(length == 0) {
624 uprv_strcpy(baseID, "root");
625 } else if(*baseID == '_') {
626 uprv_memmove(baseID + 3, baseID, length + 1);
627 uprv_memcpy(baseID, "und", 3);
628 }
629 // @collation=type, or length=0 if not specified
630 CharString collationType = ulocimp_getKeywordValue(localeID.data(), "collation", errorCode);
631 if(U_FAILURE(errorCode)) {
632 errorCode = U_ZERO_ERROR;
633 setParseError("expected language tag in [import langTag]", errorCode);
634 return;
635 }
636 if(importer == nullptr) {
637 setParseError("[import langTag] is not supported", errorCode);
638 } else {
639 UnicodeString importedRules;
640 importer->getRules(baseID,
641 !collationType.isEmpty() ? collationType.data() : "standard",
642 importedRules, errorReason, errorCode);
643 if(U_FAILURE(errorCode)) {
644 if(errorReason == nullptr) {
645 errorReason = "[import langTag] failed";
646 }
647 setErrorContext();
648 return;
649 }
650 const UnicodeString *outerRules = rules;
651 int32_t outerRuleIndex = ruleIndex;
652 parse(importedRules, errorCode);
653 if(U_FAILURE(errorCode)) {
654 if(parseError != nullptr) {
655 parseError->offset = outerRuleIndex;
656 }
657 }
658 rules = outerRules;
659 ruleIndex = j;
660 }
661 return;
662 }
663 } else if(rules->charAt(j) == 0x5b) { // words end with [
664 UnicodeSet set;
665 j = parseUnicodeSet(j, set, errorCode);
666 if(U_FAILURE(errorCode)) { return; }
667 if(raw == UNICODE_STRING_SIMPLE("optimize")) {
668 sink->optimize(set, errorReason, errorCode);
669 if(U_FAILURE(errorCode)) { setErrorContext(); }
670 ruleIndex = j;
671 return;
672 } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
673 sink->suppressContractions(set, errorReason, errorCode);
674 if(U_FAILURE(errorCode)) { setErrorContext(); }
675 ruleIndex = j;
676 return;
677 }
678 }
679 setParseError("not a valid setting/option", errorCode);
680 }
681
682 void
parseReordering(const UnicodeString & raw,UErrorCode & errorCode)683 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
684 if(U_FAILURE(errorCode)) { return; }
685 int32_t i = 7; // after "reorder"
686 if(i == raw.length()) {
687 // empty [reorder] with no codes
688 settings->resetReordering();
689 return;
690 }
691 // Parse the codes in [reorder aa bb cc].
692 UVector32 reorderCodes(errorCode);
693 if(U_FAILURE(errorCode)) { return; }
694 CharString word;
695 while(i < raw.length()) {
696 ++i; // skip the word-separating space
697 int32_t limit = raw.indexOf(static_cast<char16_t>(0x20), i);
698 if(limit < 0) { limit = raw.length(); }
699 word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
700 if(U_FAILURE(errorCode)) { return; }
701 int32_t code = getReorderCode(word.data());
702 if(code < 0) {
703 setParseError("unknown script or reorder code", errorCode);
704 return;
705 }
706 reorderCodes.addElement(code, errorCode);
707 if(U_FAILURE(errorCode)) { return; }
708 i = limit;
709 }
710 settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
711 }
712
713 static const char *const gSpecialReorderCodes[] = {
714 "space", "punct", "symbol", "currency", "digit"
715 };
716
717 int32_t
getReorderCode(const char * word)718 CollationRuleParser::getReorderCode(const char *word) {
719 for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
720 if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
721 return UCOL_REORDER_CODE_FIRST + i;
722 }
723 }
724 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
725 if(script >= 0) {
726 return script;
727 }
728 if(uprv_stricmp(word, "others") == 0) {
729 return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
730 }
731 return -1;
732 }
733
734 UColAttributeValue
getOnOffValue(const UnicodeString & s)735 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
736 if(s == UNICODE_STRING_SIMPLE("on")) {
737 return UCOL_ON;
738 } else if(s == UNICODE_STRING_SIMPLE("off")) {
739 return UCOL_OFF;
740 } else {
741 return UCOL_DEFAULT;
742 }
743 }
744
745 int32_t
parseUnicodeSet(int32_t i,UnicodeSet & set,UErrorCode & errorCode)746 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
747 // Collect a UnicodeSet pattern between a balanced pair of [brackets].
748 int32_t level = 0;
749 int32_t j = i;
750 for(;;) {
751 if(j == rules->length()) {
752 setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
753 return j;
754 }
755 char16_t c = rules->charAt(j++);
756 if(c == 0x5b) { // '['
757 ++level;
758 } else if(c == 0x5d) { // ']'
759 if(--level == 0) { break; }
760 }
761 }
762 set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
763 if(U_FAILURE(errorCode)) {
764 errorCode = U_ZERO_ERROR;
765 setParseError("not a valid UnicodeSet pattern", errorCode);
766 return j;
767 }
768 j = skipWhiteSpace(j);
769 if(j == rules->length() || rules->charAt(j) != 0x5d) {
770 setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
771 return j;
772 }
773 return ++j;
774 }
775
776 int32_t
readWords(int32_t i,UnicodeString & raw) const777 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
778 static const char16_t sp = 0x20;
779 raw.remove();
780 i = skipWhiteSpace(i);
781 for(;;) {
782 if(i >= rules->length()) { return 0; }
783 char16_t c = rules->charAt(i);
784 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
785 if(raw.isEmpty()) { return i; }
786 if(raw.endsWith(&sp, 1)) { // remove trailing space
787 raw.truncate(raw.length() - 1);
788 }
789 return i;
790 }
791 if(PatternProps::isWhiteSpace(c)) {
792 raw.append(sp);
793 i = skipWhiteSpace(i + 1);
794 } else {
795 raw.append(c);
796 ++i;
797 }
798 }
799 }
800
801 int32_t
skipComment(int32_t i) const802 CollationRuleParser::skipComment(int32_t i) const {
803 // skip to past the newline
804 while(i < rules->length()) {
805 char16_t c = rules->charAt(i++);
806 // LF or FF or CR or NEL or LS or PS
807 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
808 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
809 // NLF (new line function) = CR or LF or CR+LF or NEL.
810 // No need to collect all of CR+LF because a following LF will be ignored anyway.
811 break;
812 }
813 }
814 return i;
815 }
816
817 void
setParseError(const char * reason,UErrorCode & errorCode)818 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
819 if(U_FAILURE(errorCode)) { return; }
820 // Error code consistent with the old parser (from ca. 2001),
821 // rather than U_PARSE_ERROR;
822 errorCode = U_INVALID_FORMAT_ERROR;
823 errorReason = reason;
824 if(parseError != nullptr) { setErrorContext(); }
825 }
826
827 void
setErrorContext()828 CollationRuleParser::setErrorContext() {
829 if(parseError == nullptr) { return; }
830
831 // Note: This relies on the calling code maintaining the ruleIndex
832 // at a position that is useful for debugging.
833 // For example, at the beginning of a reset or relation etc.
834 parseError->offset = ruleIndex;
835 parseError->line = 0; // We are not counting line numbers.
836
837 // before ruleIndex
838 int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
839 if(start < 0) {
840 start = 0;
841 } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
842 ++start;
843 }
844 int32_t length = ruleIndex - start;
845 rules->extract(start, length, parseError->preContext);
846 parseError->preContext[length] = 0;
847
848 // starting from ruleIndex
849 length = rules->length() - ruleIndex;
850 if(length >= U_PARSE_CONTEXT_LEN) {
851 length = U_PARSE_CONTEXT_LEN - 1;
852 if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
853 --length;
854 }
855 }
856 rules->extract(ruleIndex, length, parseError->postContext);
857 parseError->postContext[length] = 0;
858 }
859
860 UBool
isSyntaxChar(UChar32 c)861 CollationRuleParser::isSyntaxChar(UChar32 c) {
862 return 0x21 <= c && c <= 0x7e &&
863 (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
864 (0x5b <= c && c <= 0x60) || (0x7b <= c));
865 }
866
867 int32_t
skipWhiteSpace(int32_t i) const868 CollationRuleParser::skipWhiteSpace(int32_t i) const {
869 while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
870 ++i;
871 }
872 return i;
873 }
874
875 U_NAMESPACE_END
876
877 #endif // !UCONFIG_NO_COLLATION
878