1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * collationruleparser.cpp
9 *
10 * (replaced the former ucol_tok.cpp)
11 *
12 * created on: 2013apr10
13 * created by: Markus W. Scherer
14 */
15
16 #include "unicode/utypes.h"
17
18 #if !UCONFIG_NO_COLLATION
19
20 #include "unicode/normalizer2.h"
21 #include "unicode/parseerr.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ucol.h"
24 #include "unicode/uloc.h"
25 #include "unicode/unistr.h"
26 #include "unicode/utf16.h"
27 #include "charstr.h"
28 #include "cmemory.h"
29 #include "collation.h"
30 #include "collationdata.h"
31 #include "collationruleparser.h"
32 #include "collationsettings.h"
33 #include "collationtailoring.h"
34 #include "cstring.h"
35 #include "patternprops.h"
36 #include "uassert.h"
37 #include "uvectr32.h"
38
39 U_NAMESPACE_BEGIN
40
41 namespace {
42
43 static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
44 const int32_t BEFORE_LENGTH = 7;
45
46 } // namespace
47
~Sink()48 CollationRuleParser::Sink::~Sink() {}
49
50 void
suppressContractions(const UnicodeSet &,const char * &,UErrorCode &)51 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
52
53 void
optimize(const UnicodeSet &,const char * &,UErrorCode &)54 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
55
~Importer()56 CollationRuleParser::Importer::~Importer() {}
57
CollationRuleParser(const CollationData * base,UErrorCode & errorCode)58 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
59 : nfd(*Normalizer2::getNFDInstance(errorCode)),
60 nfc(*Normalizer2::getNFCInstance(errorCode)),
61 rules(NULL), baseData(base), settings(NULL),
62 parseError(NULL), errorReason(NULL),
63 sink(NULL), importer(NULL),
64 ruleIndex(0) {
65 }
66
~CollationRuleParser()67 CollationRuleParser::~CollationRuleParser() {
68 }
69
70 void
parse(const UnicodeString & ruleString,CollationSettings & outSettings,UParseError * outParseError,UErrorCode & errorCode)71 CollationRuleParser::parse(const UnicodeString &ruleString,
72 CollationSettings &outSettings,
73 UParseError *outParseError,
74 UErrorCode &errorCode) {
75 if(U_FAILURE(errorCode)) { return; }
76 settings = &outSettings;
77 parseError = outParseError;
78 if(parseError != NULL) {
79 parseError->line = 0;
80 parseError->offset = -1;
81 parseError->preContext[0] = 0;
82 parseError->postContext[0] = 0;
83 }
84 errorReason = NULL;
85 parse(ruleString, errorCode);
86 }
87
88 void
parse(const UnicodeString & ruleString,UErrorCode & errorCode)89 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
90 if(U_FAILURE(errorCode)) { return; }
91 rules = &ruleString;
92 ruleIndex = 0;
93
94 while(ruleIndex < rules->length()) {
95 UChar c = rules->charAt(ruleIndex);
96 if(PatternProps::isWhiteSpace(c)) {
97 ++ruleIndex;
98 continue;
99 }
100 switch(c) {
101 case 0x26: // '&'
102 parseRuleChain(errorCode);
103 break;
104 case 0x5b: // '['
105 parseSetting(errorCode);
106 break;
107 case 0x23: // '#' starts a comment, until the end of the line
108 ruleIndex = skipComment(ruleIndex + 1);
109 break;
110 case 0x40: // '@' is equivalent to [backwards 2]
111 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
112 UCOL_ON, 0, errorCode);
113 ++ruleIndex;
114 break;
115 case 0x21: // '!' used to turn on Thai/Lao character reversal
116 // Accept but ignore. The root collator has contractions
117 // that are equivalent to the character reversal, where appropriate.
118 ++ruleIndex;
119 break;
120 default:
121 setParseError("expected a reset or setting or comment", errorCode);
122 break;
123 }
124 if(U_FAILURE(errorCode)) { return; }
125 }
126 }
127
128 void
parseRuleChain(UErrorCode & errorCode)129 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
130 int32_t resetStrength = parseResetAndPosition(errorCode);
131 UBool isFirstRelation = TRUE;
132 for(;;) {
133 int32_t result = parseRelationOperator(errorCode);
134 if(U_FAILURE(errorCode)) { return; }
135 if(result < 0) {
136 if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
137 // '#' starts a comment, until the end of the line
138 ruleIndex = skipComment(ruleIndex + 1);
139 continue;
140 }
141 if(isFirstRelation) {
142 setParseError("reset not followed by a relation", errorCode);
143 }
144 return;
145 }
146 int32_t strength = result & STRENGTH_MASK;
147 if(resetStrength < UCOL_IDENTICAL) {
148 // reset-before rule chain
149 if(isFirstRelation) {
150 if(strength != resetStrength) {
151 setParseError("reset-before strength differs from its first relation", errorCode);
152 return;
153 }
154 } else {
155 if(strength < resetStrength) {
156 setParseError("reset-before strength followed by a stronger relation", errorCode);
157 return;
158 }
159 }
160 }
161 int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
162 if((result & STARRED_FLAG) == 0) {
163 parseRelationStrings(strength, i, errorCode);
164 } else {
165 parseStarredCharacters(strength, i, errorCode);
166 }
167 if(U_FAILURE(errorCode)) { return; }
168 isFirstRelation = FALSE;
169 }
170 }
171
172 int32_t
parseResetAndPosition(UErrorCode & errorCode)173 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
174 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
175 int32_t i = skipWhiteSpace(ruleIndex + 1);
176 int32_t j;
177 UChar c;
178 int32_t resetStrength;
179 if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
180 (j = i + BEFORE_LENGTH) < rules->length() &&
181 PatternProps::isWhiteSpace(rules->charAt(j)) &&
182 ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
183 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
184 rules->charAt(j + 1) == 0x5d) {
185 // &[before n] with n=1 or 2 or 3
186 resetStrength = UCOL_PRIMARY + (c - 0x31);
187 i = skipWhiteSpace(j + 2);
188 } else {
189 resetStrength = UCOL_IDENTICAL;
190 }
191 if(i >= rules->length()) {
192 setParseError("reset without position", errorCode);
193 return UCOL_DEFAULT;
194 }
195 UnicodeString str;
196 if(rules->charAt(i) == 0x5b) { // '['
197 i = parseSpecialPosition(i, str, errorCode);
198 } else {
199 i = parseTailoringString(i, str, errorCode);
200 }
201 sink->addReset(resetStrength, str, errorReason, errorCode);
202 if(U_FAILURE(errorCode)) { setErrorContext(); }
203 ruleIndex = i;
204 return resetStrength;
205 }
206
207 int32_t
parseRelationOperator(UErrorCode & errorCode)208 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
209 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
210 ruleIndex = skipWhiteSpace(ruleIndex);
211 if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
212 int32_t strength;
213 int32_t i = ruleIndex;
214 UChar c = rules->charAt(i++);
215 switch(c) {
216 case 0x3c: // '<'
217 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
218 ++i;
219 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
220 ++i;
221 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
222 ++i;
223 strength = UCOL_QUATERNARY;
224 } else {
225 strength = UCOL_TERTIARY;
226 }
227 } else {
228 strength = UCOL_SECONDARY;
229 }
230 } else {
231 strength = UCOL_PRIMARY;
232 }
233 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
234 ++i;
235 strength |= STARRED_FLAG;
236 }
237 break;
238 case 0x3b: // ';' same as <<
239 strength = UCOL_SECONDARY;
240 break;
241 case 0x2c: // ',' same as <<<
242 strength = UCOL_TERTIARY;
243 break;
244 case 0x3d: // '='
245 strength = UCOL_IDENTICAL;
246 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
247 ++i;
248 strength |= STARRED_FLAG;
249 }
250 break;
251 default:
252 return UCOL_DEFAULT;
253 }
254 return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
255 }
256
257 void
parseRelationStrings(int32_t strength,int32_t i,UErrorCode & errorCode)258 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
259 // Parse
260 // prefix | str / extension
261 // where prefix and extension are optional.
262 UnicodeString prefix, str, extension;
263 i = parseTailoringString(i, str, errorCode);
264 if(U_FAILURE(errorCode)) { return; }
265 UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
266 if(next == 0x7c) { // '|' separates the context prefix from the string.
267 prefix = str;
268 i = parseTailoringString(i + 1, str, errorCode);
269 if(U_FAILURE(errorCode)) { return; }
270 next = (i < rules->length()) ? rules->charAt(i) : 0;
271 }
272 if(next == 0x2f) { // '/' separates the string from the extension.
273 i = parseTailoringString(i + 1, extension, errorCode);
274 }
275 if(!prefix.isEmpty()) {
276 UChar32 prefix0 = prefix.char32At(0);
277 UChar32 c = str.char32At(0);
278 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
279 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
280 errorCode);
281 return;
282 }
283 }
284 sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
285 if(U_FAILURE(errorCode)) { setErrorContext(); }
286 ruleIndex = i;
287 }
288
289 void
parseStarredCharacters(int32_t strength,int32_t i,UErrorCode & errorCode)290 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
291 UnicodeString empty, raw;
292 i = parseString(skipWhiteSpace(i), raw, errorCode);
293 if(U_FAILURE(errorCode)) { return; }
294 if(raw.isEmpty()) {
295 setParseError("missing starred-relation string", errorCode);
296 return;
297 }
298 UChar32 prev = -1;
299 int32_t j = 0;
300 for(;;) {
301 while(j < raw.length()) {
302 UChar32 c = raw.char32At(j);
303 if(!nfd.isInert(c)) {
304 setParseError("starred-relation string is not all NFD-inert", errorCode);
305 return;
306 }
307 sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
308 if(U_FAILURE(errorCode)) {
309 setErrorContext();
310 return;
311 }
312 j += U16_LENGTH(c);
313 prev = c;
314 }
315 if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'
316 break;
317 }
318 if(prev < 0) {
319 setParseError("range without start in starred-relation string", errorCode);
320 return;
321 }
322 i = parseString(i + 1, raw, errorCode);
323 if(U_FAILURE(errorCode)) { return; }
324 if(raw.isEmpty()) {
325 setParseError("range without end in starred-relation string", errorCode);
326 return;
327 }
328 UChar32 c = raw.char32At(0);
329 if(c < prev) {
330 setParseError("range start greater than end in starred-relation string", errorCode);
331 return;
332 }
333 // range prev-c
334 UnicodeString s;
335 while(++prev <= c) {
336 if(!nfd.isInert(prev)) {
337 setParseError("starred-relation string range is not all NFD-inert", errorCode);
338 return;
339 }
340 if(U_IS_SURROGATE(prev)) {
341 setParseError("starred-relation string range contains a surrogate", errorCode);
342 return;
343 }
344 if(0xfffd <= prev && prev <= 0xffff) {
345 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
346 return;
347 }
348 s.setTo(prev);
349 sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
350 if(U_FAILURE(errorCode)) {
351 setErrorContext();
352 return;
353 }
354 }
355 prev = -1;
356 j = U16_LENGTH(c);
357 }
358 ruleIndex = skipWhiteSpace(i);
359 }
360
361 int32_t
parseTailoringString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)362 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
363 i = parseString(skipWhiteSpace(i), raw, errorCode);
364 if(U_SUCCESS(errorCode) && raw.isEmpty()) {
365 setParseError("missing relation string", errorCode);
366 }
367 return skipWhiteSpace(i);
368 }
369
370 int32_t
parseString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)371 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
372 if(U_FAILURE(errorCode)) { return i; }
373 raw.remove();
374 while(i < rules->length()) {
375 UChar32 c = rules->charAt(i++);
376 if(isSyntaxChar(c)) {
377 if(c == 0x27) { // apostrophe
378 if(i < rules->length() && rules->charAt(i) == 0x27) {
379 // Double apostrophe, encodes a single one.
380 raw.append((UChar)0x27);
381 ++i;
382 continue;
383 }
384 // Quote literal text until the next single apostrophe.
385 for(;;) {
386 if(i == rules->length()) {
387 setParseError("quoted literal text missing terminating apostrophe", errorCode);
388 return i;
389 }
390 c = rules->charAt(i++);
391 if(c == 0x27) {
392 if(i < rules->length() && rules->charAt(i) == 0x27) {
393 // Double apostrophe inside quoted literal text,
394 // still encodes a single apostrophe.
395 ++i;
396 } else {
397 break;
398 }
399 }
400 raw.append((UChar)c);
401 }
402 } else if(c == 0x5c) { // backslash
403 if(i == rules->length()) {
404 setParseError("backslash escape at the end of the rule string", errorCode);
405 return i;
406 }
407 c = rules->char32At(i);
408 raw.append(c);
409 i += U16_LENGTH(c);
410 } else {
411 // Any other syntax character terminates a string.
412 --i;
413 break;
414 }
415 } else if(PatternProps::isWhiteSpace(c)) {
416 // Unquoted white space terminates a string.
417 --i;
418 break;
419 } else {
420 raw.append((UChar)c);
421 }
422 }
423 for(int32_t j = 0; j < raw.length();) {
424 UChar32 c = raw.char32At(j);
425 if(U_IS_SURROGATE(c)) {
426 setParseError("string contains an unpaired surrogate", errorCode);
427 return i;
428 }
429 if(0xfffd <= c && c <= 0xffff) {
430 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
431 return i;
432 }
433 j += U16_LENGTH(c);
434 }
435 return i;
436 }
437
438 namespace {
439
440 static const char *const positions[] = {
441 "first tertiary ignorable",
442 "last tertiary ignorable",
443 "first secondary ignorable",
444 "last secondary ignorable",
445 "first primary ignorable",
446 "last primary ignorable",
447 "first variable",
448 "last variable",
449 "first regular",
450 "last regular",
451 "first implicit",
452 "last implicit",
453 "first trailing",
454 "last trailing"
455 };
456
457 } // namespace
458
459 int32_t
parseSpecialPosition(int32_t i,UnicodeString & str,UErrorCode & errorCode)460 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
461 if(U_FAILURE(errorCode)) { return 0; }
462 UnicodeString raw;
463 int32_t j = readWords(i + 1, raw);
464 if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
465 ++j;
466 for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
467 if(raw == UnicodeString(positions[pos], -1, US_INV)) {
468 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
469 return j;
470 }
471 }
472 if(raw == UNICODE_STRING_SIMPLE("top")) {
473 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
474 return j;
475 }
476 if(raw == UNICODE_STRING_SIMPLE("variable top")) {
477 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
478 return j;
479 }
480 }
481 setParseError("not a valid special reset position", errorCode);
482 return i;
483 }
484
485 void
parseSetting(UErrorCode & errorCode)486 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
487 if(U_FAILURE(errorCode)) { return; }
488 UnicodeString raw;
489 int32_t i = ruleIndex + 1;
490 int32_t j = readWords(i, raw);
491 if(j <= i || raw.isEmpty()) {
492 setParseError("expected a setting/option at '['", errorCode);
493 }
494 if(rules->charAt(j) == 0x5d) { // words end with ]
495 ++j;
496 if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
498 parseReordering(raw, errorCode);
499 ruleIndex = j;
500 return;
501 }
502 if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
503 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
504 UCOL_ON, 0, errorCode);
505 ruleIndex = j;
506 return;
507 }
508 UnicodeString v;
509 int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
510 if(valueIndex >= 0) {
511 v.setTo(raw, valueIndex + 1);
512 raw.truncate(valueIndex);
513 }
514 if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
515 int32_t value = UCOL_DEFAULT;
516 UChar c = v.charAt(0);
517 if(0x31 <= c && c <= 0x34) { // 1..4
518 value = UCOL_PRIMARY + (c - 0x31);
519 } else if(c == 0x49) { // 'I'
520 value = UCOL_IDENTICAL;
521 }
522 if(value != UCOL_DEFAULT) {
523 settings->setStrength(value, 0, errorCode);
524 ruleIndex = j;
525 return;
526 }
527 } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
528 UColAttributeValue value = UCOL_DEFAULT;
529 if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
530 value = UCOL_NON_IGNORABLE;
531 } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
532 value = UCOL_SHIFTED;
533 }
534 if(value != UCOL_DEFAULT) {
535 settings->setAlternateHandling(value, 0, errorCode);
536 ruleIndex = j;
537 return;
538 }
539 } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
540 int32_t value = UCOL_DEFAULT;
541 if(v == UNICODE_STRING_SIMPLE("space")) {
542 value = CollationSettings::MAX_VAR_SPACE;
543 } else if(v == UNICODE_STRING_SIMPLE("punct")) {
544 value = CollationSettings::MAX_VAR_PUNCT;
545 } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
546 value = CollationSettings::MAX_VAR_SYMBOL;
547 } else if(v == UNICODE_STRING_SIMPLE("currency")) {
548 value = CollationSettings::MAX_VAR_CURRENCY;
549 }
550 if(value != UCOL_DEFAULT) {
551 settings->setMaxVariable(value, 0, errorCode);
552 settings->variableTop = baseData->getLastPrimaryForGroup(
553 UCOL_REORDER_CODE_FIRST + value);
554 U_ASSERT(settings->variableTop != 0);
555 ruleIndex = j;
556 return;
557 }
558 } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
559 UColAttributeValue value = UCOL_DEFAULT;
560 if(v == UNICODE_STRING_SIMPLE("off")) {
561 value = UCOL_OFF;
562 } else if(v == UNICODE_STRING_SIMPLE("lower")) {
563 value = UCOL_LOWER_FIRST;
564 } else if(v == UNICODE_STRING_SIMPLE("upper")) {
565 value = UCOL_UPPER_FIRST;
566 }
567 if(value != UCOL_DEFAULT) {
568 settings->setCaseFirst(value, 0, errorCode);
569 ruleIndex = j;
570 return;
571 }
572 } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
573 UColAttributeValue value = getOnOffValue(v);
574 if(value != UCOL_DEFAULT) {
575 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
576 ruleIndex = j;
577 return;
578 }
579 } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
580 UColAttributeValue value = getOnOffValue(v);
581 if(value != UCOL_DEFAULT) {
582 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
583 ruleIndex = j;
584 return;
585 }
586 } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
587 UColAttributeValue value = getOnOffValue(v);
588 if(value != UCOL_DEFAULT) {
589 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
590 ruleIndex = j;
591 return;
592 }
593 } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
594 UColAttributeValue value = getOnOffValue(v);
595 if(value != UCOL_DEFAULT) {
596 if(value == UCOL_ON) {
597 setParseError("[hiraganaQ on] is not supported", errorCode);
598 }
599 ruleIndex = j;
600 return;
601 }
602 } else if(raw == UNICODE_STRING_SIMPLE("import")) {
603 CharString lang;
604 lang.appendInvariantChars(v, errorCode);
605 if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
606 // BCP 47 language tag -> ICU locale ID
607 char localeID[ULOC_FULLNAME_CAPACITY];
608 int32_t parsedLength;
609 int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
610 &parsedLength, &errorCode);
611 if(U_FAILURE(errorCode) ||
612 parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
613 errorCode = U_ZERO_ERROR;
614 setParseError("expected language tag in [import langTag]", errorCode);
615 return;
616 }
617 // localeID minus all keywords
618 char baseID[ULOC_FULLNAME_CAPACITY];
619 length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
620 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
621 errorCode = U_ZERO_ERROR;
622 setParseError("expected language tag in [import langTag]", errorCode);
623 return;
624 }
625 if(length == 0) {
626 uprv_strcpy(baseID, "root");
627 } else if(*baseID == '_') {
628 uprv_memmove(baseID + 3, baseID, length + 1);
629 uprv_memcpy(baseID, "und", 3);
630 }
631 // @collation=type, or length=0 if not specified
632 char collationType[ULOC_KEYWORDS_CAPACITY];
633 length = uloc_getKeywordValue(localeID, "collation",
634 collationType, ULOC_KEYWORDS_CAPACITY,
635 &errorCode);
636 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
637 errorCode = U_ZERO_ERROR;
638 setParseError("expected language tag in [import langTag]", errorCode);
639 return;
640 }
641 if(importer == NULL) {
642 setParseError("[import langTag] is not supported", errorCode);
643 } else {
644 UnicodeString importedRules;
645 importer->getRules(baseID, length > 0 ? collationType : "standard",
646 importedRules, errorReason, errorCode);
647 if(U_FAILURE(errorCode)) {
648 if(errorReason == NULL) {
649 errorReason = "[import langTag] failed";
650 }
651 setErrorContext();
652 return;
653 }
654 const UnicodeString *outerRules = rules;
655 int32_t outerRuleIndex = ruleIndex;
656 parse(importedRules, errorCode);
657 if(U_FAILURE(errorCode)) {
658 if(parseError != NULL) {
659 parseError->offset = outerRuleIndex;
660 }
661 }
662 rules = outerRules;
663 ruleIndex = j;
664 }
665 return;
666 }
667 } else if(rules->charAt(j) == 0x5b) { // words end with [
668 UnicodeSet set;
669 j = parseUnicodeSet(j, set, errorCode);
670 if(U_FAILURE(errorCode)) { return; }
671 if(raw == UNICODE_STRING_SIMPLE("optimize")) {
672 sink->optimize(set, errorReason, errorCode);
673 if(U_FAILURE(errorCode)) { setErrorContext(); }
674 ruleIndex = j;
675 return;
676 } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
677 sink->suppressContractions(set, errorReason, errorCode);
678 if(U_FAILURE(errorCode)) { setErrorContext(); }
679 ruleIndex = j;
680 return;
681 }
682 }
683 setParseError("not a valid setting/option", errorCode);
684 }
685
686 void
parseReordering(const UnicodeString & raw,UErrorCode & errorCode)687 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
688 if(U_FAILURE(errorCode)) { return; }
689 int32_t i = 7; // after "reorder"
690 if(i == raw.length()) {
691 // empty [reorder] with no codes
692 settings->resetReordering();
693 return;
694 }
695 // Parse the codes in [reorder aa bb cc].
696 UVector32 reorderCodes(errorCode);
697 if(U_FAILURE(errorCode)) { return; }
698 CharString word;
699 while(i < raw.length()) {
700 ++i; // skip the word-separating space
701 int32_t limit = raw.indexOf((UChar)0x20, i);
702 if(limit < 0) { limit = raw.length(); }
703 word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
704 if(U_FAILURE(errorCode)) { return; }
705 int32_t code = getReorderCode(word.data());
706 if(code < 0) {
707 setParseError("unknown script or reorder code", errorCode);
708 return;
709 }
710 reorderCodes.addElement(code, errorCode);
711 if(U_FAILURE(errorCode)) { return; }
712 i = limit;
713 }
714 settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
715 }
716
717 static const char *const gSpecialReorderCodes[] = {
718 "space", "punct", "symbol", "currency", "digit"
719 };
720
721 int32_t
getReorderCode(const char * word)722 CollationRuleParser::getReorderCode(const char *word) {
723 for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
724 if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
725 return UCOL_REORDER_CODE_FIRST + i;
726 }
727 }
728 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
729 if(script >= 0) {
730 return script;
731 }
732 if(uprv_stricmp(word, "others") == 0) {
733 return UCOL_REORDER_CODE_OTHERS; // same as Zzzz = USCRIPT_UNKNOWN
734 }
735 return -1;
736 }
737
738 UColAttributeValue
getOnOffValue(const UnicodeString & s)739 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
740 if(s == UNICODE_STRING_SIMPLE("on")) {
741 return UCOL_ON;
742 } else if(s == UNICODE_STRING_SIMPLE("off")) {
743 return UCOL_OFF;
744 } else {
745 return UCOL_DEFAULT;
746 }
747 }
748
749 int32_t
parseUnicodeSet(int32_t i,UnicodeSet & set,UErrorCode & errorCode)750 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
751 // Collect a UnicodeSet pattern between a balanced pair of [brackets].
752 int32_t level = 0;
753 int32_t j = i;
754 for(;;) {
755 if(j == rules->length()) {
756 setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
757 return j;
758 }
759 UChar c = rules->charAt(j++);
760 if(c == 0x5b) { // '['
761 ++level;
762 } else if(c == 0x5d) { // ']'
763 if(--level == 0) { break; }
764 }
765 }
766 set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
767 if(U_FAILURE(errorCode)) {
768 errorCode = U_ZERO_ERROR;
769 setParseError("not a valid UnicodeSet pattern", errorCode);
770 return j;
771 }
772 j = skipWhiteSpace(j);
773 if(j == rules->length() || rules->charAt(j) != 0x5d) {
774 setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
775 return j;
776 }
777 return ++j;
778 }
779
780 int32_t
readWords(int32_t i,UnicodeString & raw) const781 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
782 static const UChar sp = 0x20;
783 raw.remove();
784 i = skipWhiteSpace(i);
785 for(;;) {
786 if(i >= rules->length()) { return 0; }
787 UChar c = rules->charAt(i);
788 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
789 if(raw.isEmpty()) { return i; }
790 if(raw.endsWith(&sp, 1)) { // remove trailing space
791 raw.truncate(raw.length() - 1);
792 }
793 return i;
794 }
795 if(PatternProps::isWhiteSpace(c)) {
796 raw.append(sp);
797 i = skipWhiteSpace(i + 1);
798 } else {
799 raw.append(c);
800 ++i;
801 }
802 }
803 }
804
805 int32_t
skipComment(int32_t i) const806 CollationRuleParser::skipComment(int32_t i) const {
807 // skip to past the newline
808 while(i < rules->length()) {
809 UChar c = rules->charAt(i++);
810 // LF or FF or CR or NEL or LS or PS
811 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
812 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
813 // NLF (new line function) = CR or LF or CR+LF or NEL.
814 // No need to collect all of CR+LF because a following LF will be ignored anyway.
815 break;
816 }
817 }
818 return i;
819 }
820
821 void
setParseError(const char * reason,UErrorCode & errorCode)822 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
823 if(U_FAILURE(errorCode)) { return; }
824 // Error code consistent with the old parser (from ca. 2001),
825 // rather than U_PARSE_ERROR;
826 errorCode = U_INVALID_FORMAT_ERROR;
827 errorReason = reason;
828 if(parseError != NULL) { setErrorContext(); }
829 }
830
831 void
setErrorContext()832 CollationRuleParser::setErrorContext() {
833 if(parseError == NULL) { return; }
834
835 // Note: This relies on the calling code maintaining the ruleIndex
836 // at a position that is useful for debugging.
837 // For example, at the beginning of a reset or relation etc.
838 parseError->offset = ruleIndex;
839 parseError->line = 0; // We are not counting line numbers.
840
841 // before ruleIndex
842 int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
843 if(start < 0) {
844 start = 0;
845 } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
846 ++start;
847 }
848 int32_t length = ruleIndex - start;
849 rules->extract(start, length, parseError->preContext);
850 parseError->preContext[length] = 0;
851
852 // starting from ruleIndex
853 length = rules->length() - ruleIndex;
854 if(length >= U_PARSE_CONTEXT_LEN) {
855 length = U_PARSE_CONTEXT_LEN - 1;
856 if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
857 --length;
858 }
859 }
860 rules->extract(ruleIndex, length, parseError->postContext);
861 parseError->postContext[length] = 0;
862 }
863
864 UBool
isSyntaxChar(UChar32 c)865 CollationRuleParser::isSyntaxChar(UChar32 c) {
866 return 0x21 <= c && c <= 0x7e &&
867 (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
868 (0x5b <= c && c <= 0x60) || (0x7b <= c));
869 }
870
871 int32_t
skipWhiteSpace(int32_t i) const872 CollationRuleParser::skipWhiteSpace(int32_t i) const {
873 while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
874 ++i;
875 }
876 return i;
877 }
878
879 U_NAMESPACE_END
880
881 #endif // !UCONFIG_NO_COLLATION
882