1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationruleparser.cpp
7 *
8 * (replaced the former ucol_tok.cpp)
9 *
10 * created on: 2013apr10
11 * created by: Markus W. Scherer
12 */
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_COLLATION
17
18 #include "unicode/normalizer2.h"
19 #include "unicode/parseerr.h"
20 #include "unicode/uchar.h"
21 #include "unicode/ucol.h"
22 #include "unicode/uloc.h"
23 #include "unicode/unistr.h"
24 #include "unicode/utf16.h"
25 #include "charstr.h"
26 #include "cmemory.h"
27 #include "collation.h"
28 #include "collationdata.h"
29 #include "collationruleparser.h"
30 #include "collationsettings.h"
31 #include "collationtailoring.h"
32 #include "cstring.h"
33 #include "patternprops.h"
34 #include "uassert.h"
35 #include "uvectr32.h"
36
37 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
38
39 U_NAMESPACE_BEGIN
40
41 namespace {
42
43 static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 }; // "[before"
44 const int32_t BEFORE_LENGTH = 7;
45
46 } // namespace
47
~Sink()48 CollationRuleParser::Sink::~Sink() {}
49
50 void
suppressContractions(const UnicodeSet &,const char * &,UErrorCode &)51 CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
52
53 void
optimize(const UnicodeSet &,const char * &,UErrorCode &)54 CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
55
~Importer()56 CollationRuleParser::Importer::~Importer() {}
57
CollationRuleParser(const CollationData * base,UErrorCode & errorCode)58 CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
59 : nfd(*Normalizer2::getNFDInstance(errorCode)),
60 nfc(*Normalizer2::getNFCInstance(errorCode)),
61 rules(NULL), baseData(base), settings(NULL),
62 parseError(NULL), errorReason(NULL),
63 sink(NULL), importer(NULL),
64 ruleIndex(0) {
65 }
66
~CollationRuleParser()67 CollationRuleParser::~CollationRuleParser() {
68 }
69
70 void
parse(const UnicodeString & ruleString,CollationSettings & outSettings,UParseError * outParseError,UErrorCode & errorCode)71 CollationRuleParser::parse(const UnicodeString &ruleString,
72 CollationSettings &outSettings,
73 UParseError *outParseError,
74 UErrorCode &errorCode) {
75 if(U_FAILURE(errorCode)) { return; }
76 settings = &outSettings;
77 parseError = outParseError;
78 if(parseError != NULL) {
79 parseError->line = 0;
80 parseError->offset = -1;
81 parseError->preContext[0] = 0;
82 parseError->postContext[0] = 0;
83 }
84 errorReason = NULL;
85 parse(ruleString, errorCode);
86 }
87
88 void
parse(const UnicodeString & ruleString,UErrorCode & errorCode)89 CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
90 if(U_FAILURE(errorCode)) { return; }
91 rules = &ruleString;
92 ruleIndex = 0;
93
94 while(ruleIndex < rules->length()) {
95 UChar c = rules->charAt(ruleIndex);
96 if(PatternProps::isWhiteSpace(c)) {
97 ++ruleIndex;
98 continue;
99 }
100 switch(c) {
101 case 0x26: // '&'
102 parseRuleChain(errorCode);
103 break;
104 case 0x5b: // '['
105 parseSetting(errorCode);
106 break;
107 case 0x23: // '#' starts a comment, until the end of the line
108 ruleIndex = skipComment(ruleIndex + 1);
109 break;
110 case 0x40: // '@' is equivalent to [backwards 2]
111 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
112 UCOL_ON, 0, errorCode);
113 ++ruleIndex;
114 break;
115 case 0x21: // '!' used to turn on Thai/Lao character reversal
116 // Accept but ignore. The root collator has contractions
117 // that are equivalent to the character reversal, where appropriate.
118 ++ruleIndex;
119 break;
120 default:
121 setParseError("expected a reset or setting or comment", errorCode);
122 break;
123 }
124 if(U_FAILURE(errorCode)) { return; }
125 }
126 }
127
128 void
parseRuleChain(UErrorCode & errorCode)129 CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
130 int32_t resetStrength = parseResetAndPosition(errorCode);
131 UBool isFirstRelation = TRUE;
132 for(;;) {
133 int32_t result = parseRelationOperator(errorCode);
134 if(U_FAILURE(errorCode)) { return; }
135 if(result < 0) {
136 if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
137 // '#' starts a comment, until the end of the line
138 ruleIndex = skipComment(ruleIndex + 1);
139 continue;
140 }
141 if(isFirstRelation) {
142 setParseError("reset not followed by a relation", errorCode);
143 }
144 return;
145 }
146 int32_t strength = result & STRENGTH_MASK;
147 if(resetStrength < UCOL_IDENTICAL) {
148 // reset-before rule chain
149 if(isFirstRelation) {
150 if(strength != resetStrength) {
151 setParseError("reset-before strength differs from its first relation", errorCode);
152 return;
153 }
154 } else {
155 if(strength < resetStrength) {
156 setParseError("reset-before strength followed by a stronger relation", errorCode);
157 return;
158 }
159 }
160 }
161 int32_t i = ruleIndex + (result >> OFFSET_SHIFT); // skip over the relation operator
162 if((result & STARRED_FLAG) == 0) {
163 parseRelationStrings(strength, i, errorCode);
164 } else {
165 parseStarredCharacters(strength, i, errorCode);
166 }
167 if(U_FAILURE(errorCode)) { return; }
168 isFirstRelation = FALSE;
169 }
170 }
171
172 int32_t
parseResetAndPosition(UErrorCode & errorCode)173 CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
174 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
175 int32_t i = skipWhiteSpace(ruleIndex + 1);
176 int32_t j;
177 UChar c;
178 int32_t resetStrength;
179 if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
180 (j = i + BEFORE_LENGTH) < rules->length() &&
181 PatternProps::isWhiteSpace(rules->charAt(j)) &&
182 ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
183 0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
184 rules->charAt(j + 1) == 0x5d) {
185 // &[before n] with n=1 or 2 or 3
186 resetStrength = UCOL_PRIMARY + (c - 0x31);
187 i = skipWhiteSpace(j + 2);
188 } else {
189 resetStrength = UCOL_IDENTICAL;
190 }
191 if(i >= rules->length()) {
192 setParseError("reset without position", errorCode);
193 return UCOL_DEFAULT;
194 }
195 UnicodeString str;
196 if(rules->charAt(i) == 0x5b) { // '['
197 i = parseSpecialPosition(i, str, errorCode);
198 } else {
199 i = parseTailoringString(i, str, errorCode);
200 }
201 sink->addReset(resetStrength, str, errorReason, errorCode);
202 if(U_FAILURE(errorCode)) { setErrorContext(); }
203 ruleIndex = i;
204 return resetStrength;
205 }
206
207 int32_t
parseRelationOperator(UErrorCode & errorCode)208 CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
209 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
210 ruleIndex = skipWhiteSpace(ruleIndex);
211 if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
212 int32_t strength;
213 int32_t i = ruleIndex;
214 UChar c = rules->charAt(i++);
215 switch(c) {
216 case 0x3c: // '<'
217 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<
218 ++i;
219 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<
220 ++i;
221 if(i < rules->length() && rules->charAt(i) == 0x3c) { // <<<<
222 ++i;
223 strength = UCOL_QUATERNARY;
224 } else {
225 strength = UCOL_TERTIARY;
226 }
227 } else {
228 strength = UCOL_SECONDARY;
229 }
230 } else {
231 strength = UCOL_PRIMARY;
232 }
233 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
234 ++i;
235 strength |= STARRED_FLAG;
236 }
237 break;
238 case 0x3b: // ';' same as <<
239 strength = UCOL_SECONDARY;
240 break;
241 case 0x2c: // ',' same as <<<
242 strength = UCOL_TERTIARY;
243 break;
244 case 0x3d: // '='
245 strength = UCOL_IDENTICAL;
246 if(i < rules->length() && rules->charAt(i) == 0x2a) { // '*'
247 ++i;
248 strength |= STARRED_FLAG;
249 }
250 break;
251 default:
252 return UCOL_DEFAULT;
253 }
254 return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
255 }
256
257 void
parseRelationStrings(int32_t strength,int32_t i,UErrorCode & errorCode)258 CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
259 // Parse
260 // prefix | str / extension
261 // where prefix and extension are optional.
262 UnicodeString prefix, str, extension;
263 i = parseTailoringString(i, str, errorCode);
264 if(U_FAILURE(errorCode)) { return; }
265 UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
266 if(next == 0x7c) { // '|' separates the context prefix from the string.
267 prefix = str;
268 i = parseTailoringString(i + 1, str, errorCode);
269 if(U_FAILURE(errorCode)) { return; }
270 next = (i < rules->length()) ? rules->charAt(i) : 0;
271 }
272 if(next == 0x2f) { // '/' separates the string from the extension.
273 i = parseTailoringString(i + 1, extension, errorCode);
274 }
275 if(!prefix.isEmpty()) {
276 UChar32 prefix0 = prefix.char32At(0);
277 UChar32 c = str.char32At(0);
278 if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
279 setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
280 errorCode);
281 return;
282 }
283 }
284 sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
285 if(U_FAILURE(errorCode)) { setErrorContext(); }
286 ruleIndex = i;
287 }
288
289 void
parseStarredCharacters(int32_t strength,int32_t i,UErrorCode & errorCode)290 CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
291 UnicodeString empty, raw;
292 i = parseString(skipWhiteSpace(i), raw, errorCode);
293 if(U_FAILURE(errorCode)) { return; }
294 if(raw.isEmpty()) {
295 setParseError("missing starred-relation string", errorCode);
296 return;
297 }
298 UChar32 prev = -1;
299 int32_t j = 0;
300 for(;;) {
301 while(j < raw.length()) {
302 UChar32 c = raw.char32At(j);
303 if(!nfd.isInert(c)) {
304 setParseError("starred-relation string is not all NFD-inert", errorCode);
305 return;
306 }
307 sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
308 if(U_FAILURE(errorCode)) {
309 setErrorContext();
310 return;
311 }
312 j += U16_LENGTH(c);
313 prev = c;
314 }
315 if(i >= rules->length() || rules->charAt(i) != 0x2d) { // '-'
316 break;
317 }
318 if(prev < 0) {
319 setParseError("range without start in starred-relation string", errorCode);
320 return;
321 }
322 i = parseString(i + 1, raw, errorCode);
323 if(U_FAILURE(errorCode)) { return; }
324 if(raw.isEmpty()) {
325 setParseError("range without end in starred-relation string", errorCode);
326 return;
327 }
328 UChar32 c = raw.char32At(0);
329 if(c < prev) {
330 setParseError("range start greater than end in starred-relation string", errorCode);
331 return;
332 }
333 // range prev-c
334 UnicodeString s;
335 while(++prev <= c) {
336 if(!nfd.isInert(prev)) {
337 setParseError("starred-relation string range is not all NFD-inert", errorCode);
338 return;
339 }
340 if(U_IS_SURROGATE(prev)) {
341 setParseError("starred-relation string range contains a surrogate", errorCode);
342 return;
343 }
344 if(0xfffd <= prev && prev <= 0xffff) {
345 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
346 return;
347 }
348 s.setTo(prev);
349 sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
350 if(U_FAILURE(errorCode)) {
351 setErrorContext();
352 return;
353 }
354 }
355 prev = -1;
356 j = U16_LENGTH(c);
357 }
358 ruleIndex = skipWhiteSpace(i);
359 }
360
361 int32_t
parseTailoringString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)362 CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
363 i = parseString(skipWhiteSpace(i), raw, errorCode);
364 if(U_SUCCESS(errorCode) && raw.isEmpty()) {
365 setParseError("missing relation string", errorCode);
366 }
367 return skipWhiteSpace(i);
368 }
369
370 int32_t
parseString(int32_t i,UnicodeString & raw,UErrorCode & errorCode)371 CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
372 if(U_FAILURE(errorCode)) { return i; }
373 raw.remove();
374 while(i < rules->length()) {
375 UChar32 c = rules->charAt(i++);
376 if(isSyntaxChar(c)) {
377 if(c == 0x27) { // apostrophe
378 if(i < rules->length() && rules->charAt(i) == 0x27) {
379 // Double apostrophe, encodes a single one.
380 raw.append((UChar)0x27);
381 ++i;
382 continue;
383 }
384 // Quote literal text until the next single apostrophe.
385 for(;;) {
386 if(i == rules->length()) {
387 setParseError("quoted literal text missing terminating apostrophe", errorCode);
388 return i;
389 }
390 c = rules->charAt(i++);
391 if(c == 0x27) {
392 if(i < rules->length() && rules->charAt(i) == 0x27) {
393 // Double apostrophe inside quoted literal text,
394 // still encodes a single apostrophe.
395 ++i;
396 } else {
397 break;
398 }
399 }
400 raw.append((UChar)c);
401 }
402 } else if(c == 0x5c) { // backslash
403 if(i == rules->length()) {
404 setParseError("backslash escape at the end of the rule string", errorCode);
405 return i;
406 }
407 c = rules->char32At(i);
408 raw.append(c);
409 i += U16_LENGTH(c);
410 } else {
411 // Any other syntax character terminates a string.
412 --i;
413 break;
414 }
415 } else if(PatternProps::isWhiteSpace(c)) {
416 // Unquoted white space terminates a string.
417 --i;
418 break;
419 } else {
420 raw.append((UChar)c);
421 }
422 }
423 for(int32_t j = 0; j < raw.length();) {
424 UChar32 c = raw.char32At(j);
425 if(U_IS_SURROGATE(c)) {
426 setParseError("string contains an unpaired surrogate", errorCode);
427 return i;
428 }
429 if(0xfffd <= c && c <= 0xffff) {
430 setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
431 return i;
432 }
433 j += U16_LENGTH(c);
434 }
435 return i;
436 }
437
438 namespace {
439
440 static const char *const positions[] = {
441 "first tertiary ignorable",
442 "last tertiary ignorable",
443 "first secondary ignorable",
444 "last secondary ignorable",
445 "first primary ignorable",
446 "last primary ignorable",
447 "first variable",
448 "last variable",
449 "first regular",
450 "last regular",
451 "first implicit",
452 "last implicit",
453 "first trailing",
454 "last trailing"
455 };
456
457 } // namespace
458
459 int32_t
parseSpecialPosition(int32_t i,UnicodeString & str,UErrorCode & errorCode)460 CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
461 if(U_FAILURE(errorCode)) { return 0; }
462 UnicodeString raw;
463 int32_t j = readWords(i + 1, raw);
464 if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) { // words end with ]
465 ++j;
466 for(int32_t pos = 0; pos < LENGTHOF(positions); ++pos) {
467 if(raw == UnicodeString(positions[pos], -1, US_INV)) {
468 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
469 return j;
470 }
471 }
472 if(raw == UNICODE_STRING_SIMPLE("top")) {
473 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
474 return j;
475 }
476 if(raw == UNICODE_STRING_SIMPLE("variable top")) {
477 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
478 return j;
479 }
480 }
481 setParseError("not a valid special reset position", errorCode);
482 return i;
483 }
484
485 void
parseSetting(UErrorCode & errorCode)486 CollationRuleParser::parseSetting(UErrorCode &errorCode) {
487 if(U_FAILURE(errorCode)) { return; }
488 UnicodeString raw;
489 int32_t i = ruleIndex + 1;
490 int32_t j = readWords(i, raw);
491 if(j <= i || raw.isEmpty()) {
492 setParseError("expected a setting/option at '['", errorCode);
493 }
494 if(rules->charAt(j) == 0x5d) { // words end with ]
495 ++j;
496 if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
497 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
498 parseReordering(raw, errorCode);
499 ruleIndex = j;
500 return;
501 }
502 if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
503 settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
504 UCOL_ON, 0, errorCode);
505 ruleIndex = j;
506 return;
507 }
508 UnicodeString v;
509 int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
510 if(valueIndex >= 0) {
511 v.setTo(raw, valueIndex + 1);
512 raw.truncate(valueIndex);
513 }
514 if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
515 int32_t value = UCOL_DEFAULT;
516 UChar c = v.charAt(0);
517 if(0x31 <= c && c <= 0x34) { // 1..4
518 value = UCOL_PRIMARY + (c - 0x31);
519 } else if(c == 0x49) { // 'I'
520 value = UCOL_IDENTICAL;
521 }
522 if(value != UCOL_DEFAULT) {
523 settings->setStrength(value, 0, errorCode);
524 ruleIndex = j;
525 return;
526 }
527 } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
528 UColAttributeValue value = UCOL_DEFAULT;
529 if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
530 value = UCOL_NON_IGNORABLE;
531 } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
532 value = UCOL_SHIFTED;
533 }
534 if(value != UCOL_DEFAULT) {
535 settings->setAlternateHandling(value, 0, errorCode);
536 ruleIndex = j;
537 return;
538 }
539 } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
540 int32_t value = UCOL_DEFAULT;
541 if(v == UNICODE_STRING_SIMPLE("space")) {
542 value = CollationSettings::MAX_VAR_SPACE;
543 } else if(v == UNICODE_STRING_SIMPLE("punct")) {
544 value = CollationSettings::MAX_VAR_PUNCT;
545 } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
546 value = CollationSettings::MAX_VAR_SYMBOL;
547 } else if(v == UNICODE_STRING_SIMPLE("currency")) {
548 value = CollationSettings::MAX_VAR_CURRENCY;
549 }
550 if(value != UCOL_DEFAULT) {
551 settings->setMaxVariable(value, 0, errorCode);
552 settings->variableTop = baseData->getLastPrimaryForGroup(
553 UCOL_REORDER_CODE_FIRST + value);
554 U_ASSERT(settings->variableTop != 0);
555 ruleIndex = j;
556 return;
557 }
558 } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
559 UColAttributeValue value = UCOL_DEFAULT;
560 if(v == UNICODE_STRING_SIMPLE("off")) {
561 value = UCOL_OFF;
562 } else if(v == UNICODE_STRING_SIMPLE("lower")) {
563 value = UCOL_LOWER_FIRST;
564 } else if(v == UNICODE_STRING_SIMPLE("upper")) {
565 value = UCOL_UPPER_FIRST;
566 }
567 if(value != UCOL_DEFAULT) {
568 settings->setCaseFirst(value, 0, errorCode);
569 ruleIndex = j;
570 return;
571 }
572 } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
573 UColAttributeValue value = getOnOffValue(v);
574 if(value != UCOL_DEFAULT) {
575 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
576 ruleIndex = j;
577 return;
578 }
579 } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
580 UColAttributeValue value = getOnOffValue(v);
581 if(value != UCOL_DEFAULT) {
582 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
583 ruleIndex = j;
584 return;
585 }
586 } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
587 UColAttributeValue value = getOnOffValue(v);
588 if(value != UCOL_DEFAULT) {
589 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
590 ruleIndex = j;
591 return;
592 }
593 } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
594 UColAttributeValue value = getOnOffValue(v);
595 if(value != UCOL_DEFAULT) {
596 if(value == UCOL_ON) {
597 setParseError("[hiraganaQ on] is not supported", errorCode);
598 }
599 ruleIndex = j;
600 return;
601 }
602 } else if(raw == UNICODE_STRING_SIMPLE("import")) {
603 CharString lang;
604 lang.appendInvariantChars(v, errorCode);
605 if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
606 // BCP 47 language tag -> ICU locale ID
607 char localeID[ULOC_FULLNAME_CAPACITY];
608 int32_t parsedLength;
609 int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
610 &parsedLength, &errorCode);
611 if(U_FAILURE(errorCode) ||
612 parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
613 errorCode = U_ZERO_ERROR;
614 setParseError("expected language tag in [import langTag]", errorCode);
615 return;
616 }
617 // localeID minus all keywords
618 char baseID[ULOC_FULLNAME_CAPACITY];
619 length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
620 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
621 errorCode = U_ZERO_ERROR;
622 setParseError("expected language tag in [import langTag]", errorCode);
623 return;
624 }
625 // @collation=type, or length=0 if not specified
626 char collationType[ULOC_KEYWORDS_CAPACITY];
627 length = uloc_getKeywordValue(localeID, "collation",
628 collationType, ULOC_KEYWORDS_CAPACITY,
629 &errorCode);
630 if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
631 errorCode = U_ZERO_ERROR;
632 setParseError("expected language tag in [import langTag]", errorCode);
633 return;
634 }
635 if(importer == NULL) {
636 setParseError("[import langTag] is not supported", errorCode);
637 } else {
638 const UnicodeString *importedRules =
639 importer->getRules(baseID,
640 length > 0 ? collationType : "standard",
641 errorReason, errorCode);
642 if(U_FAILURE(errorCode)) {
643 if(errorReason == NULL) {
644 errorReason = "[import langTag] failed";
645 }
646 setErrorContext();
647 return;
648 }
649 const UnicodeString *outerRules = rules;
650 int32_t outerRuleIndex = ruleIndex;
651 parse(*importedRules, errorCode);
652 if(U_FAILURE(errorCode)) {
653 if(parseError != NULL) {
654 parseError->offset = outerRuleIndex;
655 }
656 }
657 rules = outerRules;
658 ruleIndex = j;
659 }
660 return;
661 }
662 } else if(rules->charAt(j) == 0x5b) { // words end with [
663 UnicodeSet set;
664 j = parseUnicodeSet(j, set, errorCode);
665 if(U_FAILURE(errorCode)) { return; }
666 if(raw == UNICODE_STRING_SIMPLE("optimize")) {
667 sink->optimize(set, errorReason, errorCode);
668 if(U_FAILURE(errorCode)) { setErrorContext(); }
669 ruleIndex = j;
670 return;
671 } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
672 sink->suppressContractions(set, errorReason, errorCode);
673 if(U_FAILURE(errorCode)) { setErrorContext(); }
674 ruleIndex = j;
675 return;
676 }
677 }
678 setParseError("not a valid setting/option", errorCode);
679 }
680
681 void
parseReordering(const UnicodeString & raw,UErrorCode & errorCode)682 CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
683 if(U_FAILURE(errorCode)) { return; }
684 int32_t i = 7; // after "reorder"
685 if(i == raw.length()) {
686 // empty [reorder] with no codes
687 settings->resetReordering();
688 return;
689 }
690 // Parse the codes in [reorder aa bb cc].
691 UVector32 reorderCodes(errorCode);
692 if(U_FAILURE(errorCode)) { return; }
693 CharString word;
694 while(i < raw.length()) {
695 ++i; // skip the word-separating space
696 int32_t limit = raw.indexOf((UChar)0x20, i);
697 if(limit < 0) { limit = raw.length(); }
698 word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
699 if(U_FAILURE(errorCode)) { return; }
700 int32_t code = getReorderCode(word.data());
701 if(code < 0) {
702 setParseError("unknown script or reorder code", errorCode);
703 return;
704 }
705 reorderCodes.addElement(code, errorCode);
706 if(U_FAILURE(errorCode)) { return; }
707 i = limit;
708 }
709 int32_t length = reorderCodes.size();
710 if(length == 1 && reorderCodes.elementAti(0) == UCOL_REORDER_CODE_DEFAULT) {
711 // The root collator does not have a reordering, by definition.
712 settings->resetReordering();
713 return;
714 }
715 uint8_t table[256];
716 baseData->makeReorderTable(reorderCodes.getBuffer(), length, table, errorCode);
717 if(U_FAILURE(errorCode)) { return; }
718 if(!settings->setReordering(reorderCodes.getBuffer(), length, table)) {
719 errorCode = U_MEMORY_ALLOCATION_ERROR;
720 }
721 }
722
723 static const char *const gSpecialReorderCodes[] = {
724 "space", "punct", "symbol", "currency", "digit"
725 };
726
727 int32_t
getReorderCode(const char * word)728 CollationRuleParser::getReorderCode(const char *word) {
729 for(int32_t i = 0; i < LENGTHOF(gSpecialReorderCodes); ++i) {
730 if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
731 return UCOL_REORDER_CODE_FIRST + i;
732 }
733 }
734 int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
735 if(script >= 0) {
736 return script;
737 }
738 if(uprv_stricmp(word, "default") == 0) {
739 return UCOL_REORDER_CODE_DEFAULT;
740 }
741 return -2;
742 }
743
744 UColAttributeValue
getOnOffValue(const UnicodeString & s)745 CollationRuleParser::getOnOffValue(const UnicodeString &s) {
746 if(s == UNICODE_STRING_SIMPLE("on")) {
747 return UCOL_ON;
748 } else if(s == UNICODE_STRING_SIMPLE("off")) {
749 return UCOL_OFF;
750 } else {
751 return UCOL_DEFAULT;
752 }
753 }
754
755 int32_t
parseUnicodeSet(int32_t i,UnicodeSet & set,UErrorCode & errorCode)756 CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
757 // Collect a UnicodeSet pattern between a balanced pair of [brackets].
758 int32_t level = 0;
759 int32_t j = i;
760 for(;;) {
761 if(j == rules->length()) {
762 setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
763 return j;
764 }
765 UChar c = rules->charAt(j++);
766 if(c == 0x5b) { // '['
767 ++level;
768 } else if(c == 0x5d) { // ']'
769 if(--level == 0) { break; }
770 }
771 }
772 set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
773 if(U_FAILURE(errorCode)) {
774 errorCode = U_ZERO_ERROR;
775 setParseError("not a valid UnicodeSet pattern", errorCode);
776 return j;
777 }
778 j = skipWhiteSpace(j);
779 if(j == rules->length() || rules->charAt(j) != 0x5d) {
780 setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
781 return j;
782 }
783 return ++j;
784 }
785
786 int32_t
readWords(int32_t i,UnicodeString & raw) const787 CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
788 static const UChar sp = 0x20;
789 raw.remove();
790 i = skipWhiteSpace(i);
791 for(;;) {
792 if(i >= rules->length()) { return 0; }
793 UChar c = rules->charAt(i);
794 if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) { // syntax except -_
795 if(raw.isEmpty()) { return i; }
796 if(raw.endsWith(&sp, 1)) { // remove trailing space
797 raw.truncate(raw.length() - 1);
798 }
799 return i;
800 }
801 if(PatternProps::isWhiteSpace(c)) {
802 raw.append(0x20);
803 i = skipWhiteSpace(i + 1);
804 } else {
805 raw.append(c);
806 ++i;
807 }
808 }
809 }
810
811 int32_t
skipComment(int32_t i) const812 CollationRuleParser::skipComment(int32_t i) const {
813 // skip to past the newline
814 while(i < rules->length()) {
815 UChar c = rules->charAt(i++);
816 // LF or FF or CR or NEL or LS or PS
817 if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
818 // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
819 // NLF (new line function) = CR or LF or CR+LF or NEL.
820 // No need to collect all of CR+LF because a following LF will be ignored anyway.
821 break;
822 }
823 }
824 return i;
825 }
826
827 void
setParseError(const char * reason,UErrorCode & errorCode)828 CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
829 if(U_FAILURE(errorCode)) { return; }
830 // Error code consistent with the old parser (from ca. 2001),
831 // rather than U_PARSE_ERROR;
832 errorCode = U_INVALID_FORMAT_ERROR;
833 errorReason = reason;
834 if(parseError != NULL) { setErrorContext(); }
835 }
836
837 void
setErrorContext()838 CollationRuleParser::setErrorContext() {
839 if(parseError == NULL) { return; }
840
841 // Note: This relies on the calling code maintaining the ruleIndex
842 // at a position that is useful for debugging.
843 // For example, at the beginning of a reset or relation etc.
844 parseError->offset = ruleIndex;
845 parseError->line = 0; // We are not counting line numbers.
846
847 // before ruleIndex
848 int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
849 if(start < 0) {
850 start = 0;
851 } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
852 ++start;
853 }
854 int32_t length = ruleIndex - start;
855 rules->extract(start, length, parseError->preContext);
856 parseError->preContext[length] = 0;
857
858 // starting from ruleIndex
859 length = rules->length() - ruleIndex;
860 if(length >= U_PARSE_CONTEXT_LEN) {
861 length = U_PARSE_CONTEXT_LEN - 1;
862 if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
863 --length;
864 }
865 }
866 rules->extract(ruleIndex, length, parseError->postContext);
867 parseError->postContext[length] = 0;
868 }
869
870 UBool
isSyntaxChar(UChar32 c)871 CollationRuleParser::isSyntaxChar(UChar32 c) {
872 return 0x21 <= c && c <= 0x7e &&
873 (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
874 (0x5b <= c && c <= 0x60) || (0x7b <= c));
875 }
876
877 int32_t
skipWhiteSpace(int32_t i) const878 CollationRuleParser::skipWhiteSpace(int32_t i) const {
879 while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
880 ++i;
881 }
882 return i;
883 }
884
885 U_NAMESPACE_END
886
887 #endif // !UCONFIG_NO_COLLATION
888