1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 * Copyright (C) 2015, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 * file name: affixpatternparser.cpp
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_FORMATTING
13
14 #include "unicode/dcfmtsym.h"
15 #include "unicode/plurrule.h"
16 #include "unicode/ucurr.h"
17 #include "affixpatternparser.h"
18 #include "charstr.h"
19 #include "precision.h"
20 #include "uassert.h"
21 #include "unistrappender.h"
22
23 static UChar gDefaultSymbols[] = {0xa4, 0xa4, 0xa4};
24
25 static UChar gPercent = 0x25;
26 static UChar gPerMill = 0x2030;
27 static UChar gNegative = 0x2D;
28 static UChar gPositive = 0x2B;
29
30 #define PACK_TOKEN_AND_LENGTH(t, l) ((UChar) (((t) << 8) | (l & 0xFF)))
31
32 #define UNPACK_TOKEN(c) ((AffixPattern::ETokenType) (((c) >> 8) & 0x7F))
33
34 #define UNPACK_LONG(c) (((c) >> 8) & 0x80)
35
36 #define UNPACK_LENGTH(c) ((c) & 0xFF)
37
38 U_NAMESPACE_BEGIN
39
40 static int32_t
nextToken(const UChar * buffer,int32_t idx,int32_t len,UChar * token)41 nextToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
42 if (buffer[idx] != 0x27 || idx + 1 == len) {
43 *token = buffer[idx];
44 return 1;
45 }
46 *token = buffer[idx + 1];
47 if (buffer[idx + 1] == 0xA4) {
48 int32_t i = 2;
49 for (; idx + i < len && i < 4 && buffer[idx + i] == buffer[idx + 1]; ++i)
50 ;
51 return i;
52 }
53 return 2;
54 }
55
56 static int32_t
nextUserToken(const UChar * buffer,int32_t idx,int32_t len,UChar * token)57 nextUserToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
58 *token = buffer[idx];
59 int32_t max;
60 switch (buffer[idx]) {
61 case 0x27:
62 max = 2;
63 break;
64 case 0xA4:
65 max = 3;
66 break;
67 default:
68 max = 1;
69 break;
70 }
71 int32_t i = 1;
72 for (; idx + i < len && i < max && buffer[idx + i] == buffer[idx]; ++i)
73 ;
74 return i;
75 }
76
CurrencyAffixInfo()77 CurrencyAffixInfo::CurrencyAffixInfo()
78 : fSymbol(gDefaultSymbols, 1),
79 fISO(gDefaultSymbols, 2),
80 fLong(DigitAffix(gDefaultSymbols, 3)),
81 fIsDefault(TRUE) {
82 }
83
84 void
set(const char * locale,const PluralRules * rules,const UChar * currency,UErrorCode & status)85 CurrencyAffixInfo::set(
86 const char *locale,
87 const PluralRules *rules,
88 const UChar *currency,
89 UErrorCode &status) {
90 if (U_FAILURE(status)) {
91 return;
92 }
93 fIsDefault = FALSE;
94 if (currency == NULL) {
95 fSymbol.setTo(gDefaultSymbols, 1);
96 fISO.setTo(gDefaultSymbols, 2);
97 fLong.remove();
98 fLong.append(gDefaultSymbols, 3);
99 fIsDefault = TRUE;
100 return;
101 }
102 int32_t len;
103 UBool unusedIsChoice;
104 const UChar *symbol = ucurr_getName(
105 currency, locale, UCURR_SYMBOL_NAME, &unusedIsChoice,
106 &len, &status);
107 if (U_FAILURE(status)) {
108 return;
109 }
110 fSymbol.setTo(symbol, len);
111 fISO.setTo(currency, u_strlen(currency));
112 fLong.remove();
113 StringEnumeration* keywords = rules->getKeywords(status);
114 if (U_FAILURE(status)) {
115 return;
116 }
117 const UnicodeString* pluralCount;
118 while ((pluralCount = keywords->snext(status)) != NULL) {
119 CharString pCount;
120 pCount.appendInvariantChars(*pluralCount, status);
121 const UChar *pluralName = ucurr_getPluralName(
122 currency, locale, &unusedIsChoice, pCount.data(),
123 &len, &status);
124 fLong.setVariant(pCount.data(), UnicodeString(pluralName, len), status);
125 }
126 delete keywords;
127 }
128
129 void
adjustPrecision(const UChar * currency,const UCurrencyUsage usage,FixedPrecision & precision,UErrorCode & status)130 CurrencyAffixInfo::adjustPrecision(
131 const UChar *currency, const UCurrencyUsage usage,
132 FixedPrecision &precision, UErrorCode &status) {
133 if (U_FAILURE(status)) {
134 return;
135 }
136
137 int32_t digitCount = ucurr_getDefaultFractionDigitsForUsage(
138 currency, usage, &status);
139 precision.fMin.setFracDigitCount(digitCount);
140 precision.fMax.setFracDigitCount(digitCount);
141 double increment = ucurr_getRoundingIncrementForUsage(
142 currency, usage, &status);
143 if (increment == 0.0) {
144 precision.fRoundingIncrement.clear();
145 } else {
146 precision.fRoundingIncrement.set(increment);
147 // guard against round-off error
148 precision.fRoundingIncrement.round(6);
149 }
150 }
151
152 void
addLiteral(const UChar * literal,int32_t start,int32_t len)153 AffixPattern::addLiteral(
154 const UChar *literal, int32_t start, int32_t len) {
155 char32Count += u_countChar32(literal + start, len);
156 literals.append(literal, start, len);
157 int32_t tlen = tokens.length();
158 // Takes 4 UChars to encode maximum literal length.
159 UChar *tokenChars = tokens.getBuffer(tlen + 4);
160
161 // find start of literal size. May be tlen if there is no literal.
162 // While finding start of literal size, compute literal length
163 int32_t literalLength = 0;
164 int32_t tLiteralStart = tlen;
165 while (tLiteralStart > 0 && UNPACK_TOKEN(tokenChars[tLiteralStart - 1]) == kLiteral) {
166 tLiteralStart--;
167 literalLength <<= 8;
168 literalLength |= UNPACK_LENGTH(tokenChars[tLiteralStart]);
169 }
170 // Add number of chars we just added to literal
171 literalLength += len;
172
173 // Now encode the new length starting at tLiteralStart
174 tlen = tLiteralStart;
175 tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral, literalLength & 0xFF);
176 literalLength >>= 8;
177 while (literalLength) {
178 tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral | 0x80, literalLength & 0xFF);
179 literalLength >>= 8;
180 }
181 tokens.releaseBuffer(tlen);
182 }
183
184 void
add(ETokenType t)185 AffixPattern::add(ETokenType t) {
186 add(t, 1);
187 }
188
189 void
addCurrency(uint8_t count)190 AffixPattern::addCurrency(uint8_t count) {
191 add(kCurrency, count);
192 }
193
194 void
add(ETokenType t,uint8_t count)195 AffixPattern::add(ETokenType t, uint8_t count) {
196 U_ASSERT(t != kLiteral);
197 char32Count += count;
198 switch (t) {
199 case kCurrency:
200 hasCurrencyToken = TRUE;
201 break;
202 case kPercent:
203 hasPercentToken = TRUE;
204 break;
205 case kPerMill:
206 hasPermillToken = TRUE;
207 break;
208 default:
209 // Do nothing
210 break;
211 }
212 tokens.append(PACK_TOKEN_AND_LENGTH(t, count));
213 }
214
215 AffixPattern &
append(const AffixPattern & other)216 AffixPattern::append(const AffixPattern &other) {
217 AffixPatternIterator iter;
218 other.iterator(iter);
219 UnicodeString literal;
220 while (iter.nextToken()) {
221 switch (iter.getTokenType()) {
222 case kLiteral:
223 iter.getLiteral(literal);
224 addLiteral(literal.getBuffer(), 0, literal.length());
225 break;
226 case kCurrency:
227 addCurrency(iter.getTokenLength());
228 break;
229 default:
230 add(iter.getTokenType());
231 break;
232 }
233 }
234 return *this;
235 }
236
237 void
remove()238 AffixPattern::remove() {
239 tokens.remove();
240 literals.remove();
241 hasCurrencyToken = FALSE;
242 hasPercentToken = FALSE;
243 hasPermillToken = FALSE;
244 char32Count = 0;
245 }
246
247 // escapes literals for strings where special characters are NOT escaped
248 // except for apostrophe.
escapeApostropheInLiteral(const UnicodeString & literal,UnicodeStringAppender & appender)249 static void escapeApostropheInLiteral(
250 const UnicodeString &literal, UnicodeStringAppender &appender) {
251 int32_t len = literal.length();
252 const UChar *buffer = literal.getBuffer();
253 for (int32_t i = 0; i < len; ++i) {
254 UChar ch = buffer[i];
255 switch (ch) {
256 case 0x27:
257 appender.append((UChar) 0x27);
258 appender.append((UChar) 0x27);
259 break;
260 default:
261 appender.append(ch);
262 break;
263 }
264 }
265 }
266
267
268 // escapes literals for user strings where special characters in literals
269 // are escaped with apostrophe.
escapeLiteral(const UnicodeString & literal,UnicodeStringAppender & appender)270 static void escapeLiteral(
271 const UnicodeString &literal, UnicodeStringAppender &appender) {
272 int32_t len = literal.length();
273 const UChar *buffer = literal.getBuffer();
274 for (int32_t i = 0; i < len; ++i) {
275 UChar ch = buffer[i];
276 switch (ch) {
277 case 0x27:
278 appender.append((UChar) 0x27);
279 appender.append((UChar) 0x27);
280 break;
281 case 0x25:
282 appender.append((UChar) 0x27);
283 appender.append((UChar) 0x25);
284 appender.append((UChar) 0x27);
285 break;
286 case 0x2030:
287 appender.append((UChar) 0x27);
288 appender.append((UChar) 0x2030);
289 appender.append((UChar) 0x27);
290 break;
291 case 0xA4:
292 appender.append((UChar) 0x27);
293 appender.append((UChar) 0xA4);
294 appender.append((UChar) 0x27);
295 break;
296 case 0x2D:
297 appender.append((UChar) 0x27);
298 appender.append((UChar) 0x2D);
299 appender.append((UChar) 0x27);
300 break;
301 case 0x2B:
302 appender.append((UChar) 0x27);
303 appender.append((UChar) 0x2B);
304 appender.append((UChar) 0x27);
305 break;
306 default:
307 appender.append(ch);
308 break;
309 }
310 }
311 }
312
313 UnicodeString &
toString(UnicodeString & appendTo) const314 AffixPattern::toString(UnicodeString &appendTo) const {
315 AffixPatternIterator iter;
316 iterator(iter);
317 UnicodeStringAppender appender(appendTo);
318 UnicodeString literal;
319 while (iter.nextToken()) {
320 switch (iter.getTokenType()) {
321 case kLiteral:
322 escapeApostropheInLiteral(iter.getLiteral(literal), appender);
323 break;
324 case kPercent:
325 appender.append((UChar) 0x27);
326 appender.append((UChar) 0x25);
327 break;
328 case kPerMill:
329 appender.append((UChar) 0x27);
330 appender.append((UChar) 0x2030);
331 break;
332 case kCurrency:
333 {
334 appender.append((UChar) 0x27);
335 int32_t cl = iter.getTokenLength();
336 for (int32_t i = 0; i < cl; ++i) {
337 appender.append((UChar) 0xA4);
338 }
339 }
340 break;
341 case kNegative:
342 appender.append((UChar) 0x27);
343 appender.append((UChar) 0x2D);
344 break;
345 case kPositive:
346 appender.append((UChar) 0x27);
347 appender.append((UChar) 0x2B);
348 break;
349 default:
350 U_ASSERT(FALSE);
351 break;
352 }
353 }
354 return appendTo;
355 }
356
357 UnicodeString &
toUserString(UnicodeString & appendTo) const358 AffixPattern::toUserString(UnicodeString &appendTo) const {
359 AffixPatternIterator iter;
360 iterator(iter);
361 UnicodeStringAppender appender(appendTo);
362 UnicodeString literal;
363 while (iter.nextToken()) {
364 switch (iter.getTokenType()) {
365 case kLiteral:
366 escapeLiteral(iter.getLiteral(literal), appender);
367 break;
368 case kPercent:
369 appender.append((UChar) 0x25);
370 break;
371 case kPerMill:
372 appender.append((UChar) 0x2030);
373 break;
374 case kCurrency:
375 {
376 int32_t cl = iter.getTokenLength();
377 for (int32_t i = 0; i < cl; ++i) {
378 appender.append((UChar) 0xA4);
379 }
380 }
381 break;
382 case kNegative:
383 appender.append((UChar) 0x2D);
384 break;
385 case kPositive:
386 appender.append((UChar) 0x2B);
387 break;
388 default:
389 U_ASSERT(FALSE);
390 break;
391 }
392 }
393 return appendTo;
394 }
395
396 class AffixPatternAppender : public UMemory {
397 public:
AffixPatternAppender(AffixPattern & dest)398 AffixPatternAppender(AffixPattern &dest) : fDest(&dest), fIdx(0) { }
399
append(UChar x)400 inline void append(UChar x) {
401 if (fIdx == UPRV_LENGTHOF(fBuffer)) {
402 fDest->addLiteral(fBuffer, 0, fIdx);
403 fIdx = 0;
404 }
405 fBuffer[fIdx++] = x;
406 }
407
append(UChar32 x)408 inline void append(UChar32 x) {
409 if (fIdx >= UPRV_LENGTHOF(fBuffer) - 1) {
410 fDest->addLiteral(fBuffer, 0, fIdx);
411 fIdx = 0;
412 }
413 U16_APPEND_UNSAFE(fBuffer, fIdx, x);
414 }
415
flush()416 inline void flush() {
417 if (fIdx) {
418 fDest->addLiteral(fBuffer, 0, fIdx);
419 }
420 fIdx = 0;
421 }
422
423 /**
424 * flush the buffer when we go out of scope.
425 */
~AffixPatternAppender()426 ~AffixPatternAppender() {
427 flush();
428 }
429 private:
430 AffixPattern *fDest;
431 int32_t fIdx;
432 UChar fBuffer[32];
433 AffixPatternAppender(const AffixPatternAppender &other);
434 AffixPatternAppender &operator=(const AffixPatternAppender &other);
435 };
436
437
438 AffixPattern &
parseUserAffixString(const UnicodeString & affixStr,AffixPattern & appendTo,UErrorCode & status)439 AffixPattern::parseUserAffixString(
440 const UnicodeString &affixStr,
441 AffixPattern &appendTo,
442 UErrorCode &status) {
443 if (U_FAILURE(status)) {
444 return appendTo;
445 }
446 int32_t len = affixStr.length();
447 const UChar *buffer = affixStr.getBuffer();
448 // 0 = not quoted; 1 = quoted.
449 int32_t state = 0;
450 AffixPatternAppender appender(appendTo);
451 for (int32_t i = 0; i < len; ) {
452 UChar token;
453 int32_t tokenSize = nextUserToken(buffer, i, len, &token);
454 i += tokenSize;
455 if (token == 0x27 && tokenSize == 1) { // quote
456 state = 1 - state;
457 continue;
458 }
459 if (state == 0) {
460 switch (token) {
461 case 0x25:
462 appender.flush();
463 appendTo.add(kPercent, 1);
464 break;
465 case 0x27: // double quote
466 appender.append((UChar) 0x27);
467 break;
468 case 0x2030:
469 appender.flush();
470 appendTo.add(kPerMill, 1);
471 break;
472 case 0x2D:
473 appender.flush();
474 appendTo.add(kNegative, 1);
475 break;
476 case 0x2B:
477 appender.flush();
478 appendTo.add(kPositive, 1);
479 break;
480 case 0xA4:
481 appender.flush();
482 appendTo.add(kCurrency, tokenSize);
483 break;
484 default:
485 appender.append(token);
486 break;
487 }
488 } else {
489 switch (token) {
490 case 0x27: // double quote
491 appender.append((UChar) 0x27);
492 break;
493 case 0xA4: // included b/c tokenSize can be > 1
494 for (int32_t j = 0; j < tokenSize; ++j) {
495 appender.append((UChar) 0xA4);
496 }
497 break;
498 default:
499 appender.append(token);
500 break;
501 }
502 }
503 }
504 return appendTo;
505 }
506
507 AffixPattern &
parseAffixString(const UnicodeString & affixStr,AffixPattern & appendTo,UErrorCode & status)508 AffixPattern::parseAffixString(
509 const UnicodeString &affixStr,
510 AffixPattern &appendTo,
511 UErrorCode &status) {
512 if (U_FAILURE(status)) {
513 return appendTo;
514 }
515 int32_t len = affixStr.length();
516 const UChar *buffer = affixStr.getBuffer();
517 for (int32_t i = 0; i < len; ) {
518 UChar token;
519 int32_t tokenSize = nextToken(buffer, i, len, &token);
520 if (tokenSize == 1) {
521 int32_t literalStart = i;
522 ++i;
523 while (i < len && (tokenSize = nextToken(buffer, i, len, &token)) == 1) {
524 ++i;
525 }
526 appendTo.addLiteral(buffer, literalStart, i - literalStart);
527
528 // If we reached end of string, we are done
529 if (i == len) {
530 return appendTo;
531 }
532 }
533 i += tokenSize;
534 switch (token) {
535 case 0x25:
536 appendTo.add(kPercent, 1);
537 break;
538 case 0x2030:
539 appendTo.add(kPerMill, 1);
540 break;
541 case 0x2D:
542 appendTo.add(kNegative, 1);
543 break;
544 case 0x2B:
545 appendTo.add(kPositive, 1);
546 break;
547 case 0xA4:
548 {
549 if (tokenSize - 1 > 3) {
550 status = U_PARSE_ERROR;
551 return appendTo;
552 }
553 appendTo.add(kCurrency, tokenSize - 1);
554 }
555 break;
556 default:
557 appendTo.addLiteral(&token, 0, 1);
558 break;
559 }
560 }
561 return appendTo;
562 }
563
564 AffixPatternIterator &
iterator(AffixPatternIterator & result) const565 AffixPattern::iterator(AffixPatternIterator &result) const {
566 result.nextLiteralIndex = 0;
567 result.lastLiteralLength = 0;
568 result.nextTokenIndex = 0;
569 result.tokens = &tokens;
570 result.literals = &literals;
571 return result;
572 }
573
574 UBool
nextToken()575 AffixPatternIterator::nextToken() {
576 int32_t tlen = tokens->length();
577 if (nextTokenIndex == tlen) {
578 return FALSE;
579 }
580 ++nextTokenIndex;
581 const UChar *tokenBuffer = tokens->getBuffer();
582 if (UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]) ==
583 AffixPattern::kLiteral) {
584 while (nextTokenIndex < tlen &&
585 UNPACK_LONG(tokenBuffer[nextTokenIndex])) {
586 ++nextTokenIndex;
587 }
588 lastLiteralLength = 0;
589 int32_t i = nextTokenIndex - 1;
590 for (; UNPACK_LONG(tokenBuffer[i]); --i) {
591 lastLiteralLength <<= 8;
592 lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
593 }
594 lastLiteralLength <<= 8;
595 lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
596 nextLiteralIndex += lastLiteralLength;
597 }
598 return TRUE;
599 }
600
601 AffixPattern::ETokenType
getTokenType() const602 AffixPatternIterator::getTokenType() const {
603 return UNPACK_TOKEN(tokens->charAt(nextTokenIndex - 1));
604 }
605
606 UnicodeString &
getLiteral(UnicodeString & result) const607 AffixPatternIterator::getLiteral(UnicodeString &result) const {
608 const UChar *buffer = literals->getBuffer();
609 result.setTo(buffer + (nextLiteralIndex - lastLiteralLength), lastLiteralLength);
610 return result;
611 }
612
613 int32_t
getTokenLength() const614 AffixPatternIterator::getTokenLength() const {
615 const UChar *tokenBuffer = tokens->getBuffer();
616 AffixPattern::ETokenType type = UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]);
617 return type == AffixPattern::kLiteral ? lastLiteralLength : UNPACK_LENGTH(tokenBuffer[nextTokenIndex - 1]);
618 }
619
AffixPatternParser()620 AffixPatternParser::AffixPatternParser()
621 : fPercent(gPercent), fPermill(gPerMill), fNegative(gNegative), fPositive(gPositive) {
622 }
623
AffixPatternParser(const DecimalFormatSymbols & symbols)624 AffixPatternParser::AffixPatternParser(
625 const DecimalFormatSymbols &symbols) {
626 setDecimalFormatSymbols(symbols);
627 }
628
629 void
setDecimalFormatSymbols(const DecimalFormatSymbols & symbols)630 AffixPatternParser::setDecimalFormatSymbols(
631 const DecimalFormatSymbols &symbols) {
632 fPercent = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol);
633 fPermill = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol);
634 fNegative = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol);
635 fPositive = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol);
636 }
637
638 PluralAffix &
parse(const AffixPattern & affixPattern,const CurrencyAffixInfo & currencyAffixInfo,PluralAffix & appendTo,UErrorCode & status) const639 AffixPatternParser::parse(
640 const AffixPattern &affixPattern,
641 const CurrencyAffixInfo ¤cyAffixInfo,
642 PluralAffix &appendTo,
643 UErrorCode &status) const {
644 if (U_FAILURE(status)) {
645 return appendTo;
646 }
647 AffixPatternIterator iter;
648 affixPattern.iterator(iter);
649 UnicodeString literal;
650 while (iter.nextToken()) {
651 switch (iter.getTokenType()) {
652 case AffixPattern::kPercent:
653 appendTo.append(fPercent, UNUM_PERCENT_FIELD);
654 break;
655 case AffixPattern::kPerMill:
656 appendTo.append(fPermill, UNUM_PERMILL_FIELD);
657 break;
658 case AffixPattern::kNegative:
659 appendTo.append(fNegative, UNUM_SIGN_FIELD);
660 break;
661 case AffixPattern::kPositive:
662 appendTo.append(fPositive, UNUM_SIGN_FIELD);
663 break;
664 case AffixPattern::kCurrency:
665 switch (iter.getTokenLength()) {
666 case 1:
667 appendTo.append(
668 currencyAffixInfo.getSymbol(), UNUM_CURRENCY_FIELD);
669 break;
670 case 2:
671 appendTo.append(
672 currencyAffixInfo.getISO(), UNUM_CURRENCY_FIELD);
673 break;
674 case 3:
675 appendTo.append(
676 currencyAffixInfo.getLong(), UNUM_CURRENCY_FIELD, status);
677 break;
678 default:
679 U_ASSERT(FALSE);
680 break;
681 }
682 break;
683 case AffixPattern::kLiteral:
684 appendTo.append(iter.getLiteral(literal));
685 break;
686 default:
687 U_ASSERT(FALSE);
688 break;
689 }
690 }
691 return appendTo;
692 }
693
694
695 U_NAMESPACE_END
696 #endif /* #if !UCONFIG_NO_FORMATTING */
697