• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 #include "number_affixutils.h"
9 #include "unicode/utf16.h"
10 #include "unicode/uniset.h"
11 
12 using namespace icu;
13 using namespace icu::number;
14 using namespace icu::number::impl;
15 
16 TokenConsumer::~TokenConsumer() = default;
17 SymbolProvider::~SymbolProvider() = default;
18 
estimateLength(const UnicodeString & patternString,UErrorCode & status)19 int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20     AffixPatternState state = STATE_BASE;
21     int32_t offset = 0;
22     int32_t length = 0;
23     for (; offset < patternString.length();) {
24         UChar32 cp = patternString.char32At(offset);
25 
26         switch (state) {
27             case STATE_BASE:
28                 if (cp == u'\'') {
29                     // First quote
30                     state = STATE_FIRST_QUOTE;
31                 } else {
32                     // Unquoted symbol
33                     length++;
34                 }
35                 break;
36             case STATE_FIRST_QUOTE:
37                 if (cp == u'\'') {
38                     // Repeated quote
39                     length++;
40                     state = STATE_BASE;
41                 } else {
42                     // Quoted code point
43                     length++;
44                     state = STATE_INSIDE_QUOTE;
45                 }
46                 break;
47             case STATE_INSIDE_QUOTE:
48                 if (cp == u'\'') {
49                     // End of quoted sequence
50                     state = STATE_AFTER_QUOTE;
51                 } else {
52                     // Quoted code point
53                     length++;
54                 }
55                 break;
56             case STATE_AFTER_QUOTE:
57                 if (cp == u'\'') {
58                     // Double quote inside of quoted sequence
59                     length++;
60                     state = STATE_INSIDE_QUOTE;
61                 } else {
62                     // Unquoted symbol
63                     length++;
64                 }
65                 break;
66             default:
67                 UPRV_UNREACHABLE_EXIT;
68         }
69 
70         offset += U16_LENGTH(cp);
71     }
72 
73     switch (state) {
74         case STATE_FIRST_QUOTE:
75         case STATE_INSIDE_QUOTE:
76             status = U_ILLEGAL_ARGUMENT_ERROR;
77             break;
78         default:
79             break;
80     }
81 
82     return length;
83 }
84 
escape(const UnicodeString & input)85 UnicodeString AffixUtils::escape(const UnicodeString &input) {
86     AffixPatternState state = STATE_BASE;
87     int32_t offset = 0;
88     UnicodeString output;
89     for (; offset < input.length();) {
90         UChar32 cp = input.char32At(offset);
91 
92         switch (cp) {
93             case u'\'':
94                 output.append(u"''", -1);
95                 break;
96 
97             case u'-':
98             case u'+':
99             case u'%':
100             case u'‰':
101             case u'¤':
102                 if (state == STATE_BASE) {
103                     output.append(u'\'');
104                     output.append(cp);
105                     state = STATE_INSIDE_QUOTE;
106                 } else {
107                     output.append(cp);
108                 }
109                 break;
110 
111             default:
112                 if (state == STATE_INSIDE_QUOTE) {
113                     output.append(u'\'');
114                     output.append(cp);
115                     state = STATE_BASE;
116                 } else {
117                     output.append(cp);
118                 }
119                 break;
120         }
121         offset += U16_LENGTH(cp);
122     }
123 
124     if (state == STATE_INSIDE_QUOTE) {
125         output.append(u'\'');
126     }
127 
128     return output;
129 }
130 
getFieldForType(AffixPatternType type)131 Field AffixUtils::getFieldForType(AffixPatternType type) {
132     switch (type) {
133         case TYPE_MINUS_SIGN:
134             return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
135         case TYPE_PLUS_SIGN:
136             return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
137         case TYPE_APPROXIMATELY_SIGN:
138             // TODO: Introduce a new field for the approximately sign?
139             return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
140         case TYPE_PERCENT:
141             return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
142         case TYPE_PERMILLE:
143             return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
144         case TYPE_CURRENCY_SINGLE:
145             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
146         case TYPE_CURRENCY_DOUBLE:
147             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
148         case TYPE_CURRENCY_TRIPLE:
149             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
150         case TYPE_CURRENCY_QUAD:
151             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
152         case TYPE_CURRENCY_QUINT:
153             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
154         case TYPE_CURRENCY_OVERFLOW:
155             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
156         default:
157             UPRV_UNREACHABLE_EXIT;
158     }
159 }
160 
161 int32_t
unescape(const UnicodeString & affixPattern,FormattedStringBuilder & output,int32_t position,const SymbolProvider & provider,Field field,UErrorCode & status)162 AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
163                      const SymbolProvider &provider, Field field, UErrorCode &status) {
164     int32_t length = 0;
165     AffixTag tag;
166     while (hasNext(tag, affixPattern)) {
167         tag = nextToken(tag, affixPattern, status);
168         if (U_FAILURE(status)) { return length; }
169         if (tag.type == TYPE_CURRENCY_OVERFLOW) {
170             // Don't go to the provider for this special case
171             length += output.insertCodePoint(
172                 position + length,
173                 0xFFFD,
174                 {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
175                 status);
176         } else if (tag.type < 0) {
177             length += output.insert(
178                     position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
179         } else {
180             length += output.insertCodePoint(position + length, tag.codePoint, field, status);
181         }
182     }
183     return length;
184 }
185 
unescapedCodePointCount(const UnicodeString & affixPattern,const SymbolProvider & provider,UErrorCode & status)186 int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
187                                             const SymbolProvider &provider, UErrorCode &status) {
188     int32_t length = 0;
189     AffixTag tag;
190     while (hasNext(tag, affixPattern)) {
191         tag = nextToken(tag, affixPattern, status);
192         if (U_FAILURE(status)) { return length; }
193         if (tag.type == TYPE_CURRENCY_OVERFLOW) {
194             length += 1;
195         } else if (tag.type < 0) {
196             length += provider.getSymbol(tag.type).length();
197         } else {
198             length += U16_LENGTH(tag.codePoint);
199         }
200     }
201     return length;
202 }
203 
204 bool
containsType(const UnicodeString & affixPattern,AffixPatternType type,UErrorCode & status)205 AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
206     if (affixPattern.length() == 0) {
207         return false;
208     }
209     AffixTag tag;
210     while (hasNext(tag, affixPattern)) {
211         tag = nextToken(tag, affixPattern, status);
212         if (U_FAILURE(status)) { return false; }
213         if (tag.type == type) {
214             return true;
215         }
216     }
217     return false;
218 }
219 
hasCurrencySymbols(const UnicodeString & affixPattern,UErrorCode & status)220 bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
221     if (affixPattern.length() == 0) {
222         return false;
223     }
224     AffixTag tag;
225     while (hasNext(tag, affixPattern)) {
226         tag = nextToken(tag, affixPattern, status);
227         if (U_FAILURE(status)) { return false; }
228         if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
229             return true;
230         }
231     }
232     return false;
233 }
234 
replaceType(const UnicodeString & affixPattern,AffixPatternType type,char16_t replacementChar,UErrorCode & status)235 UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
236                                       char16_t replacementChar, UErrorCode &status) {
237     UnicodeString output(affixPattern); // copy
238     if (affixPattern.length() == 0) {
239         return output;
240     }
241     AffixTag tag;
242     while (hasNext(tag, affixPattern)) {
243         tag = nextToken(tag, affixPattern, status);
244         if (U_FAILURE(status)) { return output; }
245         if (tag.type == type) {
246             output.replace(tag.offset - 1, 1, replacementChar);
247         }
248     }
249     return output;
250 }
251 
containsOnlySymbolsAndIgnorables(const UnicodeString & affixPattern,const UnicodeSet & ignorables,UErrorCode & status)252 bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
253                                                   const UnicodeSet& ignorables, UErrorCode& status) {
254     if (affixPattern.length() == 0) {
255         return true;
256     }
257     AffixTag tag;
258     while (hasNext(tag, affixPattern)) {
259         tag = nextToken(tag, affixPattern, status);
260         if (U_FAILURE(status)) { return false; }
261         if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
262             return false;
263         }
264     }
265     return true;
266 }
267 
iterateWithConsumer(const UnicodeString & affixPattern,TokenConsumer & consumer,UErrorCode & status)268 void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
269                                      UErrorCode& status) {
270     if (affixPattern.length() == 0) {
271         return;
272     }
273     AffixTag tag;
274     while (hasNext(tag, affixPattern)) {
275         tag = nextToken(tag, affixPattern, status);
276         if (U_FAILURE(status)) { return; }
277         consumer.consumeToken(tag.type, tag.codePoint, status);
278         if (U_FAILURE(status)) { return; }
279     }
280 }
281 
nextToken(AffixTag tag,const UnicodeString & patternString,UErrorCode & status)282 AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
283     int32_t offset = tag.offset;
284     int32_t state = tag.state;
285     for (; offset < patternString.length();) {
286         UChar32 cp = patternString.char32At(offset);
287         int32_t count = U16_LENGTH(cp);
288 
289         switch (state) {
290             case STATE_BASE:
291                 switch (cp) {
292                     case u'\'':
293                         state = STATE_FIRST_QUOTE;
294                         offset += count;
295                         // continue to the next code point
296                         break;
297                     case u'-':
298                         return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
299                     case u'+':
300                         return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
301                     case u'~':
302                         return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0);
303                     case u'%':
304                         return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
305                     case u'‰':
306                         return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
307                     case u'¤':
308                         state = STATE_FIRST_CURR;
309                         offset += count;
310                         // continue to the next code point
311                         break;
312                     default:
313                         return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
314                 }
315                 break;
316             case STATE_FIRST_QUOTE:
317                 if (cp == u'\'') {
318                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
319                 } else {
320                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
321                 }
322             case STATE_INSIDE_QUOTE:
323                 if (cp == u'\'') {
324                     state = STATE_AFTER_QUOTE;
325                     offset += count;
326                     // continue to the next code point
327                     break;
328                 } else {
329                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
330                 }
331             case STATE_AFTER_QUOTE:
332                 if (cp == u'\'') {
333                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
334                 } else {
335                     state = STATE_BASE;
336                     // re-evaluate this code point
337                     break;
338                 }
339             case STATE_FIRST_CURR:
340                 if (cp == u'¤') {
341                     state = STATE_SECOND_CURR;
342                     offset += count;
343                     // continue to the next code point
344                     break;
345                 } else {
346                     return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
347                 }
348             case STATE_SECOND_CURR:
349                 if (cp == u'¤') {
350                     state = STATE_THIRD_CURR;
351                     offset += count;
352                     // continue to the next code point
353                     break;
354                 } else {
355                     return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
356                 }
357             case STATE_THIRD_CURR:
358                 if (cp == u'¤') {
359                     state = STATE_FOURTH_CURR;
360                     offset += count;
361                     // continue to the next code point
362                     break;
363                 } else {
364                     return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
365                 }
366             case STATE_FOURTH_CURR:
367                 if (cp == u'¤') {
368                     state = STATE_FIFTH_CURR;
369                     offset += count;
370                     // continue to the next code point
371                     break;
372                 } else {
373                     return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
374                 }
375             case STATE_FIFTH_CURR:
376                 if (cp == u'¤') {
377                     state = STATE_OVERFLOW_CURR;
378                     offset += count;
379                     // continue to the next code point
380                     break;
381                 } else {
382                     return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
383                 }
384             case STATE_OVERFLOW_CURR:
385                 if (cp == u'¤') {
386                     offset += count;
387                     // continue to the next code point and loop back to this state
388                     break;
389                 } else {
390                     return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
391                 }
392             default:
393                 UPRV_UNREACHABLE_EXIT;
394         }
395     }
396     // End of string
397     switch (state) {
398         case STATE_BASE:
399             // No more tokens in string.
400             return {-1};
401         case STATE_FIRST_QUOTE:
402         case STATE_INSIDE_QUOTE:
403             // For consistent behavior with the JDK and ICU 58, set an error here.
404             status = U_ILLEGAL_ARGUMENT_ERROR;
405             return {-1};
406         case STATE_AFTER_QUOTE:
407             // No more tokens in string.
408             return {-1};
409         case STATE_FIRST_CURR:
410             return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
411         case STATE_SECOND_CURR:
412             return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
413         case STATE_THIRD_CURR:
414             return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
415         case STATE_FOURTH_CURR:
416             return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
417         case STATE_FIFTH_CURR:
418             return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
419         case STATE_OVERFLOW_CURR:
420             return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
421         default:
422             UPRV_UNREACHABLE_EXIT;
423     }
424 }
425 
hasNext(const AffixTag & tag,const UnicodeString & string)426 bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
427     // First check for the {-1} and default initializer syntax.
428     if (tag.offset < 0) {
429         return false;
430     } else if (tag.offset == 0) {
431         return string.length() > 0;
432     }
433     // The rest of the fields are safe to use now.
434     // Special case: the last character in string is an end quote.
435     if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
436         string.charAt(tag.offset) == u'\'') {
437         return false;
438     } else if (tag.state != STATE_BASE) {
439         return true;
440     } else {
441         return tag.offset < string.length();
442     }
443 }
444 
445 #endif /* #if !UCONFIG_NO_FORMATTING */
446