• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 #include "number_affixutils.h"
9 #include "unicode/utf16.h"
10 #include "unicode/uniset.h"
11 
12 using namespace icu;
13 using namespace icu::number;
14 using namespace icu::number::impl;
15 
16 TokenConsumer::~TokenConsumer() = default;
17 SymbolProvider::~SymbolProvider() = default;
18 
estimateLength(const UnicodeString & patternString,UErrorCode & status)19 int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20     AffixPatternState state = STATE_BASE;
21     int32_t offset = 0;
22     int32_t length = 0;
23     for (; offset < patternString.length();) {
24         UChar32 cp = patternString.char32At(offset);
25 
26         switch (state) {
27             case STATE_BASE:
28                 if (cp == u'\'') {
29                     // First quote
30                     state = STATE_FIRST_QUOTE;
31                 } else {
32                     // Unquoted symbol
33                     length++;
34                 }
35                 break;
36             case STATE_FIRST_QUOTE:
37                 if (cp == u'\'') {
38                     // Repeated quote
39                     length++;
40                     state = STATE_BASE;
41                 } else {
42                     // Quoted code point
43                     length++;
44                     state = STATE_INSIDE_QUOTE;
45                 }
46                 break;
47             case STATE_INSIDE_QUOTE:
48                 if (cp == u'\'') {
49                     // End of quoted sequence
50                     state = STATE_AFTER_QUOTE;
51                 } else {
52                     // Quoted code point
53                     length++;
54                 }
55                 break;
56             case STATE_AFTER_QUOTE:
57                 if (cp == u'\'') {
58                     // Double quote inside of quoted sequence
59                     length++;
60                     state = STATE_INSIDE_QUOTE;
61                 } else {
62                     // Unquoted symbol
63                     length++;
64                 }
65                 break;
66             default:
67                 UPRV_UNREACHABLE_EXIT;
68         }
69 
70         offset += U16_LENGTH(cp);
71     }
72 
73     switch (state) {
74         case STATE_FIRST_QUOTE:
75         case STATE_INSIDE_QUOTE:
76             status = U_ILLEGAL_ARGUMENT_ERROR;
77             break;
78         default:
79             break;
80     }
81 
82     return length;
83 }
84 
escape(const UnicodeString & input)85 UnicodeString AffixUtils::escape(const UnicodeString &input) {
86     AffixPatternState state = STATE_BASE;
87     int32_t offset = 0;
88     UnicodeString output;
89     for (; offset < input.length();) {
90         UChar32 cp = input.char32At(offset);
91 
92         switch (cp) {
93             case u'\'':
94                 output.append(u"''", -1);
95                 break;
96 
97             case u'-':
98             case u'+':
99             case u'%':
100             case u'‰':
101             case u'¤':
102                 if (state == STATE_BASE) {
103                     output.append(u'\'');
104                     output.append(cp);
105                     state = STATE_INSIDE_QUOTE;
106                 } else {
107                     output.append(cp);
108                 }
109                 break;
110 
111             default:
112                 if (state == STATE_INSIDE_QUOTE) {
113                     output.append(u'\'');
114                     output.append(cp);
115                     state = STATE_BASE;
116                 } else {
117                     output.append(cp);
118                 }
119                 break;
120         }
121         offset += U16_LENGTH(cp);
122     }
123 
124     if (state == STATE_INSIDE_QUOTE) {
125         output.append(u'\'');
126     }
127 
128     return output;
129 }
130 
getFieldForType(AffixPatternType type)131 Field AffixUtils::getFieldForType(AffixPatternType type) {
132     switch (type) {
133         case TYPE_MINUS_SIGN:
134             return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
135         case TYPE_PLUS_SIGN:
136             return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
137         case TYPE_APPROXIMATELY_SIGN:
138             return {UFIELD_CATEGORY_NUMBER, UNUM_APPROXIMATELY_SIGN_FIELD};
139         case TYPE_PERCENT:
140             return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
141         case TYPE_PERMILLE:
142             return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
143         case TYPE_CURRENCY_SINGLE:
144             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
145         case TYPE_CURRENCY_DOUBLE:
146             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
147         case TYPE_CURRENCY_TRIPLE:
148             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
149         case TYPE_CURRENCY_QUAD:
150             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
151         case TYPE_CURRENCY_QUINT:
152             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
153         case TYPE_CURRENCY_OVERFLOW:
154             return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
155         default:
156             UPRV_UNREACHABLE_EXIT;
157     }
158 }
159 
160 int32_t
unescape(const UnicodeString & affixPattern,FormattedStringBuilder & output,int32_t position,const SymbolProvider & provider,Field field,UErrorCode & status)161 AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
162                      const SymbolProvider &provider, Field field, UErrorCode &status) {
163     int32_t length = 0;
164     AffixTag tag;
165     while (hasNext(tag, affixPattern)) {
166         tag = nextToken(tag, affixPattern, status);
167         if (U_FAILURE(status)) { return length; }
168         if (tag.type == TYPE_CURRENCY_OVERFLOW) {
169             // Don't go to the provider for this special case
170             length += output.insertCodePoint(
171                 position + length,
172                 0xFFFD,
173                 {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
174                 status);
175         } else if (tag.type < 0) {
176             length += output.insert(
177                     position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
178         } else {
179             length += output.insertCodePoint(position + length, tag.codePoint, field, status);
180         }
181     }
182     return length;
183 }
184 
unescapedCodePointCount(const UnicodeString & affixPattern,const SymbolProvider & provider,UErrorCode & status)185 int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
186                                             const SymbolProvider &provider, UErrorCode &status) {
187     int32_t length = 0;
188     AffixTag tag;
189     while (hasNext(tag, affixPattern)) {
190         tag = nextToken(tag, affixPattern, status);
191         if (U_FAILURE(status)) { return length; }
192         if (tag.type == TYPE_CURRENCY_OVERFLOW) {
193             length += 1;
194         } else if (tag.type < 0) {
195             length += provider.getSymbol(tag.type).length();
196         } else {
197             length += U16_LENGTH(tag.codePoint);
198         }
199     }
200     return length;
201 }
202 
203 bool
containsType(const UnicodeString & affixPattern,AffixPatternType type,UErrorCode & status)204 AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
205     if (affixPattern.length() == 0) {
206         return false;
207     }
208     AffixTag tag;
209     while (hasNext(tag, affixPattern)) {
210         tag = nextToken(tag, affixPattern, status);
211         if (U_FAILURE(status)) { return false; }
212         if (tag.type == type) {
213             return true;
214         }
215     }
216     return false;
217 }
218 
hasCurrencySymbols(const UnicodeString & affixPattern,UErrorCode & status)219 bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
220     if (affixPattern.length() == 0) {
221         return false;
222     }
223     AffixTag tag;
224     while (hasNext(tag, affixPattern)) {
225         tag = nextToken(tag, affixPattern, status);
226         if (U_FAILURE(status)) { return false; }
227         if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
228             return true;
229         }
230     }
231     return false;
232 }
233 
replaceType(const UnicodeString & affixPattern,AffixPatternType type,char16_t replacementChar,UErrorCode & status)234 UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
235                                       char16_t replacementChar, UErrorCode &status) {
236     UnicodeString output(affixPattern); // copy
237     if (affixPattern.length() == 0) {
238         return output;
239     }
240     AffixTag tag;
241     while (hasNext(tag, affixPattern)) {
242         tag = nextToken(tag, affixPattern, status);
243         if (U_FAILURE(status)) { return output; }
244         if (tag.type == type) {
245             output.replace(tag.offset - 1, 1, replacementChar);
246         }
247     }
248     return output;
249 }
250 
containsOnlySymbolsAndIgnorables(const UnicodeString & affixPattern,const UnicodeSet & ignorables,UErrorCode & status)251 bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
252                                                   const UnicodeSet& ignorables, UErrorCode& status) {
253     if (affixPattern.length() == 0) {
254         return true;
255     }
256     AffixTag tag;
257     while (hasNext(tag, affixPattern)) {
258         tag = nextToken(tag, affixPattern, status);
259         if (U_FAILURE(status)) { return false; }
260         if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
261             return false;
262         }
263     }
264     return true;
265 }
266 
iterateWithConsumer(const UnicodeString & affixPattern,TokenConsumer & consumer,UErrorCode & status)267 void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
268                                      UErrorCode& status) {
269     if (affixPattern.length() == 0) {
270         return;
271     }
272     AffixTag tag;
273     while (hasNext(tag, affixPattern)) {
274         tag = nextToken(tag, affixPattern, status);
275         if (U_FAILURE(status)) { return; }
276         consumer.consumeToken(tag.type, tag.codePoint, status);
277         if (U_FAILURE(status)) { return; }
278     }
279 }
280 
nextToken(AffixTag tag,const UnicodeString & patternString,UErrorCode & status)281 AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
282     int32_t offset = tag.offset;
283     int32_t state = tag.state;
284     for (; offset < patternString.length();) {
285         UChar32 cp = patternString.char32At(offset);
286         int32_t count = U16_LENGTH(cp);
287 
288         switch (state) {
289             case STATE_BASE:
290                 switch (cp) {
291                     case u'\'':
292                         state = STATE_FIRST_QUOTE;
293                         offset += count;
294                         // continue to the next code point
295                         break;
296                     case u'-':
297                         return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
298                     case u'+':
299                         return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
300                     case u'~':
301                         return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0);
302                     case u'%':
303                         return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
304                     case u'‰':
305                         return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
306                     case u'¤':
307                         state = STATE_FIRST_CURR;
308                         offset += count;
309                         // continue to the next code point
310                         break;
311                     default:
312                         return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
313                 }
314                 break;
315             case STATE_FIRST_QUOTE:
316                 if (cp == u'\'') {
317                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
318                 } else {
319                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
320                 }
321             case STATE_INSIDE_QUOTE:
322                 if (cp == u'\'') {
323                     state = STATE_AFTER_QUOTE;
324                     offset += count;
325                     // continue to the next code point
326                     break;
327                 } else {
328                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
329                 }
330             case STATE_AFTER_QUOTE:
331                 if (cp == u'\'') {
332                     return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
333                 } else {
334                     state = STATE_BASE;
335                     // re-evaluate this code point
336                     break;
337                 }
338             case STATE_FIRST_CURR:
339                 if (cp == u'¤') {
340                     state = STATE_SECOND_CURR;
341                     offset += count;
342                     // continue to the next code point
343                     break;
344                 } else {
345                     return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
346                 }
347             case STATE_SECOND_CURR:
348                 if (cp == u'¤') {
349                     state = STATE_THIRD_CURR;
350                     offset += count;
351                     // continue to the next code point
352                     break;
353                 } else {
354                     return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
355                 }
356             case STATE_THIRD_CURR:
357                 if (cp == u'¤') {
358                     state = STATE_FOURTH_CURR;
359                     offset += count;
360                     // continue to the next code point
361                     break;
362                 } else {
363                     return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
364                 }
365             case STATE_FOURTH_CURR:
366                 if (cp == u'¤') {
367                     state = STATE_FIFTH_CURR;
368                     offset += count;
369                     // continue to the next code point
370                     break;
371                 } else {
372                     return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
373                 }
374             case STATE_FIFTH_CURR:
375                 if (cp == u'¤') {
376                     state = STATE_OVERFLOW_CURR;
377                     offset += count;
378                     // continue to the next code point
379                     break;
380                 } else {
381                     return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
382                 }
383             case STATE_OVERFLOW_CURR:
384                 if (cp == u'¤') {
385                     offset += count;
386                     // continue to the next code point and loop back to this state
387                     break;
388                 } else {
389                     return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
390                 }
391             default:
392                 UPRV_UNREACHABLE_EXIT;
393         }
394     }
395     // End of string
396     switch (state) {
397         case STATE_BASE:
398             // No more tokens in string.
399             return {-1};
400         case STATE_FIRST_QUOTE:
401         case STATE_INSIDE_QUOTE:
402             // For consistent behavior with the JDK and ICU 58, set an error here.
403             status = U_ILLEGAL_ARGUMENT_ERROR;
404             return {-1};
405         case STATE_AFTER_QUOTE:
406             // No more tokens in string.
407             return {-1};
408         case STATE_FIRST_CURR:
409             return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
410         case STATE_SECOND_CURR:
411             return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
412         case STATE_THIRD_CURR:
413             return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
414         case STATE_FOURTH_CURR:
415             return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
416         case STATE_FIFTH_CURR:
417             return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
418         case STATE_OVERFLOW_CURR:
419             return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
420         default:
421             UPRV_UNREACHABLE_EXIT;
422     }
423 }
424 
hasNext(const AffixTag & tag,const UnicodeString & string)425 bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
426     // First check for the {-1} and default initializer syntax.
427     if (tag.offset < 0) {
428         return false;
429     } else if (tag.offset == 0) {
430         return string.length() > 0;
431     }
432     // The rest of the fields are safe to use now.
433     // Special case: the last character in string is an end quote.
434     if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
435         string.charAt(tag.offset) == u'\'') {
436         return false;
437     } else if (tag.state != STATE_BASE) {
438         return true;
439     } else {
440         return tag.offset < string.length();
441     }
442 }
443 
444 #endif /* #if !UCONFIG_NO_FORMATTING */
445