1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include "unicode/utypes.h"
5
6 #if !UCONFIG_NO_FORMATTING
7
8 #include "number_affixutils.h"
9 #include "unicode/utf16.h"
10 #include "unicode/uniset.h"
11
12 using namespace icu;
13 using namespace icu::number;
14 using namespace icu::number::impl;
15
16 TokenConsumer::~TokenConsumer() = default;
17 SymbolProvider::~SymbolProvider() = default;
18
estimateLength(const UnicodeString & patternString,UErrorCode & status)19 int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20 AffixPatternState state = STATE_BASE;
21 int32_t offset = 0;
22 int32_t length = 0;
23 for (; offset < patternString.length();) {
24 UChar32 cp = patternString.char32At(offset);
25
26 switch (state) {
27 case STATE_BASE:
28 if (cp == u'\'') {
29 // First quote
30 state = STATE_FIRST_QUOTE;
31 } else {
32 // Unquoted symbol
33 length++;
34 }
35 break;
36 case STATE_FIRST_QUOTE:
37 if (cp == u'\'') {
38 // Repeated quote
39 length++;
40 state = STATE_BASE;
41 } else {
42 // Quoted code point
43 length++;
44 state = STATE_INSIDE_QUOTE;
45 }
46 break;
47 case STATE_INSIDE_QUOTE:
48 if (cp == u'\'') {
49 // End of quoted sequence
50 state = STATE_AFTER_QUOTE;
51 } else {
52 // Quoted code point
53 length++;
54 }
55 break;
56 case STATE_AFTER_QUOTE:
57 if (cp == u'\'') {
58 // Double quote inside of quoted sequence
59 length++;
60 state = STATE_INSIDE_QUOTE;
61 } else {
62 // Unquoted symbol
63 length++;
64 }
65 break;
66 default:
67 UPRV_UNREACHABLE_EXIT;
68 }
69
70 offset += U16_LENGTH(cp);
71 }
72
73 switch (state) {
74 case STATE_FIRST_QUOTE:
75 case STATE_INSIDE_QUOTE:
76 status = U_ILLEGAL_ARGUMENT_ERROR;
77 break;
78 default:
79 break;
80 }
81
82 return length;
83 }
84
escape(const UnicodeString & input)85 UnicodeString AffixUtils::escape(const UnicodeString &input) {
86 AffixPatternState state = STATE_BASE;
87 int32_t offset = 0;
88 UnicodeString output;
89 for (; offset < input.length();) {
90 UChar32 cp = input.char32At(offset);
91
92 switch (cp) {
93 case u'\'':
94 output.append(u"''", -1);
95 break;
96
97 case u'-':
98 case u'+':
99 case u'%':
100 case u'‰':
101 case u'¤':
102 if (state == STATE_BASE) {
103 output.append(u'\'');
104 output.append(cp);
105 state = STATE_INSIDE_QUOTE;
106 } else {
107 output.append(cp);
108 }
109 break;
110
111 default:
112 if (state == STATE_INSIDE_QUOTE) {
113 output.append(u'\'');
114 output.append(cp);
115 state = STATE_BASE;
116 } else {
117 output.append(cp);
118 }
119 break;
120 }
121 offset += U16_LENGTH(cp);
122 }
123
124 if (state == STATE_INSIDE_QUOTE) {
125 output.append(u'\'');
126 }
127
128 return output;
129 }
130
getFieldForType(AffixPatternType type)131 Field AffixUtils::getFieldForType(AffixPatternType type) {
132 switch (type) {
133 case TYPE_MINUS_SIGN:
134 return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
135 case TYPE_PLUS_SIGN:
136 return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
137 case TYPE_APPROXIMATELY_SIGN:
138 return {UFIELD_CATEGORY_NUMBER, UNUM_APPROXIMATELY_SIGN_FIELD};
139 case TYPE_PERCENT:
140 return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
141 case TYPE_PERMILLE:
142 return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
143 case TYPE_CURRENCY_SINGLE:
144 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
145 case TYPE_CURRENCY_DOUBLE:
146 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
147 case TYPE_CURRENCY_TRIPLE:
148 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
149 case TYPE_CURRENCY_QUAD:
150 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
151 case TYPE_CURRENCY_QUINT:
152 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
153 case TYPE_CURRENCY_OVERFLOW:
154 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
155 default:
156 UPRV_UNREACHABLE_EXIT;
157 }
158 }
159
160 int32_t
unescape(const UnicodeString & affixPattern,FormattedStringBuilder & output,int32_t position,const SymbolProvider & provider,Field field,UErrorCode & status)161 AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
162 const SymbolProvider &provider, Field field, UErrorCode &status) {
163 int32_t length = 0;
164 AffixTag tag;
165 while (hasNext(tag, affixPattern)) {
166 tag = nextToken(tag, affixPattern, status);
167 if (U_FAILURE(status)) { return length; }
168 if (tag.type == TYPE_CURRENCY_OVERFLOW) {
169 // Don't go to the provider for this special case
170 length += output.insertCodePoint(
171 position + length,
172 0xFFFD,
173 {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
174 status);
175 } else if (tag.type < 0) {
176 length += output.insert(
177 position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
178 } else {
179 length += output.insertCodePoint(position + length, tag.codePoint, field, status);
180 }
181 }
182 return length;
183 }
184
unescapedCodePointCount(const UnicodeString & affixPattern,const SymbolProvider & provider,UErrorCode & status)185 int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
186 const SymbolProvider &provider, UErrorCode &status) {
187 int32_t length = 0;
188 AffixTag tag;
189 while (hasNext(tag, affixPattern)) {
190 tag = nextToken(tag, affixPattern, status);
191 if (U_FAILURE(status)) { return length; }
192 if (tag.type == TYPE_CURRENCY_OVERFLOW) {
193 length += 1;
194 } else if (tag.type < 0) {
195 length += provider.getSymbol(tag.type).length();
196 } else {
197 length += U16_LENGTH(tag.codePoint);
198 }
199 }
200 return length;
201 }
202
203 bool
containsType(const UnicodeString & affixPattern,AffixPatternType type,UErrorCode & status)204 AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
205 if (affixPattern.length() == 0) {
206 return false;
207 }
208 AffixTag tag;
209 while (hasNext(tag, affixPattern)) {
210 tag = nextToken(tag, affixPattern, status);
211 if (U_FAILURE(status)) { return false; }
212 if (tag.type == type) {
213 return true;
214 }
215 }
216 return false;
217 }
218
hasCurrencySymbols(const UnicodeString & affixPattern,UErrorCode & status)219 bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
220 if (affixPattern.length() == 0) {
221 return false;
222 }
223 AffixTag tag;
224 while (hasNext(tag, affixPattern)) {
225 tag = nextToken(tag, affixPattern, status);
226 if (U_FAILURE(status)) { return false; }
227 if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
228 return true;
229 }
230 }
231 return false;
232 }
233
replaceType(const UnicodeString & affixPattern,AffixPatternType type,char16_t replacementChar,UErrorCode & status)234 UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
235 char16_t replacementChar, UErrorCode &status) {
236 UnicodeString output(affixPattern); // copy
237 if (affixPattern.length() == 0) {
238 return output;
239 }
240 AffixTag tag;
241 while (hasNext(tag, affixPattern)) {
242 tag = nextToken(tag, affixPattern, status);
243 if (U_FAILURE(status)) { return output; }
244 if (tag.type == type) {
245 output.replace(tag.offset - 1, 1, replacementChar);
246 }
247 }
248 return output;
249 }
250
containsOnlySymbolsAndIgnorables(const UnicodeString & affixPattern,const UnicodeSet & ignorables,UErrorCode & status)251 bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
252 const UnicodeSet& ignorables, UErrorCode& status) {
253 if (affixPattern.length() == 0) {
254 return true;
255 }
256 AffixTag tag;
257 while (hasNext(tag, affixPattern)) {
258 tag = nextToken(tag, affixPattern, status);
259 if (U_FAILURE(status)) { return false; }
260 if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
261 return false;
262 }
263 }
264 return true;
265 }
266
iterateWithConsumer(const UnicodeString & affixPattern,TokenConsumer & consumer,UErrorCode & status)267 void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
268 UErrorCode& status) {
269 if (affixPattern.length() == 0) {
270 return;
271 }
272 AffixTag tag;
273 while (hasNext(tag, affixPattern)) {
274 tag = nextToken(tag, affixPattern, status);
275 if (U_FAILURE(status)) { return; }
276 consumer.consumeToken(tag.type, tag.codePoint, status);
277 if (U_FAILURE(status)) { return; }
278 }
279 }
280
nextToken(AffixTag tag,const UnicodeString & patternString,UErrorCode & status)281 AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
282 int32_t offset = tag.offset;
283 int32_t state = tag.state;
284 for (; offset < patternString.length();) {
285 UChar32 cp = patternString.char32At(offset);
286 int32_t count = U16_LENGTH(cp);
287
288 switch (state) {
289 case STATE_BASE:
290 switch (cp) {
291 case u'\'':
292 state = STATE_FIRST_QUOTE;
293 offset += count;
294 // continue to the next code point
295 break;
296 case u'-':
297 return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
298 case u'+':
299 return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
300 case u'~':
301 return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0);
302 case u'%':
303 return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
304 case u'‰':
305 return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
306 case u'¤':
307 state = STATE_FIRST_CURR;
308 offset += count;
309 // continue to the next code point
310 break;
311 default:
312 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
313 }
314 break;
315 case STATE_FIRST_QUOTE:
316 if (cp == u'\'') {
317 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
318 } else {
319 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
320 }
321 case STATE_INSIDE_QUOTE:
322 if (cp == u'\'') {
323 state = STATE_AFTER_QUOTE;
324 offset += count;
325 // continue to the next code point
326 break;
327 } else {
328 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
329 }
330 case STATE_AFTER_QUOTE:
331 if (cp == u'\'') {
332 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
333 } else {
334 state = STATE_BASE;
335 // re-evaluate this code point
336 break;
337 }
338 case STATE_FIRST_CURR:
339 if (cp == u'¤') {
340 state = STATE_SECOND_CURR;
341 offset += count;
342 // continue to the next code point
343 break;
344 } else {
345 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
346 }
347 case STATE_SECOND_CURR:
348 if (cp == u'¤') {
349 state = STATE_THIRD_CURR;
350 offset += count;
351 // continue to the next code point
352 break;
353 } else {
354 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
355 }
356 case STATE_THIRD_CURR:
357 if (cp == u'¤') {
358 state = STATE_FOURTH_CURR;
359 offset += count;
360 // continue to the next code point
361 break;
362 } else {
363 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
364 }
365 case STATE_FOURTH_CURR:
366 if (cp == u'¤') {
367 state = STATE_FIFTH_CURR;
368 offset += count;
369 // continue to the next code point
370 break;
371 } else {
372 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
373 }
374 case STATE_FIFTH_CURR:
375 if (cp == u'¤') {
376 state = STATE_OVERFLOW_CURR;
377 offset += count;
378 // continue to the next code point
379 break;
380 } else {
381 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
382 }
383 case STATE_OVERFLOW_CURR:
384 if (cp == u'¤') {
385 offset += count;
386 // continue to the next code point and loop back to this state
387 break;
388 } else {
389 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
390 }
391 default:
392 UPRV_UNREACHABLE_EXIT;
393 }
394 }
395 // End of string
396 switch (state) {
397 case STATE_BASE:
398 // No more tokens in string.
399 return {-1};
400 case STATE_FIRST_QUOTE:
401 case STATE_INSIDE_QUOTE:
402 // For consistent behavior with the JDK and ICU 58, set an error here.
403 status = U_ILLEGAL_ARGUMENT_ERROR;
404 return {-1};
405 case STATE_AFTER_QUOTE:
406 // No more tokens in string.
407 return {-1};
408 case STATE_FIRST_CURR:
409 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
410 case STATE_SECOND_CURR:
411 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
412 case STATE_THIRD_CURR:
413 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
414 case STATE_FOURTH_CURR:
415 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
416 case STATE_FIFTH_CURR:
417 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
418 case STATE_OVERFLOW_CURR:
419 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
420 default:
421 UPRV_UNREACHABLE_EXIT;
422 }
423 }
424
hasNext(const AffixTag & tag,const UnicodeString & string)425 bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
426 // First check for the {-1} and default initializer syntax.
427 if (tag.offset < 0) {
428 return false;
429 } else if (tag.offset == 0) {
430 return string.length() > 0;
431 }
432 // The rest of the fields are safe to use now.
433 // Special case: the last character in string is an end quote.
434 if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
435 string.charAt(tag.offset) == u'\'') {
436 return false;
437 } else if (tag.state != STATE_BASE) {
438 return true;
439 } else {
440 return tag.offset < string.length();
441 }
442 }
443
444 #endif /* #if !UCONFIG_NO_FORMATTING */
445