1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include "unicode/utypes.h"
5
6 #if !UCONFIG_NO_FORMATTING
7
8 #include "number_affixutils.h"
9 #include "unicode/utf16.h"
10 #include "unicode/uniset.h"
11
12 using namespace icu;
13 using namespace icu::number;
14 using namespace icu::number::impl;
15
16 TokenConsumer::~TokenConsumer() = default;
17 SymbolProvider::~SymbolProvider() = default;
18
estimateLength(const UnicodeString & patternString,UErrorCode & status)19 int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20 AffixPatternState state = STATE_BASE;
21 int32_t offset = 0;
22 int32_t length = 0;
23 for (; offset < patternString.length();) {
24 UChar32 cp = patternString.char32At(offset);
25
26 switch (state) {
27 case STATE_BASE:
28 if (cp == u'\'') {
29 // First quote
30 state = STATE_FIRST_QUOTE;
31 } else {
32 // Unquoted symbol
33 length++;
34 }
35 break;
36 case STATE_FIRST_QUOTE:
37 if (cp == u'\'') {
38 // Repeated quote
39 length++;
40 state = STATE_BASE;
41 } else {
42 // Quoted code point
43 length++;
44 state = STATE_INSIDE_QUOTE;
45 }
46 break;
47 case STATE_INSIDE_QUOTE:
48 if (cp == u'\'') {
49 // End of quoted sequence
50 state = STATE_AFTER_QUOTE;
51 } else {
52 // Quoted code point
53 length++;
54 }
55 break;
56 case STATE_AFTER_QUOTE:
57 if (cp == u'\'') {
58 // Double quote inside of quoted sequence
59 length++;
60 state = STATE_INSIDE_QUOTE;
61 } else {
62 // Unquoted symbol
63 length++;
64 }
65 break;
66 default:
67 UPRV_UNREACHABLE_EXIT;
68 }
69
70 offset += U16_LENGTH(cp);
71 }
72
73 switch (state) {
74 case STATE_FIRST_QUOTE:
75 case STATE_INSIDE_QUOTE:
76 status = U_ILLEGAL_ARGUMENT_ERROR;
77 break;
78 default:
79 break;
80 }
81
82 return length;
83 }
84
escape(const UnicodeString & input)85 UnicodeString AffixUtils::escape(const UnicodeString &input) {
86 AffixPatternState state = STATE_BASE;
87 int32_t offset = 0;
88 UnicodeString output;
89 for (; offset < input.length();) {
90 UChar32 cp = input.char32At(offset);
91
92 switch (cp) {
93 case u'\'':
94 output.append(u"''", -1);
95 break;
96
97 case u'-':
98 case u'+':
99 case u'%':
100 case u'‰':
101 case u'¤':
102 if (state == STATE_BASE) {
103 output.append(u'\'');
104 output.append(cp);
105 state = STATE_INSIDE_QUOTE;
106 } else {
107 output.append(cp);
108 }
109 break;
110
111 default:
112 if (state == STATE_INSIDE_QUOTE) {
113 output.append(u'\'');
114 output.append(cp);
115 state = STATE_BASE;
116 } else {
117 output.append(cp);
118 }
119 break;
120 }
121 offset += U16_LENGTH(cp);
122 }
123
124 if (state == STATE_INSIDE_QUOTE) {
125 output.append(u'\'');
126 }
127
128 return output;
129 }
130
getFieldForType(AffixPatternType type)131 Field AffixUtils::getFieldForType(AffixPatternType type) {
132 switch (type) {
133 case TYPE_MINUS_SIGN:
134 return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
135 case TYPE_PLUS_SIGN:
136 return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
137 case TYPE_APPROXIMATELY_SIGN:
138 // TODO: Introduce a new field for the approximately sign?
139 return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
140 case TYPE_PERCENT:
141 return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
142 case TYPE_PERMILLE:
143 return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
144 case TYPE_CURRENCY_SINGLE:
145 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
146 case TYPE_CURRENCY_DOUBLE:
147 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
148 case TYPE_CURRENCY_TRIPLE:
149 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
150 case TYPE_CURRENCY_QUAD:
151 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
152 case TYPE_CURRENCY_QUINT:
153 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
154 case TYPE_CURRENCY_OVERFLOW:
155 return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
156 default:
157 UPRV_UNREACHABLE_EXIT;
158 }
159 }
160
161 int32_t
unescape(const UnicodeString & affixPattern,FormattedStringBuilder & output,int32_t position,const SymbolProvider & provider,Field field,UErrorCode & status)162 AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
163 const SymbolProvider &provider, Field field, UErrorCode &status) {
164 int32_t length = 0;
165 AffixTag tag;
166 while (hasNext(tag, affixPattern)) {
167 tag = nextToken(tag, affixPattern, status);
168 if (U_FAILURE(status)) { return length; }
169 if (tag.type == TYPE_CURRENCY_OVERFLOW) {
170 // Don't go to the provider for this special case
171 length += output.insertCodePoint(
172 position + length,
173 0xFFFD,
174 {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD},
175 status);
176 } else if (tag.type < 0) {
177 length += output.insert(
178 position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
179 } else {
180 length += output.insertCodePoint(position + length, tag.codePoint, field, status);
181 }
182 }
183 return length;
184 }
185
unescapedCodePointCount(const UnicodeString & affixPattern,const SymbolProvider & provider,UErrorCode & status)186 int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
187 const SymbolProvider &provider, UErrorCode &status) {
188 int32_t length = 0;
189 AffixTag tag;
190 while (hasNext(tag, affixPattern)) {
191 tag = nextToken(tag, affixPattern, status);
192 if (U_FAILURE(status)) { return length; }
193 if (tag.type == TYPE_CURRENCY_OVERFLOW) {
194 length += 1;
195 } else if (tag.type < 0) {
196 length += provider.getSymbol(tag.type).length();
197 } else {
198 length += U16_LENGTH(tag.codePoint);
199 }
200 }
201 return length;
202 }
203
204 bool
containsType(const UnicodeString & affixPattern,AffixPatternType type,UErrorCode & status)205 AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
206 if (affixPattern.length() == 0) {
207 return false;
208 }
209 AffixTag tag;
210 while (hasNext(tag, affixPattern)) {
211 tag = nextToken(tag, affixPattern, status);
212 if (U_FAILURE(status)) { return false; }
213 if (tag.type == type) {
214 return true;
215 }
216 }
217 return false;
218 }
219
hasCurrencySymbols(const UnicodeString & affixPattern,UErrorCode & status)220 bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
221 if (affixPattern.length() == 0) {
222 return false;
223 }
224 AffixTag tag;
225 while (hasNext(tag, affixPattern)) {
226 tag = nextToken(tag, affixPattern, status);
227 if (U_FAILURE(status)) { return false; }
228 if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) {
229 return true;
230 }
231 }
232 return false;
233 }
234
replaceType(const UnicodeString & affixPattern,AffixPatternType type,char16_t replacementChar,UErrorCode & status)235 UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
236 char16_t replacementChar, UErrorCode &status) {
237 UnicodeString output(affixPattern); // copy
238 if (affixPattern.length() == 0) {
239 return output;
240 }
241 AffixTag tag;
242 while (hasNext(tag, affixPattern)) {
243 tag = nextToken(tag, affixPattern, status);
244 if (U_FAILURE(status)) { return output; }
245 if (tag.type == type) {
246 output.replace(tag.offset - 1, 1, replacementChar);
247 }
248 }
249 return output;
250 }
251
containsOnlySymbolsAndIgnorables(const UnicodeString & affixPattern,const UnicodeSet & ignorables,UErrorCode & status)252 bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
253 const UnicodeSet& ignorables, UErrorCode& status) {
254 if (affixPattern.length() == 0) {
255 return true;
256 }
257 AffixTag tag;
258 while (hasNext(tag, affixPattern)) {
259 tag = nextToken(tag, affixPattern, status);
260 if (U_FAILURE(status)) { return false; }
261 if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
262 return false;
263 }
264 }
265 return true;
266 }
267
iterateWithConsumer(const UnicodeString & affixPattern,TokenConsumer & consumer,UErrorCode & status)268 void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
269 UErrorCode& status) {
270 if (affixPattern.length() == 0) {
271 return;
272 }
273 AffixTag tag;
274 while (hasNext(tag, affixPattern)) {
275 tag = nextToken(tag, affixPattern, status);
276 if (U_FAILURE(status)) { return; }
277 consumer.consumeToken(tag.type, tag.codePoint, status);
278 if (U_FAILURE(status)) { return; }
279 }
280 }
281
nextToken(AffixTag tag,const UnicodeString & patternString,UErrorCode & status)282 AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
283 int32_t offset = tag.offset;
284 int32_t state = tag.state;
285 for (; offset < patternString.length();) {
286 UChar32 cp = patternString.char32At(offset);
287 int32_t count = U16_LENGTH(cp);
288
289 switch (state) {
290 case STATE_BASE:
291 switch (cp) {
292 case u'\'':
293 state = STATE_FIRST_QUOTE;
294 offset += count;
295 // continue to the next code point
296 break;
297 case u'-':
298 return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
299 case u'+':
300 return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
301 case u'~':
302 return makeTag(offset + count, TYPE_APPROXIMATELY_SIGN, STATE_BASE, 0);
303 case u'%':
304 return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
305 case u'‰':
306 return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
307 case u'¤':
308 state = STATE_FIRST_CURR;
309 offset += count;
310 // continue to the next code point
311 break;
312 default:
313 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
314 }
315 break;
316 case STATE_FIRST_QUOTE:
317 if (cp == u'\'') {
318 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
319 } else {
320 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
321 }
322 case STATE_INSIDE_QUOTE:
323 if (cp == u'\'') {
324 state = STATE_AFTER_QUOTE;
325 offset += count;
326 // continue to the next code point
327 break;
328 } else {
329 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
330 }
331 case STATE_AFTER_QUOTE:
332 if (cp == u'\'') {
333 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
334 } else {
335 state = STATE_BASE;
336 // re-evaluate this code point
337 break;
338 }
339 case STATE_FIRST_CURR:
340 if (cp == u'¤') {
341 state = STATE_SECOND_CURR;
342 offset += count;
343 // continue to the next code point
344 break;
345 } else {
346 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
347 }
348 case STATE_SECOND_CURR:
349 if (cp == u'¤') {
350 state = STATE_THIRD_CURR;
351 offset += count;
352 // continue to the next code point
353 break;
354 } else {
355 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
356 }
357 case STATE_THIRD_CURR:
358 if (cp == u'¤') {
359 state = STATE_FOURTH_CURR;
360 offset += count;
361 // continue to the next code point
362 break;
363 } else {
364 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
365 }
366 case STATE_FOURTH_CURR:
367 if (cp == u'¤') {
368 state = STATE_FIFTH_CURR;
369 offset += count;
370 // continue to the next code point
371 break;
372 } else {
373 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
374 }
375 case STATE_FIFTH_CURR:
376 if (cp == u'¤') {
377 state = STATE_OVERFLOW_CURR;
378 offset += count;
379 // continue to the next code point
380 break;
381 } else {
382 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
383 }
384 case STATE_OVERFLOW_CURR:
385 if (cp == u'¤') {
386 offset += count;
387 // continue to the next code point and loop back to this state
388 break;
389 } else {
390 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
391 }
392 default:
393 UPRV_UNREACHABLE_EXIT;
394 }
395 }
396 // End of string
397 switch (state) {
398 case STATE_BASE:
399 // No more tokens in string.
400 return {-1};
401 case STATE_FIRST_QUOTE:
402 case STATE_INSIDE_QUOTE:
403 // For consistent behavior with the JDK and ICU 58, set an error here.
404 status = U_ILLEGAL_ARGUMENT_ERROR;
405 return {-1};
406 case STATE_AFTER_QUOTE:
407 // No more tokens in string.
408 return {-1};
409 case STATE_FIRST_CURR:
410 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
411 case STATE_SECOND_CURR:
412 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
413 case STATE_THIRD_CURR:
414 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
415 case STATE_FOURTH_CURR:
416 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
417 case STATE_FIFTH_CURR:
418 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
419 case STATE_OVERFLOW_CURR:
420 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
421 default:
422 UPRV_UNREACHABLE_EXIT;
423 }
424 }
425
hasNext(const AffixTag & tag,const UnicodeString & string)426 bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
427 // First check for the {-1} and default initializer syntax.
428 if (tag.offset < 0) {
429 return false;
430 } else if (tag.offset == 0) {
431 return string.length() > 0;
432 }
433 // The rest of the fields are safe to use now.
434 // Special case: the last character in string is an end quote.
435 if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
436 string.charAt(tag.offset) == u'\'') {
437 return false;
438 } else if (tag.state != STATE_BASE) {
439 return true;
440 } else {
441 return tag.offset < string.length();
442 }
443 }
444
445 #endif /* #if !UCONFIG_NO_FORMATTING */
446