1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include "unicode/utypes.h"
5
6 #if !UCONFIG_NO_FORMATTING
7
8 #include "number_affixutils.h"
9 #include "unicode/utf16.h"
10 #include "unicode/uniset.h"
11
12 using namespace icu;
13 using namespace icu::number;
14 using namespace icu::number::impl;
15
16 TokenConsumer::~TokenConsumer() = default;
17 SymbolProvider::~SymbolProvider() = default;
18
estimateLength(const UnicodeString & patternString,UErrorCode & status)19 int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20 AffixPatternState state = STATE_BASE;
21 int32_t offset = 0;
22 int32_t length = 0;
23 for (; offset < patternString.length();) {
24 UChar32 cp = patternString.char32At(offset);
25
26 switch (state) {
27 case STATE_BASE:
28 if (cp == u'\'') {
29 // First quote
30 state = STATE_FIRST_QUOTE;
31 } else {
32 // Unquoted symbol
33 length++;
34 }
35 break;
36 case STATE_FIRST_QUOTE:
37 if (cp == u'\'') {
38 // Repeated quote
39 length++;
40 state = STATE_BASE;
41 } else {
42 // Quoted code point
43 length++;
44 state = STATE_INSIDE_QUOTE;
45 }
46 break;
47 case STATE_INSIDE_QUOTE:
48 if (cp == u'\'') {
49 // End of quoted sequence
50 state = STATE_AFTER_QUOTE;
51 } else {
52 // Quoted code point
53 length++;
54 }
55 break;
56 case STATE_AFTER_QUOTE:
57 if (cp == u'\'') {
58 // Double quote inside of quoted sequence
59 length++;
60 state = STATE_INSIDE_QUOTE;
61 } else {
62 // Unquoted symbol
63 length++;
64 }
65 break;
66 default:
67 U_ASSERT(false);
68 }
69
70 offset += U16_LENGTH(cp);
71 }
72
73 switch (state) {
74 case STATE_FIRST_QUOTE:
75 case STATE_INSIDE_QUOTE:
76 status = U_ILLEGAL_ARGUMENT_ERROR;
77 break;
78 default:
79 break;
80 }
81
82 return length;
83 }
84
escape(const UnicodeString & input)85 UnicodeString AffixUtils::escape(const UnicodeString &input) {
86 AffixPatternState state = STATE_BASE;
87 int32_t offset = 0;
88 UnicodeString output;
89 for (; offset < input.length();) {
90 UChar32 cp = input.char32At(offset);
91
92 switch (cp) {
93 case u'\'':
94 output.append(u"''", -1);
95 break;
96
97 case u'-':
98 case u'+':
99 case u'%':
100 case u'‰':
101 case u'¤':
102 if (state == STATE_BASE) {
103 output.append(u'\'');
104 output.append(cp);
105 state = STATE_INSIDE_QUOTE;
106 } else {
107 output.append(cp);
108 }
109 break;
110
111 default:
112 if (state == STATE_INSIDE_QUOTE) {
113 output.append(u'\'');
114 output.append(cp);
115 state = STATE_BASE;
116 } else {
117 output.append(cp);
118 }
119 break;
120 }
121 offset += U16_LENGTH(cp);
122 }
123
124 if (state == STATE_INSIDE_QUOTE) {
125 output.append(u'\'');
126 }
127
128 return output;
129 }
130
getFieldForType(AffixPatternType type)131 Field AffixUtils::getFieldForType(AffixPatternType type) {
132 switch (type) {
133 case TYPE_MINUS_SIGN:
134 return Field::UNUM_SIGN_FIELD;
135 case TYPE_PLUS_SIGN:
136 return Field::UNUM_SIGN_FIELD;
137 case TYPE_PERCENT:
138 return Field::UNUM_PERCENT_FIELD;
139 case TYPE_PERMILLE:
140 return Field::UNUM_PERMILL_FIELD;
141 case TYPE_CURRENCY_SINGLE:
142 return Field::UNUM_CURRENCY_FIELD;
143 case TYPE_CURRENCY_DOUBLE:
144 return Field::UNUM_CURRENCY_FIELD;
145 case TYPE_CURRENCY_TRIPLE:
146 return Field::UNUM_CURRENCY_FIELD;
147 case TYPE_CURRENCY_QUAD:
148 return Field::UNUM_CURRENCY_FIELD;
149 case TYPE_CURRENCY_QUINT:
150 return Field::UNUM_CURRENCY_FIELD;
151 case TYPE_CURRENCY_OVERFLOW:
152 return Field::UNUM_CURRENCY_FIELD;
153 default:
154 U_ASSERT(false);
155 return Field::UNUM_FIELD_COUNT; // suppress "control reaches end of non-void function"
156 }
157 }
158
159 int32_t
unescape(const UnicodeString & affixPattern,NumberStringBuilder & output,int32_t position,const SymbolProvider & provider,UErrorCode & status)160 AffixUtils::unescape(const UnicodeString &affixPattern, NumberStringBuilder &output, int32_t position,
161 const SymbolProvider &provider, UErrorCode &status) {
162 int32_t length = 0;
163 AffixTag tag;
164 while (hasNext(tag, affixPattern)) {
165 tag = nextToken(tag, affixPattern, status);
166 if (U_FAILURE(status)) { return length; }
167 if (tag.type == TYPE_CURRENCY_OVERFLOW) {
168 // Don't go to the provider for this special case
169 length += output.insertCodePoint(position + length, 0xFFFD, UNUM_CURRENCY_FIELD, status);
170 } else if (tag.type < 0) {
171 length += output.insert(
172 position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
173 } else {
174 length += output.insertCodePoint(position + length, tag.codePoint, UNUM_FIELD_COUNT, status);
175 }
176 }
177 return length;
178 }
179
unescapedCodePointCount(const UnicodeString & affixPattern,const SymbolProvider & provider,UErrorCode & status)180 int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
181 const SymbolProvider &provider, UErrorCode &status) {
182 int32_t length = 0;
183 AffixTag tag;
184 while (hasNext(tag, affixPattern)) {
185 tag = nextToken(tag, affixPattern, status);
186 if (U_FAILURE(status)) { return length; }
187 if (tag.type == TYPE_CURRENCY_OVERFLOW) {
188 length += 1;
189 } else if (tag.type < 0) {
190 length += provider.getSymbol(tag.type).length();
191 } else {
192 length += U16_LENGTH(tag.codePoint);
193 }
194 }
195 return length;
196 }
197
198 bool
containsType(const UnicodeString & affixPattern,AffixPatternType type,UErrorCode & status)199 AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
200 if (affixPattern.length() == 0) {
201 return false;
202 }
203 AffixTag tag;
204 while (hasNext(tag, affixPattern)) {
205 tag = nextToken(tag, affixPattern, status);
206 if (U_FAILURE(status)) { return false; }
207 if (tag.type == type) {
208 return true;
209 }
210 }
211 return false;
212 }
213
hasCurrencySymbols(const UnicodeString & affixPattern,UErrorCode & status)214 bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
215 if (affixPattern.length() == 0) {
216 return false;
217 }
218 AffixTag tag;
219 while (hasNext(tag, affixPattern)) {
220 tag = nextToken(tag, affixPattern, status);
221 if (U_FAILURE(status)) { return false; }
222 if (tag.type < 0 && getFieldForType(tag.type) == UNUM_CURRENCY_FIELD) {
223 return true;
224 }
225 }
226 return false;
227 }
228
replaceType(const UnicodeString & affixPattern,AffixPatternType type,char16_t replacementChar,UErrorCode & status)229 UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
230 char16_t replacementChar, UErrorCode &status) {
231 UnicodeString output(affixPattern); // copy
232 if (affixPattern.length() == 0) {
233 return output;
234 };
235 AffixTag tag;
236 while (hasNext(tag, affixPattern)) {
237 tag = nextToken(tag, affixPattern, status);
238 if (U_FAILURE(status)) { return output; }
239 if (tag.type == type) {
240 output.replace(tag.offset - 1, 1, replacementChar);
241 }
242 }
243 return output;
244 }
245
containsOnlySymbolsAndIgnorables(const UnicodeString & affixPattern,const UnicodeSet & ignorables,UErrorCode & status)246 bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
247 const UnicodeSet& ignorables, UErrorCode& status) {
248 if (affixPattern.length() == 0) {
249 return true;
250 };
251 AffixTag tag;
252 while (hasNext(tag, affixPattern)) {
253 tag = nextToken(tag, affixPattern, status);
254 if (U_FAILURE(status)) { return false; }
255 if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
256 return false;
257 }
258 }
259 return true;
260 }
261
iterateWithConsumer(const UnicodeString & affixPattern,TokenConsumer & consumer,UErrorCode & status)262 void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
263 UErrorCode& status) {
264 if (affixPattern.length() == 0) {
265 return;
266 };
267 AffixTag tag;
268 while (hasNext(tag, affixPattern)) {
269 tag = nextToken(tag, affixPattern, status);
270 if (U_FAILURE(status)) { return; }
271 consumer.consumeToken(tag.type, tag.codePoint, status);
272 if (U_FAILURE(status)) { return; }
273 }
274 }
275
nextToken(AffixTag tag,const UnicodeString & patternString,UErrorCode & status)276 AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
277 int32_t offset = tag.offset;
278 int32_t state = tag.state;
279 for (; offset < patternString.length();) {
280 UChar32 cp = patternString.char32At(offset);
281 int32_t count = U16_LENGTH(cp);
282
283 switch (state) {
284 case STATE_BASE:
285 switch (cp) {
286 case u'\'':
287 state = STATE_FIRST_QUOTE;
288 offset += count;
289 // continue to the next code point
290 break;
291 case u'-':
292 return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
293 case u'+':
294 return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
295 case u'%':
296 return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
297 case u'‰':
298 return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
299 case u'¤':
300 state = STATE_FIRST_CURR;
301 offset += count;
302 // continue to the next code point
303 break;
304 default:
305 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
306 }
307 break;
308 case STATE_FIRST_QUOTE:
309 if (cp == u'\'') {
310 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
311 } else {
312 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
313 }
314 case STATE_INSIDE_QUOTE:
315 if (cp == u'\'') {
316 state = STATE_AFTER_QUOTE;
317 offset += count;
318 // continue to the next code point
319 break;
320 } else {
321 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
322 }
323 case STATE_AFTER_QUOTE:
324 if (cp == u'\'') {
325 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
326 } else {
327 state = STATE_BASE;
328 // re-evaluate this code point
329 break;
330 }
331 case STATE_FIRST_CURR:
332 if (cp == u'¤') {
333 state = STATE_SECOND_CURR;
334 offset += count;
335 // continue to the next code point
336 break;
337 } else {
338 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
339 }
340 case STATE_SECOND_CURR:
341 if (cp == u'¤') {
342 state = STATE_THIRD_CURR;
343 offset += count;
344 // continue to the next code point
345 break;
346 } else {
347 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
348 }
349 case STATE_THIRD_CURR:
350 if (cp == u'¤') {
351 state = STATE_FOURTH_CURR;
352 offset += count;
353 // continue to the next code point
354 break;
355 } else {
356 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
357 }
358 case STATE_FOURTH_CURR:
359 if (cp == u'¤') {
360 state = STATE_FIFTH_CURR;
361 offset += count;
362 // continue to the next code point
363 break;
364 } else {
365 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
366 }
367 case STATE_FIFTH_CURR:
368 if (cp == u'¤') {
369 state = STATE_OVERFLOW_CURR;
370 offset += count;
371 // continue to the next code point
372 break;
373 } else {
374 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
375 }
376 case STATE_OVERFLOW_CURR:
377 if (cp == u'¤') {
378 offset += count;
379 // continue to the next code point and loop back to this state
380 break;
381 } else {
382 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
383 }
384 default:
385 U_ASSERT(false);
386 }
387 }
388 // End of string
389 switch (state) {
390 case STATE_BASE:
391 // No more tokens in string.
392 return {-1};
393 case STATE_FIRST_QUOTE:
394 case STATE_INSIDE_QUOTE:
395 // For consistent behavior with the JDK and ICU 58, set an error here.
396 status = U_ILLEGAL_ARGUMENT_ERROR;
397 return {-1};
398 case STATE_AFTER_QUOTE:
399 // No more tokens in string.
400 return {-1};
401 case STATE_FIRST_CURR:
402 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
403 case STATE_SECOND_CURR:
404 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
405 case STATE_THIRD_CURR:
406 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
407 case STATE_FOURTH_CURR:
408 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
409 case STATE_FIFTH_CURR:
410 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
411 case STATE_OVERFLOW_CURR:
412 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
413 default:
414 U_ASSERT(false);
415 return {-1}; // suppress "control reaches end of non-void function"
416 }
417 }
418
hasNext(const AffixTag & tag,const UnicodeString & string)419 bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
420 // First check for the {-1} and default initializer syntax.
421 if (tag.offset < 0) {
422 return false;
423 } else if (tag.offset == 0) {
424 return string.length() > 0;
425 }
426 // The rest of the fields are safe to use now.
427 // Special case: the last character in string is an end quote.
428 if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
429 string.charAt(tag.offset) == u'\'') {
430 return false;
431 } else if (tag.state != STATE_BASE) {
432 return true;
433 } else {
434 return tag.offset < string.length();
435 }
436 }
437
438 #endif /* #if !UCONFIG_NO_FORMATTING */
439