1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 #include "unicode/utypes.h"
5
6 #if !UCONFIG_NO_FORMATTING
7
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
11
12 #include "numparse_types.h"
13 #include "numparse_decimal.h"
14 #include "static_unicode_sets.h"
15 #include "numparse_utils.h"
16 #include "unicode/uchar.h"
17 #include "putilimp.h"
18 #include "number_decimalquantity.h"
19 #include "string_segment.h"
20
21 using namespace icu;
22 using namespace icu::numparse;
23 using namespace icu::numparse::impl;
24
25
DecimalMatcher(const DecimalFormatSymbols & symbols,const Grouper & grouper,parse_flags_t parseFlags)26 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
27 parse_flags_t parseFlags) {
28 if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
29 groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
30 decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
31 } else {
32 groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
33 decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
34 }
35 bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
36 unisets::Key groupingKey = strictSeparators ? unisets::STRICT_ALL_SEPARATORS
37 : unisets::ALL_SEPARATORS;
38
39 // Attempt to find separators in the static cache
40
41 groupingUniSet = unisets::get(groupingKey);
42 unisets::Key decimalKey = unisets::chooseFrom(
43 decimalSeparator,
44 strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
45 strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
46 if (decimalKey >= 0) {
47 decimalUniSet = unisets::get(decimalKey);
48 } else if (!decimalSeparator.isEmpty()) {
49 auto* set = new UnicodeSet();
50 set->add(decimalSeparator.char32At(0));
51 set->freeze();
52 decimalUniSet = set;
53 fLocalDecimalUniSet.adoptInstead(set);
54 } else {
55 decimalUniSet = unisets::get(unisets::EMPTY);
56 }
57
58 if (groupingKey >= 0 && decimalKey >= 0) {
59 // Everything is available in the static cache
60 separatorSet = groupingUniSet;
61 leadSet = unisets::get(
62 strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
63 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
64 } else {
65 auto* set = new UnicodeSet();
66 set->addAll(*groupingUniSet);
67 set->addAll(*decimalUniSet);
68 set->freeze();
69 separatorSet = set;
70 fLocalSeparatorSet.adoptInstead(set);
71 leadSet = nullptr;
72 }
73
74 UChar32 cpZero = symbols.getCodePointZero();
75 if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
76 // Uncommon case: okay to allocate.
77 auto digitStrings = new UnicodeString[10];
78 fLocalDigitStrings.adoptInstead(digitStrings);
79 for (int32_t i = 0; i <= 9; i++) {
80 digitStrings[i] = symbols.getConstDigitSymbol(i);
81 }
82 }
83
84 requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
85 groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
86 integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
87 grouping1 = grouper.getPrimary();
88 grouping2 = grouper.getSecondary();
89
90 // Fraction grouping parsing is disabled for now but could be enabled later.
91 // See http://bugs.icu-project.org/trac/ticket/10794
92 // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
93 }
94
match(StringSegment & segment,ParsedNumber & result,UErrorCode & status) const95 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
96 return match(segment, result, 0, status);
97 }
98
match(StringSegment & segment,ParsedNumber & result,int8_t exponentSign,UErrorCode &) const99 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
100 UErrorCode&) const {
101 if (result.seenNumber() && exponentSign == 0) {
102 // A number has already been consumed.
103 return false;
104 } else if (exponentSign != 0) {
105 // scientific notation always comes after the number
106 U_ASSERT(!result.quantity.bogus);
107 }
108
109 // Initial offset before any character consumption.
110 int32_t initialOffset = segment.getOffset();
111
112 // Return value: whether to ask for more characters.
113 bool maybeMore = false;
114
115 // All digits consumed so far.
116 number::impl::DecimalQuantity digitsConsumed;
117 digitsConsumed.bogus = true;
118
119 // The total number of digits after the decimal place, used for scaling the result.
120 int32_t digitsAfterDecimalPlace = 0;
121
122 // The actual grouping and decimal separators used in the string.
123 // If non-null, we have seen that token.
124 UnicodeString actualGroupingString;
125 UnicodeString actualDecimalString;
126 actualGroupingString.setToBogus();
127 actualDecimalString.setToBogus();
128
129 // Information for two groups: the previous group and the current group.
130 //
131 // Each group has three pieces of information:
132 //
133 // Offset: the string position of the beginning of the group, including a leading separator
134 // if there was a leading separator. This is needed in case we need to rewind the parse to
135 // that position.
136 //
137 // Separator type:
138 // 0 => beginning of string
139 // 1 => lead separator is a grouping separator
140 // 2 => lead separator is a decimal separator
141 //
142 // Count: the number of digits in the group. If -1, the group has been validated.
143 int32_t currGroupOffset = 0;
144 int32_t currGroupSepType = 0;
145 int32_t currGroupCount = 0;
146 int32_t prevGroupOffset = -1;
147 int32_t prevGroupSepType = -1;
148 int32_t prevGroupCount = -1;
149
150 while (segment.length() > 0) {
151 maybeMore = false;
152
153 // Attempt to match a digit.
154 int8_t digit = -1;
155
156 // Try by code point digit value.
157 UChar32 cp = segment.getCodePoint();
158 if (u_isdigit(cp)) {
159 segment.adjustOffset(U16_LENGTH(cp));
160 digit = static_cast<int8_t>(u_digit(cp, 10));
161 }
162
163 // Try by digit string.
164 if (digit == -1 && !fLocalDigitStrings.isNull()) {
165 for (int32_t i = 0; i < 10; i++) {
166 const UnicodeString& str = fLocalDigitStrings[i];
167 if (str.isEmpty()) {
168 continue;
169 }
170 int32_t overlap = segment.getCommonPrefixLength(str);
171 if (overlap == str.length()) {
172 segment.adjustOffset(overlap);
173 digit = static_cast<int8_t>(i);
174 break;
175 }
176 maybeMore = maybeMore || (overlap == segment.length());
177 }
178 }
179
180 if (digit >= 0) {
181 // Digit was found.
182 if (digitsConsumed.bogus) {
183 digitsConsumed.bogus = false;
184 digitsConsumed.clear();
185 }
186 digitsConsumed.appendDigit(digit, 0, true);
187 currGroupCount++;
188 if (!actualDecimalString.isBogus()) {
189 digitsAfterDecimalPlace++;
190 }
191 continue;
192 }
193
194 // Attempt to match a literal grouping or decimal separator.
195 bool isDecimal = false;
196 bool isGrouping = false;
197
198 // 1) Attempt the decimal separator string literal.
199 // if (we have not seen a decimal separator yet) { ... }
200 if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
201 int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
202 maybeMore = maybeMore || (overlap == segment.length());
203 if (overlap == decimalSeparator.length()) {
204 isDecimal = true;
205 actualDecimalString = decimalSeparator;
206 }
207 }
208
209 // 2) Attempt to match the actual grouping string literal.
210 if (!actualGroupingString.isBogus()) {
211 int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
212 maybeMore = maybeMore || (overlap == segment.length());
213 if (overlap == actualGroupingString.length()) {
214 isGrouping = true;
215 }
216 }
217
218 // 2.5) Attempt to match a new the grouping separator string literal.
219 // if (we have not seen a grouping or decimal separator yet) { ... }
220 if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
221 !groupingSeparator.isEmpty()) {
222 int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
223 maybeMore = maybeMore || (overlap == segment.length());
224 if (overlap == groupingSeparator.length()) {
225 isGrouping = true;
226 actualGroupingString = groupingSeparator;
227 }
228 }
229
230 // 3) Attempt to match a decimal separator from the equivalence set.
231 // if (we have not seen a decimal separator yet) { ... }
232 // The !isGrouping is to confirm that we haven't yet matched the current character.
233 if (!isGrouping && actualDecimalString.isBogus()) {
234 if (decimalUniSet->contains(cp)) {
235 isDecimal = true;
236 actualDecimalString = UnicodeString(cp);
237 }
238 }
239
240 // 4) Attempt to match a grouping separator from the equivalence set.
241 // if (we have not seen a grouping or decimal separator yet) { ... }
242 if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
243 if (groupingUniSet->contains(cp)) {
244 isGrouping = true;
245 actualGroupingString = UnicodeString(cp);
246 }
247 }
248
249 // Leave if we failed to match this as a separator.
250 if (!isDecimal && !isGrouping) {
251 break;
252 }
253
254 // Check for conditions when we don't want to accept the separator.
255 if (isDecimal && integerOnly) {
256 break;
257 } else if (currGroupSepType == 2 && isGrouping) {
258 // Fraction grouping
259 break;
260 }
261
262 // Validate intermediate grouping sizes.
263 bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
264 bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
265 if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
266 // Invalid grouping sizes.
267 if (isGrouping && currGroupCount == 0) {
268 // Trailing grouping separators: these are taken care of below
269 U_ASSERT(currGroupSepType == 1);
270 } else if (requireGroupingMatch) {
271 // Strict mode: reject the parse
272 digitsConsumed.clear();
273 digitsConsumed.bogus = true;
274 }
275 break;
276 } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
277 break;
278 } else {
279 // Grouping sizes OK so far.
280 prevGroupOffset = currGroupOffset;
281 prevGroupCount = currGroupCount;
282 if (isDecimal) {
283 // Do not validate this group any more.
284 prevGroupSepType = -1;
285 } else {
286 prevGroupSepType = currGroupSepType;
287 }
288 }
289
290 // OK to accept the separator.
291 // Special case: don't update currGroup if it is empty; this allows two grouping
292 // separators in a row in lenient mode.
293 if (currGroupCount != 0) {
294 currGroupOffset = segment.getOffset();
295 }
296 currGroupSepType = isGrouping ? 1 : 2;
297 currGroupCount = 0;
298 if (isGrouping) {
299 segment.adjustOffset(actualGroupingString.length());
300 } else {
301 segment.adjustOffset(actualDecimalString.length());
302 }
303 }
304
305 // End of main loop.
306 // Back up if there was a trailing grouping separator.
307 // Shift prev -> curr so we can check it as a final group.
308 if (currGroupSepType != 2 && currGroupCount == 0) {
309 maybeMore = true;
310 segment.setOffset(currGroupOffset);
311 currGroupOffset = prevGroupOffset;
312 currGroupSepType = prevGroupSepType;
313 currGroupCount = prevGroupCount;
314 prevGroupOffset = -1;
315 prevGroupSepType = 0;
316 prevGroupCount = 1;
317 }
318
319 // Validate final grouping sizes.
320 bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
321 bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
322 if (!requireGroupingMatch) {
323 // The cases we need to handle here are lone digits.
324 // Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1)
325 // See more examples in numberformattestspecification.txt
326 int32_t digitsToRemove = 0;
327 if (!prevValidSecondary) {
328 segment.setOffset(prevGroupOffset);
329 digitsToRemove += prevGroupCount;
330 digitsToRemove += currGroupCount;
331 } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
332 maybeMore = true;
333 segment.setOffset(currGroupOffset);
334 digitsToRemove += currGroupCount;
335 }
336 if (digitsToRemove != 0) {
337 digitsConsumed.adjustMagnitude(-digitsToRemove);
338 digitsConsumed.truncate();
339 }
340 prevValidSecondary = true;
341 currValidPrimary = true;
342 }
343 if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
344 // Grouping failure.
345 digitsConsumed.bogus = true;
346 }
347
348 // Strings that start with a separator but have no digits,
349 // or strings that failed a grouping size check.
350 if (digitsConsumed.bogus) {
351 maybeMore = maybeMore || (segment.length() == 0);
352 segment.setOffset(initialOffset);
353 return maybeMore;
354 }
355
356 // We passed all inspections. Start post-processing.
357
358 // Adjust for fraction part.
359 digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
360
361 // Set the digits, either normal or exponent.
362 if (exponentSign != 0 && segment.getOffset() != initialOffset) {
363 bool overflow = false;
364 if (digitsConsumed.fitsInLong()) {
365 int64_t exponentLong = digitsConsumed.toLong(false);
366 U_ASSERT(exponentLong >= 0);
367 if (exponentLong <= INT32_MAX) {
368 auto exponentInt = static_cast<int32_t>(exponentLong);
369 if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
370 overflow = true;
371 }
372 } else {
373 overflow = true;
374 }
375 } else {
376 overflow = true;
377 }
378 if (overflow) {
379 if (exponentSign == -1) {
380 // Set to zero
381 result.quantity.clear();
382 } else {
383 // Set to infinity
384 result.quantity.bogus = true;
385 result.flags |= FLAG_INFINITY;
386 }
387 }
388 } else {
389 result.quantity = digitsConsumed;
390 }
391
392 // Set other information into the result and return.
393 if (!actualDecimalString.isBogus()) {
394 result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
395 }
396 result.setCharsConsumed(segment);
397 return segment.length() == 0 || maybeMore;
398 }
399
validateGroup(int32_t sepType,int32_t count,bool isPrimary) const400 bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
401 if (requireGroupingMatch) {
402 if (sepType == -1) {
403 // No such group (prevGroup before first shift).
404 return true;
405 } else if (sepType == 0) {
406 // First group.
407 if (isPrimary) {
408 // No grouping separators is OK.
409 return true;
410 } else {
411 return count != 0 && count <= grouping2;
412 }
413 } else if (sepType == 1) {
414 // Middle group.
415 if (isPrimary) {
416 return count == grouping1;
417 } else {
418 return count == grouping2;
419 }
420 } else {
421 U_ASSERT(sepType == 2);
422 // After the decimal separator.
423 return true;
424 }
425 } else {
426 if (sepType == 1) {
427 // #11230: don't accept middle groups with only 1 digit.
428 return count != 1;
429 } else {
430 return true;
431 }
432 }
433 }
434
smokeTest(const StringSegment & segment) const435 bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
436 // The common case uses a static leadSet for efficiency.
437 if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
438 return segment.startsWith(*leadSet);
439 }
440 if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
441 return true;
442 }
443 if (fLocalDigitStrings.isNull()) {
444 return false;
445 }
446 for (int32_t i = 0; i < 10; i++) {
447 if (segment.startsWith(fLocalDigitStrings[i])) {
448 return true;
449 }
450 }
451 return false;
452 }
453
toString() const454 UnicodeString DecimalMatcher::toString() const {
455 return u"<Decimal>";
456 }
457
458
459 #endif /* #if !UCONFIG_NO_FORMATTING */
460