• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
11 
12 #include "numparse_types.h"
13 #include "numparse_decimal.h"
14 #include "static_unicode_sets.h"
15 #include "numparse_utils.h"
16 #include "unicode/uchar.h"
17 #include "putilimp.h"
18 #include "number_decimalquantity.h"
19 
20 using namespace icu;
21 using namespace icu::numparse;
22 using namespace icu::numparse::impl;
23 
24 
DecimalMatcher(const DecimalFormatSymbols & symbols,const Grouper & grouper,parse_flags_t parseFlags)25 DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
26                                parse_flags_t parseFlags) {
27     if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
28         groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
29         decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
30     } else {
31         groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
32         decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
33     }
34     bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
35     unisets::Key groupingKey = strictSeparators ? unisets::STRICT_ALL_SEPARATORS
36                                                 : unisets::ALL_SEPARATORS;
37 
38     // Attempt to find separators in the static cache
39 
40     groupingUniSet = unisets::get(groupingKey);
41     unisets::Key decimalKey = unisets::chooseFrom(
42             decimalSeparator,
43             strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
44             strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
45     if (decimalKey >= 0) {
46         decimalUniSet = unisets::get(decimalKey);
47     } else if (!decimalSeparator.isEmpty()) {
48         auto* set = new UnicodeSet();
49         set->add(decimalSeparator.char32At(0));
50         set->freeze();
51         decimalUniSet = set;
52         fLocalDecimalUniSet.adoptInstead(set);
53     } else {
54         decimalUniSet = unisets::get(unisets::EMPTY);
55     }
56 
57     if (groupingKey >= 0 && decimalKey >= 0) {
58         // Everything is available in the static cache
59         separatorSet = groupingUniSet;
60         leadSet = unisets::get(
61                 strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
62                                  : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
63     } else {
64         auto* set = new UnicodeSet();
65         set->addAll(*groupingUniSet);
66         set->addAll(*decimalUniSet);
67         set->freeze();
68         separatorSet = set;
69         fLocalSeparatorSet.adoptInstead(set);
70         leadSet = nullptr;
71     }
72 
73     UChar32 cpZero = symbols.getCodePointZero();
74     if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
75         // Uncommon case: okay to allocate.
76         auto digitStrings = new UnicodeString[10];
77         fLocalDigitStrings.adoptInstead(digitStrings);
78         for (int32_t i = 0; i <= 9; i++) {
79             digitStrings[i] = symbols.getConstDigitSymbol(i);
80         }
81     }
82 
83     requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
84     groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
85     integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
86     grouping1 = grouper.getPrimary();
87     grouping2 = grouper.getSecondary();
88 
89     // Fraction grouping parsing is disabled for now but could be enabled later.
90     // See http://bugs.icu-project.org/trac/ticket/10794
91     // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
92 }
93 
match(StringSegment & segment,ParsedNumber & result,UErrorCode & status) const94 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
95     return match(segment, result, 0, status);
96 }
97 
match(StringSegment & segment,ParsedNumber & result,int8_t exponentSign,UErrorCode &) const98 bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
99                            UErrorCode&) const {
100     if (result.seenNumber() && exponentSign == 0) {
101         // A number has already been consumed.
102         return false;
103     } else if (exponentSign != 0) {
104         // scientific notation always comes after the number
105         U_ASSERT(!result.quantity.bogus);
106     }
107 
108     // Initial offset before any character consumption.
109     int32_t initialOffset = segment.getOffset();
110 
111     // Return value: whether to ask for more characters.
112     bool maybeMore = false;
113 
114     // All digits consumed so far.
115     number::impl::DecimalQuantity digitsConsumed;
116     digitsConsumed.bogus = true;
117 
118     // The total number of digits after the decimal place, used for scaling the result.
119     int32_t digitsAfterDecimalPlace = 0;
120 
121     // The actual grouping and decimal separators used in the string.
122     // If non-null, we have seen that token.
123     UnicodeString actualGroupingString;
124     UnicodeString actualDecimalString;
125     actualGroupingString.setToBogus();
126     actualDecimalString.setToBogus();
127 
128     // Information for two groups: the previous group and the current group.
129     //
130     // Each group has three pieces of information:
131     //
132     // Offset: the string position of the beginning of the group, including a leading separator
133     // if there was a leading separator. This is needed in case we need to rewind the parse to
134     // that position.
135     //
136     // Separator type:
137     // 0 => beginning of string
138     // 1 => lead separator is a grouping separator
139     // 2 => lead separator is a decimal separator
140     //
141     // Count: the number of digits in the group. If -1, the group has been validated.
142     int32_t currGroupOffset = 0;
143     int32_t currGroupSepType = 0;
144     int32_t currGroupCount = 0;
145     int32_t prevGroupOffset = -1;
146     int32_t prevGroupSepType = -1;
147     int32_t prevGroupCount = -1;
148 
149     while (segment.length() > 0) {
150         maybeMore = false;
151 
152         // Attempt to match a digit.
153         int8_t digit = -1;
154 
155         // Try by code point digit value.
156         UChar32 cp = segment.getCodePoint();
157         if (u_isdigit(cp)) {
158             segment.adjustOffset(U16_LENGTH(cp));
159             digit = static_cast<int8_t>(u_digit(cp, 10));
160         }
161 
162         // Try by digit string.
163         if (digit == -1 && !fLocalDigitStrings.isNull()) {
164             for (int32_t i = 0; i < 10; i++) {
165                 const UnicodeString& str = fLocalDigitStrings[i];
166                 if (str.isEmpty()) {
167                     continue;
168                 }
169                 int32_t overlap = segment.getCommonPrefixLength(str);
170                 if (overlap == str.length()) {
171                     segment.adjustOffset(overlap);
172                     digit = static_cast<int8_t>(i);
173                     break;
174                 }
175                 maybeMore = maybeMore || (overlap == segment.length());
176             }
177         }
178 
179         if (digit >= 0) {
180             // Digit was found.
181             if (digitsConsumed.bogus) {
182                 digitsConsumed.bogus = false;
183                 digitsConsumed.clear();
184             }
185             digitsConsumed.appendDigit(digit, 0, true);
186             currGroupCount++;
187             if (!actualDecimalString.isBogus()) {
188                 digitsAfterDecimalPlace++;
189             }
190             continue;
191         }
192 
193         // Attempt to match a literal grouping or decimal separator.
194         bool isDecimal = false;
195         bool isGrouping = false;
196 
197         // 1) Attempt the decimal separator string literal.
198         // if (we have not seen a decimal separator yet) { ... }
199         if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
200             int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
201             maybeMore = maybeMore || (overlap == segment.length());
202             if (overlap == decimalSeparator.length()) {
203                 isDecimal = true;
204                 actualDecimalString = decimalSeparator;
205             }
206         }
207 
208         // 2) Attempt to match the actual grouping string literal.
209         if (!actualGroupingString.isBogus()) {
210             int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
211             maybeMore = maybeMore || (overlap == segment.length());
212             if (overlap == actualGroupingString.length()) {
213                 isGrouping = true;
214             }
215         }
216 
217         // 2.5) Attempt to match a new the grouping separator string literal.
218         // if (we have not seen a grouping or decimal separator yet) { ... }
219         if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
220             !groupingSeparator.isEmpty()) {
221             int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
222             maybeMore = maybeMore || (overlap == segment.length());
223             if (overlap == groupingSeparator.length()) {
224                 isGrouping = true;
225                 actualGroupingString = groupingSeparator;
226             }
227         }
228 
229         // 3) Attempt to match a decimal separator from the equivalence set.
230         // if (we have not seen a decimal separator yet) { ... }
231         // The !isGrouping is to confirm that we haven't yet matched the current character.
232         if (!isGrouping && actualDecimalString.isBogus()) {
233             if (decimalUniSet->contains(cp)) {
234                 isDecimal = true;
235                 actualDecimalString = UnicodeString(cp);
236             }
237         }
238 
239         // 4) Attempt to match a grouping separator from the equivalence set.
240         // if (we have not seen a grouping or decimal separator yet) { ... }
241         if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
242             if (groupingUniSet->contains(cp)) {
243                 isGrouping = true;
244                 actualGroupingString = UnicodeString(cp);
245             }
246         }
247 
248         // Leave if we failed to match this as a separator.
249         if (!isDecimal && !isGrouping) {
250             break;
251         }
252 
253         // Check for conditions when we don't want to accept the separator.
254         if (isDecimal && integerOnly) {
255             break;
256         } else if (currGroupSepType == 2 && isGrouping) {
257             // Fraction grouping
258             break;
259         }
260 
261         // Validate intermediate grouping sizes.
262         bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
263         bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
264         if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
265             // Invalid grouping sizes.
266             if (isGrouping && currGroupCount == 0) {
267                 // Trailing grouping separators: these are taken care of below
268                 U_ASSERT(currGroupSepType == 1);
269             } else if (requireGroupingMatch) {
270                 // Strict mode: reject the parse
271                 digitsConsumed.clear();
272                 digitsConsumed.bogus = true;
273             }
274             break;
275         } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
276             break;
277         } else {
278             // Grouping sizes OK so far.
279             prevGroupOffset = currGroupOffset;
280             prevGroupCount = currGroupCount;
281             if (isDecimal) {
282                 // Do not validate this group any more.
283                 prevGroupSepType = -1;
284             } else {
285                 prevGroupSepType = currGroupSepType;
286             }
287         }
288 
289         // OK to accept the separator.
290         // Special case: don't update currGroup if it is empty; this allows two grouping
291         // separators in a row in lenient mode.
292         if (currGroupCount != 0) {
293             currGroupOffset = segment.getOffset();
294         }
295         currGroupSepType = isGrouping ? 1 : 2;
296         currGroupCount = 0;
297         if (isGrouping) {
298             segment.adjustOffset(actualGroupingString.length());
299         } else {
300             segment.adjustOffset(actualDecimalString.length());
301         }
302     }
303 
304     // End of main loop.
305     // Back up if there was a trailing grouping separator.
306     // Shift prev -> curr so we can check it as a final group.
307     if (currGroupSepType != 2 && currGroupCount == 0) {
308         maybeMore = true;
309         segment.setOffset(currGroupOffset);
310         currGroupOffset = prevGroupOffset;
311         currGroupSepType = prevGroupSepType;
312         currGroupCount = prevGroupCount;
313         prevGroupOffset = -1;
314         prevGroupSepType = 0;
315         prevGroupCount = 1;
316     }
317 
318     // Validate final grouping sizes.
319     bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
320     bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
321     if (!requireGroupingMatch) {
322         // The cases we need to handle here are lone digits.
323         // Examples: "1,1"  "1,1,"  "1,1,1"  "1,1,1,"  ",1" (all parse as 1)
324         // See more examples in numberformattestspecification.txt
325         int32_t digitsToRemove = 0;
326         if (!prevValidSecondary) {
327             segment.setOffset(prevGroupOffset);
328             digitsToRemove += prevGroupCount;
329             digitsToRemove += currGroupCount;
330         } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
331             maybeMore = true;
332             segment.setOffset(currGroupOffset);
333             digitsToRemove += currGroupCount;
334         }
335         if (digitsToRemove != 0) {
336             digitsConsumed.adjustMagnitude(-digitsToRemove);
337             digitsConsumed.truncate();
338         }
339         prevValidSecondary = true;
340         currValidPrimary = true;
341     }
342     if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
343         // Grouping failure.
344         digitsConsumed.bogus = true;
345     }
346 
347     // Strings that start with a separator but have no digits,
348     // or strings that failed a grouping size check.
349     if (digitsConsumed.bogus) {
350         maybeMore = maybeMore || (segment.length() == 0);
351         segment.setOffset(initialOffset);
352         return maybeMore;
353     }
354 
355     // We passed all inspections. Start post-processing.
356 
357     // Adjust for fraction part.
358     digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
359 
360     // Set the digits, either normal or exponent.
361     if (exponentSign != 0 && segment.getOffset() != initialOffset) {
362         bool overflow = false;
363         if (digitsConsumed.fitsInLong()) {
364             int64_t exponentLong = digitsConsumed.toLong(false);
365             U_ASSERT(exponentLong >= 0);
366             if (exponentLong <= INT32_MAX) {
367                 auto exponentInt = static_cast<int32_t>(exponentLong);
368                 if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
369                     overflow = true;
370                 }
371             } else {
372                 overflow = true;
373             }
374         } else {
375             overflow = true;
376         }
377         if (overflow) {
378             if (exponentSign == -1) {
379                 // Set to zero
380                 result.quantity.clear();
381             } else {
382                 // Set to infinity
383                 result.quantity.bogus = true;
384                 result.flags |= FLAG_INFINITY;
385             }
386         }
387     } else {
388         result.quantity = digitsConsumed;
389     }
390 
391     // Set other information into the result and return.
392     if (!actualDecimalString.isBogus()) {
393         result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
394     }
395     result.setCharsConsumed(segment);
396     return segment.length() == 0 || maybeMore;
397 }
398 
validateGroup(int32_t sepType,int32_t count,bool isPrimary) const399 bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
400     if (requireGroupingMatch) {
401         if (sepType == -1) {
402             // No such group (prevGroup before first shift).
403             return true;
404         } else if (sepType == 0) {
405             // First group.
406             if (isPrimary) {
407                 // No grouping separators is OK.
408                 return true;
409             } else {
410                 return count != 0 && count <= grouping2;
411             }
412         } else if (sepType == 1) {
413             // Middle group.
414             if (isPrimary) {
415                 return count == grouping1;
416             } else {
417                 return count == grouping2;
418             }
419         } else {
420             U_ASSERT(sepType == 2);
421             // After the decimal separator.
422             return true;
423         }
424     } else {
425         if (sepType == 1) {
426             // #11230: don't accept middle groups with only 1 digit.
427             return count != 1;
428         } else {
429             return true;
430         }
431     }
432 }
433 
smokeTest(const StringSegment & segment) const434 bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
435     // The common case uses a static leadSet for efficiency.
436     if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
437         return segment.startsWith(*leadSet);
438     }
439     if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
440         return true;
441     }
442     if (fLocalDigitStrings.isNull()) {
443         return false;
444     }
445     for (int32_t i = 0; i < 10; i++) {
446         if (segment.startsWith(fLocalDigitStrings[i])) {
447             return true;
448         }
449     }
450     return false;
451 }
452 
toString() const453 UnicodeString DecimalMatcher::toString() const {
454     return u"<Decimal>";
455 }
456 
457 
458 #endif /* #if !UCONFIG_NO_FORMATTING */
459