• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2018 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 #include "unicode/utypes.h"
5 
6 #if !UCONFIG_NO_FORMATTING
7 
8 // Allow implicit conversion from char16_t* to UnicodeString for this file:
9 // Helpful in toString methods and elsewhere.
10 #define UNISTR_FROM_STRING_EXPLICIT
11 
12 #include "numparse_types.h"
13 #include "numparse_affixes.h"
14 #include "numparse_utils.h"
15 #include "number_utils.h"
16 #include "string_segment.h"
17 
18 using namespace icu;
19 using namespace icu::numparse;
20 using namespace icu::numparse::impl;
21 using namespace icu::number;
22 using namespace icu::number::impl;
23 
24 
25 namespace {
26 
27 /**
28  * Helper method to return whether the given AffixPatternMatcher equals the given pattern string.
29  * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal
30  * the given pattern string.
31  */
matched(const AffixPatternMatcher * affix,const UnicodeString & patternString)32 static bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) {
33     return (affix == nullptr && patternString.isBogus()) ||
34            (affix != nullptr && affix->getPattern() == patternString);
35 }
36 
37 /**
38  * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null.
39  */
length(const AffixPatternMatcher * matcher)40 static int32_t length(const AffixPatternMatcher* matcher) {
41     return matcher == nullptr ? 0 : matcher->getPattern().length();
42 }
43 
44 /**
45  * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both
46  * valid, whether they are equal according to operator==.  Similar to Java Objects.equals()
47  */
equals(const AffixPatternMatcher * lhs,const AffixPatternMatcher * rhs)48 static bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) {
49     if (lhs == nullptr && rhs == nullptr) {
50         return true;
51     }
52     if (lhs == nullptr || rhs == nullptr) {
53         return false;
54     }
55     return *lhs == *rhs;
56 }
57 
58 }
59 
60 
AffixPatternMatcherBuilder(const UnicodeString & pattern,AffixTokenMatcherWarehouse & warehouse,IgnorablesMatcher * ignorables)61 AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern,
62                                                        AffixTokenMatcherWarehouse& warehouse,
63                                                        IgnorablesMatcher* ignorables)
64         : fMatchersLen(0),
65           fLastTypeOrCp(0),
66           fPattern(pattern),
67           fWarehouse(warehouse),
68           fIgnorables(ignorables) {}
69 
consumeToken(AffixPatternType type,UChar32 cp,UErrorCode & status)70 void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) {
71     // This is called by AffixUtils.iterateWithConsumer() for each token.
72 
73     // Add an ignorables matcher between tokens except between two literals, and don't put two
74     // ignorables matchers in a row.
75     if (fIgnorables != nullptr && fMatchersLen > 0 &&
76         (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) {
77         addMatcher(*fIgnorables);
78     }
79 
80     if (type != TYPE_CODEPOINT) {
81         // Case 1: the token is a symbol.
82         switch (type) {
83             case TYPE_MINUS_SIGN:
84                 addMatcher(fWarehouse.minusSign());
85                 break;
86             case TYPE_PLUS_SIGN:
87                 addMatcher(fWarehouse.plusSign());
88                 break;
89             case TYPE_PERCENT:
90                 addMatcher(fWarehouse.percent());
91                 break;
92             case TYPE_PERMILLE:
93                 addMatcher(fWarehouse.permille());
94                 break;
95             case TYPE_CURRENCY_SINGLE:
96             case TYPE_CURRENCY_DOUBLE:
97             case TYPE_CURRENCY_TRIPLE:
98             case TYPE_CURRENCY_QUAD:
99             case TYPE_CURRENCY_QUINT:
100                 // All currency symbols use the same matcher
101                 addMatcher(fWarehouse.currency(status));
102                 break;
103             default:
104                 UPRV_UNREACHABLE_EXIT;
105         }
106 
107     } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) {
108         // Case 2: the token is an ignorable literal.
109         // No action necessary: the ignorables matcher has already been added.
110 
111     } else {
112         // Case 3: the token is a non-ignorable literal.
113         if (auto* ptr = fWarehouse.nextCodePointMatcher(cp, status)) {
114             addMatcher(*ptr);
115         } else {
116             // OOM; unwind the stack
117             return;
118         }
119     }
120     fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp;
121 }
122 
addMatcher(NumberParseMatcher & matcher)123 void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) {
124     if (fMatchersLen >= fMatchers.getCapacity()) {
125         fMatchers.resize(fMatchersLen * 2, fMatchersLen);
126     }
127     fMatchers[fMatchersLen++] = &matcher;
128 }
129 
build(UErrorCode & status)130 AffixPatternMatcher AffixPatternMatcherBuilder::build(UErrorCode& status) {
131     return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern, status);
132 }
133 
AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData * setupData)134 AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData)
135         : fSetupData(setupData) {}
136 
minusSign()137 NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() {
138     return fMinusSign = {fSetupData->dfs, true};
139 }
140 
plusSign()141 NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() {
142     return fPlusSign = {fSetupData->dfs, true};
143 }
144 
percent()145 NumberParseMatcher& AffixTokenMatcherWarehouse::percent() {
146     return fPercent = {fSetupData->dfs};
147 }
148 
permille()149 NumberParseMatcher& AffixTokenMatcherWarehouse::permille() {
150     return fPermille = {fSetupData->dfs};
151 }
152 
currency(UErrorCode & status)153 NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) {
154     return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status};
155 }
156 
ignorables()157 IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() {
158     return fSetupData->ignorables;
159 }
160 
nextCodePointMatcher(UChar32 cp,UErrorCode & status)161 NumberParseMatcher* AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp, UErrorCode& status) {
162     if (U_FAILURE(status)) {
163         return nullptr;
164     }
165     auto* result = fCodePoints.create(cp);
166     if (result == nullptr) {
167         status = U_MEMORY_ALLOCATION_ERROR;
168     }
169     return result;
170 }
171 
hasEmptyCurrencySymbol() const172 bool AffixTokenMatcherWarehouse::hasEmptyCurrencySymbol() const {
173     return fSetupData->currencySymbols.hasEmptyCurrencySymbol();
174 }
175 
176 
CodePointMatcher(UChar32 cp)177 CodePointMatcher::CodePointMatcher(UChar32 cp)
178         : fCp(cp) {}
179 
match(StringSegment & segment,ParsedNumber & result,UErrorCode &) const180 bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
181     if (segment.startsWith(fCp)) {
182         segment.adjustOffsetByCodePoint();
183         result.setCharsConsumed(segment);
184     }
185     return false;
186 }
187 
smokeTest(const StringSegment & segment) const188 bool CodePointMatcher::smokeTest(const StringSegment& segment) const {
189     return segment.startsWith(fCp);
190 }
191 
toString() const192 UnicodeString CodePointMatcher::toString() const {
193     return u"<CodePoint>";
194 }
195 
196 
fromAffixPattern(const UnicodeString & affixPattern,AffixTokenMatcherWarehouse & tokenWarehouse,parse_flags_t parseFlags,bool * success,UErrorCode & status)197 AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern,
198                                                           AffixTokenMatcherWarehouse& tokenWarehouse,
199                                                           parse_flags_t parseFlags, bool* success,
200                                                           UErrorCode& status) {
201     if (affixPattern.isEmpty()) {
202         *success = false;
203         return {};
204     }
205     *success = true;
206 
207     IgnorablesMatcher* ignorables;
208     if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) {
209         ignorables = nullptr;
210     } else {
211         ignorables = &tokenWarehouse.ignorables();
212     }
213 
214     AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables);
215     AffixUtils::iterateWithConsumer(affixPattern, builder, status);
216     return builder.build(status);
217 }
218 
AffixPatternMatcher(MatcherArray & matchers,int32_t matchersLen,const UnicodeString & pattern,UErrorCode & status)219 AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen,
220                                          const UnicodeString& pattern, UErrorCode& status)
221     : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern, status) {
222 }
223 
getPattern() const224 UnicodeString AffixPatternMatcher::getPattern() const {
225     return fPattern.toAliasedUnicodeString();
226 }
227 
operator ==(const AffixPatternMatcher & other) const228 bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const {
229     return fPattern == other.fPattern;
230 }
231 
232 
AffixMatcherWarehouse(AffixTokenMatcherWarehouse * tokenWarehouse)233 AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse)
234         : fTokenWarehouse(tokenWarehouse) {
235 }
236 
isInteresting(const AffixPatternProvider & patternInfo,const IgnorablesMatcher & ignorables,parse_flags_t parseFlags,UErrorCode & status)237 bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo,
238                                           const IgnorablesMatcher& ignorables, parse_flags_t parseFlags,
239                                           UErrorCode& status) {
240     UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX);
241     UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX);
242     UnicodeString negPrefixString;
243     UnicodeString negSuffixString;
244     if (patternInfo.hasNegativeSubpattern()) {
245         negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX);
246         negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX);
247     }
248 
249     if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) &&
250         AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) &&
251         AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) &&
252         AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) &&
253         AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status)
254         // HACK: Plus and minus sign are a special case: we accept them trailing only if they are
255         // trailing in the pattern string.
256         && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) &&
257         !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) &&
258         !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) &&
259         !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) {
260         // The affixes contain only symbols and ignorables.
261         // No need to generate affix matchers.
262         return false;
263     }
264     return true;
265 }
266 
createAffixMatchers(const AffixPatternProvider & patternInfo,MutableMatcherCollection & output,const IgnorablesMatcher & ignorables,parse_flags_t parseFlags,UErrorCode & status)267 void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo,
268                                                 MutableMatcherCollection& output,
269                                                 const IgnorablesMatcher& ignorables,
270                                                 parse_flags_t parseFlags, UErrorCode& status) {
271     if (!isInteresting(patternInfo, ignorables, parseFlags, status)) {
272         return;
273     }
274 
275     // The affixes have interesting characters, or we are in strict mode.
276     // Use initial capacity of 6, the highest possible number of AffixMatchers.
277     UnicodeString sb;
278     bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES);
279 
280     int32_t numAffixMatchers = 0;
281     int32_t numAffixPatternMatchers = 0;
282 
283     AffixPatternMatcher* posPrefix = nullptr;
284     AffixPatternMatcher* posSuffix = nullptr;
285 
286     // Pre-process the affix strings to resolve LDML rules like sign display.
287     for (int8_t typeInt = 0; typeInt < PATTERN_SIGN_TYPE_COUNT * 2; typeInt++) {
288         auto type = static_cast<PatternSignType>(typeInt / 2);
289         bool dropCurrencySymbols = (typeInt % 2) == 1;
290 
291         if (dropCurrencySymbols && !patternInfo.hasCurrencySign()) {
292             continue;
293         }
294         if (dropCurrencySymbols && !fTokenWarehouse->hasEmptyCurrencySymbol()) {
295             continue;
296         }
297 
298         // Skip affixes in some cases
299         if (type == PATTERN_SIGN_TYPE_POS
300                 && 0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) {
301             continue;
302         }
303         if (type == PATTERN_SIGN_TYPE_POS_SIGN
304                 && 0 == (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) {
305             continue;
306         }
307 
308         // Generate Prefix
309         // TODO: Handle approximately sign?
310         bool hasPrefix = false;
311         PatternStringUtils::patternInfoToStringBuilder(
312                 patternInfo, true, type, false, StandardPlural::OTHER, false, dropCurrencySymbols, sb);
313         fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
314                 sb, *fTokenWarehouse, parseFlags, &hasPrefix, status);
315         AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
316                                                 : nullptr;
317 
318         // Generate Suffix
319         // TODO: Handle approximately sign?
320         bool hasSuffix = false;
321         PatternStringUtils::patternInfoToStringBuilder(
322                 patternInfo, false, type, false, StandardPlural::OTHER, false, dropCurrencySymbols, sb);
323         fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
324                 sb, *fTokenWarehouse, parseFlags, &hasSuffix, status);
325         AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
326                                                 : nullptr;
327 
328         if (type == PATTERN_SIGN_TYPE_POS) {
329             posPrefix = prefix;
330             posSuffix = suffix;
331         } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) {
332             // Skip adding these matchers (we already have equivalents)
333             continue;
334         }
335 
336         // Flags for setting in the ParsedNumber; the token matchers may add more.
337         int flags = (type == PATTERN_SIGN_TYPE_NEG) ? FLAG_NEGATIVE : 0;
338 
339         // Note: it is indeed possible for posPrefix and posSuffix to both be null.
340         // We still need to add that matcher for strict mode to work.
341         fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags};
342         if (includeUnpaired && prefix != nullptr && suffix != nullptr) {
343             // The following if statements are designed to prevent adding two identical matchers.
344             if (type == PATTERN_SIGN_TYPE_POS || !equals(prefix, posPrefix)) {
345                 fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags};
346             }
347             if (type == PATTERN_SIGN_TYPE_POS || !equals(suffix, posSuffix)) {
348                 fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags};
349             }
350         }
351     }
352 
353     // Put the AffixMatchers in order, and then add them to the output.
354     // Since there are at most 9 elements, do a simple-to-implement bubble sort.
355     bool madeChanges;
356     do {
357         madeChanges = false;
358         for (int32_t i = 1; i < numAffixMatchers; i++) {
359             if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) {
360                 madeChanges = true;
361                 AffixMatcher temp = std::move(fAffixMatchers[i - 1]);
362                 fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]);
363                 fAffixMatchers[i] = std::move(temp);
364             }
365         }
366     } while (madeChanges);
367 
368     for (int32_t i = 0; i < numAffixMatchers; i++) {
369         // Enable the following line to debug affixes
370         //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl;
371         output.addMatcher(fAffixMatchers[i]);
372     }
373 }
374 
375 
AffixMatcher(AffixPatternMatcher * prefix,AffixPatternMatcher * suffix,result_flags_t flags)376 AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags)
377         : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {}
378 
match(StringSegment & segment,ParsedNumber & result,UErrorCode & status) const379 bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
380     if (!result.seenNumber()) {
381         // Prefix
382         // Do not match if:
383         // 1. We have already seen a prefix (result.prefix != null)
384         // 2. The prefix in this AffixMatcher is empty (prefix == null)
385         if (!result.prefix.isBogus() || fPrefix == nullptr) {
386             return false;
387         }
388 
389         // Attempt to match the prefix.
390         int initialOffset = segment.getOffset();
391         bool maybeMore = fPrefix->match(segment, result, status);
392         if (initialOffset != segment.getOffset()) {
393             result.prefix = fPrefix->getPattern();
394         }
395         return maybeMore;
396 
397     } else {
398         // Suffix
399         // Do not match if:
400         // 1. We have already seen a suffix (result.suffix != null)
401         // 2. The suffix in this AffixMatcher is empty (suffix == null)
402         // 3. The matched prefix does not equal this AffixMatcher's prefix
403         if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) {
404             return false;
405         }
406 
407         // Attempt to match the suffix.
408         int initialOffset = segment.getOffset();
409         bool maybeMore = fSuffix->match(segment, result, status);
410         if (initialOffset != segment.getOffset()) {
411             result.suffix = fSuffix->getPattern();
412         }
413         return maybeMore;
414     }
415 }
416 
smokeTest(const StringSegment & segment) const417 bool AffixMatcher::smokeTest(const StringSegment& segment) const {
418     return (fPrefix != nullptr && fPrefix->smokeTest(segment)) ||
419            (fSuffix != nullptr && fSuffix->smokeTest(segment));
420 }
421 
postProcess(ParsedNumber & result) const422 void AffixMatcher::postProcess(ParsedNumber& result) const {
423     // Check to see if our affix is the one that was matched. If so, set the flags in the result.
424     if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) {
425         // Fill in the result prefix and suffix with non-null values (empty string).
426         // Used by strict mode to determine whether an entire affix pair was matched.
427         if (result.prefix.isBogus()) {
428             result.prefix = UnicodeString();
429         }
430         if (result.suffix.isBogus()) {
431             result.suffix = UnicodeString();
432         }
433         result.flags |= fFlags;
434         if (fPrefix != nullptr) {
435             fPrefix->postProcess(result);
436         }
437         if (fSuffix != nullptr) {
438             fSuffix->postProcess(result);
439         }
440     }
441 }
442 
compareTo(const AffixMatcher & rhs) const443 int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const {
444     const AffixMatcher& lhs = *this;
445     if (length(lhs.fPrefix) != length(rhs.fPrefix)) {
446         return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1;
447     } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) {
448         return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1;
449     } else {
450         return 0;
451     }
452 }
453 
toString() const454 UnicodeString AffixMatcher::toString() const {
455     bool isNegative = 0 != (fFlags & FLAG_NEGATIVE);
456     return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") +
457            (fPrefix ? fPrefix->getPattern() : u"null") + u"#" +
458            (fSuffix ? fSuffix->getPattern() : u"null") + u">";
459 
460 }
461 
462 
463 #endif /* #if !UCONFIG_NO_FORMATTING */
464