• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 
4 // locdistance.cpp
5 // created: 2019may08 Markus W. Scherer
6 
7 #include "unicode/utypes.h"
8 #include "unicode/bytestrie.h"
9 #include "unicode/localematcher.h"
10 #include "unicode/locid.h"
11 #include "unicode/uobject.h"
12 #include "unicode/ures.h"
13 #include "cstring.h"
14 #include "locdistance.h"
15 #include "loclikelysubtags.h"
16 #include "uassert.h"
17 #include "ucln_cmn.h"
18 #include "uinvchar.h"
19 #include "umutex.h"
20 
21 U_NAMESPACE_BEGIN
22 
23 namespace {
24 
25 /**
26  * Bit flag used on the last character of a subtag in the trie.
27  * Must be set consistently by the builder and the lookup code.
28  */
29 constexpr int32_t END_OF_SUBTAG = 0x80;
30 /** Distance value bit flag, set by the builder. */
31 constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80;
32 /** Distance value bit flag, set by trieNext(). */
33 constexpr int32_t DISTANCE_IS_FINAL = 0x100;
34 constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
35 
36 constexpr int32_t ABOVE_THRESHOLD = 100;
37 
38 // Indexes into array of distances.
39 enum {
40     IX_DEF_LANG_DISTANCE,
41     IX_DEF_SCRIPT_DISTANCE,
42     IX_DEF_REGION_DISTANCE,
43     IX_MIN_REGION_DISTANCE,
44     IX_LIMIT
45 };
46 
47 LocaleDistance *gLocaleDistance = nullptr;
48 UInitOnce gInitOnce = U_INITONCE_INITIALIZER;
49 
cleanup()50 UBool U_CALLCONV cleanup() {
51     delete gLocaleDistance;
52     gLocaleDistance = nullptr;
53     gInitOnce.reset();
54     return TRUE;
55 }
56 
57 }  // namespace
58 
initLocaleDistance(UErrorCode & errorCode)59 void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
60     // This function is invoked only via umtx_initOnce().
61     U_ASSERT(gLocaleDistance == nullptr);
62     const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode);
63     if (U_FAILURE(errorCode)) { return; }
64     const LocaleDistanceData &data = likely.getDistanceData();
65     if (data.distanceTrieBytes == nullptr ||
66             data.regionToPartitions == nullptr || data.partitions == nullptr ||
67             // ok if no paradigms
68             data.distances == nullptr) {
69         errorCode = U_MISSING_RESOURCE_ERROR;
70         return;
71     }
72     gLocaleDistance = new LocaleDistance(data);
73     if (gLocaleDistance == nullptr) {
74         errorCode = U_MEMORY_ALLOCATION_ERROR;
75         return;
76     }
77     ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup);
78 }
79 
getSingleton(UErrorCode & errorCode)80 const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
81     if (U_FAILURE(errorCode)) { return nullptr; }
82     umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode);
83     return gLocaleDistance;
84 }
85 
LocaleDistance(const LocaleDistanceData & data)86 LocaleDistance::LocaleDistance(const LocaleDistanceData &data) :
87         trie(data.distanceTrieBytes),
88         regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
89         paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
90         defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]),
91         defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]),
92         defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]),
93         minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) {
94     // For the default demotion value, use the
95     // default region distance between unrelated Englishes.
96     // Thus, unless demotion is turned off,
97     // a mere region difference for one desired locale
98     // is as good as a perfect match for the next following desired locale.
99     // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
100     LSR en("en", "Latn", "US");
101     LSR enGB("en", "Latn", "GB");
102     const LSR *p_enGB = &enGB;
103     defaultDemotionPerDesiredLocale = getBestIndexAndDistance(en, &p_enGB, 1,
104             50, ULOCMATCH_FAVOR_LANGUAGE) & 0xff;
105 }
106 
getBestIndexAndDistance(const LSR & desired,const LSR ** supportedLSRs,int32_t supportedLSRsLength,int32_t threshold,ULocMatchFavorSubtag favorSubtag) const107 int32_t LocaleDistance::getBestIndexAndDistance(
108         const LSR &desired,
109         const LSR **supportedLSRs, int32_t supportedLSRsLength,
110         int32_t threshold, ULocMatchFavorSubtag favorSubtag) const {
111     BytesTrie iter(trie);
112     // Look up the desired language only once for all supported LSRs.
113     // Its "distance" is either a match point value of 0, or a non-match negative value.
114     // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
115     int32_t desLangDistance = trieNext(iter, desired.language, false);
116     uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
117     // Index of the supported LSR with the lowest distance.
118     int32_t bestIndex = -1;
119     for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
120         const LSR &supported = *supportedLSRs[slIndex];
121         bool star = false;
122         int32_t distance = desLangDistance;
123         if (distance >= 0) {
124             U_ASSERT((distance & DISTANCE_IS_FINAL) == 0);
125             if (slIndex != 0) {
126                 iter.resetToState64(desLangState);
127             }
128             distance = trieNext(iter, supported.language, true);
129         }
130         // Note: The data builder verifies that there are no rules with "any" (*) language and
131         // real (non *) script or region subtags.
132         // This means that if the lookup for either language fails we can use
133         // the default distances without further lookups.
134         int32_t flags;
135         if (distance >= 0) {
136             flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
137             distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
138         } else {  // <*, *>
139             if (uprv_strcmp(desired.language, supported.language) == 0) {
140                 distance = 0;
141             } else {
142                 distance = defaultLanguageDistance;
143             }
144             flags = 0;
145             star = true;
146         }
147         U_ASSERT(0 <= distance && distance <= 100);
148         // We implement "favor subtag" by reducing the language subtag distance
149         // (unscientifically reducing it to a quarter of the normal value),
150         // so that the script distance is relatively more important.
151         // For example, given a default language distance of 80, we reduce it to 20,
152         // which is below the default threshold of 50, which is the default script distance.
153         if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
154             distance >>= 2;
155         }
156         if (distance >= threshold) {
157             continue;
158         }
159 
160         int32_t scriptDistance;
161         if (star || flags != 0) {
162             if (uprv_strcmp(desired.script, supported.script) == 0) {
163                 scriptDistance = 0;
164             } else {
165                 scriptDistance = defaultScriptDistance;
166             }
167         } else {
168             scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(),
169                     desired.script, supported.script);
170             flags = scriptDistance & DISTANCE_IS_FINAL;
171             scriptDistance &= ~DISTANCE_IS_FINAL;
172         }
173         distance += scriptDistance;
174         if (distance >= threshold) {
175             continue;
176         }
177 
178         if (uprv_strcmp(desired.region, supported.region) == 0) {
179             // regionDistance = 0
180         } else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
181             distance += defaultRegionDistance;
182         } else {
183             int32_t remainingThreshold = threshold - distance;
184             if (minRegionDistance >= remainingThreshold) {
185                 continue;
186             }
187 
188             // From here on we know the regions are not equal.
189             // Map each region to zero or more partitions. (zero = one non-matching string)
190             // (Each array of single-character partition strings is encoded as one string.)
191             // If either side has more than one, then we find the maximum distance.
192             // This could be optimized by adding some more structure, but probably not worth it.
193             distance += getRegionPartitionsDistance(
194                     iter, iter.getState64(),
195                     partitionsForRegion(desired),
196                     partitionsForRegion(supported),
197                     remainingThreshold);
198         }
199         if (distance < threshold) {
200             if (distance == 0) {
201                 return slIndex << 8;
202             }
203             bestIndex = slIndex;
204             threshold = distance;
205         }
206     }
207     return bestIndex >= 0 ? (bestIndex << 8) | threshold : 0xffffff00 | ABOVE_THRESHOLD;
208 }
209 
getDesSuppScriptDistance(BytesTrie & iter,uint64_t startState,const char * desired,const char * supported)210 int32_t LocaleDistance::getDesSuppScriptDistance(
211         BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) {
212     // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
213     int32_t distance = trieNext(iter, desired, false);
214     if (distance >= 0) {
215         distance = trieNext(iter, supported, true);
216     }
217     if (distance < 0) {
218         UStringTrieResult result = iter.resetToState64(startState).next(u'*');  // <*, *>
219         U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
220         if (uprv_strcmp(desired, supported) == 0) {
221             distance = 0;  // same script
222         } else {
223             distance = iter.getValue();
224             U_ASSERT(distance >= 0);
225         }
226         if (result == USTRINGTRIE_FINAL_VALUE) {
227             distance |= DISTANCE_IS_FINAL;
228         }
229     }
230     return distance;
231 }
232 
getRegionPartitionsDistance(BytesTrie & iter,uint64_t startState,const char * desiredPartitions,const char * supportedPartitions,int32_t threshold)233 int32_t LocaleDistance::getRegionPartitionsDistance(
234         BytesTrie &iter, uint64_t startState,
235         const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) {
236     char desired = *desiredPartitions++;
237     char supported = *supportedPartitions++;
238     U_ASSERT(desired != 0 && supported != 0);
239     // See if we have single desired/supported partitions, from NUL-terminated
240     // partition strings without explicit length.
241     bool suppLengthGt1 = *supportedPartitions != 0;  // gt1: more than 1 character
242     // equivalent to: if (desLength == 1 && suppLength == 1)
243     if (*desiredPartitions == 0 && !suppLengthGt1) {
244         // Fastpath for single desired/supported partitions.
245         UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
246         if (USTRINGTRIE_HAS_NEXT(result)) {
247             result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
248             if (USTRINGTRIE_HAS_VALUE(result)) {
249                 return iter.getValue();
250             }
251         }
252         return getFallbackRegionDistance(iter, startState);
253     }
254 
255     const char *supportedStart = supportedPartitions - 1;  // for restart of inner loop
256     int32_t regionDistance = 0;
257     // Fall back to * only once, not for each pair of partition strings.
258     bool star = false;
259     for (;;) {
260         // Look up each desired-partition string only once,
261         // not for each (desired, supported) pair.
262         UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
263         if (USTRINGTRIE_HAS_NEXT(result)) {
264             uint64_t desState = suppLengthGt1 ? iter.getState64() : 0;
265             for (;;) {
266                 result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
267                 int32_t d;
268                 if (USTRINGTRIE_HAS_VALUE(result)) {
269                     d = iter.getValue();
270                 } else if (star) {
271                     d = 0;
272                 } else {
273                     d = getFallbackRegionDistance(iter, startState);
274                     star = true;
275                 }
276                 if (d >= threshold) {
277                     return d;
278                 } else if (regionDistance < d) {
279                     regionDistance = d;
280                 }
281                 if ((supported = *supportedPartitions++) != 0) {
282                     iter.resetToState64(desState);
283                 } else {
284                     break;
285                 }
286             }
287         } else if (!star) {
288             int32_t d = getFallbackRegionDistance(iter, startState);
289             if (d >= threshold) {
290                 return d;
291             } else if (regionDistance < d) {
292                 regionDistance = d;
293             }
294             star = true;
295         }
296         if ((desired = *desiredPartitions++) != 0) {
297             iter.resetToState64(startState);
298             supportedPartitions = supportedStart;
299             supported = *supportedPartitions++;
300         } else {
301             break;
302         }
303     }
304     return regionDistance;
305 }
306 
getFallbackRegionDistance(BytesTrie & iter,uint64_t startState)307 int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) {
308 #if U_DEBUG
309     UStringTrieResult result =
310 #endif
311     iter.resetToState64(startState).next(u'*');  // <*, *>
312     U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
313     int32_t distance = iter.getValue();
314     U_ASSERT(distance >= 0);
315     return distance;
316 }
317 
trieNext(BytesTrie & iter,const char * s,bool wantValue)318 int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) {
319     uint8_t c;
320     if ((c = *s) == 0) {
321         return -1;  // no empty subtags in the distance data
322     }
323     for (;;) {
324         c = uprv_invCharToAscii(c);
325         // EBCDIC: If *s is not an invariant character,
326         // then c is now 0 and will simply not match anything, which is harmless.
327         uint8_t next = *++s;
328         if (next != 0) {
329             if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
330                 return -1;
331             }
332         } else {
333             // last character of this subtag
334             UStringTrieResult result = iter.next(c | END_OF_SUBTAG);
335             if (wantValue) {
336                 if (USTRINGTRIE_HAS_VALUE(result)) {
337                     int32_t value = iter.getValue();
338                     if (result == USTRINGTRIE_FINAL_VALUE) {
339                         value |= DISTANCE_IS_FINAL;
340                     }
341                     return value;
342                 }
343             } else {
344                 if (USTRINGTRIE_HAS_NEXT(result)) {
345                     return 0;
346                 }
347             }
348             return -1;
349         }
350         c = next;
351     }
352 }
353 
isParadigmLSR(const LSR & lsr) const354 UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
355     // Linear search for a very short list (length 6 as of 2019).
356     // If there are many paradigm LSRs we should use a hash set.
357     U_ASSERT(paradigmLSRsLength <= 15);
358     for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
359         if (lsr == paradigmLSRs[i]) { return true; }
360     }
361     return false;
362 }
363 
364 U_NAMESPACE_END
365