1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // locdistance.cpp
5 // created: 2019may08 Markus W. Scherer
6
7 #include "unicode/utypes.h"
8 #include "unicode/bytestrie.h"
9 #include "unicode/localematcher.h"
10 #include "unicode/locid.h"
11 #include "unicode/uobject.h"
12 #include "unicode/ures.h"
13 #include "cstring.h"
14 #include "locdistance.h"
15 #include "loclikelysubtags.h"
16 #include "uassert.h"
17 #include "ucln_cmn.h"
18 #include "uinvchar.h"
19 #include "umutex.h"
20
21 U_NAMESPACE_BEGIN
22
23 namespace {
24
25 /**
26 * Bit flag used on the last character of a subtag in the trie.
27 * Must be set consistently by the builder and the lookup code.
28 */
29 constexpr int32_t END_OF_SUBTAG = 0x80;
30 /** Distance value bit flag, set by the builder. */
31 constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80;
32 /** Distance value bit flag, set by trieNext(). */
33 constexpr int32_t DISTANCE_IS_FINAL = 0x100;
34 constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT;
35
36 constexpr int32_t ABOVE_THRESHOLD = 100;
37
38 // Indexes into array of distances.
39 enum {
40 IX_DEF_LANG_DISTANCE,
41 IX_DEF_SCRIPT_DISTANCE,
42 IX_DEF_REGION_DISTANCE,
43 IX_MIN_REGION_DISTANCE,
44 IX_LIMIT
45 };
46
47 LocaleDistance *gLocaleDistance = nullptr;
48 UInitOnce gInitOnce {};
49
cleanup()50 UBool U_CALLCONV cleanup() {
51 delete gLocaleDistance;
52 gLocaleDistance = nullptr;
53 gInitOnce.reset();
54 return true;
55 }
56
57 } // namespace
58
initLocaleDistance(UErrorCode & errorCode)59 void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) {
60 // This function is invoked only via umtx_initOnce().
61 U_ASSERT(gLocaleDistance == nullptr);
62 const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode);
63 if (U_FAILURE(errorCode)) { return; }
64 const LocaleDistanceData &data = likely.getDistanceData();
65 if (data.distanceTrieBytes == nullptr ||
66 data.regionToPartitions == nullptr || data.partitions == nullptr ||
67 // ok if no paradigms
68 data.distances == nullptr) {
69 errorCode = U_MISSING_RESOURCE_ERROR;
70 return;
71 }
72 gLocaleDistance = new LocaleDistance(data, likely);
73 if (gLocaleDistance == nullptr) {
74 errorCode = U_MEMORY_ALLOCATION_ERROR;
75 return;
76 }
77 ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup);
78 }
79
getSingleton(UErrorCode & errorCode)80 const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) {
81 if (U_FAILURE(errorCode)) { return nullptr; }
82 umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode);
83 return gLocaleDistance;
84 }
85
LocaleDistance(const LocaleDistanceData & data,const XLikelySubtags & likely)86 LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) :
87 likelySubtags(likely),
88 trie(data.distanceTrieBytes),
89 regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions),
90 paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength),
91 defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]),
92 defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]),
93 defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]),
94 minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) {
95 // For the default demotion value, use the
96 // default region distance between unrelated Englishes.
97 // Thus, unless demotion is turned off,
98 // a mere region difference for one desired locale
99 // is as good as a perfect match for the next following desired locale.
100 // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>.
101 LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR);
102 LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR);
103 const LSR *p_enGB = &enGB;
104 int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1,
105 shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY);
106 defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance);
107 }
108
getBestIndexAndDistance(const LSR & desired,const LSR ** supportedLSRs,int32_t supportedLSRsLength,int32_t shiftedThreshold,ULocMatchFavorSubtag favorSubtag,ULocMatchDirection direction) const109 int32_t LocaleDistance::getBestIndexAndDistance(
110 const LSR &desired,
111 const LSR **supportedLSRs, int32_t supportedLSRsLength,
112 int32_t shiftedThreshold,
113 ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const {
114 BytesTrie iter(trie);
115 // Look up the desired language only once for all supported LSRs.
116 // Its "distance" is either a match point value of 0, or a non-match negative value.
117 // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
118 int32_t desLangDistance = trieNext(iter, desired.language, false);
119 uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0;
120 // Index of the supported LSR with the lowest distance.
121 int32_t bestIndex = -1;
122 // Cached lookup info from XLikelySubtags.compareLikely().
123 int32_t bestLikelyInfo = -1;
124 for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) {
125 const LSR &supported = *supportedLSRs[slIndex];
126 bool star = false;
127 int32_t distance = desLangDistance;
128 if (distance >= 0) {
129 U_ASSERT((distance & DISTANCE_IS_FINAL) == 0);
130 if (slIndex != 0) {
131 iter.resetToState64(desLangState);
132 }
133 distance = trieNext(iter, supported.language, true);
134 }
135 // Note: The data builder verifies that there are no rules with "any" (*) language and
136 // real (non *) script or region subtags.
137 // This means that if the lookup for either language fails we can use
138 // the default distances without further lookups.
139 int32_t flags;
140 if (distance >= 0) {
141 flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
142 distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT;
143 } else { // <*, *>
144 if (uprv_strcmp(desired.language, supported.language) == 0) {
145 distance = 0;
146 } else {
147 distance = defaultLanguageDistance;
148 }
149 flags = 0;
150 star = true;
151 }
152 U_ASSERT(0 <= distance && distance <= 100);
153 // Round up the shifted threshold (if fraction bits are not 0)
154 // for comparison with un-shifted distances until we need fraction bits.
155 // (If we simply shifted non-zero fraction bits away, then we might ignore a language
156 // when it's really still a micro distance below the threshold.)
157 int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT;
158 // We implement "favor subtag" by reducing the language subtag distance
159 // (unscientifically reducing it to a quarter of the normal value),
160 // so that the script distance is relatively more important.
161 // For example, given a default language distance of 80, we reduce it to 20,
162 // which is below the default threshold of 50, which is the default script distance.
163 if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) {
164 distance >>= 2;
165 }
166 // Let distance == roundedThreshold pass until the tie-breaker logic
167 // at the end of the loop.
168 if (distance > roundedThreshold) {
169 continue;
170 }
171
172 int32_t scriptDistance;
173 if (star || flags != 0) {
174 if (uprv_strcmp(desired.script, supported.script) == 0) {
175 scriptDistance = 0;
176 } else {
177 scriptDistance = defaultScriptDistance;
178 }
179 } else {
180 scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(),
181 desired.script, supported.script);
182 flags = scriptDistance & DISTANCE_IS_FINAL;
183 scriptDistance &= ~DISTANCE_IS_FINAL;
184 }
185 distance += scriptDistance;
186 if (distance > roundedThreshold) {
187 continue;
188 }
189
190 if (uprv_strcmp(desired.region, supported.region) == 0) {
191 // regionDistance = 0
192 } else if (star || (flags & DISTANCE_IS_FINAL) != 0) {
193 distance += defaultRegionDistance;
194 } else {
195 int32_t remainingThreshold = roundedThreshold - distance;
196 if (minRegionDistance > remainingThreshold) {
197 continue;
198 }
199
200 // From here on we know the regions are not equal.
201 // Map each region to zero or more partitions. (zero = one non-matching string)
202 // (Each array of single-character partition strings is encoded as one string.)
203 // If either side has more than one, then we find the maximum distance.
204 // This could be optimized by adding some more structure, but probably not worth it.
205 distance += getRegionPartitionsDistance(
206 iter, iter.getState64(),
207 partitionsForRegion(desired),
208 partitionsForRegion(supported),
209 remainingThreshold);
210 }
211 int32_t shiftedDistance = shiftDistance(distance);
212 if (shiftedDistance == 0) {
213 // Distinguish between equivalent but originally unequal locales via an
214 // additional micro distance.
215 shiftedDistance |= (desired.flags ^ supported.flags);
216 if (shiftedDistance < shiftedThreshold) {
217 if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
218 // Is there also a match when we swap desired/supported?
219 isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
220 if (shiftedDistance == 0) {
221 return slIndex << INDEX_SHIFT;
222 }
223 bestIndex = slIndex;
224 shiftedThreshold = shiftedDistance;
225 bestLikelyInfo = -1;
226 }
227 }
228 } else {
229 if (shiftedDistance < shiftedThreshold) {
230 if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
231 // Is there also a match when we swap desired/supported?
232 isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
233 bestIndex = slIndex;
234 shiftedThreshold = shiftedDistance;
235 bestLikelyInfo = -1;
236 }
237 } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) {
238 if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY ||
239 // Is there also a match when we swap desired/supported?
240 isMatch(supported, desired, shiftedThreshold, favorSubtag)) {
241 bestLikelyInfo = likelySubtags.compareLikely(
242 supported, *supportedLSRs[bestIndex], bestLikelyInfo);
243 if ((bestLikelyInfo & 1) != 0) {
244 // This supported locale matches as well as the previous best match,
245 // and neither matches perfectly,
246 // but this one is "more likely" (has more-default subtags).
247 bestIndex = slIndex;
248 }
249 }
250 }
251 }
252 }
253 return bestIndex >= 0 ?
254 (bestIndex << INDEX_SHIFT) | shiftedThreshold :
255 INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD);
256 }
257
getDesSuppScriptDistance(BytesTrie & iter,uint64_t startState,const char * desired,const char * supported)258 int32_t LocaleDistance::getDesSuppScriptDistance(
259 BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) {
260 // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules.
261 int32_t distance = trieNext(iter, desired, false);
262 if (distance >= 0) {
263 distance = trieNext(iter, supported, true);
264 }
265 if (distance < 0) {
266 UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *>
267 U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
268 if (uprv_strcmp(desired, supported) == 0) {
269 distance = 0; // same script
270 } else {
271 distance = iter.getValue();
272 U_ASSERT(distance >= 0);
273 }
274 if (result == USTRINGTRIE_FINAL_VALUE) {
275 distance |= DISTANCE_IS_FINAL;
276 }
277 }
278 return distance;
279 }
280
getRegionPartitionsDistance(BytesTrie & iter,uint64_t startState,const char * desiredPartitions,const char * supportedPartitions,int32_t threshold)281 int32_t LocaleDistance::getRegionPartitionsDistance(
282 BytesTrie &iter, uint64_t startState,
283 const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) {
284 char desired = *desiredPartitions++;
285 char supported = *supportedPartitions++;
286 U_ASSERT(desired != 0 && supported != 0);
287 // See if we have single desired/supported partitions, from NUL-terminated
288 // partition strings without explicit length.
289 bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character
290 // equivalent to: if (desLength == 1 && suppLength == 1)
291 if (*desiredPartitions == 0 && !suppLengthGt1) {
292 // Fastpath for single desired/supported partitions.
293 UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
294 if (USTRINGTRIE_HAS_NEXT(result)) {
295 result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
296 if (USTRINGTRIE_HAS_VALUE(result)) {
297 return iter.getValue();
298 }
299 }
300 return getFallbackRegionDistance(iter, startState);
301 }
302
303 const char *supportedStart = supportedPartitions - 1; // for restart of inner loop
304 int32_t regionDistance = 0;
305 // Fall back to * only once, not for each pair of partition strings.
306 bool star = false;
307 for (;;) {
308 // Look up each desired-partition string only once,
309 // not for each (desired, supported) pair.
310 UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG);
311 if (USTRINGTRIE_HAS_NEXT(result)) {
312 uint64_t desState = suppLengthGt1 ? iter.getState64() : 0;
313 for (;;) {
314 result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG);
315 int32_t d;
316 if (USTRINGTRIE_HAS_VALUE(result)) {
317 d = iter.getValue();
318 } else if (star) {
319 d = 0;
320 } else {
321 d = getFallbackRegionDistance(iter, startState);
322 star = true;
323 }
324 if (d > threshold) {
325 return d;
326 } else if (regionDistance < d) {
327 regionDistance = d;
328 }
329 if ((supported = *supportedPartitions++) != 0) {
330 iter.resetToState64(desState);
331 } else {
332 break;
333 }
334 }
335 } else if (!star) {
336 int32_t d = getFallbackRegionDistance(iter, startState);
337 if (d > threshold) {
338 return d;
339 } else if (regionDistance < d) {
340 regionDistance = d;
341 }
342 star = true;
343 }
344 if ((desired = *desiredPartitions++) != 0) {
345 iter.resetToState64(startState);
346 supportedPartitions = supportedStart;
347 supported = *supportedPartitions++;
348 } else {
349 break;
350 }
351 }
352 return regionDistance;
353 }
354
getFallbackRegionDistance(BytesTrie & iter,uint64_t startState)355 int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) {
356 #if U_DEBUG
357 UStringTrieResult result =
358 #endif
359 iter.resetToState64(startState).next(u'*'); // <*, *>
360 U_ASSERT(USTRINGTRIE_HAS_VALUE(result));
361 int32_t distance = iter.getValue();
362 U_ASSERT(distance >= 0);
363 return distance;
364 }
365
trieNext(BytesTrie & iter,const char * s,bool wantValue)366 int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) {
367 uint8_t c;
368 if ((c = *s) == 0) {
369 return -1; // no empty subtags in the distance data
370 }
371 for (;;) {
372 c = uprv_invCharToAscii(c);
373 // EBCDIC: If *s is not an invariant character,
374 // then c is now 0 and will simply not match anything, which is harmless.
375 uint8_t next = *++s;
376 if (next != 0) {
377 if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) {
378 return -1;
379 }
380 } else {
381 // last character of this subtag
382 UStringTrieResult result = iter.next(c | END_OF_SUBTAG);
383 if (wantValue) {
384 if (USTRINGTRIE_HAS_VALUE(result)) {
385 int32_t value = iter.getValue();
386 if (result == USTRINGTRIE_FINAL_VALUE) {
387 value |= DISTANCE_IS_FINAL;
388 }
389 return value;
390 }
391 } else {
392 if (USTRINGTRIE_HAS_NEXT(result)) {
393 return 0;
394 }
395 }
396 return -1;
397 }
398 c = next;
399 }
400 }
401
isParadigmLSR(const LSR & lsr) const402 UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const {
403 // Linear search for a very short list (length 6 as of 2019),
404 // because we look for equivalence not equality, and
405 // because it's easy.
406 // If there are many paradigm LSRs we should use a hash set
407 // with custom comparator and hasher.
408 U_ASSERT(paradigmLSRsLength <= 15);
409 for (int32_t i = 0; i < paradigmLSRsLength; ++i) {
410 if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; }
411 }
412 return false;
413 }
414
415 U_NAMESPACE_END
416