1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // locdistance.h 5 // created: 2019may08 Markus W. Scherer 6 7 #ifndef __LOCDISTANCE_H__ 8 #define __LOCDISTANCE_H__ 9 10 #include "unicode/utypes.h" 11 #include "unicode/bytestrie.h" 12 #include "unicode/localematcher.h" 13 #include "unicode/locid.h" 14 #include "unicode/uobject.h" 15 #include "lsr.h" 16 17 U_NAMESPACE_BEGIN 18 19 struct LocaleDistanceData; 20 21 /** 22 * Offline-built data for LocaleMatcher. 23 * Mostly but not only the data for mapping locales to their maximized forms. 24 */ 25 class LocaleDistance final : public UMemory { 26 public: 27 static const LocaleDistance *getSingleton(UErrorCode &errorCode); 28 shiftDistance(int32_t distance)29 static int32_t shiftDistance(int32_t distance) { 30 return distance << DISTANCE_SHIFT; 31 } 32 getShiftedDistance(int32_t indexAndDistance)33 static int32_t getShiftedDistance(int32_t indexAndDistance) { 34 return indexAndDistance & DISTANCE_MASK; 35 } 36 getDistanceDouble(int32_t indexAndDistance)37 static double getDistanceDouble(int32_t indexAndDistance) { 38 double shiftedDistance = getShiftedDistance(indexAndDistance); 39 return shiftedDistance / (1 << DISTANCE_SHIFT); 40 } 41 getDistanceFloor(int32_t indexAndDistance)42 static int32_t getDistanceFloor(int32_t indexAndDistance) { 43 return (indexAndDistance & DISTANCE_MASK) >> DISTANCE_SHIFT; 44 } 45 getIndex(int32_t indexAndDistance)46 static int32_t getIndex(int32_t indexAndDistance) { 47 // assert indexAndDistance >= 0; 48 return indexAndDistance >> INDEX_SHIFT; 49 } 50 51 /** 52 * Finds the supported LSR with the smallest distance from the desired one. 53 * Equivalent LSR subtags must be normalized into a canonical form. 54 * 55 * <p>Returns the index of the lowest-distance supported LSR in the high bits 56 * (negative if none has a distance below the threshold), 57 * and its distance (0..ABOVE_THRESHOLD) in the low bits. 58 */ 59 int32_t getBestIndexAndDistance(const LSR &desired, 60 const LSR **supportedLSRs, int32_t supportedLSRsLength, 61 int32_t shiftedThreshold, 62 ULocMatchFavorSubtag favorSubtag, 63 ULocMatchDirection direction) const; 64 65 UBool isParadigmLSR(const LSR &lsr) const; 66 getDefaultScriptDistance()67 int32_t getDefaultScriptDistance() const { 68 return defaultScriptDistance; 69 } 70 getDefaultDemotionPerDesiredLocale()71 int32_t getDefaultDemotionPerDesiredLocale() const { 72 return defaultDemotionPerDesiredLocale; 73 } 74 75 private: 76 // The distance is shifted left to gain some fraction bits. 77 static constexpr int32_t DISTANCE_SHIFT = 3; 78 static constexpr int32_t DISTANCE_FRACTION_MASK = 7; 79 // 7 bits for 0..100 80 static constexpr int32_t DISTANCE_INT_SHIFT = 7; 81 static constexpr int32_t INDEX_SHIFT = DISTANCE_INT_SHIFT + DISTANCE_SHIFT; 82 static constexpr int32_t DISTANCE_MASK = 0x3ff; 83 // tic constexpr int32_t MAX_INDEX = 0x1fffff; // avoids sign bit 84 static constexpr int32_t INDEX_NEG_1 = 0xfffffc00; 85 86 LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely); 87 LocaleDistance(const LocaleDistance &other) = delete; 88 LocaleDistance &operator=(const LocaleDistance &other) = delete; 89 90 static void initLocaleDistance(UErrorCode &errorCode); 91 isMatch(const LSR & desired,const LSR & supported,int32_t shiftedThreshold,ULocMatchFavorSubtag favorSubtag)92 UBool isMatch(const LSR &desired, const LSR &supported, 93 int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag) const { 94 const LSR *pSupp = &supported; 95 return getBestIndexAndDistance( 96 desired, &pSupp, 1, 97 shiftedThreshold, favorSubtag, ULOCMATCH_DIRECTION_WITH_ONE_WAY) >= 0; 98 } 99 100 static int32_t getDesSuppScriptDistance(BytesTrie &iter, uint64_t startState, 101 const char *desired, const char *supported); 102 103 static int32_t getRegionPartitionsDistance( 104 BytesTrie &iter, uint64_t startState, 105 const char *desiredPartitions, const char *supportedPartitions, 106 int32_t threshold); 107 108 static int32_t getFallbackRegionDistance(BytesTrie &iter, uint64_t startState); 109 110 static int32_t trieNext(BytesTrie &iter, const char *s, bool wantValue); 111 partitionsForRegion(const LSR & lsr)112 const char *partitionsForRegion(const LSR &lsr) const { 113 // ill-formed region -> one non-matching string 114 int32_t pIndex = regionToPartitionsIndex[lsr.regionIndex]; 115 return partitionArrays[pIndex]; 116 } 117 getDefaultRegionDistance()118 int32_t getDefaultRegionDistance() const { 119 return defaultRegionDistance; 120 } 121 122 const XLikelySubtags &likelySubtags; 123 124 // The trie maps each dlang+slang+dscript+sscript+dregion+sregion 125 // (encoded in ASCII with bit 7 set on the last character of each subtag) to a distance. 126 // There is also a trie value for each subsequence of whole subtags. 127 // One '*' is used for a (desired, supported) pair of "und", "Zzzz"/"", or "ZZ"/"". 128 BytesTrie trie; 129 130 /** 131 * Maps each region to zero or more single-character partitions. 132 */ 133 const uint8_t *regionToPartitionsIndex; 134 const char **partitionArrays; 135 136 /** 137 * Used to get the paradigm region for a cluster, if there is one. 138 */ 139 const LSR *paradigmLSRs; 140 int32_t paradigmLSRsLength; 141 142 int32_t defaultLanguageDistance; 143 int32_t defaultScriptDistance; 144 int32_t defaultRegionDistance; 145 int32_t minRegionDistance; 146 int32_t defaultDemotionPerDesiredLocale; 147 }; 148 149 U_NAMESPACE_END 150 151 #endif // __LOCDISTANCE_H__ 152