1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationsettings.h 9 * 10 * created on: 2013feb07 11 * created by: Markus W. Scherer 12 */ 13 14 #ifndef __COLLATIONSETTINGS_H__ 15 #define __COLLATIONSETTINGS_H__ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_COLLATION 20 21 #include "unicode/ucol.h" 22 #include "collation.h" 23 #include "sharedobject.h" 24 #include "umutex.h" 25 26 U_NAMESPACE_BEGIN 27 28 struct CollationData; 29 30 /** 31 * Collation settings/options/attributes. 32 * These are the values that can be changed via API. 33 */ 34 struct U_I18N_API CollationSettings : public SharedObject { 35 /** 36 * Options bit 0: Perform the FCD check on the input text and deliver normalized text. 37 */ 38 static const int32_t CHECK_FCD = 1; 39 /** 40 * Options bit 1: Numeric collation. 41 * Also known as CODAN = COllate Digits As Numbers. 42 * 43 * Treat digit sequences as numbers with CE sequences in numeric order, 44 * rather than returning a normal CE for each digit. 45 */ 46 static const int32_t NUMERIC = 2; 47 /** 48 * "Shifted" alternate handling, see ALTERNATE_MASK. 49 */ 50 static const int32_t SHIFTED = 4; 51 /** 52 * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable. 53 * Reserve values 8 and 0xc for shift-trimmed and blanked. 54 */ 55 static const int32_t ALTERNATE_MASK = 0xc; 56 /** 57 * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value. 58 */ 59 static const int32_t MAX_VARIABLE_SHIFT = 4; 60 /** maxVariable options bit mask before shifting. */ 61 static const int32_t MAX_VARIABLE_MASK = 0x70; 62 /** Options bit 7: Reserved/unused/0. */ 63 /** 64 * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on. 65 */ 66 static const int32_t UPPER_FIRST = 0x100; 67 /** 68 * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values) 69 * unless case level is on (when they are *moved* into the separate case level). 70 * By default, the case bits are removed from the tertiary weight (ignored). 71 * 72 * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to 73 * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST. 74 */ 75 static const int32_t CASE_FIRST = 0x200; 76 /** 77 * Options bit mask for caseFirst and upperFirst, before shifting. 78 * Same value as caseFirst==upperFirst. 79 */ 80 static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST; 81 /** 82 * Options bit 10: Insert the case level between the secondary and tertiary levels. 83 */ 84 static const int32_t CASE_LEVEL = 0x400; 85 /** 86 * Options bit 11: Compare secondary weights backwards. ("French secondary") 87 */ 88 static const int32_t BACKWARD_SECONDARY = 0x800; 89 /** 90 * Options bits 15..12: The 4-bit strength value bit field is shifted by this value. 91 * It is the top used bit field in the options. (No need to mask after shifting.) 92 */ 93 static const int32_t STRENGTH_SHIFT = 12; 94 /** Strength options bit mask before shifting. */ 95 static const int32_t STRENGTH_MASK = 0xf000; 96 97 /** maxVariable values */ 98 enum MaxVariable { 99 MAX_VAR_SPACE, 100 MAX_VAR_PUNCT, 101 MAX_VAR_SYMBOL, 102 MAX_VAR_CURRENCY 103 }; 104 CollationSettingsCollationSettings105 CollationSettings() 106 : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) | 107 (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)), 108 variableTop(0), 109 reorderTable(NULL), 110 minHighNoReorder(0), 111 reorderRanges(NULL), reorderRangesLength(0), 112 reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0), 113 fastLatinOptions(-1) {} 114 115 CollationSettings(const CollationSettings &other); 116 virtual ~CollationSettings(); 117 118 UBool operator==(const CollationSettings &other) const; 119 120 inline UBool operator!=(const CollationSettings &other) const { 121 return !operator==(other); 122 } 123 124 int32_t hashCode() const; 125 126 void resetReordering(); 127 void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length, 128 const uint32_t *ranges, int32_t rangesLength, 129 const uint8_t *table, UErrorCode &errorCode); 130 void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength, 131 UErrorCode &errorCode); 132 void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode); 133 hasReorderingCollationSettings134 inline UBool hasReordering() const { return reorderTable != NULL; } 135 static UBool reorderTableHasSplitBytes(const uint8_t table[256]); reorderCollationSettings136 inline uint32_t reorder(uint32_t p) const { 137 uint8_t b = reorderTable[p >> 24]; 138 if(b != 0 || p <= Collation::NO_CE_PRIMARY) { 139 return ((uint32_t)b << 24) | (p & 0xffffff); 140 } else { 141 return reorderEx(p); 142 } 143 } 144 145 void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); 146 getStrengthCollationSettings147 static int32_t getStrength(int32_t options) { 148 return options >> STRENGTH_SHIFT; 149 } 150 getStrengthCollationSettings151 int32_t getStrength() const { 152 return getStrength(options); 153 } 154 155 /** Sets the options bit for an on/off attribute. */ 156 void setFlag(int32_t bit, UColAttributeValue value, 157 int32_t defaultOptions, UErrorCode &errorCode); 158 getFlagCollationSettings159 UColAttributeValue getFlag(int32_t bit) const { 160 return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF; 161 } 162 163 void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode); 164 getCaseFirstCollationSettings165 UColAttributeValue getCaseFirst() const { 166 int32_t option = options & CASE_FIRST_AND_UPPER_MASK; 167 return (option == 0) ? UCOL_OFF : 168 (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST; 169 } 170 171 void setAlternateHandling(UColAttributeValue value, 172 int32_t defaultOptions, UErrorCode &errorCode); 173 getAlternateHandlingCollationSettings174 UColAttributeValue getAlternateHandling() const { 175 return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED; 176 } 177 178 void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); 179 getMaxVariableCollationSettings180 MaxVariable getMaxVariable() const { 181 return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT); 182 } 183 184 /** 185 * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off. 186 */ isTertiaryWithCaseBitsCollationSettings187 static inline UBool isTertiaryWithCaseBits(int32_t options) { 188 return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST; 189 } getTertiaryMaskCollationSettings190 static uint32_t getTertiaryMask(int32_t options) { 191 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 192 return isTertiaryWithCaseBits(options) ? 193 Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK; 194 } 195 sortsTertiaryUpperCaseFirstCollationSettings196 static UBool sortsTertiaryUpperCaseFirst(int32_t options) { 197 // On tertiary level, consider case bits and sort uppercase first 198 // if caseLevel is off and caseFirst==upperFirst. 199 return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK; 200 } 201 dontCheckFCDCollationSettings202 inline UBool dontCheckFCD() const { 203 return (options & CHECK_FCD) == 0; 204 } 205 hasBackwardSecondaryCollationSettings206 inline UBool hasBackwardSecondary() const { 207 return (options & BACKWARD_SECONDARY) != 0; 208 } 209 isNumericCollationSettings210 inline UBool isNumeric() const { 211 return (options & NUMERIC) != 0; 212 } 213 214 /** CHECK_FCD etc. */ 215 int32_t options; 216 /** Variable-top primary weight. */ 217 uint32_t variableTop; 218 /** 219 * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering. 220 * A 0 entry at a non-zero index means that the primary lead byte is "split" 221 * (there are different offsets for primaries that share that lead byte) 222 * and the reordering offset must be determined via the reorderRanges. 223 */ 224 const uint8_t *reorderTable; 225 /** Limit of last reordered range. 0 if no reordering or no split bytes. */ 226 uint32_t minHighNoReorder; 227 /** 228 * Primary-weight ranges for script reordering, 229 * to be used by reorder(p) for split-reordered primary lead bytes. 230 * 231 * Each entry is a (limit, offset) pair. 232 * The upper 16 bits of the entry are the upper 16 bits of the 233 * exclusive primary limit of a range. 234 * Primaries between the previous limit and this one have their lead bytes 235 * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits. 236 * 237 * CollationData::makeReorderRanges() writes a full list where the first range 238 * (at least for terminators and separators) has a 0 offset. 239 * The last range has a non-zero offset. 240 * minHighNoReorder is set to the limit of that last range. 241 * 242 * In the settings object, the initial ranges before the first split lead byte 243 * are omitted for efficiency; they are handled by reorder(p) via the reorderTable. 244 * If there are no split-reordered lead bytes, then no ranges are needed. 245 */ 246 const uint32_t *reorderRanges; 247 int32_t reorderRangesLength; 248 /** Array of reorder codes; ignored if reorderCodesLength == 0. */ 249 const int32_t *reorderCodes; 250 /** Number of reorder codes; 0 if no reordering. */ 251 int32_t reorderCodesLength; 252 /** 253 * Capacity of reorderCodes. 254 * If 0, then the codes, the ranges, and the table are aliases. 255 * Otherwise, this object owns the memory via the reorderCodes pointer; 256 * the codes, the ranges, and the table are in the same memory block, in that order. 257 */ 258 int32_t reorderCodesCapacity; 259 260 /** Options for CollationFastLatin. Negative if disabled. */ 261 int32_t fastLatinOptions; 262 uint16_t fastLatinPrimaries[0x180]; 263 264 private: 265 void setReorderArrays(const int32_t *codes, int32_t codesLength, 266 const uint32_t *ranges, int32_t rangesLength, 267 const uint8_t *table, UErrorCode &errorCode); 268 uint32_t reorderEx(uint32_t p) const; 269 }; 270 271 U_NAMESPACE_END 272 273 #endif // !UCONFIG_NO_COLLATION 274 #endif // __COLLATIONSETTINGS_H__ 275