1 /* 2 ******************************************************************************* 3 * Copyright (C) 2013-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationsettings.h 7 * 8 * created on: 2013feb07 9 * created by: Markus W. Scherer 10 */ 11 12 #ifndef __COLLATIONSETTINGS_H__ 13 #define __COLLATIONSETTINGS_H__ 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_COLLATION 18 19 #include "unicode/ucol.h" 20 #include "collation.h" 21 #include "sharedobject.h" 22 #include "umutex.h" 23 24 U_NAMESPACE_BEGIN 25 26 struct CollationData; 27 28 /** 29 * Collation settings/options/attributes. 30 * These are the values that can be changed via API. 31 */ 32 struct U_I18N_API CollationSettings : public SharedObject { 33 /** 34 * Options bit 0: Perform the FCD check on the input text and deliver normalized text. 35 */ 36 static const int32_t CHECK_FCD = 1; 37 /** 38 * Options bit 1: Numeric collation. 39 * Also known as CODAN = COllate Digits As Numbers. 40 * 41 * Treat digit sequences as numbers with CE sequences in numeric order, 42 * rather than returning a normal CE for each digit. 43 */ 44 static const int32_t NUMERIC = 2; 45 /** 46 * "Shifted" alternate handling, see ALTERNATE_MASK. 47 */ 48 static const int32_t SHIFTED = 4; 49 /** 50 * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable. 51 * Reserve values 8 and 0xc for shift-trimmed and blanked. 52 */ 53 static const int32_t ALTERNATE_MASK = 0xc; 54 /** 55 * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value. 56 */ 57 static const int32_t MAX_VARIABLE_SHIFT = 4; 58 /** maxVariable options bit mask before shifting. */ 59 static const int32_t MAX_VARIABLE_MASK = 0x70; 60 /** Options bit 7: Reserved/unused/0. */ 61 /** 62 * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on. 63 */ 64 static const int32_t UPPER_FIRST = 0x100; 65 /** 66 * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values) 67 * unless case level is on (when they are *moved* into the separate case level). 68 * By default, the case bits are removed from the tertiary weight (ignored). 69 * 70 * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to 71 * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST. 72 */ 73 static const int32_t CASE_FIRST = 0x200; 74 /** 75 * Options bit mask for caseFirst and upperFirst, before shifting. 76 * Same value as caseFirst==upperFirst. 77 */ 78 static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST; 79 /** 80 * Options bit 10: Insert the case level between the secondary and tertiary levels. 81 */ 82 static const int32_t CASE_LEVEL = 0x400; 83 /** 84 * Options bit 11: Compare secondary weights backwards. ("French secondary") 85 */ 86 static const int32_t BACKWARD_SECONDARY = 0x800; 87 /** 88 * Options bits 15..12: The 4-bit strength value bit field is shifted by this value. 89 * It is the top used bit field in the options. (No need to mask after shifting.) 90 */ 91 static const int32_t STRENGTH_SHIFT = 12; 92 /** Strength options bit mask before shifting. */ 93 static const int32_t STRENGTH_MASK = 0xf000; 94 95 /** maxVariable values */ 96 enum MaxVariable { 97 MAX_VAR_SPACE, 98 MAX_VAR_PUNCT, 99 MAX_VAR_SYMBOL, 100 MAX_VAR_CURRENCY 101 }; 102 CollationSettingsCollationSettings103 CollationSettings() 104 : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) | 105 (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)), 106 variableTop(0), 107 reorderTable(NULL), 108 minHighNoReorder(0), 109 reorderRanges(NULL), reorderRangesLength(0), 110 reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0), 111 fastLatinOptions(-1) {} 112 113 CollationSettings(const CollationSettings &other); 114 virtual ~CollationSettings(); 115 116 UBool operator==(const CollationSettings &other) const; 117 118 inline UBool operator!=(const CollationSettings &other) const { 119 return !operator==(other); 120 } 121 122 int32_t hashCode() const; 123 124 void resetReordering(); 125 void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length, 126 const uint32_t *ranges, int32_t rangesLength, 127 const uint8_t *table, UErrorCode &errorCode); 128 void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength, 129 UErrorCode &errorCode); 130 void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode); 131 hasReorderingCollationSettings132 inline UBool hasReordering() const { return reorderTable != NULL; } 133 static UBool reorderTableHasSplitBytes(const uint8_t table[256]); reorderCollationSettings134 inline uint32_t reorder(uint32_t p) const { 135 uint8_t b = reorderTable[p >> 24]; 136 if(b != 0 || p <= Collation::NO_CE_PRIMARY) { 137 return ((uint32_t)b << 24) | (p & 0xffffff); 138 } else { 139 return reorderEx(p); 140 } 141 } 142 143 void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); 144 getStrengthCollationSettings145 static int32_t getStrength(int32_t options) { 146 return options >> STRENGTH_SHIFT; 147 } 148 getStrengthCollationSettings149 int32_t getStrength() const { 150 return getStrength(options); 151 } 152 153 /** Sets the options bit for an on/off attribute. */ 154 void setFlag(int32_t bit, UColAttributeValue value, 155 int32_t defaultOptions, UErrorCode &errorCode); 156 getFlagCollationSettings157 UColAttributeValue getFlag(int32_t bit) const { 158 return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF; 159 } 160 161 void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode); 162 getCaseFirstCollationSettings163 UColAttributeValue getCaseFirst() const { 164 int32_t option = options & CASE_FIRST_AND_UPPER_MASK; 165 return (option == 0) ? UCOL_OFF : 166 (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST; 167 } 168 169 void setAlternateHandling(UColAttributeValue value, 170 int32_t defaultOptions, UErrorCode &errorCode); 171 getAlternateHandlingCollationSettings172 UColAttributeValue getAlternateHandling() const { 173 return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED; 174 } 175 176 void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode); 177 getMaxVariableCollationSettings178 MaxVariable getMaxVariable() const { 179 return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT); 180 } 181 182 /** 183 * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off. 184 */ isTertiaryWithCaseBitsCollationSettings185 static inline UBool isTertiaryWithCaseBits(int32_t options) { 186 return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST; 187 } getTertiaryMaskCollationSettings188 static uint32_t getTertiaryMask(int32_t options) { 189 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 190 return isTertiaryWithCaseBits(options) ? 191 Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK; 192 } 193 sortsTertiaryUpperCaseFirstCollationSettings194 static UBool sortsTertiaryUpperCaseFirst(int32_t options) { 195 // On tertiary level, consider case bits and sort uppercase first 196 // if caseLevel is off and caseFirst==upperFirst. 197 return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK; 198 } 199 dontCheckFCDCollationSettings200 inline UBool dontCheckFCD() const { 201 return (options & CHECK_FCD) == 0; 202 } 203 hasBackwardSecondaryCollationSettings204 inline UBool hasBackwardSecondary() const { 205 return (options & BACKWARD_SECONDARY) != 0; 206 } 207 isNumericCollationSettings208 inline UBool isNumeric() const { 209 return (options & NUMERIC) != 0; 210 } 211 212 /** CHECK_FCD etc. */ 213 int32_t options; 214 /** Variable-top primary weight. */ 215 uint32_t variableTop; 216 /** 217 * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering. 218 * A 0 entry at a non-zero index means that the primary lead byte is "split" 219 * (there are different offsets for primaries that share that lead byte) 220 * and the reordering offset must be determined via the reorderRanges. 221 */ 222 const uint8_t *reorderTable; 223 /** Limit of last reordered range. 0 if no reordering or no split bytes. */ 224 uint32_t minHighNoReorder; 225 /** 226 * Primary-weight ranges for script reordering, 227 * to be used by reorder(p) for split-reordered primary lead bytes. 228 * 229 * Each entry is a (limit, offset) pair. 230 * The upper 16 bits of the entry are the upper 16 bits of the 231 * exclusive primary limit of a range. 232 * Primaries between the previous limit and this one have their lead bytes 233 * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits. 234 * 235 * CollationData::makeReorderRanges() writes a full list where the first range 236 * (at least for terminators and separators) has a 0 offset. 237 * The last range has a non-zero offset. 238 * minHighNoReorder is set to the limit of that last range. 239 * 240 * In the settings object, the initial ranges before the first split lead byte 241 * are omitted for efficiency; they are handled by reorder(p) via the reorderTable. 242 * If there are no split-reordered lead bytes, then no ranges are needed. 243 */ 244 const uint32_t *reorderRanges; 245 int32_t reorderRangesLength; 246 /** Array of reorder codes; ignored if reorderCodesLength == 0. */ 247 const int32_t *reorderCodes; 248 /** Number of reorder codes; 0 if no reordering. */ 249 int32_t reorderCodesLength; 250 /** 251 * Capacity of reorderCodes. 252 * If 0, then the codes, the ranges, and the table are aliases. 253 * Otherwise, this object owns the memory via the reorderCodes pointer; 254 * the codes, the ranges, and the table are in the same memory block, in that order. 255 */ 256 int32_t reorderCodesCapacity; 257 258 /** Options for CollationFastLatin. Negative if disabled. */ 259 int32_t fastLatinOptions; 260 uint16_t fastLatinPrimaries[0x180]; 261 262 private: 263 void setReorderArrays(const int32_t *codes, int32_t codesLength, 264 const uint32_t *ranges, int32_t rangesLength, 265 const uint8_t *table, UErrorCode &errorCode); 266 uint32_t reorderEx(uint32_t p) const; 267 }; 268 269 U_NAMESPACE_END 270 271 #endif // !UCONFIG_NO_COLLATION 272 #endif // __COLLATIONSETTINGS_H__ 273