1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationruleparser.h 9 * 10 * created on: 2013apr10 11 * created by: Markus W. Scherer 12 */ 13 14 #ifndef __COLLATIONRULEPARSER_H__ 15 #define __COLLATIONRULEPARSER_H__ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_COLLATION 20 21 #include "unicode/ucol.h" 22 #include "unicode/uniset.h" 23 #include "unicode/unistr.h" 24 25 struct UParseError; 26 27 U_NAMESPACE_BEGIN 28 29 struct CollationData; 30 struct CollationTailoring; 31 32 class Locale; 33 class Normalizer2; 34 35 struct CollationSettings; 36 37 class U_I18N_API CollationRuleParser : public UMemory { 38 public: 39 /** Special reset positions. */ 40 enum Position { 41 FIRST_TERTIARY_IGNORABLE, 42 LAST_TERTIARY_IGNORABLE, 43 FIRST_SECONDARY_IGNORABLE, 44 LAST_SECONDARY_IGNORABLE, 45 FIRST_PRIMARY_IGNORABLE, 46 LAST_PRIMARY_IGNORABLE, 47 FIRST_VARIABLE, 48 LAST_VARIABLE, 49 FIRST_REGULAR, 50 LAST_REGULAR, 51 FIRST_IMPLICIT, 52 LAST_IMPLICIT, 53 FIRST_TRAILING, 54 LAST_TRAILING 55 }; 56 57 /** 58 * First character of contractions that encode special reset positions. 59 * U+FFFE cannot be tailored via rule syntax. 60 * 61 * The second contraction character is POS_BASE + Position. 62 */ 63 static const char16_t POS_LEAD = 0xfffe; 64 /** 65 * Base for the second character of contractions that encode special reset positions. 66 * Braille characters U+28xx are printable and normalization-inert. 67 * @see POS_LEAD 68 */ 69 static const char16_t POS_BASE = 0x2800; 70 71 class U_I18N_API Sink : public UObject { 72 public: 73 virtual ~Sink(); 74 /** 75 * Adds a reset. 76 * strength=UCOL_IDENTICAL for &str. 77 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3. 78 */ 79 virtual void addReset(int32_t strength, const UnicodeString &str, 80 const char *&errorReason, UErrorCode &errorCode) = 0; 81 /** 82 * Adds a relation with strength and prefix | str / extension. 83 */ 84 virtual void addRelation(int32_t strength, const UnicodeString &prefix, 85 const UnicodeString &str, const UnicodeString &extension, 86 const char *&errorReason, UErrorCode &errorCode) = 0; 87 88 virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason, 89 UErrorCode &errorCode); 90 91 virtual void optimize(const UnicodeSet &set, const char *&errorReason, 92 UErrorCode &errorCode); 93 }; 94 95 class U_I18N_API Importer : public UObject { 96 public: 97 virtual ~Importer(); 98 virtual void getRules( 99 const char *localeID, const char *collationType, 100 UnicodeString &rules, 101 const char *&errorReason, UErrorCode &errorCode) = 0; 102 }; 103 104 /** 105 * Constructor. 106 * The Sink must be set before parsing. 107 * The Importer can be set, otherwise [import locale] syntax is not supported. 108 */ 109 CollationRuleParser(const CollationData *base, UErrorCode &errorCode); 110 ~CollationRuleParser(); 111 112 /** 113 * Sets the pointer to a Sink object. 114 * The pointer is aliased: Pointer copy without cloning or taking ownership. 115 */ setSink(Sink * sinkAlias)116 void setSink(Sink *sinkAlias) { 117 sink = sinkAlias; 118 } 119 120 /** 121 * Sets the pointer to an Importer object. 122 * The pointer is aliased: Pointer copy without cloning or taking ownership. 123 */ setImporter(Importer * importerAlias)124 void setImporter(Importer *importerAlias) { 125 importer = importerAlias; 126 } 127 128 void parse(const UnicodeString &ruleString, 129 CollationSettings &outSettings, 130 UParseError *outParseError, 131 UErrorCode &errorCode); 132 getErrorReason()133 const char *getErrorReason() const { return errorReason; } 134 135 /** 136 * Gets a script or reorder code from its string representation. 137 * @return the script/reorder code, or 138 * -1 if not recognized 139 */ 140 static int32_t getReorderCode(const char *word); 141 142 private: 143 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ 144 static const int32_t STRENGTH_MASK = 0xf; 145 static const int32_t STARRED_FLAG = 0x10; 146 static const int32_t OFFSET_SHIFT = 8; 147 148 void parse(const UnicodeString &ruleString, UErrorCode &errorCode); 149 void parseRuleChain(UErrorCode &errorCode); 150 int32_t parseResetAndPosition(UErrorCode &errorCode); 151 int32_t parseRelationOperator(UErrorCode &errorCode); 152 void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode); 153 void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode); 154 int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); 155 int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); 156 157 /** 158 * Sets str to a contraction of U+FFFE and (U+2800 + Position). 159 * @return rule index after the special reset position 160 */ 161 int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode); 162 void parseSetting(UErrorCode &errorCode); 163 void parseReordering(const UnicodeString &raw, UErrorCode &errorCode); 164 static UColAttributeValue getOnOffValue(const UnicodeString &s); 165 166 int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode); 167 int32_t readWords(int32_t i, UnicodeString &raw) const; 168 int32_t skipComment(int32_t i) const; 169 170 void setParseError(const char *reason, UErrorCode &errorCode); 171 void setErrorContext(); 172 173 /** 174 * ASCII [:P:] and [:S:]: 175 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] 176 */ 177 static UBool isSyntaxChar(UChar32 c); 178 int32_t skipWhiteSpace(int32_t i) const; 179 180 const Normalizer2 &nfd, &nfc; 181 182 const UnicodeString *rules; 183 const CollationData *const baseData; 184 CollationSettings *settings; 185 UParseError *parseError; 186 const char *errorReason; 187 188 Sink *sink; 189 Importer *importer; 190 191 int32_t ruleIndex; 192 }; 193 194 U_NAMESPACE_END 195 196 #endif // !UCONFIG_NO_COLLATION 197 #endif // __COLLATIONRULEPARSER_H__ 198