1 /* 2 ******************************************************************************* 3 * Copyright (C) 2013-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationruleparser.h 7 * 8 * created on: 2013apr10 9 * created by: Markus W. Scherer 10 */ 11 12 #ifndef __COLLATIONRULEPARSER_H__ 13 #define __COLLATIONRULEPARSER_H__ 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_COLLATION 18 19 #include "unicode/ucol.h" 20 #include "unicode/uniset.h" 21 #include "unicode/unistr.h" 22 23 struct UParseError; 24 25 U_NAMESPACE_BEGIN 26 27 struct CollationData; 28 struct CollationTailoring; 29 30 class Locale; 31 class Normalizer2; 32 33 struct CollationSettings; 34 35 class U_I18N_API CollationRuleParser : public UMemory { 36 public: 37 /** Special reset positions. */ 38 enum Position { 39 FIRST_TERTIARY_IGNORABLE, 40 LAST_TERTIARY_IGNORABLE, 41 FIRST_SECONDARY_IGNORABLE, 42 LAST_SECONDARY_IGNORABLE, 43 FIRST_PRIMARY_IGNORABLE, 44 LAST_PRIMARY_IGNORABLE, 45 FIRST_VARIABLE, 46 LAST_VARIABLE, 47 FIRST_REGULAR, 48 LAST_REGULAR, 49 FIRST_IMPLICIT, 50 LAST_IMPLICIT, 51 FIRST_TRAILING, 52 LAST_TRAILING 53 }; 54 55 /** 56 * First character of contractions that encode special reset positions. 57 * U+FFFE cannot be tailored via rule syntax. 58 * 59 * The second contraction character is POS_BASE + Position. 60 */ 61 static const UChar POS_LEAD = 0xfffe; 62 /** 63 * Base for the second character of contractions that encode special reset positions. 64 * Braille characters U+28xx are printable and normalization-inert. 65 * @see POS_LEAD 66 */ 67 static const UChar POS_BASE = 0x2800; 68 69 class U_I18N_API Sink : public UObject { 70 public: 71 virtual ~Sink(); 72 /** 73 * Adds a reset. 74 * strength=UCOL_IDENTICAL for &str. 75 * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3. 76 */ 77 virtual void addReset(int32_t strength, const UnicodeString &str, 78 const char *&errorReason, UErrorCode &errorCode) = 0; 79 /** 80 * Adds a relation with strength and prefix | str / extension. 81 */ 82 virtual void addRelation(int32_t strength, const UnicodeString &prefix, 83 const UnicodeString &str, const UnicodeString &extension, 84 const char *&errorReason, UErrorCode &errorCode) = 0; 85 86 virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason, 87 UErrorCode &errorCode); 88 89 virtual void optimize(const UnicodeSet &set, const char *&errorReason, 90 UErrorCode &errorCode); 91 }; 92 93 class U_I18N_API Importer : public UObject { 94 public: 95 virtual ~Importer(); 96 virtual const UnicodeString *getRules( 97 const char *localeID, const char *collationType, 98 const char *&errorReason, UErrorCode &errorCode) = 0; 99 }; 100 101 /** 102 * Constructor. 103 * The Sink must be set before parsing. 104 * The Importer can be set, otherwise [import locale] syntax is not supported. 105 */ 106 CollationRuleParser(const CollationData *base, UErrorCode &errorCode); 107 ~CollationRuleParser(); 108 109 /** 110 * Sets the pointer to a Sink object. 111 * The pointer is aliased: Pointer copy without cloning or taking ownership. 112 */ setSink(Sink * sinkAlias)113 void setSink(Sink *sinkAlias) { 114 sink = sinkAlias; 115 } 116 117 /** 118 * Sets the pointer to an Importer object. 119 * The pointer is aliased: Pointer copy without cloning or taking ownership. 120 */ setImporter(Importer * importerAlias)121 void setImporter(Importer *importerAlias) { 122 importer = importerAlias; 123 } 124 125 void parse(const UnicodeString &ruleString, 126 CollationSettings &outSettings, 127 UParseError *outParseError, 128 UErrorCode &errorCode); 129 getErrorReason()130 const char *getErrorReason() const { return errorReason; } 131 132 /** 133 * Gets a script or reorder code from its string representation. 134 * @return the script/reorder code, or 135 * -1==UCOL_REORDER_CODE_DEFAULT, or 136 * -2 if not recognized 137 */ 138 static int32_t getReorderCode(const char *word); 139 140 private: 141 /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ 142 static const int32_t STRENGTH_MASK = 0xf; 143 static const int32_t STARRED_FLAG = 0x10; 144 static const int32_t OFFSET_SHIFT = 8; 145 146 void parse(const UnicodeString &ruleString, UErrorCode &errorCode); 147 void parseRuleChain(UErrorCode &errorCode); 148 int32_t parseResetAndPosition(UErrorCode &errorCode); 149 int32_t parseRelationOperator(UErrorCode &errorCode); 150 void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode); 151 void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode); 152 int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); 153 int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); 154 155 /** 156 * Sets str to a contraction of U+FFFE and (U+2800 + Position). 157 * @return rule index after the special reset position 158 */ 159 int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode); 160 void parseSetting(UErrorCode &errorCode); 161 void parseReordering(const UnicodeString &raw, UErrorCode &errorCode); 162 static UColAttributeValue getOnOffValue(const UnicodeString &s); 163 164 int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode); 165 int32_t readWords(int32_t i, UnicodeString &raw) const; 166 int32_t skipComment(int32_t i) const; 167 168 void setParseError(const char *reason, UErrorCode &errorCode); 169 void setErrorContext(); 170 171 /** 172 * ASCII [:P:] and [:S:]: 173 * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] 174 */ 175 static UBool isSyntaxChar(UChar32 c); 176 int32_t skipWhiteSpace(int32_t i) const; 177 178 const Normalizer2 &nfd, &nfc; 179 180 const UnicodeString *rules; 181 const CollationData *const baseData; 182 CollationSettings *settings; 183 UParseError *parseError; 184 const char *errorReason; 185 186 Sink *sink; 187 Importer *importer; 188 189 int32_t ruleIndex; 190 }; 191 192 U_NAMESPACE_END 193 194 #endif // !UCONFIG_NO_COLLATION 195 #endif // __COLLATIONRULEPARSER_H__ 196