• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2014, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationruleparser.h
7 *
8 * created on: 2013apr10
9 * created by: Markus W. Scherer
10 */
11 
12 #ifndef __COLLATIONRULEPARSER_H__
13 #define __COLLATIONRULEPARSER_H__
14 
15 #include "unicode/utypes.h"
16 
17 #if !UCONFIG_NO_COLLATION
18 
19 #include "unicode/ucol.h"
20 #include "unicode/uniset.h"
21 #include "unicode/unistr.h"
22 
23 struct UParseError;
24 
25 U_NAMESPACE_BEGIN
26 
27 struct CollationData;
28 struct CollationTailoring;
29 
30 class Locale;
31 class Normalizer2;
32 
33 struct CollationSettings;
34 
35 class U_I18N_API CollationRuleParser : public UMemory {
36 public:
37     /** Special reset positions. */
38     enum Position {
39         FIRST_TERTIARY_IGNORABLE,
40         LAST_TERTIARY_IGNORABLE,
41         FIRST_SECONDARY_IGNORABLE,
42         LAST_SECONDARY_IGNORABLE,
43         FIRST_PRIMARY_IGNORABLE,
44         LAST_PRIMARY_IGNORABLE,
45         FIRST_VARIABLE,
46         LAST_VARIABLE,
47         FIRST_REGULAR,
48         LAST_REGULAR,
49         FIRST_IMPLICIT,
50         LAST_IMPLICIT,
51         FIRST_TRAILING,
52         LAST_TRAILING
53     };
54 
55     /**
56      * First character of contractions that encode special reset positions.
57      * U+FFFE cannot be tailored via rule syntax.
58      *
59      * The second contraction character is POS_BASE + Position.
60      */
61     static const UChar POS_LEAD = 0xfffe;
62     /**
63      * Base for the second character of contractions that encode special reset positions.
64      * Braille characters U+28xx are printable and normalization-inert.
65      * @see POS_LEAD
66      */
67     static const UChar POS_BASE = 0x2800;
68 
69     class U_I18N_API Sink : public UObject {
70     public:
71         virtual ~Sink();
72         /**
73          * Adds a reset.
74          * strength=UCOL_IDENTICAL for &str.
75          * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
76          */
77         virtual void addReset(int32_t strength, const UnicodeString &str,
78                               const char *&errorReason, UErrorCode &errorCode) = 0;
79         /**
80          * Adds a relation with strength and prefix | str / extension.
81          */
82         virtual void addRelation(int32_t strength, const UnicodeString &prefix,
83                                  const UnicodeString &str, const UnicodeString &extension,
84                                  const char *&errorReason, UErrorCode &errorCode) = 0;
85 
86         virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
87                                           UErrorCode &errorCode);
88 
89         virtual void optimize(const UnicodeSet &set, const char *&errorReason,
90                               UErrorCode &errorCode);
91     };
92 
93     class U_I18N_API Importer : public UObject {
94     public:
95         virtual ~Importer();
96         virtual const UnicodeString *getRules(
97                 const char *localeID, const char *collationType,
98                 const char *&errorReason, UErrorCode &errorCode) = 0;
99     };
100 
101     /**
102      * Constructor.
103      * The Sink must be set before parsing.
104      * The Importer can be set, otherwise [import locale] syntax is not supported.
105      */
106     CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
107     ~CollationRuleParser();
108 
109     /**
110      * Sets the pointer to a Sink object.
111      * The pointer is aliased: Pointer copy without cloning or taking ownership.
112      */
setSink(Sink * sinkAlias)113     void setSink(Sink *sinkAlias) {
114         sink = sinkAlias;
115     }
116 
117     /**
118      * Sets the pointer to an Importer object.
119      * The pointer is aliased: Pointer copy without cloning or taking ownership.
120      */
setImporter(Importer * importerAlias)121     void setImporter(Importer *importerAlias) {
122         importer = importerAlias;
123     }
124 
125     void parse(const UnicodeString &ruleString,
126                CollationSettings &outSettings,
127                UParseError *outParseError,
128                UErrorCode &errorCode);
129 
getErrorReason()130     const char *getErrorReason() const { return errorReason; }
131 
132     /**
133      * Gets a script or reorder code from its string representation.
134      * @return the script/reorder code, or
135      * -1==UCOL_REORDER_CODE_DEFAULT, or
136      * -2 if not recognized
137      */
138     static int32_t getReorderCode(const char *word);
139 
140 private:
141     /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
142     static const int32_t STRENGTH_MASK = 0xf;
143     static const int32_t STARRED_FLAG = 0x10;
144     static const int32_t OFFSET_SHIFT = 8;
145 
146     void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
147     void parseRuleChain(UErrorCode &errorCode);
148     int32_t parseResetAndPosition(UErrorCode &errorCode);
149     int32_t parseRelationOperator(UErrorCode &errorCode);
150     void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
151     void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
152     int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
153     int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
154 
155     /**
156      * Sets str to a contraction of U+FFFE and (U+2800 + Position).
157      * @return rule index after the special reset position
158      */
159     int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
160     void parseSetting(UErrorCode &errorCode);
161     void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
162     static UColAttributeValue getOnOffValue(const UnicodeString &s);
163 
164     int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
165     int32_t readWords(int32_t i, UnicodeString &raw) const;
166     int32_t skipComment(int32_t i) const;
167 
168     void setParseError(const char *reason, UErrorCode &errorCode);
169     void setErrorContext();
170 
171     /**
172      * ASCII [:P:] and [:S:]:
173      * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
174      */
175     static UBool isSyntaxChar(UChar32 c);
176     int32_t skipWhiteSpace(int32_t i) const;
177 
178     const Normalizer2 &nfd, &nfc;
179 
180     const UnicodeString *rules;
181     const CollationData *const baseData;
182     CollationSettings *settings;
183     UParseError *parseError;
184     const char *errorReason;
185 
186     Sink *sink;
187     Importer *importer;
188 
189     int32_t ruleIndex;
190 };
191 
192 U_NAMESPACE_END
193 
194 #endif  // !UCONFIG_NO_COLLATION
195 #endif  // __COLLATIONRULEPARSER_H__
196