1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/17/99 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/rep.h"
16 #include "unicode/uniset.h"
17 #include "rbt_pars.h"
18 #include "rbt_data.h"
19 #include "rbt_rule.h"
20 #include "rbt.h"
21 #include "mutex.h"
22 #include "umutex.h"
23
24 U_NAMESPACE_BEGIN
25
26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
27
28 static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER;
29 static Replaceable *gLockedText = NULL;
30
_construct(const UnicodeString & rules,UTransDirection direction,UParseError & parseError,UErrorCode & status)31 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
32 UTransDirection direction,
33 UParseError& parseError,
34 UErrorCode& status) {
35 fData = 0;
36 isDataOwned = TRUE;
37 if (U_FAILURE(status)) {
38 return;
39 }
40
41 TransliteratorParser parser(status);
42 parser.parse(rules, direction, parseError, status);
43 if (U_FAILURE(status)) {
44 return;
45 }
46
47 if (parser.idBlockVector.size() != 0 ||
48 parser.compoundFilter != NULL ||
49 parser.dataVector.size() == 0) {
50 status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
51 return;
52 }
53
54 fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
55 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
56 }
57
58 /**
59 * Constructs a new transliterator from the given rules.
60 * @param id the id for the transliterator.
61 * @param rules rules, separated by ';'
62 * @param direction either FORWARD or REVERSE.
63 * @param adoptedFilter the filter for this transliterator.
64 * @param parseError Struct to recieve information on position
65 * of error if an error is encountered
66 * @param status Output param set to success/failure code.
67 * @exception IllegalArgumentException if rules are malformed
68 * or direction is invalid.
69 */
RuleBasedTransliterator(const UnicodeString & id,const UnicodeString & rules,UTransDirection direction,UnicodeFilter * adoptedFilter,UParseError & parseError,UErrorCode & status)70 RuleBasedTransliterator::RuleBasedTransliterator(
71 const UnicodeString& id,
72 const UnicodeString& rules,
73 UTransDirection direction,
74 UnicodeFilter* adoptedFilter,
75 UParseError& parseError,
76 UErrorCode& status) :
77 Transliterator(id, adoptedFilter) {
78 _construct(rules, direction,parseError,status);
79 }
80
81 /**
82 * Constructs a new transliterator from the given rules.
83 * @param id the id for the transliterator.
84 * @param rules rules, separated by ';'
85 * @param direction either FORWARD or REVERSE.
86 * @param adoptedFilter the filter for this transliterator.
87 * @param status Output param set to success/failure code.
88 * @exception IllegalArgumentException if rules are malformed
89 * or direction is invalid.
90 */
91 /*RuleBasedTransliterator::RuleBasedTransliterator(
92 const UnicodeString& id,
93 const UnicodeString& rules,
94 UTransDirection direction,
95 UnicodeFilter* adoptedFilter,
96 UErrorCode& status) :
97 Transliterator(id, adoptedFilter) {
98 UParseError parseError;
99 _construct(rules, direction,parseError, status);
100 }*/
101
102 /**
103 * Covenience constructor with no filter.
104 */
105 /*RuleBasedTransliterator::RuleBasedTransliterator(
106 const UnicodeString& id,
107 const UnicodeString& rules,
108 UTransDirection direction,
109 UErrorCode& status) :
110 Transliterator(id, 0) {
111 UParseError parseError;
112 _construct(rules, direction,parseError, status);
113 }*/
114
115 /**
116 * Covenience constructor with no filter and FORWARD direction.
117 */
118 /*RuleBasedTransliterator::RuleBasedTransliterator(
119 const UnicodeString& id,
120 const UnicodeString& rules,
121 UErrorCode& status) :
122 Transliterator(id, 0) {
123 UParseError parseError;
124 _construct(rules, UTRANS_FORWARD, parseError, status);
125 }*/
126
127 /**
128 * Covenience constructor with FORWARD direction.
129 */
130 /*RuleBasedTransliterator::RuleBasedTransliterator(
131 const UnicodeString& id,
132 const UnicodeString& rules,
133 UnicodeFilter* adoptedFilter,
134 UErrorCode& status) :
135 Transliterator(id, adoptedFilter) {
136 UParseError parseError;
137 _construct(rules, UTRANS_FORWARD,parseError, status);
138 }*/
139
RuleBasedTransliterator(const UnicodeString & id,const TransliterationRuleData * theData,UnicodeFilter * adoptedFilter)140 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
141 const TransliterationRuleData* theData,
142 UnicodeFilter* adoptedFilter) :
143 Transliterator(id, adoptedFilter),
144 fData((TransliterationRuleData*)theData), // cast away const
145 isDataOwned(FALSE) {
146 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
147 }
148
149 /**
150 * Internal constructor.
151 */
RuleBasedTransliterator(const UnicodeString & id,TransliterationRuleData * theData,UBool isDataAdopted)152 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
153 TransliterationRuleData* theData,
154 UBool isDataAdopted) :
155 Transliterator(id, 0),
156 fData(theData),
157 isDataOwned(isDataAdopted) {
158 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
159 }
160
161 /**
162 * Copy constructor.
163 */
RuleBasedTransliterator(const RuleBasedTransliterator & other)164 RuleBasedTransliterator::RuleBasedTransliterator(
165 const RuleBasedTransliterator& other) :
166 Transliterator(other), fData(other.fData),
167 isDataOwned(other.isDataOwned) {
168
169 // The data object may or may not be owned. If it is not owned we
170 // share it; it is invariant. If it is owned, it's still
171 // invariant, but we need to copy it to prevent double-deletion.
172 // If this becomes a performance issue (if people do a lot of RBT
173 // copying -- unlikely) we can reference count the data object.
174
175 // Only do a deep copy if this is owned data, that is, data that
176 // will be later deleted. System transliterators contain
177 // non-owned data.
178 if (isDataOwned) {
179 fData = new TransliterationRuleData(*other.fData);
180 }
181 }
182
183 /**
184 * Destructor.
185 */
~RuleBasedTransliterator()186 RuleBasedTransliterator::~RuleBasedTransliterator() {
187 // Delete the data object only if we own it.
188 if (isDataOwned) {
189 delete fData;
190 }
191 }
192
193 Transliterator* // Covariant return NOT ALLOWED (for portability)
clone(void) const194 RuleBasedTransliterator::clone(void) const {
195 return new RuleBasedTransliterator(*this);
196 }
197
198 /**
199 * Implements {@link Transliterator#handleTransliterate}.
200 */
201 void
handleTransliterate(Replaceable & text,UTransPosition & index,UBool isIncremental) const202 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
203 UBool isIncremental) const {
204 /* We keep contextStart and contextLimit fixed the entire time,
205 * relative to the text -- contextLimit may move numerically if
206 * text is inserted or removed. The start offset moves toward
207 * limit, with replacements happening under it.
208 *
209 * Example: rules 1. ab>x|y
210 * 2. yc>z
211 *
212 * |eabcd begin - no match, advance start
213 * e|abcd match rule 1 - change text & adjust start
214 * ex|ycd match rule 2 - change text & adjust start
215 * exz|d no match, advance start
216 * exzd| done
217 */
218
219 /* A rule like
220 * a>b|a
221 * creates an infinite loop. To prevent that, we put an arbitrary
222 * limit on the number of iterations that we take, one that is
223 * high enough that any reasonable rules are ok, but low enough to
224 * prevent a server from hanging. The limit is 16 times the
225 * number of characters n, unless n is so large that 16n exceeds a
226 * uint32_t.
227 */
228 uint32_t loopCount = 0;
229 uint32_t loopLimit = index.limit - index.start;
230 if (loopLimit >= 0x10000000) {
231 loopLimit = 0xFFFFFFFF;
232 } else {
233 loopLimit <<= 4;
234 }
235
236 // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent
237 // operations must be prevented.
238 // A Complication: compound transliterators can result in recursive entries to this
239 // function, sometimes with different "This" objects, always with the same text.
240 // Double-locking must be prevented in these cases.
241 //
242
243 // If the transliteration data is exclusively owned by this transliterator object,
244 // we don't need to do any locking. No sharing between transliterators is possible,
245 // so no concurrent access from multiple threads is possible.
246 UBool lockedMutexAtThisLevel = FALSE;
247 if (isDataOwned == FALSE) {
248 // Test whether this request is operating on the same text string as
249 // some other transliteration that is still in progress and holding the
250 // transliteration mutex. If so, do not lock the transliteration
251 // mutex again.
252 //
253 // gLockedText variable is protected by the global ICU mutex.
254 // Shared RBT data protected by transliteratorDataMutex.
255 //
256 // TODO(andy): Need a better scheme for handling this.
257 UBool needToLock;
258 {
259 Mutex m;
260 needToLock = (&text != gLockedText);
261 }
262 if (needToLock) {
263 umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here.
264 Mutex m;
265 gLockedText = &text;
266 lockedMutexAtThisLevel = TRUE;
267 }
268 }
269
270 // Check to make sure we don't dereference a null pointer.
271 if (fData != NULL) {
272 while (index.start < index.limit &&
273 loopCount <= loopLimit &&
274 fData->ruleSet.transliterate(text, index, isIncremental)) {
275 ++loopCount;
276 }
277 }
278 if (lockedMutexAtThisLevel) {
279 {
280 Mutex m;
281 gLockedText = NULL;
282 }
283 umtx_unlock(&transliteratorDataMutex);
284 }
285 }
286
toRules(UnicodeString & rulesSource,UBool escapeUnprintable) const287 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
288 UBool escapeUnprintable) const {
289 return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
290 }
291
292 /**
293 * Implement Transliterator framework
294 */
handleGetSourceSet(UnicodeSet & result) const295 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
296 fData->ruleSet.getSourceTargetSet(result, FALSE);
297 }
298
299 /**
300 * Override Transliterator framework
301 */
getTargetSet(UnicodeSet & result) const302 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
303 return fData->ruleSet.getSourceTargetSet(result, TRUE);
304 }
305
306 U_NAMESPACE_END
307
308 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
309