1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1999-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 11/17/99 aliu Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "unicode/rep.h"
18 #include "unicode/uniset.h"
19 #include "rbt_pars.h"
20 #include "rbt_data.h"
21 #include "rbt_rule.h"
22 #include "rbt.h"
23 #include "mutex.h"
24 #include "umutex.h"
25
26 U_NAMESPACE_BEGIN
27
28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
29
30 static Replaceable *gLockedText = NULL;
31
_construct(const UnicodeString & rules,UTransDirection direction,UParseError & parseError,UErrorCode & status)32 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
33 UTransDirection direction,
34 UParseError& parseError,
35 UErrorCode& status) {
36 fData = 0;
37 isDataOwned = true;
38 if (U_FAILURE(status)) {
39 return;
40 }
41
42 TransliteratorParser parser(status);
43 parser.parse(rules, direction, parseError, status);
44 if (U_FAILURE(status)) {
45 return;
46 }
47
48 if (parser.idBlockVector.size() != 0 ||
49 parser.compoundFilter != NULL ||
50 parser.dataVector.size() == 0) {
51 status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
52 return;
53 }
54
55 fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
56 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
57 }
58
59 /**
60 * Constructs a new transliterator from the given rules.
61 * @param id the id for the transliterator.
62 * @param rules rules, separated by ';'
63 * @param direction either FORWARD or REVERSE.
64 * @param adoptedFilter the filter for this transliterator.
65 * @param parseError Struct to receive information on position
66 * of error if an error is encountered
67 * @param status Output param set to success/failure code.
68 * @exception IllegalArgumentException if rules are malformed
69 * or direction is invalid.
70 */
RuleBasedTransliterator(const UnicodeString & id,const UnicodeString & rules,UTransDirection direction,UnicodeFilter * adoptedFilter,UParseError & parseError,UErrorCode & status)71 RuleBasedTransliterator::RuleBasedTransliterator(
72 const UnicodeString& id,
73 const UnicodeString& rules,
74 UTransDirection direction,
75 UnicodeFilter* adoptedFilter,
76 UParseError& parseError,
77 UErrorCode& status) :
78 Transliterator(id, adoptedFilter) {
79 _construct(rules, direction,parseError,status);
80 }
81
82 /**
83 * Constructs a new transliterator from the given rules.
84 * @param id the id for the transliterator.
85 * @param rules rules, separated by ';'
86 * @param direction either FORWARD or REVERSE.
87 * @param adoptedFilter the filter for this transliterator.
88 * @param status Output param set to success/failure code.
89 * @exception IllegalArgumentException if rules are malformed
90 * or direction is invalid.
91 */
92 /*RuleBasedTransliterator::RuleBasedTransliterator(
93 const UnicodeString& id,
94 const UnicodeString& rules,
95 UTransDirection direction,
96 UnicodeFilter* adoptedFilter,
97 UErrorCode& status) :
98 Transliterator(id, adoptedFilter) {
99 UParseError parseError;
100 _construct(rules, direction,parseError, status);
101 }*/
102
103 /**
104 * Convenience constructor with no filter.
105 */
106 /*RuleBasedTransliterator::RuleBasedTransliterator(
107 const UnicodeString& id,
108 const UnicodeString& rules,
109 UTransDirection direction,
110 UErrorCode& status) :
111 Transliterator(id, 0) {
112 UParseError parseError;
113 _construct(rules, direction,parseError, status);
114 }*/
115
116 /**
117 * Convenience constructor with no filter and FORWARD direction.
118 */
119 /*RuleBasedTransliterator::RuleBasedTransliterator(
120 const UnicodeString& id,
121 const UnicodeString& rules,
122 UErrorCode& status) :
123 Transliterator(id, 0) {
124 UParseError parseError;
125 _construct(rules, UTRANS_FORWARD, parseError, status);
126 }*/
127
128 /**
129 * Convenience constructor with FORWARD direction.
130 */
131 /*RuleBasedTransliterator::RuleBasedTransliterator(
132 const UnicodeString& id,
133 const UnicodeString& rules,
134 UnicodeFilter* adoptedFilter,
135 UErrorCode& status) :
136 Transliterator(id, adoptedFilter) {
137 UParseError parseError;
138 _construct(rules, UTRANS_FORWARD,parseError, status);
139 }*/
140
RuleBasedTransliterator(const UnicodeString & id,const TransliterationRuleData * theData,UnicodeFilter * adoptedFilter)141 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
142 const TransliterationRuleData* theData,
143 UnicodeFilter* adoptedFilter) :
144 Transliterator(id, adoptedFilter),
145 fData((TransliterationRuleData*)theData), // cast away const
146 isDataOwned(false) {
147 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
148 }
149
150 /**
151 * Internal constructor.
152 */
RuleBasedTransliterator(const UnicodeString & id,TransliterationRuleData * theData,UBool isDataAdopted)153 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
154 TransliterationRuleData* theData,
155 UBool isDataAdopted) :
156 Transliterator(id, 0),
157 fData(theData),
158 isDataOwned(isDataAdopted) {
159 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
160 }
161
162 /**
163 * Copy constructor.
164 */
RuleBasedTransliterator(const RuleBasedTransliterator & other)165 RuleBasedTransliterator::RuleBasedTransliterator(
166 const RuleBasedTransliterator& other) :
167 Transliterator(other), fData(other.fData),
168 isDataOwned(other.isDataOwned) {
169
170 // The data object may or may not be owned. If it is not owned we
171 // share it; it is invariant. If it is owned, it's still
172 // invariant, but we need to copy it to prevent double-deletion.
173 // If this becomes a performance issue (if people do a lot of RBT
174 // copying -- unlikely) we can reference count the data object.
175
176 // Only do a deep copy if this is owned data, that is, data that
177 // will be later deleted. System transliterators contain
178 // non-owned data.
179 if (isDataOwned) {
180 fData = new TransliterationRuleData(*other.fData);
181 }
182 }
183
184 /**
185 * Destructor.
186 */
~RuleBasedTransliterator()187 RuleBasedTransliterator::~RuleBasedTransliterator() {
188 // Delete the data object only if we own it.
189 if (isDataOwned) {
190 delete fData;
191 }
192 }
193
194 RuleBasedTransliterator*
clone() const195 RuleBasedTransliterator::clone() const {
196 return new RuleBasedTransliterator(*this);
197 }
198
199 /**
200 * Implements {@link Transliterator#handleTransliterate}.
201 */
202 void
handleTransliterate(Replaceable & text,UTransPosition & index,UBool isIncremental) const203 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
204 UBool isIncremental) const {
205 /* We keep contextStart and contextLimit fixed the entire time,
206 * relative to the text -- contextLimit may move numerically if
207 * text is inserted or removed. The start offset moves toward
208 * limit, with replacements happening under it.
209 *
210 * Example: rules 1. ab>x|y
211 * 2. yc>z
212 *
213 * |eabcd begin - no match, advance start
214 * e|abcd match rule 1 - change text & adjust start
215 * ex|ycd match rule 2 - change text & adjust start
216 * exz|d no match, advance start
217 * exzd| done
218 */
219
220 /* A rule like
221 * a>b|a
222 * creates an infinite loop. To prevent that, we put an arbitrary
223 * limit on the number of iterations that we take, one that is
224 * high enough that any reasonable rules are ok, but low enough to
225 * prevent a server from hanging. The limit is 16 times the
226 * number of characters n, unless n is so large that 16n exceeds a
227 * uint32_t.
228 */
229 uint32_t loopCount = 0;
230 uint32_t loopLimit = index.limit - index.start;
231 if (loopLimit >= 0x10000000) {
232 loopLimit = 0xFFFFFFFF;
233 } else {
234 loopLimit <<= 4;
235 }
236
237 // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent
238 // operations must be prevented.
239 // A Complication: compound transliterators can result in recursive entries to this
240 // function, sometimes with different "This" objects, always with the same text.
241 // Double-locking must be prevented in these cases.
242 //
243
244 UBool lockedMutexAtThisLevel = false;
245
246 // Test whether this request is operating on the same text string as
247 // some other transliteration that is still in progress and holding the
248 // transliteration mutex. If so, do not lock the transliteration
249 // mutex again.
250 //
251 // gLockedText variable is protected by the global ICU mutex.
252 // Shared RBT data protected by transliteratorDataMutex.
253 //
254 // TODO(andy): Need a better scheme for handling this.
255
256 static UMutex transliteratorDataMutex;
257 UBool needToLock;
258 {
259 Mutex m;
260 needToLock = (&text != gLockedText);
261 }
262 if (needToLock) {
263 umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here.
264 Mutex m;
265 gLockedText = &text;
266 lockedMutexAtThisLevel = true;
267 }
268
269 // Check to make sure we don't dereference a null pointer.
270 if (fData != NULL) {
271 while (index.start < index.limit &&
272 loopCount <= loopLimit &&
273 fData->ruleSet.transliterate(text, index, isIncremental)) {
274 ++loopCount;
275 }
276 }
277 if (lockedMutexAtThisLevel) {
278 {
279 Mutex m;
280 gLockedText = NULL;
281 }
282 umtx_unlock(&transliteratorDataMutex);
283 }
284 }
285
toRules(UnicodeString & rulesSource,UBool escapeUnprintable) const286 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
287 UBool escapeUnprintable) const {
288 return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
289 }
290
291 /**
292 * Implement Transliterator framework
293 */
handleGetSourceSet(UnicodeSet & result) const294 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
295 fData->ruleSet.getSourceTargetSet(result, false);
296 }
297
298 /**
299 * Override Transliterator framework
300 */
getTargetSet(UnicodeSet & result) const301 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
302 return fData->ruleSet.getSourceTargetSet(result, true);
303 }
304
305 U_NAMESPACE_END
306
307 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
308