1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1999-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 11/17/99 aliu Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "unicode/rep.h"
18 #include "unicode/uniset.h"
19 #include "rbt_pars.h"
20 #include "rbt_data.h"
21 #include "rbt_rule.h"
22 #include "rbt.h"
23 #include "mutex.h"
24 #include "umutex.h"
25
26 U_NAMESPACE_BEGIN
27
28 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
29
30 static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER;
31 static Replaceable *gLockedText = NULL;
32
_construct(const UnicodeString & rules,UTransDirection direction,UParseError & parseError,UErrorCode & status)33 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
34 UTransDirection direction,
35 UParseError& parseError,
36 UErrorCode& status) {
37 fData = 0;
38 isDataOwned = TRUE;
39 if (U_FAILURE(status)) {
40 return;
41 }
42
43 TransliteratorParser parser(status);
44 parser.parse(rules, direction, parseError, status);
45 if (U_FAILURE(status)) {
46 return;
47 }
48
49 if (parser.idBlockVector.size() != 0 ||
50 parser.compoundFilter != NULL ||
51 parser.dataVector.size() == 0) {
52 status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
53 return;
54 }
55
56 fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
57 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
58 }
59
60 /**
61 * Constructs a new transliterator from the given rules.
62 * @param id the id for the transliterator.
63 * @param rules rules, separated by ';'
64 * @param direction either FORWARD or REVERSE.
65 * @param adoptedFilter the filter for this transliterator.
66 * @param parseError Struct to recieve information on position
67 * of error if an error is encountered
68 * @param status Output param set to success/failure code.
69 * @exception IllegalArgumentException if rules are malformed
70 * or direction is invalid.
71 */
RuleBasedTransliterator(const UnicodeString & id,const UnicodeString & rules,UTransDirection direction,UnicodeFilter * adoptedFilter,UParseError & parseError,UErrorCode & status)72 RuleBasedTransliterator::RuleBasedTransliterator(
73 const UnicodeString& id,
74 const UnicodeString& rules,
75 UTransDirection direction,
76 UnicodeFilter* adoptedFilter,
77 UParseError& parseError,
78 UErrorCode& status) :
79 Transliterator(id, adoptedFilter) {
80 _construct(rules, direction,parseError,status);
81 }
82
83 /**
84 * Constructs a new transliterator from the given rules.
85 * @param id the id for the transliterator.
86 * @param rules rules, separated by ';'
87 * @param direction either FORWARD or REVERSE.
88 * @param adoptedFilter the filter for this transliterator.
89 * @param status Output param set to success/failure code.
90 * @exception IllegalArgumentException if rules are malformed
91 * or direction is invalid.
92 */
93 /*RuleBasedTransliterator::RuleBasedTransliterator(
94 const UnicodeString& id,
95 const UnicodeString& rules,
96 UTransDirection direction,
97 UnicodeFilter* adoptedFilter,
98 UErrorCode& status) :
99 Transliterator(id, adoptedFilter) {
100 UParseError parseError;
101 _construct(rules, direction,parseError, status);
102 }*/
103
104 /**
105 * Covenience constructor with no filter.
106 */
107 /*RuleBasedTransliterator::RuleBasedTransliterator(
108 const UnicodeString& id,
109 const UnicodeString& rules,
110 UTransDirection direction,
111 UErrorCode& status) :
112 Transliterator(id, 0) {
113 UParseError parseError;
114 _construct(rules, direction,parseError, status);
115 }*/
116
117 /**
118 * Covenience constructor with no filter and FORWARD direction.
119 */
120 /*RuleBasedTransliterator::RuleBasedTransliterator(
121 const UnicodeString& id,
122 const UnicodeString& rules,
123 UErrorCode& status) :
124 Transliterator(id, 0) {
125 UParseError parseError;
126 _construct(rules, UTRANS_FORWARD, parseError, status);
127 }*/
128
129 /**
130 * Covenience constructor with FORWARD direction.
131 */
132 /*RuleBasedTransliterator::RuleBasedTransliterator(
133 const UnicodeString& id,
134 const UnicodeString& rules,
135 UnicodeFilter* adoptedFilter,
136 UErrorCode& status) :
137 Transliterator(id, adoptedFilter) {
138 UParseError parseError;
139 _construct(rules, UTRANS_FORWARD,parseError, status);
140 }*/
141
RuleBasedTransliterator(const UnicodeString & id,const TransliterationRuleData * theData,UnicodeFilter * adoptedFilter)142 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
143 const TransliterationRuleData* theData,
144 UnicodeFilter* adoptedFilter) :
145 Transliterator(id, adoptedFilter),
146 fData((TransliterationRuleData*)theData), // cast away const
147 isDataOwned(FALSE) {
148 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
149 }
150
151 /**
152 * Internal constructor.
153 */
RuleBasedTransliterator(const UnicodeString & id,TransliterationRuleData * theData,UBool isDataAdopted)154 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
155 TransliterationRuleData* theData,
156 UBool isDataAdopted) :
157 Transliterator(id, 0),
158 fData(theData),
159 isDataOwned(isDataAdopted) {
160 setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
161 }
162
163 /**
164 * Copy constructor.
165 */
RuleBasedTransliterator(const RuleBasedTransliterator & other)166 RuleBasedTransliterator::RuleBasedTransliterator(
167 const RuleBasedTransliterator& other) :
168 Transliterator(other), fData(other.fData),
169 isDataOwned(other.isDataOwned) {
170
171 // The data object may or may not be owned. If it is not owned we
172 // share it; it is invariant. If it is owned, it's still
173 // invariant, but we need to copy it to prevent double-deletion.
174 // If this becomes a performance issue (if people do a lot of RBT
175 // copying -- unlikely) we can reference count the data object.
176
177 // Only do a deep copy if this is owned data, that is, data that
178 // will be later deleted. System transliterators contain
179 // non-owned data.
180 if (isDataOwned) {
181 fData = new TransliterationRuleData(*other.fData);
182 }
183 }
184
185 /**
186 * Destructor.
187 */
~RuleBasedTransliterator()188 RuleBasedTransliterator::~RuleBasedTransliterator() {
189 // Delete the data object only if we own it.
190 if (isDataOwned) {
191 delete fData;
192 }
193 }
194
195 Transliterator* // Covariant return NOT ALLOWED (for portability)
clone(void) const196 RuleBasedTransliterator::clone(void) const {
197 return new RuleBasedTransliterator(*this);
198 }
199
200 /**
201 * Implements {@link Transliterator#handleTransliterate}.
202 */
203 void
handleTransliterate(Replaceable & text,UTransPosition & index,UBool isIncremental) const204 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
205 UBool isIncremental) const {
206 /* We keep contextStart and contextLimit fixed the entire time,
207 * relative to the text -- contextLimit may move numerically if
208 * text is inserted or removed. The start offset moves toward
209 * limit, with replacements happening under it.
210 *
211 * Example: rules 1. ab>x|y
212 * 2. yc>z
213 *
214 * |eabcd begin - no match, advance start
215 * e|abcd match rule 1 - change text & adjust start
216 * ex|ycd match rule 2 - change text & adjust start
217 * exz|d no match, advance start
218 * exzd| done
219 */
220
221 /* A rule like
222 * a>b|a
223 * creates an infinite loop. To prevent that, we put an arbitrary
224 * limit on the number of iterations that we take, one that is
225 * high enough that any reasonable rules are ok, but low enough to
226 * prevent a server from hanging. The limit is 16 times the
227 * number of characters n, unless n is so large that 16n exceeds a
228 * uint32_t.
229 */
230 uint32_t loopCount = 0;
231 uint32_t loopLimit = index.limit - index.start;
232 if (loopLimit >= 0x10000000) {
233 loopLimit = 0xFFFFFFFF;
234 } else {
235 loopLimit <<= 4;
236 }
237
238 // Transliterator locking. Rule-based Transliterators are not thread safe; concurrent
239 // operations must be prevented.
240 // A Complication: compound transliterators can result in recursive entries to this
241 // function, sometimes with different "This" objects, always with the same text.
242 // Double-locking must be prevented in these cases.
243 //
244
245 UBool lockedMutexAtThisLevel = FALSE;
246
247 // Test whether this request is operating on the same text string as
248 // some other transliteration that is still in progress and holding the
249 // transliteration mutex. If so, do not lock the transliteration
250 // mutex again.
251 //
252 // gLockedText variable is protected by the global ICU mutex.
253 // Shared RBT data protected by transliteratorDataMutex.
254 //
255 // TODO(andy): Need a better scheme for handling this.
256 UBool needToLock;
257 {
258 Mutex m;
259 needToLock = (&text != gLockedText);
260 }
261 if (needToLock) {
262 umtx_lock(&transliteratorDataMutex); // Contention, longish waits possible here.
263 Mutex m;
264 gLockedText = &text;
265 lockedMutexAtThisLevel = TRUE;
266 }
267
268 // Check to make sure we don't dereference a null pointer.
269 if (fData != NULL) {
270 while (index.start < index.limit &&
271 loopCount <= loopLimit &&
272 fData->ruleSet.transliterate(text, index, isIncremental)) {
273 ++loopCount;
274 }
275 }
276 if (lockedMutexAtThisLevel) {
277 {
278 Mutex m;
279 gLockedText = NULL;
280 }
281 umtx_unlock(&transliteratorDataMutex);
282 }
283 }
284
toRules(UnicodeString & rulesSource,UBool escapeUnprintable) const285 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
286 UBool escapeUnprintable) const {
287 return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
288 }
289
290 /**
291 * Implement Transliterator framework
292 */
handleGetSourceSet(UnicodeSet & result) const293 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
294 fData->ruleSet.getSourceTargetSet(result, FALSE);
295 }
296
297 /**
298 * Override Transliterator framework
299 */
getTargetSet(UnicodeSet & result) const300 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
301 return fData->ruleSet.getSourceTargetSet(result, TRUE);
302 }
303
304 U_NAMESPACE_END
305
306 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
307