• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 1999-2015, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   11/17/99    aliu        Creation.
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "unicode/rep.h"
16 #include "unicode/uniset.h"
17 #include "rbt_pars.h"
18 #include "rbt_data.h"
19 #include "rbt_rule.h"
20 #include "rbt.h"
21 #include "mutex.h"
22 #include "umutex.h"
23 
24 U_NAMESPACE_BEGIN
25 
26 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedTransliterator)
27 
28 static UMutex transliteratorDataMutex = U_MUTEX_INITIALIZER;
29 static Replaceable *gLockedText = NULL;
30 
_construct(const UnicodeString & rules,UTransDirection direction,UParseError & parseError,UErrorCode & status)31 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
32                                          UTransDirection direction,
33                                          UParseError& parseError,
34                                          UErrorCode& status) {
35     fData = 0;
36     isDataOwned = TRUE;
37     if (U_FAILURE(status)) {
38         return;
39     }
40 
41     TransliteratorParser parser(status);
42     parser.parse(rules, direction, parseError, status);
43     if (U_FAILURE(status)) {
44         return;
45     }
46 
47     if (parser.idBlockVector.size() != 0 ||
48         parser.compoundFilter != NULL ||
49         parser.dataVector.size() == 0) {
50         status = U_INVALID_RBT_SYNTAX; // ::ID blocks disallowed in RBT
51         return;
52     }
53 
54     fData = (TransliterationRuleData*)parser.dataVector.orphanElementAt(0);
55     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
56 }
57 
58 /**
59  * Constructs a new transliterator from the given rules.
60  * @param id            the id for the transliterator.
61  * @param rules         rules, separated by ';'
62  * @param direction     either FORWARD or REVERSE.
63  * @param adoptedFilter the filter for this transliterator.
64  * @param parseError    Struct to recieve information on position
65  *                      of error if an error is encountered
66  * @param status        Output param set to success/failure code.
67  * @exception IllegalArgumentException if rules are malformed
68  * or direction is invalid.
69  */
RuleBasedTransliterator(const UnicodeString & id,const UnicodeString & rules,UTransDirection direction,UnicodeFilter * adoptedFilter,UParseError & parseError,UErrorCode & status)70 RuleBasedTransliterator::RuleBasedTransliterator(
71                             const UnicodeString& id,
72                             const UnicodeString& rules,
73                             UTransDirection direction,
74                             UnicodeFilter* adoptedFilter,
75                             UParseError& parseError,
76                             UErrorCode& status) :
77     Transliterator(id, adoptedFilter) {
78     _construct(rules, direction,parseError,status);
79 }
80 
81 /**
82  * Constructs a new transliterator from the given rules.
83  * @param id            the id for the transliterator.
84  * @param rules         rules, separated by ';'
85  * @param direction     either FORWARD or REVERSE.
86  * @param adoptedFilter the filter for this transliterator.
87  * @param status        Output param set to success/failure code.
88  * @exception IllegalArgumentException if rules are malformed
89  * or direction is invalid.
90  */
91 /*RuleBasedTransliterator::RuleBasedTransliterator(
92                             const UnicodeString& id,
93                             const UnicodeString& rules,
94                             UTransDirection direction,
95                             UnicodeFilter* adoptedFilter,
96                             UErrorCode& status) :
97     Transliterator(id, adoptedFilter) {
98     UParseError parseError;
99     _construct(rules, direction,parseError, status);
100 }*/
101 
102 /**
103  * Covenience constructor with no filter.
104  */
105 /*RuleBasedTransliterator::RuleBasedTransliterator(
106                             const UnicodeString& id,
107                             const UnicodeString& rules,
108                             UTransDirection direction,
109                             UErrorCode& status) :
110     Transliterator(id, 0) {
111     UParseError parseError;
112     _construct(rules, direction,parseError, status);
113 }*/
114 
115 /**
116  * Covenience constructor with no filter and FORWARD direction.
117  */
118 /*RuleBasedTransliterator::RuleBasedTransliterator(
119                             const UnicodeString& id,
120                             const UnicodeString& rules,
121                             UErrorCode& status) :
122     Transliterator(id, 0) {
123     UParseError parseError;
124     _construct(rules, UTRANS_FORWARD, parseError, status);
125 }*/
126 
127 /**
128  * Covenience constructor with FORWARD direction.
129  */
130 /*RuleBasedTransliterator::RuleBasedTransliterator(
131                             const UnicodeString& id,
132                             const UnicodeString& rules,
133                             UnicodeFilter* adoptedFilter,
134                             UErrorCode& status) :
135     Transliterator(id, adoptedFilter) {
136     UParseError parseError;
137     _construct(rules, UTRANS_FORWARD,parseError, status);
138 }*/
139 
RuleBasedTransliterator(const UnicodeString & id,const TransliterationRuleData * theData,UnicodeFilter * adoptedFilter)140 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
141                                  const TransliterationRuleData* theData,
142                                  UnicodeFilter* adoptedFilter) :
143     Transliterator(id, adoptedFilter),
144     fData((TransliterationRuleData*)theData), // cast away const
145     isDataOwned(FALSE) {
146     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
147 }
148 
149 /**
150  * Internal constructor.
151  */
RuleBasedTransliterator(const UnicodeString & id,TransliterationRuleData * theData,UBool isDataAdopted)152 RuleBasedTransliterator::RuleBasedTransliterator(const UnicodeString& id,
153                                                  TransliterationRuleData* theData,
154                                                  UBool isDataAdopted) :
155     Transliterator(id, 0),
156     fData(theData),
157     isDataOwned(isDataAdopted) {
158     setMaximumContextLength(fData->ruleSet.getMaximumContextLength());
159 }
160 
161 /**
162  * Copy constructor.
163  */
RuleBasedTransliterator(const RuleBasedTransliterator & other)164 RuleBasedTransliterator::RuleBasedTransliterator(
165         const RuleBasedTransliterator& other) :
166     Transliterator(other), fData(other.fData),
167     isDataOwned(other.isDataOwned) {
168 
169     // The data object may or may not be owned.  If it is not owned we
170     // share it; it is invariant.  If it is owned, it's still
171     // invariant, but we need to copy it to prevent double-deletion.
172     // If this becomes a performance issue (if people do a lot of RBT
173     // copying -- unlikely) we can reference count the data object.
174 
175     // Only do a deep copy if this is owned data, that is, data that
176     // will be later deleted.  System transliterators contain
177     // non-owned data.
178     if (isDataOwned) {
179         fData = new TransliterationRuleData(*other.fData);
180     }
181 }
182 
183 /**
184  * Destructor.
185  */
~RuleBasedTransliterator()186 RuleBasedTransliterator::~RuleBasedTransliterator() {
187     // Delete the data object only if we own it.
188     if (isDataOwned) {
189         delete fData;
190     }
191 }
192 
193 Transliterator* // Covariant return NOT ALLOWED (for portability)
clone(void) const194 RuleBasedTransliterator::clone(void) const {
195     return new RuleBasedTransliterator(*this);
196 }
197 
198 /**
199  * Implements {@link Transliterator#handleTransliterate}.
200  */
201 void
handleTransliterate(Replaceable & text,UTransPosition & index,UBool isIncremental) const202 RuleBasedTransliterator::handleTransliterate(Replaceable& text, UTransPosition& index,
203                                              UBool isIncremental) const {
204     /* We keep contextStart and contextLimit fixed the entire time,
205      * relative to the text -- contextLimit may move numerically if
206      * text is inserted or removed.  The start offset moves toward
207      * limit, with replacements happening under it.
208      *
209      * Example: rules 1. ab>x|y
210      *                2. yc>z
211      *
212      * |eabcd   begin - no match, advance start
213      * e|abcd   match rule 1 - change text & adjust start
214      * ex|ycd   match rule 2 - change text & adjust start
215      * exz|d    no match, advance start
216      * exzd|    done
217      */
218 
219     /* A rule like
220      *   a>b|a
221      * creates an infinite loop. To prevent that, we put an arbitrary
222      * limit on the number of iterations that we take, one that is
223      * high enough that any reasonable rules are ok, but low enough to
224      * prevent a server from hanging.  The limit is 16 times the
225      * number of characters n, unless n is so large that 16n exceeds a
226      * uint32_t.
227      */
228     uint32_t loopCount = 0;
229     uint32_t loopLimit = index.limit - index.start;
230     if (loopLimit >= 0x10000000) {
231         loopLimit = 0xFFFFFFFF;
232     } else {
233         loopLimit <<= 4;
234     }
235 
236     // Transliterator locking.  Rule-based Transliterators are not thread safe; concurrent
237     //   operations must be prevented.
238     // A Complication: compound transliterators can result in recursive entries to this
239     //   function, sometimes with different "This" objects, always with the same text.
240     //   Double-locking must be prevented in these cases.
241     //
242 
243     // If the transliteration data is exclusively owned by this transliterator object,
244     //   we don't need to do any locking.  No sharing between transliterators is possible,
245     //   so no concurrent access from multiple threads is possible.
246     UBool    lockedMutexAtThisLevel = FALSE;
247     if (isDataOwned == FALSE) {
248         // Test whether this request is operating on the same text string as
249         //   some other transliteration that is still in progress and holding the
250         //   transliteration mutex.  If so, do not lock the transliteration
251         //    mutex again.
252         //
253         //  gLockedText variable is protected by the global ICU mutex.
254         //  Shared RBT data protected by transliteratorDataMutex.
255         //
256         // TODO(andy): Need a better scheme for handling this.
257         UBool needToLock;
258         {
259             Mutex m;
260             needToLock = (&text != gLockedText);
261         }
262         if (needToLock) {
263             umtx_lock(&transliteratorDataMutex);  // Contention, longish waits possible here.
264             Mutex m;
265             gLockedText = &text;
266             lockedMutexAtThisLevel = TRUE;
267         }
268     }
269 
270     // Check to make sure we don't dereference a null pointer.
271     if (fData != NULL) {
272 	    while (index.start < index.limit &&
273 	           loopCount <= loopLimit &&
274 	           fData->ruleSet.transliterate(text, index, isIncremental)) {
275 	        ++loopCount;
276 	    }
277     }
278     if (lockedMutexAtThisLevel) {
279         {
280             Mutex m;
281             gLockedText = NULL;
282         }
283         umtx_unlock(&transliteratorDataMutex);
284     }
285 }
286 
toRules(UnicodeString & rulesSource,UBool escapeUnprintable) const287 UnicodeString& RuleBasedTransliterator::toRules(UnicodeString& rulesSource,
288                                                 UBool escapeUnprintable) const {
289     return fData->ruleSet.toRules(rulesSource, escapeUnprintable);
290 }
291 
292 /**
293  * Implement Transliterator framework
294  */
handleGetSourceSet(UnicodeSet & result) const295 void RuleBasedTransliterator::handleGetSourceSet(UnicodeSet& result) const {
296     fData->ruleSet.getSourceTargetSet(result, FALSE);
297 }
298 
299 /**
300  * Override Transliterator framework
301  */
getTargetSet(UnicodeSet & result) const302 UnicodeSet& RuleBasedTransliterator::getTargetSet(UnicodeSet& result) const {
303     return fData->ruleSet.getSourceTargetSet(result, TRUE);
304 }
305 
306 U_NAMESPACE_END
307 
308 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
309