1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 11/17/99 aliu Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/unistr.h"
16 #include "unicode/uniset.h"
17 #include "unicode/utf16.h"
18 #include "rbt_set.h"
19 #include "rbt_rule.h"
20 #include "cmemory.h"
21 #include "putilimp.h"
22
23 U_CDECL_BEGIN
_deleteRule(void * rule)24 static void U_CALLCONV _deleteRule(void *rule) {
25 delete (icu::TransliterationRule *)rule;
26 }
27 U_CDECL_END
28
29 //----------------------------------------------------------------------
30 // BEGIN Debugging support
31 //----------------------------------------------------------------------
32
33 // #define DEBUG_RBT
34
35 #ifdef DEBUG_RBT
36 #include <stdio.h>
37 #include "charstr.h"
38
39 /**
40 * @param appendTo result is appended to this param.
41 * @param input the string being transliterated
42 * @param pos the index struct
43 */
_formatInput(UnicodeString & appendTo,const UnicodeString & input,const UTransPosition & pos)44 static UnicodeString& _formatInput(UnicodeString &appendTo,
45 const UnicodeString& input,
46 const UTransPosition& pos) {
47 // Output a string of the form aaa{bbb|ccc|ddd}eee, where
48 // the {} indicate the context start and limit, and the ||
49 // indicate the start and limit.
50 if (0 <= pos.contextStart &&
51 pos.contextStart <= pos.start &&
52 pos.start <= pos.limit &&
53 pos.limit <= pos.contextLimit &&
54 pos.contextLimit <= input.length()) {
55
56 UnicodeString a, b, c, d, e;
57 input.extractBetween(0, pos.contextStart, a);
58 input.extractBetween(pos.contextStart, pos.start, b);
59 input.extractBetween(pos.start, pos.limit, c);
60 input.extractBetween(pos.limit, pos.contextLimit, d);
61 input.extractBetween(pos.contextLimit, input.length(), e);
62 appendTo.append(a).append((UChar)123/*{*/).append(b).
63 append((UChar)124/*|*/).append(c).append((UChar)124/*|*/).append(d).
64 append((UChar)125/*}*/).append(e);
65 } else {
66 appendTo.append("INVALID UTransPosition");
67 //appendTo.append((UnicodeString)"INVALID UTransPosition {cs=" +
68 // pos.contextStart + ", s=" + pos.start + ", l=" +
69 // pos.limit + ", cl=" + pos.contextLimit + "} on " +
70 // input);
71 }
72 return appendTo;
73 }
74
75 // Append a hex string to the target
_appendHex(uint32_t number,int32_t digits,UnicodeString & target)76 UnicodeString& _appendHex(uint32_t number,
77 int32_t digits,
78 UnicodeString& target) {
79 static const UChar digitString[] = {
80 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
81 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0
82 };
83 while (digits--) {
84 target += digitString[(number >> (digits*4)) & 0xF];
85 }
86 return target;
87 }
88
89 // Replace nonprintable characters with unicode escapes
_escape(const UnicodeString & source,UnicodeString & target)90 UnicodeString& _escape(const UnicodeString &source,
91 UnicodeString &target) {
92 for (int32_t i = 0; i < source.length(); ) {
93 UChar32 ch = source.char32At(i);
94 i += U16_LENGTH(ch);
95 if (ch < 0x09 || (ch > 0x0A && ch < 0x20)|| ch > 0x7E) {
96 if (ch <= 0xFFFF) {
97 target += "\\u";
98 _appendHex(ch, 4, target);
99 } else {
100 target += "\\U";
101 _appendHex(ch, 8, target);
102 }
103 } else {
104 target += ch;
105 }
106 }
107 return target;
108 }
109
_debugOut(const char * msg,TransliterationRule * rule,const Replaceable & theText,UTransPosition & pos)110 inline void _debugOut(const char* msg, TransliterationRule* rule,
111 const Replaceable& theText, UTransPosition& pos) {
112 UnicodeString buf(msg, "");
113 if (rule) {
114 UnicodeString r;
115 rule->toRule(r, TRUE);
116 buf.append((UChar)32).append(r);
117 }
118 buf.append(UnicodeString(" => ", ""));
119 UnicodeString* text = (UnicodeString*)&theText;
120 _formatInput(buf, *text, pos);
121 UnicodeString esc;
122 _escape(buf, esc);
123 CharString cbuf(esc);
124 printf("%s\n", (const char*) cbuf);
125 }
126
127 #else
128 #define _debugOut(msg, rule, theText, pos)
129 #endif
130
131 //----------------------------------------------------------------------
132 // END Debugging support
133 //----------------------------------------------------------------------
134
135 // Fill the precontext and postcontext with the patterns of the rules
136 // that are masking one another.
maskingError(const icu::TransliterationRule & rule1,const icu::TransliterationRule & rule2,UParseError & parseError)137 static void maskingError(const icu::TransliterationRule& rule1,
138 const icu::TransliterationRule& rule2,
139 UParseError& parseError) {
140 icu::UnicodeString r;
141 int32_t len;
142
143 parseError.line = parseError.offset = -1;
144
145 // for pre-context
146 rule1.toRule(r, FALSE);
147 len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);
148 r.extract(0, len, parseError.preContext);
149 parseError.preContext[len] = 0;
150
151 //for post-context
152 r.truncate(0);
153 rule2.toRule(r, FALSE);
154 len = uprv_min(r.length(), U_PARSE_CONTEXT_LEN-1);
155 r.extract(0, len, parseError.postContext);
156 parseError.postContext[len] = 0;
157 }
158
159 U_NAMESPACE_BEGIN
160
161 /**
162 * Construct a new empty rule set.
163 */
TransliterationRuleSet(UErrorCode & status)164 TransliterationRuleSet::TransliterationRuleSet(UErrorCode& status) : UMemory() {
165 ruleVector = new UVector(&_deleteRule, NULL, status);
166 if (U_FAILURE(status)) {
167 return;
168 }
169 if (ruleVector == NULL) {
170 status = U_MEMORY_ALLOCATION_ERROR;
171 }
172 rules = NULL;
173 maxContextLength = 0;
174 }
175
176 /**
177 * Copy constructor.
178 */
TransliterationRuleSet(const TransliterationRuleSet & other)179 TransliterationRuleSet::TransliterationRuleSet(const TransliterationRuleSet& other) :
180 UMemory(other),
181 ruleVector(0),
182 rules(0),
183 maxContextLength(other.maxContextLength) {
184
185 int32_t i, len;
186 uprv_memcpy(index, other.index, sizeof(index));
187 UErrorCode status = U_ZERO_ERROR;
188 ruleVector = new UVector(&_deleteRule, NULL, status);
189 if (other.ruleVector != 0 && ruleVector != 0 && U_SUCCESS(status)) {
190 len = other.ruleVector->size();
191 for (i=0; i<len && U_SUCCESS(status); ++i) {
192 TransliterationRule *tempTranslitRule = new TransliterationRule(*(TransliterationRule*)other.ruleVector->elementAt(i));
193 // Null pointer test
194 if (tempTranslitRule == NULL) {
195 status = U_MEMORY_ALLOCATION_ERROR;
196 break;
197 }
198 ruleVector->addElement(tempTranslitRule, status);
199 if (U_FAILURE(status)) {
200 break;
201 }
202 }
203 }
204 if (other.rules != 0 && U_SUCCESS(status)) {
205 UParseError p;
206 freeze(p, status);
207 }
208 }
209
210 /**
211 * Destructor.
212 */
~TransliterationRuleSet()213 TransliterationRuleSet::~TransliterationRuleSet() {
214 delete ruleVector; // This deletes the contained rules
215 uprv_free(rules);
216 }
217
setData(const TransliterationRuleData * d)218 void TransliterationRuleSet::setData(const TransliterationRuleData* d) {
219 /**
220 * We assume that the ruleset has already been frozen.
221 */
222 int32_t len = index[256]; // see freeze()
223 for (int32_t i=0; i<len; ++i) {
224 rules[i]->setData(d);
225 }
226 }
227
228 /**
229 * Return the maximum context length.
230 * @return the length of the longest preceding context.
231 */
getMaximumContextLength(void) const232 int32_t TransliterationRuleSet::getMaximumContextLength(void) const {
233 return maxContextLength;
234 }
235
236 /**
237 * Add a rule to this set. Rules are added in order, and order is
238 * significant. The last call to this method must be followed by
239 * a call to <code>freeze()</code> before the rule set is used.
240 *
241 * <p>If freeze() has already been called, calling addRule()
242 * unfreezes the rules, and freeze() must be called again.
243 *
244 * @param adoptedRule the rule to add
245 */
addRule(TransliterationRule * adoptedRule,UErrorCode & status)246 void TransliterationRuleSet::addRule(TransliterationRule* adoptedRule,
247 UErrorCode& status) {
248 if (U_FAILURE(status)) {
249 delete adoptedRule;
250 return;
251 }
252 ruleVector->addElement(adoptedRule, status);
253
254 int32_t len;
255 if ((len = adoptedRule->getContextLength()) > maxContextLength) {
256 maxContextLength = len;
257 }
258
259 uprv_free(rules);
260 rules = 0;
261 }
262
263 /**
264 * Check this for masked rules and index it to optimize performance.
265 * The sequence of operations is: (1) add rules to a set using
266 * <code>addRule()</code>; (2) freeze the set using
267 * <code>freeze()</code>; (3) use the rule set. If
268 * <code>addRule()</code> is called after calling this method, it
269 * invalidates this object, and this method must be called again.
270 * That is, <code>freeze()</code> may be called multiple times,
271 * although for optimal performance it shouldn't be.
272 */
freeze(UParseError & parseError,UErrorCode & status)273 void TransliterationRuleSet::freeze(UParseError& parseError,UErrorCode& status) {
274 /* Construct the rule array and index table. We reorder the
275 * rules by sorting them into 256 bins. Each bin contains all
276 * rules matching the index value for that bin. A rule
277 * matches an index value if string whose first key character
278 * has a low byte equal to the index value can match the rule.
279 *
280 * Each bin contains zero or more rules, in the same order
281 * they were found originally. However, the total rules in
282 * the bins may exceed the number in the original vector,
283 * since rules that have a variable as their first key
284 * character will generally fall into more than one bin.
285 *
286 * That is, each bin contains all rules that either have that
287 * first index value as their first key character, or have
288 * a set containing the index value as their first character.
289 */
290 int32_t n = ruleVector->size();
291 int32_t j;
292 int16_t x;
293 UVector v(2*n, status); // heuristic; adjust as needed
294
295 if (U_FAILURE(status)) {
296 return;
297 }
298
299 /* Precompute the index values. This saves a LOT of time.
300 * Be careful not to call malloc(0).
301 */
302 int16_t* indexValue = (int16_t*) uprv_malloc( sizeof(int16_t) * (n > 0 ? n : 1) );
303 /* test for NULL */
304 if (indexValue == 0) {
305 status = U_MEMORY_ALLOCATION_ERROR;
306 return;
307 }
308 for (j=0; j<n; ++j) {
309 TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
310 indexValue[j] = r->getIndexValue();
311 }
312 for (x=0; x<256; ++x) {
313 index[x] = v.size();
314 for (j=0; j<n; ++j) {
315 if (indexValue[j] >= 0) {
316 if (indexValue[j] == x) {
317 v.addElement(ruleVector->elementAt(j), status);
318 }
319 } else {
320 // If the indexValue is < 0, then the first key character is
321 // a set, and we must use the more time-consuming
322 // matchesIndexValue check. In practice this happens
323 // rarely, so we seldom tread this code path.
324 TransliterationRule* r = (TransliterationRule*) ruleVector->elementAt(j);
325 if (r->matchesIndexValue((uint8_t)x)) {
326 v.addElement(r, status);
327 }
328 }
329 }
330 }
331 uprv_free(indexValue);
332 index[256] = v.size();
333
334 /* Freeze things into an array.
335 */
336 uprv_free(rules); // Contains alias pointers
337
338 /* You can't do malloc(0)! */
339 if (v.size() == 0) {
340 rules = NULL;
341 return;
342 }
343 rules = (TransliterationRule **)uprv_malloc(v.size() * sizeof(TransliterationRule *));
344 /* test for NULL */
345 if (rules == 0) {
346 status = U_MEMORY_ALLOCATION_ERROR;
347 return;
348 }
349 for (j=0; j<v.size(); ++j) {
350 rules[j] = (TransliterationRule*) v.elementAt(j);
351 }
352
353 // TODO Add error reporting that indicates the rules that
354 // are being masked.
355 //UnicodeString errors;
356
357 /* Check for masking. This is MUCH faster than our old check,
358 * which was each rule against each following rule, since we
359 * only have to check for masking within each bin now. It's
360 * 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
361 * count, and n2 is the per-bin rule count. But n2<<n1, so
362 * it's a big win.
363 */
364 for (x=0; x<256; ++x) {
365 for (j=index[x]; j<index[x+1]-1; ++j) {
366 TransliterationRule* r1 = rules[j];
367 for (int32_t k=j+1; k<index[x+1]; ++k) {
368 TransliterationRule* r2 = rules[k];
369 if (r1->masks(*r2)) {
370 //| if (errors == null) {
371 //| errors = new StringBuffer();
372 //| } else {
373 //| errors.append("\n");
374 //| }
375 //| errors.append("Rule " + r1 + " masks " + r2);
376 status = U_RULE_MASK_ERROR;
377 maskingError(*r1, *r2, parseError);
378 return;
379 }
380 }
381 }
382 }
383
384 //if (errors != null) {
385 // throw new IllegalArgumentException(errors.toString());
386 //}
387 }
388
389 /**
390 * Transliterate the given text with the given UTransPosition
391 * indices. Return TRUE if the transliteration should continue
392 * or FALSE if it should halt (because of a U_PARTIAL_MATCH match).
393 * Note that FALSE is only ever returned if isIncremental is TRUE.
394 * @param text the text to be transliterated
395 * @param pos the position indices, which will be updated
396 * @param incremental if TRUE, assume new text may be inserted
397 * at index.limit, and return FALSE if thre is a partial match.
398 * @return TRUE unless a U_PARTIAL_MATCH has been obtained,
399 * indicating that transliteration should stop until more text
400 * arrives.
401 */
transliterate(Replaceable & text,UTransPosition & pos,UBool incremental)402 UBool TransliterationRuleSet::transliterate(Replaceable& text,
403 UTransPosition& pos,
404 UBool incremental) {
405 int16_t indexByte = (int16_t) (text.char32At(pos.start) & 0xFF);
406 for (int32_t i=index[indexByte]; i<index[indexByte+1]; ++i) {
407 UMatchDegree m = rules[i]->matchAndReplace(text, pos, incremental);
408 switch (m) {
409 case U_MATCH:
410 _debugOut("match", rules[i], text, pos);
411 return TRUE;
412 case U_PARTIAL_MATCH:
413 _debugOut("partial match", rules[i], text, pos);
414 return FALSE;
415 default: /* Ram: added default to make GCC happy */
416 break;
417 }
418 }
419 // No match or partial match from any rule
420 pos.start += U16_LENGTH(text.char32At(pos.start));
421 _debugOut("no match", NULL, text, pos);
422 return TRUE;
423 }
424
425 /**
426 * Create rule strings that represents this rule set.
427 */
toRules(UnicodeString & ruleSource,UBool escapeUnprintable) const428 UnicodeString& TransliterationRuleSet::toRules(UnicodeString& ruleSource,
429 UBool escapeUnprintable) const {
430 int32_t i;
431 int32_t count = ruleVector->size();
432 ruleSource.truncate(0);
433 for (i=0; i<count; ++i) {
434 if (i != 0) {
435 ruleSource.append((UChar) 0x000A /*\n*/);
436 }
437 TransliterationRule *r =
438 (TransliterationRule*) ruleVector->elementAt(i);
439 r->toRule(ruleSource, escapeUnprintable);
440 }
441 return ruleSource;
442 }
443
444 /**
445 * Return the set of all characters that may be modified
446 * (getTarget=false) or emitted (getTarget=true) by this set.
447 */
getSourceTargetSet(UnicodeSet & result,UBool getTarget) const448 UnicodeSet& TransliterationRuleSet::getSourceTargetSet(UnicodeSet& result,
449 UBool getTarget) const
450 {
451 result.clear();
452 int32_t count = ruleVector->size();
453 for (int32_t i=0; i<count; ++i) {
454 TransliterationRule* r =
455 (TransliterationRule*) ruleVector->elementAt(i);
456 if (getTarget) {
457 r->addTargetSetTo(result);
458 } else {
459 r->addSourceSetTo(result);
460 }
461 }
462 return result;
463 }
464
465 U_NAMESPACE_END
466
467 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
468