1 /*
2 *****************************************************************
3 * Copyright (c) 2002-2006, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *****************************************************************
6 * Date Name Description
7 * 06/06/2002 aliu Creation.
8 *****************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_TRANSLITERATION
14
15 #include "unicode/uobject.h"
16 #include "unicode/uscript.h"
17 #include "nultrans.h"
18 #include "anytrans.h"
19 #include "uvector.h"
20 #include "tridpars.h"
21 #include "hash.h"
22 #include "putilimp.h"
23 #include "uinvchar.h"
24
25 //------------------------------------------------------------
26 // Constants
27
28 static const UChar TARGET_SEP = 45; // '-'
29 static const UChar VARIANT_SEP = 47; // '/'
30 static const UChar ANY[] = {65,110,121,0}; // "Any"
31 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
32 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
33
34 //------------------------------------------------------------
35
36 U_CDECL_BEGIN
37 /**
38 * Deleter function for Transliterator*.
39 */
40 static void U_CALLCONV
_deleteTransliterator(void * obj)41 _deleteTransliterator(void *obj) {
42 delete (U_NAMESPACE_QUALIFIER Transliterator*) obj;
43 }
44 U_CDECL_END
45
46 //------------------------------------------------------------
47
48 U_NAMESPACE_BEGIN
49
50 //------------------------------------------------------------
51 // ScriptRunIterator
52
53 /**
54 * Returns a series of ranges corresponding to scripts. They will be
55 * of the form:
56 *
57 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
58 * | | - first run (start, limit)
59 * | | - second run (start, limit)
60 *
61 * That is, the runs will overlap. The reason for this is so that a
62 * transliterator can consider common characters both before and after
63 * the scripts.
64 */
65 class ScriptRunIterator : public UMemory {
66 private:
67 const Replaceable& text;
68 int32_t textStart;
69 int32_t textLimit;
70
71 public:
72 /**
73 * The code of the current run, valid after next() returns. May
74 * be USCRIPT_INVALID_CODE if and only if the entire text is
75 * COMMON/INHERITED.
76 */
77 UScriptCode scriptCode;
78
79 /**
80 * The start of the run, inclusive, valid after next() returns.
81 */
82 int32_t start;
83
84 /**
85 * The end of the run, exclusive, valid after next() returns.
86 */
87 int32_t limit;
88
89 /**
90 * Constructs a run iterator over the given text from start
91 * (inclusive) to limit (exclusive).
92 */
93 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
94
95 /**
96 * Returns TRUE if there are any more runs. TRUE is always
97 * returned at least once. Upon return, the caller should
98 * examine scriptCode, start, and limit.
99 */
100 UBool next();
101
102 /**
103 * Adjusts internal indices for a change in the limit index of the
104 * given delta. A positive delta means the limit has increased.
105 */
106 void adjustLimit(int32_t delta);
107
108 private:
109 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
110 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
111 };
112
ScriptRunIterator(const Replaceable & theText,int32_t myStart,int32_t myLimit)113 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
114 int32_t myStart, int32_t myLimit) :
115 text(theText)
116 {
117 textStart = myStart;
118 textLimit = myLimit;
119 limit = myStart;
120 }
121
next()122 UBool ScriptRunIterator::next() {
123 UChar32 ch;
124 UScriptCode s;
125 UErrorCode ec = U_ZERO_ERROR;
126
127 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
128 start = limit;
129
130 // Are we done?
131 if (start == textLimit) {
132 return FALSE;
133 }
134
135 // Move start back to include adjacent COMMON or INHERITED
136 // characters
137 while (start > textStart) {
138 ch = text.char32At(start - 1); // look back
139 s = uscript_getScript(ch, &ec);
140 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
141 --start;
142 } else {
143 break;
144 }
145 }
146
147 // Move limit ahead to include COMMON, INHERITED, and characters
148 // of the current script.
149 while (limit < textLimit) {
150 ch = text.char32At(limit); // look ahead
151 s = uscript_getScript(ch, &ec);
152 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
153 if (scriptCode == USCRIPT_INVALID_CODE) {
154 scriptCode = s;
155 } else if (s != scriptCode) {
156 break;
157 }
158 }
159 ++limit;
160 }
161
162 // Return TRUE even if the entire text is COMMON / INHERITED, in
163 // which case scriptCode will be USCRIPT_INVALID_CODE.
164 return TRUE;
165 }
166
adjustLimit(int32_t delta)167 void ScriptRunIterator::adjustLimit(int32_t delta) {
168 limit += delta;
169 textLimit += delta;
170 }
171
172 //------------------------------------------------------------
173 // AnyTransliterator
174
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)175 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
176
177 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
178 const UnicodeString& theTarget,
179 const UnicodeString& theVariant,
180 UScriptCode theTargetScript,
181 UErrorCode& ec) :
182 Transliterator(id, NULL),
183 targetScript(theTargetScript)
184 {
185 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
186 uhash_setValueDeleter(cache, _deleteTransliterator);
187
188 target = theTarget;
189 if (theVariant.length() > 0) {
190 target.append(VARIANT_SEP).append(theVariant);
191 }
192 }
193
~AnyTransliterator()194 AnyTransliterator::~AnyTransliterator() {
195 uhash_close(cache);
196 }
197
198 /**
199 * Copy constructor.
200 */
AnyTransliterator(const AnyTransliterator & o)201 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
202 Transliterator(o),
203 target(o.target),
204 targetScript(o.targetScript)
205 {
206 // Don't copy the cache contents
207 UErrorCode ec = U_ZERO_ERROR;
208 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
209 uhash_setValueDeleter(cache, _deleteTransliterator);
210 }
211
212 /**
213 * Transliterator API.
214 */
clone() const215 Transliterator* AnyTransliterator::clone() const {
216 return new AnyTransliterator(*this);
217 }
218
219 /**
220 * Implements {@link Transliterator#handleTransliterate}.
221 */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const222 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
223 UBool isIncremental) const {
224 int32_t allStart = pos.start;
225 int32_t allLimit = pos.limit;
226
227 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
228
229 while (it.next()) {
230 // Ignore runs in the ante context
231 if (it.limit <= allStart) continue;
232
233 // Try to instantiate transliterator from it.scriptCode to
234 // our target or target/variant
235 Transliterator* t = getTransliterator(it.scriptCode);
236
237 if (t == NULL) {
238 // We have no transliterator. Do nothing, but keep
239 // pos.start up to date.
240 pos.start = it.limit;
241 continue;
242 }
243
244 // If the run end is before the transliteration limit, do
245 // a non-incremental transliteration. Otherwise do an
246 // incremental one.
247 UBool incremental = isIncremental && (it.limit >= allLimit);
248
249 pos.start = uprv_max(allStart, it.start);
250 pos.limit = uprv_min(allLimit, it.limit);
251 int32_t limit = pos.limit;
252 t->filteredTransliterate(text, pos, incremental);
253 int32_t delta = pos.limit - limit;
254 allLimit += delta;
255 it.adjustLimit(delta);
256
257 // We're done if we enter the post context
258 if (it.limit >= allLimit) break;
259 }
260
261 // Restore limit. pos.start is fine where the last transliterator
262 // left it, or at the end of the last run.
263 pos.limit = allLimit;
264 }
265
getTransliterator(UScriptCode source) const266 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
267
268 if (source == targetScript || source == USCRIPT_INVALID_CODE) {
269 return NULL;
270 }
271
272 Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
273 if (t == NULL) {
274 UErrorCode ec = U_ZERO_ERROR;
275 UnicodeString sourceName(uscript_getName(source), -1, US_INV);
276 UnicodeString id(sourceName);
277 id.append(TARGET_SEP).append(target);
278
279 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
280 if (U_FAILURE(ec) || t == NULL) {
281 delete t;
282
283 // Try to pivot around Latin, our most common script
284 id = sourceName;
285 id.append(LATIN_PIVOT).append(target);
286 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
287 if (U_FAILURE(ec) || t == NULL) {
288 delete t;
289 t = NULL;
290 }
291 }
292
293 if (t != NULL) {
294 uhash_iput(cache, (int32_t) source, t, &ec);
295 }
296 }
297
298 return t;
299 }
300
301 /**
302 * Return the script code for a given name, or -1 if not found.
303 */
scriptNameToCode(const UnicodeString & name)304 static UScriptCode scriptNameToCode(const UnicodeString& name) {
305 char buf[128];
306 UScriptCode code;
307 UErrorCode ec = U_ZERO_ERROR;
308 int32_t nameLen = name.length();
309 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
310
311 if (isInvariant) {
312 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
313 buf[127] = 0; // Make sure that we NULL terminate the string.
314 }
315 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
316 {
317 code = USCRIPT_INVALID_CODE;
318 }
319 return code;
320 }
321
322 /**
323 * Registers standard transliterators with the system. Called by
324 * Transliterator during initialization. Scan all current targets and
325 * register those that are scripts T as Any-T/V.
326 */
registerIDs()327 void AnyTransliterator::registerIDs() {
328
329 UErrorCode ec = U_ZERO_ERROR;
330 Hashtable seen(TRUE, ec);
331
332 int32_t sourceCount = Transliterator::_countAvailableSources();
333 for (int32_t s=0; s<sourceCount; ++s) {
334 UnicodeString source;
335 Transliterator::_getAvailableSource(s, source);
336
337 // Ignore the "Any" source
338 if (source.caseCompare(ANY, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
339
340 int32_t targetCount = Transliterator::_countAvailableTargets(source);
341 for (int32_t t=0; t<targetCount; ++t) {
342 UnicodeString target;
343 Transliterator::_getAvailableTarget(t, source, target);
344
345 // Only process each target once
346 if (seen.geti(target) != 0) continue;
347 ec = U_ZERO_ERROR;
348 seen.puti(target, 1, ec);
349
350 // Get the script code for the target. If not a script, ignore.
351 UScriptCode targetScript = scriptNameToCode(target);
352 if (targetScript == USCRIPT_INVALID_CODE) continue;
353
354 int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
355 // assert(variantCount >= 1);
356 for (int32_t v=0; v<variantCount; ++v) {
357 UnicodeString variant;
358 Transliterator::_getAvailableVariant(v, source, target, variant);
359
360 UnicodeString id;
361 TransliteratorIDParser::STVtoID(ANY, target, variant, id);
362 ec = U_ZERO_ERROR;
363 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
364 targetScript, ec);
365 if (U_FAILURE(ec)) {
366 delete t;
367 } else {
368 Transliterator::_registerInstance(t);
369 Transliterator::_registerSpecialInverse(target, NULL_ID, FALSE);
370 }
371 }
372 }
373 }
374 }
375
376 U_NAMESPACE_END
377
378 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
379
380 //eof
381