• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *****************************************************************
3 * Copyright (c) 2002-2008, International Business Machines Corporation
4 * and others.  All Rights Reserved.
5 *****************************************************************
6 * Date        Name        Description
7 * 06/06/2002  aliu        Creation.
8 *****************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "unicode/uobject.h"
16 #include "unicode/uscript.h"
17 #include "nultrans.h"
18 #include "anytrans.h"
19 #include "uvector.h"
20 #include "tridpars.h"
21 #include "hash.h"
22 #include "putilimp.h"
23 #include "uinvchar.h"
24 
25 //------------------------------------------------------------
26 // Constants
27 
28 static const UChar TARGET_SEP = 45; // '-'
29 static const UChar VARIANT_SEP = 47; // '/'
30 static const UChar ANY[] = {65,110,121,0}; // "Any"
31 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
32 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
33 
34 //------------------------------------------------------------
35 
36 U_CDECL_BEGIN
37 /**
38  * Deleter function for Transliterator*.
39  */
40 static void U_CALLCONV
_deleteTransliterator(void * obj)41 _deleteTransliterator(void *obj) {
42     delete (U_NAMESPACE_QUALIFIER Transliterator*) obj;
43 }
44 U_CDECL_END
45 
46 //------------------------------------------------------------
47 
48 U_NAMESPACE_BEGIN
49 
50 //------------------------------------------------------------
51 // ScriptRunIterator
52 
53 /**
54  * Returns a series of ranges corresponding to scripts. They will be
55  * of the form:
56  *
57  * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
58  * |            |          - first run (start, limit)
59  *          |           |  - second run (start, limit)
60  *
61  * That is, the runs will overlap. The reason for this is so that a
62  * transliterator can consider common characters both before and after
63  * the scripts.
64  */
65 class ScriptRunIterator : public UMemory {
66 private:
67     const Replaceable& text;
68     int32_t textStart;
69     int32_t textLimit;
70 
71 public:
72     /**
73      * The code of the current run, valid after next() returns.  May
74      * be USCRIPT_INVALID_CODE if and only if the entire text is
75      * COMMON/INHERITED.
76      */
77     UScriptCode scriptCode;
78 
79     /**
80      * The start of the run, inclusive, valid after next() returns.
81      */
82     int32_t start;
83 
84     /**
85      * The end of the run, exclusive, valid after next() returns.
86      */
87     int32_t limit;
88 
89     /**
90      * Constructs a run iterator over the given text from start
91      * (inclusive) to limit (exclusive).
92      */
93     ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
94 
95     /**
96      * Returns TRUE if there are any more runs.  TRUE is always
97      * returned at least once.  Upon return, the caller should
98      * examine scriptCode, start, and limit.
99      */
100     UBool next();
101 
102     /**
103      * Adjusts internal indices for a change in the limit index of the
104      * given delta.  A positive delta means the limit has increased.
105      */
106     void adjustLimit(int32_t delta);
107 
108 private:
109     ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
110     ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
111 };
112 
ScriptRunIterator(const Replaceable & theText,int32_t myStart,int32_t myLimit)113 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
114                                      int32_t myStart, int32_t myLimit) :
115     text(theText)
116 {
117     textStart = myStart;
118     textLimit = myLimit;
119     limit = myStart;
120 }
121 
next()122 UBool ScriptRunIterator::next() {
123     UChar32 ch;
124     UScriptCode s;
125     UErrorCode ec = U_ZERO_ERROR;
126 
127     scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
128     start = limit;
129 
130     // Are we done?
131     if (start == textLimit) {
132         return FALSE;
133     }
134 
135     // Move start back to include adjacent COMMON or INHERITED
136     // characters
137     while (start > textStart) {
138         ch = text.char32At(start - 1); // look back
139         s = uscript_getScript(ch, &ec);
140         if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
141             --start;
142         } else {
143             break;
144         }
145     }
146 
147     // Move limit ahead to include COMMON, INHERITED, and characters
148     // of the current script.
149     while (limit < textLimit) {
150         ch = text.char32At(limit); // look ahead
151         s = uscript_getScript(ch, &ec);
152         if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
153             if (scriptCode == USCRIPT_INVALID_CODE) {
154                 scriptCode = s;
155             } else if (s != scriptCode) {
156                 break;
157             }
158         }
159         ++limit;
160     }
161 
162     // Return TRUE even if the entire text is COMMON / INHERITED, in
163     // which case scriptCode will be USCRIPT_INVALID_CODE.
164     return TRUE;
165 }
166 
adjustLimit(int32_t delta)167 void ScriptRunIterator::adjustLimit(int32_t delta) {
168     limit += delta;
169     textLimit += delta;
170 }
171 
172 //------------------------------------------------------------
173 // AnyTransliterator
174 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)175 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
176 
177 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
178                                      const UnicodeString& theTarget,
179                                      const UnicodeString& theVariant,
180                                      UScriptCode theTargetScript,
181                                      UErrorCode& ec) :
182     Transliterator(id, NULL),
183     targetScript(theTargetScript)
184 {
185     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
186     if (U_FAILURE(ec)) {
187         return;
188     }
189     uhash_setValueDeleter(cache, _deleteTransliterator);
190 
191     target = theTarget;
192     if (theVariant.length() > 0) {
193         target.append(VARIANT_SEP).append(theVariant);
194     }
195 }
196 
~AnyTransliterator()197 AnyTransliterator::~AnyTransliterator() {
198     uhash_close(cache);
199 }
200 
201 /**
202  * Copy constructor.
203  */
AnyTransliterator(const AnyTransliterator & o)204 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
205     Transliterator(o),
206     target(o.target),
207     targetScript(o.targetScript)
208 {
209     // Don't copy the cache contents
210     UErrorCode ec = U_ZERO_ERROR;
211     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
212     if (U_FAILURE(ec)) {
213         return;
214     }
215     uhash_setValueDeleter(cache, _deleteTransliterator);
216 }
217 
218 /**
219  * Transliterator API.
220  */
clone() const221 Transliterator* AnyTransliterator::clone() const {
222     return new AnyTransliterator(*this);
223 }
224 
225 /**
226  * Implements {@link Transliterator#handleTransliterate}.
227  */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const228 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
229                                             UBool isIncremental) const {
230     int32_t allStart = pos.start;
231     int32_t allLimit = pos.limit;
232 
233     ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
234 
235     while (it.next()) {
236         // Ignore runs in the ante context
237         if (it.limit <= allStart) continue;
238 
239         // Try to instantiate transliterator from it.scriptCode to
240         // our target or target/variant
241         Transliterator* t = getTransliterator(it.scriptCode);
242 
243         if (t == NULL) {
244             // We have no transliterator.  Do nothing, but keep
245             // pos.start up to date.
246             pos.start = it.limit;
247             continue;
248         }
249 
250         // If the run end is before the transliteration limit, do
251         // a non-incremental transliteration.  Otherwise do an
252         // incremental one.
253         UBool incremental = isIncremental && (it.limit >= allLimit);
254 
255         pos.start = uprv_max(allStart, it.start);
256         pos.limit = uprv_min(allLimit, it.limit);
257         int32_t limit = pos.limit;
258         t->filteredTransliterate(text, pos, incremental);
259         int32_t delta = pos.limit - limit;
260         allLimit += delta;
261         it.adjustLimit(delta);
262 
263         // We're done if we enter the post context
264         if (it.limit >= allLimit) break;
265     }
266 
267     // Restore limit.  pos.start is fine where the last transliterator
268     // left it, or at the end of the last run.
269     pos.limit = allLimit;
270 }
271 
getTransliterator(UScriptCode source) const272 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
273 
274     if (source == targetScript || source == USCRIPT_INVALID_CODE) {
275         return NULL;
276     }
277 
278     Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
279     if (t == NULL) {
280         UErrorCode ec = U_ZERO_ERROR;
281         UnicodeString sourceName(uscript_getName(source), -1, US_INV);
282         UnicodeString id(sourceName);
283         id.append(TARGET_SEP).append(target);
284 
285         t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
286         if (U_FAILURE(ec) || t == NULL) {
287             delete t;
288 
289             // Try to pivot around Latin, our most common script
290             id = sourceName;
291             id.append(LATIN_PIVOT).append(target);
292             t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
293             if (U_FAILURE(ec) || t == NULL) {
294                 delete t;
295                 t = NULL;
296             }
297         }
298 
299         if (t != NULL) {
300             uhash_iput(cache, (int32_t) source, t, &ec);
301         }
302     }
303 
304     return t;
305 }
306 
307 /**
308  * Return the script code for a given name, or -1 if not found.
309  */
scriptNameToCode(const UnicodeString & name)310 static UScriptCode scriptNameToCode(const UnicodeString& name) {
311     char buf[128];
312     UScriptCode code;
313     UErrorCode ec = U_ZERO_ERROR;
314     int32_t nameLen = name.length();
315     UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
316 
317     if (isInvariant) {
318         name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
319         buf[127] = 0;   // Make sure that we NULL terminate the string.
320     }
321     if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
322     {
323         code = USCRIPT_INVALID_CODE;
324     }
325     return code;
326 }
327 
328 /**
329  * Registers standard transliterators with the system.  Called by
330  * Transliterator during initialization.  Scan all current targets and
331  * register those that are scripts T as Any-T/V.
332  */
registerIDs()333 void AnyTransliterator::registerIDs() {
334 
335     UErrorCode ec = U_ZERO_ERROR;
336     Hashtable seen(TRUE, ec);
337 
338     int32_t sourceCount = Transliterator::_countAvailableSources();
339     for (int32_t s=0; s<sourceCount; ++s) {
340         UnicodeString source;
341         Transliterator::_getAvailableSource(s, source);
342 
343         // Ignore the "Any" source
344         if (source.caseCompare(ANY, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
345 
346         int32_t targetCount = Transliterator::_countAvailableTargets(source);
347         for (int32_t t=0; t<targetCount; ++t) {
348             UnicodeString target;
349             Transliterator::_getAvailableTarget(t, source, target);
350 
351             // Only process each target once
352             if (seen.geti(target) != 0) continue;
353             ec = U_ZERO_ERROR;
354             seen.puti(target, 1, ec);
355 
356             // Get the script code for the target.  If not a script, ignore.
357             UScriptCode targetScript = scriptNameToCode(target);
358             if (targetScript == USCRIPT_INVALID_CODE) continue;
359 
360             int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
361             // assert(variantCount >= 1);
362             for (int32_t v=0; v<variantCount; ++v) {
363                 UnicodeString variant;
364                 Transliterator::_getAvailableVariant(v, source, target, variant);
365 
366                 UnicodeString id;
367                 TransliteratorIDParser::STVtoID(ANY, target, variant, id);
368                 ec = U_ZERO_ERROR;
369                 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
370                                                              targetScript, ec);
371                 if (U_FAILURE(ec)) {
372                     delete t;
373                 } else {
374                     Transliterator::_registerInstance(t);
375                     Transliterator::_registerSpecialInverse(target, NULL_ID, FALSE);
376                 }
377             }
378         }
379     }
380 }
381 
382 U_NAMESPACE_END
383 
384 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
385 
386 //eof
387