• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *****************************************************************
5 * Copyright (c) 2002-2014, International Business Machines Corporation
6 * and others.  All Rights Reserved.
7 *****************************************************************
8 * Date        Name        Description
9 * 06/06/2002  aliu        Creation.
10 *****************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_TRANSLITERATION
16 
17 #include "unicode/uobject.h"
18 #include "unicode/uscript.h"
19 
20 #include "anytrans.h"
21 #include "hash.h"
22 #include "mutex.h"
23 #include "nultrans.h"
24 #include "putilimp.h"
25 #include "tridpars.h"
26 #include "uinvchar.h"
27 #include "uvector.h"
28 
29 //------------------------------------------------------------
30 // Constants
31 
32 static const char16_t TARGET_SEP = 45; // '-'
33 static const char16_t VARIANT_SEP = 47; // '/'
34 static const char16_t ANY[] = {0x41,0x6E,0x79,0}; // "Any"
35 static const char16_t NULL_ID[] = {78,117,108,108,0}; // "Null"
36 static const char16_t LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"
37 
38 // initial size for an Any-XXXX transform's cache of script-XXXX transforms
39 // (will grow as necessary, but we don't expect to have source text with more than 7 scripts)
40 #define ANY_TRANS_CACHE_INIT_SIZE 7
41 
42 //------------------------------------------------------------
43 
44 U_CDECL_BEGIN
45 /**
46  * Deleter function for Transliterator*.
47  */
48 static void U_CALLCONV
_deleteTransliterator(void * obj)49 _deleteTransliterator(void *obj) {
50     delete (icu::Transliterator*) obj;
51 }
52 U_CDECL_END
53 
54 //------------------------------------------------------------
55 
56 U_NAMESPACE_BEGIN
57 
58 //------------------------------------------------------------
59 // ScriptRunIterator
60 
61 /**
62  * Returns a series of ranges corresponding to scripts. They will be
63  * of the form:
64  *
65  * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
66  * |            |          - first run (start, limit)
67  *          |           |  - second run (start, limit)
68  *
69  * That is, the runs will overlap. The reason for this is so that a
70  * transliterator can consider common characters both before and after
71  * the scripts.
72  */
73 class ScriptRunIterator : public UMemory {
74 private:
75     const Replaceable& text;
76     int32_t textStart;
77     int32_t textLimit;
78 
79 public:
80     /**
81      * The code of the current run, valid after next() returns.  May
82      * be USCRIPT_INVALID_CODE if and only if the entire text is
83      * COMMON/INHERITED.
84      */
85     UScriptCode scriptCode;
86 
87     /**
88      * The start of the run, inclusive, valid after next() returns.
89      */
90     int32_t start;
91 
92     /**
93      * The end of the run, exclusive, valid after next() returns.
94      */
95     int32_t limit;
96 
97     /**
98      * Constructs a run iterator over the given text from start
99      * (inclusive) to limit (exclusive).
100      */
101     ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
102 
103     /**
104      * Returns true if there are any more runs.  true is always
105      * returned at least once.  Upon return, the caller should
106      * examine scriptCode, start, and limit.
107      */
108     UBool next();
109 
110     /**
111      * Adjusts internal indices for a change in the limit index of the
112      * given delta.  A positive delta means the limit has increased.
113      */
114     void adjustLimit(int32_t delta);
115 
116 private:
117     ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
118     ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
119 };
120 
ScriptRunIterator(const Replaceable & theText,int32_t myStart,int32_t myLimit)121 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
122                                      int32_t myStart, int32_t myLimit) :
123     text(theText)
124 {
125     textStart = myStart;
126     textLimit = myLimit;
127     limit = myStart;
128 }
129 
next()130 UBool ScriptRunIterator::next() {
131     UChar32 ch;
132     UScriptCode s;
133     UErrorCode ec = U_ZERO_ERROR;
134 
135     scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
136     start = limit;
137 
138     // Are we done?
139     if (start == textLimit) {
140         return false;
141     }
142 
143     // Move start back to include adjacent COMMON or INHERITED
144     // characters
145     while (start > textStart) {
146         ch = text.char32At(start - 1); // look back
147         s = uscript_getScript(ch, &ec);
148         if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
149             --start;
150         } else {
151             break;
152         }
153     }
154 
155     // Move limit ahead to include COMMON, INHERITED, and characters
156     // of the current script.
157     while (limit < textLimit) {
158         ch = text.char32At(limit); // look ahead
159         s = uscript_getScript(ch, &ec);
160         if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
161             if (scriptCode == USCRIPT_INVALID_CODE) {
162                 scriptCode = s;
163             } else if (s != scriptCode) {
164                 break;
165             }
166         }
167         ++limit;
168     }
169 
170     // Return true even if the entire text is COMMON / INHERITED, in
171     // which case scriptCode will be USCRIPT_INVALID_CODE.
172     return true;
173 }
174 
adjustLimit(int32_t delta)175 void ScriptRunIterator::adjustLimit(int32_t delta) {
176     limit += delta;
177     textLimit += delta;
178 }
179 
180 //------------------------------------------------------------
181 // AnyTransliterator
182 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)183 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
184 
185 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
186                                      const UnicodeString& theTarget,
187                                      const UnicodeString& theVariant,
188                                      UScriptCode theTargetScript,
189                                      UErrorCode& ec) :
190     Transliterator(id, nullptr),
191     targetScript(theTargetScript)
192 {
193     cache = uhash_openSize(uhash_hashLong, uhash_compareLong, nullptr, ANY_TRANS_CACHE_INIT_SIZE, &ec);
194     if (U_FAILURE(ec)) {
195         return;
196     }
197     uhash_setValueDeleter(cache, _deleteTransliterator);
198 
199     target = theTarget;
200     if (theVariant.length() > 0) {
201         target.append(VARIANT_SEP).append(theVariant);
202     }
203 }
204 
~AnyTransliterator()205 AnyTransliterator::~AnyTransliterator() {
206     uhash_close(cache);
207 }
208 
209 /**
210  * Copy constructor.
211  */
AnyTransliterator(const AnyTransliterator & o)212 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
213     Transliterator(o),
214     target(o.target),
215     targetScript(o.targetScript)
216 {
217     // Don't copy the cache contents
218     UErrorCode ec = U_ZERO_ERROR;
219     cache = uhash_openSize(uhash_hashLong, uhash_compareLong, nullptr, ANY_TRANS_CACHE_INIT_SIZE, &ec);
220     if (U_FAILURE(ec)) {
221         return;
222     }
223     uhash_setValueDeleter(cache, _deleteTransliterator);
224 }
225 
226 /**
227  * Transliterator API.
228  */
clone() const229 AnyTransliterator* AnyTransliterator::clone() const {
230     return new AnyTransliterator(*this);
231 }
232 
233 /**
234  * Implements {@link Transliterator#handleTransliterate}.
235  */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const236 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
237                                             UBool isIncremental) const {
238     int32_t allStart = pos.start;
239     int32_t allLimit = pos.limit;
240 
241     ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
242 
243     while (it.next()) {
244         // Ignore runs in the ante context
245         if (it.limit <= allStart) continue;
246 
247         // Try to instantiate transliterator from it.scriptCode to
248         // our target or target/variant
249         Transliterator* t = getTransliterator(it.scriptCode);
250 
251         if (t == nullptr) {
252             // We have no transliterator.  Do nothing, but keep
253             // pos.start up to date.
254             pos.start = it.limit;
255             continue;
256         }
257 
258         // If the run end is before the transliteration limit, do
259         // a non-incremental transliteration.  Otherwise do an
260         // incremental one.
261         UBool incremental = isIncremental && (it.limit >= allLimit);
262 
263         pos.start = uprv_max(allStart, it.start);
264         pos.limit = uprv_min(allLimit, it.limit);
265         int32_t limit = pos.limit;
266         t->filteredTransliterate(text, pos, incremental);
267         int32_t delta = pos.limit - limit;
268         allLimit += delta;
269         it.adjustLimit(delta);
270 
271         // We're done if we enter the post context
272         if (it.limit >= allLimit) break;
273     }
274 
275     // Restore limit.  pos.start is fine where the last transliterator
276     // left it, or at the end of the last run.
277     pos.limit = allLimit;
278 }
279 
getTransliterator(UScriptCode source) const280 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
281 
282     if (source == targetScript || source == USCRIPT_INVALID_CODE) {
283         return nullptr;
284     }
285 
286     Transliterator* t = nullptr;
287     {
288         Mutex m(nullptr);
289         t = (Transliterator*) uhash_iget(cache, (int32_t) source);
290     }
291     if (t == nullptr) {
292         UErrorCode ec = U_ZERO_ERROR;
293         UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
294         UnicodeString id(sourceName);
295         id.append(TARGET_SEP).append(target);
296 
297         t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
298         if (U_FAILURE(ec) || t == nullptr) {
299             delete t;
300 
301             // Try to pivot around Latin, our most common script
302             id = sourceName;
303             id.append(LATIN_PIVOT, -1).append(target);
304             t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
305             if (U_FAILURE(ec) || t == nullptr) {
306                 delete t;
307                 t = nullptr;
308             }
309         }
310 
311         if (t != nullptr) {
312             Transliterator *rt = nullptr;
313             {
314                 Mutex m(nullptr);
315                 rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
316                 if (rt == nullptr) {
317                     // Common case, no race to cache this new transliterator.
318                     uhash_iput(cache, (int32_t) source, t, &ec);
319                 } else {
320                     // Race case, some other thread beat us to caching this transliterator.
321                     Transliterator *temp = rt;
322                     rt = t;    // Our newly created transliterator that lost the race & now needs deleting.
323                     t  = temp; // The transliterator from the cache that we will return.
324                 }
325             }
326             delete rt;    // will be non-null only in case of races.
327         }
328     }
329     return t;
330 }
331 
332 /**
333  * Return the script code for a given name, or -1 if not found.
334  */
scriptNameToCode(const UnicodeString & name)335 static UScriptCode scriptNameToCode(const UnicodeString& name) {
336     char buf[128];
337     UScriptCode code;
338     UErrorCode ec = U_ZERO_ERROR;
339     int32_t nameLen = name.length();
340     UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
341 
342     if (isInvariant) {
343         name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
344         buf[127] = 0;   // Make sure that we nullptr terminate the string.
345     }
346     if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
347     {
348         code = USCRIPT_INVALID_CODE;
349     }
350     return code;
351 }
352 
353 /**
354  * Registers standard transliterators with the system.  Called by
355  * Transliterator during initialization.  Scan all current targets and
356  * register those that are scripts T as Any-T/V.
357  */
registerIDs()358 void AnyTransliterator::registerIDs() {
359 
360     UErrorCode ec = U_ZERO_ERROR;
361     Hashtable seen(true, ec);
362 
363     int32_t sourceCount = Transliterator::_countAvailableSources();
364     for (int32_t s=0; s<sourceCount; ++s) {
365         UnicodeString source;
366         Transliterator::_getAvailableSource(s, source);
367 
368         // Ignore the "Any" source
369         if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
370 
371         int32_t targetCount = Transliterator::_countAvailableTargets(source);
372         for (int32_t t=0; t<targetCount; ++t) {
373             UnicodeString target;
374             Transliterator::_getAvailableTarget(t, source, target);
375 
376             // Only process each target once
377             if (seen.geti(target) != 0) continue;
378             ec = U_ZERO_ERROR;
379             seen.puti(target, 1, ec);
380 
381             // Get the script code for the target.  If not a script, ignore.
382             UScriptCode targetScript = scriptNameToCode(target);
383             if (targetScript == USCRIPT_INVALID_CODE) continue;
384 
385             int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
386             // assert(variantCount >= 1);
387             for (int32_t v=0; v<variantCount; ++v) {
388                 UnicodeString variant;
389                 Transliterator::_getAvailableVariant(v, source, target, variant);
390 
391                 UnicodeString id;
392                 TransliteratorIDParser::STVtoID(UnicodeString(true, ANY, 3), target, variant, id);
393                 ec = U_ZERO_ERROR;
394                 AnyTransliterator* tl = new AnyTransliterator(id, target, variant,
395                                                              targetScript, ec);
396                 if (U_FAILURE(ec)) {
397                     delete tl;
398                 } else {
399                     Transliterator::_registerInstance(tl);
400                     Transliterator::_registerSpecialInverse(target, UnicodeString(true, NULL_ID, 4), false);
401                 }
402             }
403         }
404     }
405 }
406 
407 U_NAMESPACE_END
408 
409 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
410 
411 //eof
412