1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *****************************************************************
5 * Copyright (c) 2002-2014, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 *****************************************************************
8 * Date Name Description
9 * 06/06/2002 aliu Creation.
10 *****************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "unicode/uobject.h"
18 #include "unicode/uscript.h"
19
20 #include "anytrans.h"
21 #include "hash.h"
22 #include "mutex.h"
23 #include "nultrans.h"
24 #include "putilimp.h"
25 #include "tridpars.h"
26 #include "uinvchar.h"
27 #include "uvector.h"
28
29 //------------------------------------------------------------
30 // Constants
31
32 static const char16_t TARGET_SEP = 45; // '-'
33 static const char16_t VARIANT_SEP = 47; // '/'
34 static const char16_t ANY[] = {0x41,0x6E,0x79,0}; // "Any"
35 static const char16_t NULL_ID[] = {78,117,108,108,0}; // "Null"
36 static const char16_t LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-"
37
38 // initial size for an Any-XXXX transform's cache of script-XXXX transforms
39 // (will grow as necessary, but we don't expect to have source text with more than 7 scripts)
40 #define ANY_TRANS_CACHE_INIT_SIZE 7
41
42 //------------------------------------------------------------
43
44 U_CDECL_BEGIN
45 /**
46 * Deleter function for Transliterator*.
47 */
48 static void U_CALLCONV
_deleteTransliterator(void * obj)49 _deleteTransliterator(void *obj) {
50 delete (icu::Transliterator*) obj;
51 }
52 U_CDECL_END
53
54 //------------------------------------------------------------
55
56 U_NAMESPACE_BEGIN
57
58 //------------------------------------------------------------
59 // ScriptRunIterator
60
61 /**
62 * Returns a series of ranges corresponding to scripts. They will be
63 * of the form:
64 *
65 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
66 * | | - first run (start, limit)
67 * | | - second run (start, limit)
68 *
69 * That is, the runs will overlap. The reason for this is so that a
70 * transliterator can consider common characters both before and after
71 * the scripts.
72 */
73 class ScriptRunIterator : public UMemory {
74 private:
75 const Replaceable& text;
76 int32_t textStart;
77 int32_t textLimit;
78
79 public:
80 /**
81 * The code of the current run, valid after next() returns. May
82 * be USCRIPT_INVALID_CODE if and only if the entire text is
83 * COMMON/INHERITED.
84 */
85 UScriptCode scriptCode;
86
87 /**
88 * The start of the run, inclusive, valid after next() returns.
89 */
90 int32_t start;
91
92 /**
93 * The end of the run, exclusive, valid after next() returns.
94 */
95 int32_t limit;
96
97 /**
98 * Constructs a run iterator over the given text from start
99 * (inclusive) to limit (exclusive).
100 */
101 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
102
103 /**
104 * Returns true if there are any more runs. true is always
105 * returned at least once. Upon return, the caller should
106 * examine scriptCode, start, and limit.
107 */
108 UBool next();
109
110 /**
111 * Adjusts internal indices for a change in the limit index of the
112 * given delta. A positive delta means the limit has increased.
113 */
114 void adjustLimit(int32_t delta);
115
116 private:
117 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
118 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
119 };
120
ScriptRunIterator(const Replaceable & theText,int32_t myStart,int32_t myLimit)121 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
122 int32_t myStart, int32_t myLimit) :
123 text(theText)
124 {
125 textStart = myStart;
126 textLimit = myLimit;
127 limit = myStart;
128 }
129
next()130 UBool ScriptRunIterator::next() {
131 UChar32 ch;
132 UScriptCode s;
133 UErrorCode ec = U_ZERO_ERROR;
134
135 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
136 start = limit;
137
138 // Are we done?
139 if (start == textLimit) {
140 return false;
141 }
142
143 // Move start back to include adjacent COMMON or INHERITED
144 // characters
145 while (start > textStart) {
146 ch = text.char32At(start - 1); // look back
147 s = uscript_getScript(ch, &ec);
148 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
149 --start;
150 } else {
151 break;
152 }
153 }
154
155 // Move limit ahead to include COMMON, INHERITED, and characters
156 // of the current script.
157 while (limit < textLimit) {
158 ch = text.char32At(limit); // look ahead
159 s = uscript_getScript(ch, &ec);
160 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
161 if (scriptCode == USCRIPT_INVALID_CODE) {
162 scriptCode = s;
163 } else if (s != scriptCode) {
164 break;
165 }
166 }
167 ++limit;
168 }
169
170 // Return true even if the entire text is COMMON / INHERITED, in
171 // which case scriptCode will be USCRIPT_INVALID_CODE.
172 return true;
173 }
174
adjustLimit(int32_t delta)175 void ScriptRunIterator::adjustLimit(int32_t delta) {
176 limit += delta;
177 textLimit += delta;
178 }
179
180 //------------------------------------------------------------
181 // AnyTransliterator
182
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)183 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
184
185 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
186 const UnicodeString& theTarget,
187 const UnicodeString& theVariant,
188 UScriptCode theTargetScript,
189 UErrorCode& ec) :
190 Transliterator(id, nullptr),
191 targetScript(theTargetScript)
192 {
193 cache = uhash_openSize(uhash_hashLong, uhash_compareLong, nullptr, ANY_TRANS_CACHE_INIT_SIZE, &ec);
194 if (U_FAILURE(ec)) {
195 return;
196 }
197 uhash_setValueDeleter(cache, _deleteTransliterator);
198
199 target = theTarget;
200 if (theVariant.length() > 0) {
201 target.append(VARIANT_SEP).append(theVariant);
202 }
203 }
204
~AnyTransliterator()205 AnyTransliterator::~AnyTransliterator() {
206 uhash_close(cache);
207 }
208
209 /**
210 * Copy constructor.
211 */
AnyTransliterator(const AnyTransliterator & o)212 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
213 Transliterator(o),
214 target(o.target),
215 targetScript(o.targetScript)
216 {
217 // Don't copy the cache contents
218 UErrorCode ec = U_ZERO_ERROR;
219 cache = uhash_openSize(uhash_hashLong, uhash_compareLong, nullptr, ANY_TRANS_CACHE_INIT_SIZE, &ec);
220 if (U_FAILURE(ec)) {
221 return;
222 }
223 uhash_setValueDeleter(cache, _deleteTransliterator);
224 }
225
226 /**
227 * Transliterator API.
228 */
clone() const229 AnyTransliterator* AnyTransliterator::clone() const {
230 return new AnyTransliterator(*this);
231 }
232
233 /**
234 * Implements {@link Transliterator#handleTransliterate}.
235 */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const236 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
237 UBool isIncremental) const {
238 int32_t allStart = pos.start;
239 int32_t allLimit = pos.limit;
240
241 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
242
243 while (it.next()) {
244 // Ignore runs in the ante context
245 if (it.limit <= allStart) continue;
246
247 // Try to instantiate transliterator from it.scriptCode to
248 // our target or target/variant
249 Transliterator* t = getTransliterator(it.scriptCode);
250
251 if (t == nullptr) {
252 // We have no transliterator. Do nothing, but keep
253 // pos.start up to date.
254 pos.start = it.limit;
255 continue;
256 }
257
258 // If the run end is before the transliteration limit, do
259 // a non-incremental transliteration. Otherwise do an
260 // incremental one.
261 UBool incremental = isIncremental && (it.limit >= allLimit);
262
263 pos.start = uprv_max(allStart, it.start);
264 pos.limit = uprv_min(allLimit, it.limit);
265 int32_t limit = pos.limit;
266 t->filteredTransliterate(text, pos, incremental);
267 int32_t delta = pos.limit - limit;
268 allLimit += delta;
269 it.adjustLimit(delta);
270
271 // We're done if we enter the post context
272 if (it.limit >= allLimit) break;
273 }
274
275 // Restore limit. pos.start is fine where the last transliterator
276 // left it, or at the end of the last run.
277 pos.limit = allLimit;
278 }
279
getTransliterator(UScriptCode source) const280 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
281
282 if (source == targetScript || source == USCRIPT_INVALID_CODE) {
283 return nullptr;
284 }
285
286 Transliterator* t = nullptr;
287 {
288 Mutex m(nullptr);
289 t = (Transliterator*) uhash_iget(cache, (int32_t) source);
290 }
291 if (t == nullptr) {
292 UErrorCode ec = U_ZERO_ERROR;
293 UnicodeString sourceName(uscript_getShortName(source), -1, US_INV);
294 UnicodeString id(sourceName);
295 id.append(TARGET_SEP).append(target);
296
297 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
298 if (U_FAILURE(ec) || t == nullptr) {
299 delete t;
300
301 // Try to pivot around Latin, our most common script
302 id = sourceName;
303 id.append(LATIN_PIVOT, -1).append(target);
304 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
305 if (U_FAILURE(ec) || t == nullptr) {
306 delete t;
307 t = nullptr;
308 }
309 }
310
311 if (t != nullptr) {
312 Transliterator *rt = nullptr;
313 {
314 Mutex m(nullptr);
315 rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
316 if (rt == nullptr) {
317 // Common case, no race to cache this new transliterator.
318 uhash_iput(cache, (int32_t) source, t, &ec);
319 } else {
320 // Race case, some other thread beat us to caching this transliterator.
321 Transliterator *temp = rt;
322 rt = t; // Our newly created transliterator that lost the race & now needs deleting.
323 t = temp; // The transliterator from the cache that we will return.
324 }
325 }
326 delete rt; // will be non-null only in case of races.
327 }
328 }
329 return t;
330 }
331
332 /**
333 * Return the script code for a given name, or -1 if not found.
334 */
scriptNameToCode(const UnicodeString & name)335 static UScriptCode scriptNameToCode(const UnicodeString& name) {
336 char buf[128];
337 UScriptCode code;
338 UErrorCode ec = U_ZERO_ERROR;
339 int32_t nameLen = name.length();
340 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
341
342 if (isInvariant) {
343 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
344 buf[127] = 0; // Make sure that we nullptr terminate the string.
345 }
346 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
347 {
348 code = USCRIPT_INVALID_CODE;
349 }
350 return code;
351 }
352
353 /**
354 * Registers standard transliterators with the system. Called by
355 * Transliterator during initialization. Scan all current targets and
356 * register those that are scripts T as Any-T/V.
357 */
registerIDs()358 void AnyTransliterator::registerIDs() {
359
360 UErrorCode ec = U_ZERO_ERROR;
361 Hashtable seen(true, ec);
362
363 int32_t sourceCount = Transliterator::_countAvailableSources();
364 for (int32_t s=0; s<sourceCount; ++s) {
365 UnicodeString source;
366 Transliterator::_getAvailableSource(s, source);
367
368 // Ignore the "Any" source
369 if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
370
371 int32_t targetCount = Transliterator::_countAvailableTargets(source);
372 for (int32_t t=0; t<targetCount; ++t) {
373 UnicodeString target;
374 Transliterator::_getAvailableTarget(t, source, target);
375
376 // Only process each target once
377 if (seen.geti(target) != 0) continue;
378 ec = U_ZERO_ERROR;
379 seen.puti(target, 1, ec);
380
381 // Get the script code for the target. If not a script, ignore.
382 UScriptCode targetScript = scriptNameToCode(target);
383 if (targetScript == USCRIPT_INVALID_CODE) continue;
384
385 int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
386 // assert(variantCount >= 1);
387 for (int32_t v=0; v<variantCount; ++v) {
388 UnicodeString variant;
389 Transliterator::_getAvailableVariant(v, source, target, variant);
390
391 UnicodeString id;
392 TransliteratorIDParser::STVtoID(UnicodeString(true, ANY, 3), target, variant, id);
393 ec = U_ZERO_ERROR;
394 AnyTransliterator* tl = new AnyTransliterator(id, target, variant,
395 targetScript, ec);
396 if (U_FAILURE(ec)) {
397 delete tl;
398 } else {
399 Transliterator::_registerInstance(tl);
400 Transliterator::_registerSpecialInverse(target, UnicodeString(true, NULL_ID, 4), false);
401 }
402 }
403 }
404 }
405 }
406
407 U_NAMESPACE_END
408
409 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
410
411 //eof
412