1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *****************************************************************
5 * Copyright (c) 2002-2014, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 *****************************************************************
8 * Date Name Description
9 * 06/06/2002 aliu Creation.
10 *****************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_TRANSLITERATION
16
17 #include "unicode/uobject.h"
18 #include "unicode/uscript.h"
19
20 #include "anytrans.h"
21 #include "hash.h"
22 #include "mutex.h"
23 #include "nultrans.h"
24 #include "putilimp.h"
25 #include "tridpars.h"
26 #include "uinvchar.h"
27 #include "uvector.h"
28
29 //------------------------------------------------------------
30 // Constants
31
32 static const UChar TARGET_SEP = 45; // '-'
33 static const UChar VARIANT_SEP = 47; // '/'
34 static const UChar ANY[] = {65,110,121,0}; // "Any"
35 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
36 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
37
38 //------------------------------------------------------------
39
40 U_CDECL_BEGIN
41 /**
42 * Deleter function for Transliterator*.
43 */
44 static void U_CALLCONV
_deleteTransliterator(void * obj)45 _deleteTransliterator(void *obj) {
46 delete (icu::Transliterator*) obj;
47 }
48 U_CDECL_END
49
50 //------------------------------------------------------------
51
52 U_NAMESPACE_BEGIN
53
54 //------------------------------------------------------------
55 // ScriptRunIterator
56
57 /**
58 * Returns a series of ranges corresponding to scripts. They will be
59 * of the form:
60 *
61 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second
62 * | | - first run (start, limit)
63 * | | - second run (start, limit)
64 *
65 * That is, the runs will overlap. The reason for this is so that a
66 * transliterator can consider common characters both before and after
67 * the scripts.
68 */
69 class ScriptRunIterator : public UMemory {
70 private:
71 const Replaceable& text;
72 int32_t textStart;
73 int32_t textLimit;
74
75 public:
76 /**
77 * The code of the current run, valid after next() returns. May
78 * be USCRIPT_INVALID_CODE if and only if the entire text is
79 * COMMON/INHERITED.
80 */
81 UScriptCode scriptCode;
82
83 /**
84 * The start of the run, inclusive, valid after next() returns.
85 */
86 int32_t start;
87
88 /**
89 * The end of the run, exclusive, valid after next() returns.
90 */
91 int32_t limit;
92
93 /**
94 * Constructs a run iterator over the given text from start
95 * (inclusive) to limit (exclusive).
96 */
97 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
98
99 /**
100 * Returns TRUE if there are any more runs. TRUE is always
101 * returned at least once. Upon return, the caller should
102 * examine scriptCode, start, and limit.
103 */
104 UBool next();
105
106 /**
107 * Adjusts internal indices for a change in the limit index of the
108 * given delta. A positive delta means the limit has increased.
109 */
110 void adjustLimit(int32_t delta);
111
112 private:
113 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
114 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
115 };
116
ScriptRunIterator(const Replaceable & theText,int32_t myStart,int32_t myLimit)117 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
118 int32_t myStart, int32_t myLimit) :
119 text(theText)
120 {
121 textStart = myStart;
122 textLimit = myLimit;
123 limit = myStart;
124 }
125
next()126 UBool ScriptRunIterator::next() {
127 UChar32 ch;
128 UScriptCode s;
129 UErrorCode ec = U_ZERO_ERROR;
130
131 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
132 start = limit;
133
134 // Are we done?
135 if (start == textLimit) {
136 return FALSE;
137 }
138
139 // Move start back to include adjacent COMMON or INHERITED
140 // characters
141 while (start > textStart) {
142 ch = text.char32At(start - 1); // look back
143 s = uscript_getScript(ch, &ec);
144 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
145 --start;
146 } else {
147 break;
148 }
149 }
150
151 // Move limit ahead to include COMMON, INHERITED, and characters
152 // of the current script.
153 while (limit < textLimit) {
154 ch = text.char32At(limit); // look ahead
155 s = uscript_getScript(ch, &ec);
156 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
157 if (scriptCode == USCRIPT_INVALID_CODE) {
158 scriptCode = s;
159 } else if (s != scriptCode) {
160 break;
161 }
162 }
163 ++limit;
164 }
165
166 // Return TRUE even if the entire text is COMMON / INHERITED, in
167 // which case scriptCode will be USCRIPT_INVALID_CODE.
168 return TRUE;
169 }
170
adjustLimit(int32_t delta)171 void ScriptRunIterator::adjustLimit(int32_t delta) {
172 limit += delta;
173 textLimit += delta;
174 }
175
176 //------------------------------------------------------------
177 // AnyTransliterator
178
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)179 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
180
181 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
182 const UnicodeString& theTarget,
183 const UnicodeString& theVariant,
184 UScriptCode theTargetScript,
185 UErrorCode& ec) :
186 Transliterator(id, NULL),
187 targetScript(theTargetScript)
188 {
189 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
190 if (U_FAILURE(ec)) {
191 return;
192 }
193 uhash_setValueDeleter(cache, _deleteTransliterator);
194
195 target = theTarget;
196 if (theVariant.length() > 0) {
197 target.append(VARIANT_SEP).append(theVariant);
198 }
199 }
200
~AnyTransliterator()201 AnyTransliterator::~AnyTransliterator() {
202 uhash_close(cache);
203 }
204
205 /**
206 * Copy constructor.
207 */
AnyTransliterator(const AnyTransliterator & o)208 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
209 Transliterator(o),
210 target(o.target),
211 targetScript(o.targetScript)
212 {
213 // Don't copy the cache contents
214 UErrorCode ec = U_ZERO_ERROR;
215 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
216 if (U_FAILURE(ec)) {
217 return;
218 }
219 uhash_setValueDeleter(cache, _deleteTransliterator);
220 }
221
222 /**
223 * Transliterator API.
224 */
clone() const225 Transliterator* AnyTransliterator::clone() const {
226 return new AnyTransliterator(*this);
227 }
228
229 /**
230 * Implements {@link Transliterator#handleTransliterate}.
231 */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const232 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
233 UBool isIncremental) const {
234 int32_t allStart = pos.start;
235 int32_t allLimit = pos.limit;
236
237 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
238
239 while (it.next()) {
240 // Ignore runs in the ante context
241 if (it.limit <= allStart) continue;
242
243 // Try to instantiate transliterator from it.scriptCode to
244 // our target or target/variant
245 Transliterator* t = getTransliterator(it.scriptCode);
246
247 if (t == NULL) {
248 // We have no transliterator. Do nothing, but keep
249 // pos.start up to date.
250 pos.start = it.limit;
251 continue;
252 }
253
254 // If the run end is before the transliteration limit, do
255 // a non-incremental transliteration. Otherwise do an
256 // incremental one.
257 UBool incremental = isIncremental && (it.limit >= allLimit);
258
259 pos.start = uprv_max(allStart, it.start);
260 pos.limit = uprv_min(allLimit, it.limit);
261 int32_t limit = pos.limit;
262 t->filteredTransliterate(text, pos, incremental);
263 int32_t delta = pos.limit - limit;
264 allLimit += delta;
265 it.adjustLimit(delta);
266
267 // We're done if we enter the post context
268 if (it.limit >= allLimit) break;
269 }
270
271 // Restore limit. pos.start is fine where the last transliterator
272 // left it, or at the end of the last run.
273 pos.limit = allLimit;
274 }
275
getTransliterator(UScriptCode source) const276 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
277
278 if (source == targetScript || source == USCRIPT_INVALID_CODE) {
279 return NULL;
280 }
281
282 Transliterator* t = NULL;
283 {
284 Mutex m(NULL);
285 t = (Transliterator*) uhash_iget(cache, (int32_t) source);
286 }
287 if (t == NULL) {
288 UErrorCode ec = U_ZERO_ERROR;
289 UnicodeString sourceName(uscript_getName(source), -1, US_INV);
290 UnicodeString id(sourceName);
291 id.append(TARGET_SEP).append(target);
292
293 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
294 if (U_FAILURE(ec) || t == NULL) {
295 delete t;
296
297 // Try to pivot around Latin, our most common script
298 id = sourceName;
299 id.append(LATIN_PIVOT, -1).append(target);
300 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
301 if (U_FAILURE(ec) || t == NULL) {
302 delete t;
303 t = NULL;
304 }
305 }
306
307 if (t != NULL) {
308 Transliterator *rt = NULL;
309 {
310 Mutex m(NULL);
311 rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
312 if (rt == NULL) {
313 // Common case, no race to cache this new transliterator.
314 uhash_iput(cache, (int32_t) source, t, &ec);
315 } else {
316 // Race case, some other thread beat us to caching this transliterator.
317 Transliterator *temp = rt;
318 rt = t; // Our newly created transliterator that lost the race & now needs deleting.
319 t = temp; // The transliterator from the cache that we will return.
320 }
321 }
322 delete rt; // will be non-null only in case of races.
323 }
324 }
325 return t;
326 }
327
328 /**
329 * Return the script code for a given name, or -1 if not found.
330 */
scriptNameToCode(const UnicodeString & name)331 static UScriptCode scriptNameToCode(const UnicodeString& name) {
332 char buf[128];
333 UScriptCode code;
334 UErrorCode ec = U_ZERO_ERROR;
335 int32_t nameLen = name.length();
336 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
337
338 if (isInvariant) {
339 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
340 buf[127] = 0; // Make sure that we NULL terminate the string.
341 }
342 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
343 {
344 code = USCRIPT_INVALID_CODE;
345 }
346 return code;
347 }
348
349 /**
350 * Registers standard transliterators with the system. Called by
351 * Transliterator during initialization. Scan all current targets and
352 * register those that are scripts T as Any-T/V.
353 */
registerIDs()354 void AnyTransliterator::registerIDs() {
355
356 UErrorCode ec = U_ZERO_ERROR;
357 Hashtable seen(TRUE, ec);
358
359 int32_t sourceCount = Transliterator::_countAvailableSources();
360 for (int32_t s=0; s<sourceCount; ++s) {
361 UnicodeString source;
362 Transliterator::_getAvailableSource(s, source);
363
364 // Ignore the "Any" source
365 if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
366
367 int32_t targetCount = Transliterator::_countAvailableTargets(source);
368 for (int32_t t=0; t<targetCount; ++t) {
369 UnicodeString target;
370 Transliterator::_getAvailableTarget(t, source, target);
371
372 // Only process each target once
373 if (seen.geti(target) != 0) continue;
374 ec = U_ZERO_ERROR;
375 seen.puti(target, 1, ec);
376
377 // Get the script code for the target. If not a script, ignore.
378 UScriptCode targetScript = scriptNameToCode(target);
379 if (targetScript == USCRIPT_INVALID_CODE) continue;
380
381 int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
382 // assert(variantCount >= 1);
383 for (int32_t v=0; v<variantCount; ++v) {
384 UnicodeString variant;
385 Transliterator::_getAvailableVariant(v, source, target, variant);
386
387 UnicodeString id;
388 TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
389 ec = U_ZERO_ERROR;
390 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
391 targetScript, ec);
392 if (U_FAILURE(ec)) {
393 delete t;
394 } else {
395 Transliterator::_registerInstance(t);
396 Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
397 }
398 }
399 }
400 }
401 }
402
403 U_NAMESPACE_END
404
405 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
406
407 //eof
408