• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 2012-2014, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7 
8 #include "unicode/utypes.h"
9 
10 #include "unicode/uchar.h"
11 #include "unicode/utf16.h"
12 
13 #include "identifier_info.h"
14 #include "mutex.h"
15 #include "scriptset.h"
16 #include "ucln_in.h"
17 #include "uvector.h"
18 
19 U_NAMESPACE_BEGIN
20 
21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
22 
23 static UnicodeSet *ASCII;
24 static ScriptSet *JAPANESE;
25 static ScriptSet *CHINESE;
26 static ScriptSet *KOREAN;
27 static ScriptSet *CONFUSABLE_WITH_LATIN;
28 static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
29 
30 
31 U_CDECL_BEGIN
32 static UBool U_CALLCONV
IdentifierInfo_cleanup(void)33 IdentifierInfo_cleanup(void) {
34     delete ASCII;
35     ASCII = NULL;
36     delete JAPANESE;
37     JAPANESE = NULL;
38     delete CHINESE;
39     CHINESE = NULL;
40     delete KOREAN;
41     KOREAN = NULL;
42     delete CONFUSABLE_WITH_LATIN;
43     CONFUSABLE_WITH_LATIN = NULL;
44     gIdentifierInfoInitOnce.reset();
45     return TRUE;
46 }
47 
48 static void U_CALLCONV
IdentifierInfo_init(UErrorCode & status)49 IdentifierInfo_init(UErrorCode &status) {
50     ASCII    = new UnicodeSet(0, 0x7f);
51     JAPANESE = new ScriptSet();
52     CHINESE  = new ScriptSet();
53     KOREAN   = new ScriptSet();
54     CONFUSABLE_WITH_LATIN = new ScriptSet();
55     if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
56             || CONFUSABLE_WITH_LATIN == NULL) {
57         status = U_MEMORY_ALLOCATION_ERROR;
58         return;
59     }
60     ASCII->freeze();
61     JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
62              .set(USCRIPT_KATAKANA, status);
63     CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
64     KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
65     CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
66               .set(USCRIPT_CHEROKEE, status);
67     ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
68 }
69 U_CDECL_END
70 
71 
IdentifierInfo(UErrorCode & status)72 IdentifierInfo::IdentifierInfo(UErrorCode &status):
73          fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
74          fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
75     umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
76     if (U_FAILURE(status)) {
77         return;
78     }
79 
80     fIdentifier = new UnicodeString();
81     fRequiredScripts = new ScriptSet();
82     fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
83     uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
84     fCommonAmongAlternates = new ScriptSet();
85     fNumerics = new UnicodeSet();
86     fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
87 
88     if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
89                               fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
90         status = U_MEMORY_ALLOCATION_ERROR;
91     }
92 }
93 
~IdentifierInfo()94 IdentifierInfo::~IdentifierInfo() {
95     delete fIdentifier;
96     delete fRequiredScripts;
97     uhash_close(fScriptSetSet);
98     delete fCommonAmongAlternates;
99     delete fNumerics;
100     delete fIdentifierProfile;
101 }
102 
103 
clear()104 IdentifierInfo &IdentifierInfo::clear() {
105     fRequiredScripts->resetAll();
106     uhash_removeAll(fScriptSetSet);
107     fNumerics->clear();
108     fCommonAmongAlternates->resetAll();
109     return *this;
110 }
111 
112 
setIdentifierProfile(const UnicodeSet & identifierProfile)113 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
114     *fIdentifierProfile = identifierProfile;
115     return *this;
116 }
117 
118 
getIdentifierProfile() const119 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
120     return *fIdentifierProfile;
121 }
122 
123 
setIdentifier(const UnicodeString & identifier,UErrorCode & status)124 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
125     if (U_FAILURE(status)) {
126         return *this;
127     }
128     *fIdentifier = identifier;
129     clear();
130     ScriptSet scriptsForCP;
131     UChar32 cp;
132     for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
133         cp = identifier.char32At(i);
134         // Store a representative character for each kind of decimal digit
135         if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
136             // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
137             fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
138         }
139         UScriptCode extensions[500];
140         int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
141         if (U_FAILURE(status)) {
142             return *this;
143         }
144         scriptsForCP.resetAll();
145         for (int32_t j=0; j<extensionsCount; j++) {
146             scriptsForCP.set(extensions[j], status);
147         }
148         scriptsForCP.reset(USCRIPT_COMMON, status);
149         scriptsForCP.reset(USCRIPT_INHERITED, status);
150         switch (scriptsForCP.countMembers()) {
151           case 0: break;
152           case 1:
153             // Single script, record it.
154             fRequiredScripts->Union(scriptsForCP);
155             break;
156           default:
157             if (!fRequiredScripts->intersects(scriptsForCP)
158                     && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
159                 // If the set hasn't been added already, add it
160                 //    (Add a copy, fScriptSetSet takes ownership of the copy.)
161                 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
162             }
163             break;
164         }
165     }
166     // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
167     // [Kana], [Kana Hira] => [Kana]
168     // This is relatively infrequent, so doesn't have to be optimized.
169     // We also compute any commonalities among the alternates.
170     if (uhash_count(fScriptSetSet) > 0) {
171         fCommonAmongAlternates->setAll();
172         for (int32_t it = -1;;) {
173             const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
174             if (nextHashEl == NULL) {
175                 break;
176             }
177             ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
178             // [Kana], [Kana Hira] => [Kana]
179             if (fRequiredScripts->intersects(*next)) {
180                 uhash_removeElement(fScriptSetSet, nextHashEl);
181             } else {
182                 fCommonAmongAlternates->intersect(*next);
183                 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
184                 for (int32_t otherIt = -1;;) {
185                     const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
186                     if (otherHashEl == NULL) {
187                         break;
188                     }
189                     ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
190                     if (next != other && next->contains(*other)) {
191                         uhash_removeElement(fScriptSetSet, nextHashEl);
192                         break;
193                     }
194                 }
195             }
196         }
197     }
198     if (uhash_count(fScriptSetSet) == 0) {
199         fCommonAmongAlternates->resetAll();
200     }
201     return *this;
202 }
203 
204 
getIdentifier() const205 const UnicodeString *IdentifierInfo::getIdentifier() const {
206     return fIdentifier;
207 }
208 
getScripts() const209 const ScriptSet *IdentifierInfo::getScripts() const {
210     return fRequiredScripts;
211 }
212 
getAlternates() const213 const UHashtable *IdentifierInfo::getAlternates() const {
214     return fScriptSetSet;
215 }
216 
217 
getNumerics() const218 const UnicodeSet *IdentifierInfo::getNumerics() const {
219     return fNumerics;
220 }
221 
getCommonAmongAlternates() const222 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
223     return fCommonAmongAlternates;
224 }
225 
226 #if !UCONFIG_NO_NORMALIZATION
227 
getRestrictionLevel(UErrorCode & status) const228 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
229     if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
230         return USPOOF_UNRESTRICTIVE;
231     }
232     if (ASCII->containsAll(*fIdentifier)) {
233         return USPOOF_ASCII;
234     }
235     // This is a bit tricky. We look at a number of factors.
236     // The number of scripts in the text.
237     // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
238     // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
239 
240     // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
241     //       time it is created, in setIdentifier().
242     int32_t cardinalityPlus = fRequiredScripts->countMembers() +
243             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
244     if (cardinalityPlus < 2) {
245         return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
246     }
247     if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
248             || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
249         return USPOOF_HIGHLY_RESTRICTIVE;
250     }
251     if (cardinalityPlus == 2 &&
252             fRequiredScripts->test(USCRIPT_LATIN, status) &&
253             !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
254         return USPOOF_MODERATELY_RESTRICTIVE;
255     }
256     return USPOOF_MINIMALLY_RESTRICTIVE;
257 }
258 
259 #endif /* !UCONFIG_NO_NORMALIZATION */
260 
getScriptCount() const261 int32_t IdentifierInfo::getScriptCount() const {
262     // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
263     int32_t count = fRequiredScripts->countMembers() +
264             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
265     return count;
266 }
267 
268 
269 
containsWithAlternates(const ScriptSet & container,const ScriptSet & containee) const270 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
271     if (!container.contains(containee)) {
272         return FALSE;
273     }
274     for (int32_t iter = -1; ;) {
275         const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
276         if (hashEl == NULL) {
277             break;
278         }
279         ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
280         if (!container.intersects(*alternatives)) {
281             return false;
282         }
283     }
284     return true;
285 }
286 
displayAlternates(UnicodeString & dest,const UHashtable * alternates,UErrorCode & status)287 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
288     UVector sorted(status);
289     if (U_FAILURE(status)) {
290         return dest;
291     }
292     for (int32_t pos = -1; ;) {
293         const UHashElement *el = uhash_nextElement(alternates, &pos);
294         if (el == NULL) {
295             break;
296         }
297         ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
298         sorted.addElement(ss, status);
299     }
300     sorted.sort(uhash_compareScriptSet, status);
301     UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
302     for (int32_t i=0; i<sorted.size(); i++) {
303         if (i>0) {
304             dest.append(separator);
305         }
306         ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
307         ss->displayScripts(dest);
308     }
309     return dest;
310 }
311 
312 U_NAMESPACE_END
313 
314