• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 2002-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************
8  *
9  * @author Mark E. Davis
10  * @author Vladimir Weinstein
11  */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_NORMALIZATION
16 
17 #include "intltest.h"
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "canittst.h"
21 #include "unicode/caniter.h"
22 #include "unicode/normlzr.h"
23 #include "unicode/uchar.h"
24 #include "hash.h"
25 
26 #define CASE(id,test) case id:                          \
27                           name = #test;                 \
28                           if (exec) {                   \
29                               logln(#test "---");       \
30                               logln((UnicodeString)""); \
31                               test();                   \
32                           }                             \
33                           break
34 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)35 void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec,
36                                          const char* &name, char* /*par*/) {
37     switch (index) {
38         CASE(0, TestBasic);
39         CASE(1, TestExhaustive);
40         CASE(2, TestAPI);
41       default: name = ""; break;
42     }
43 }
44 
45 /**
46  * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
47 static UnicodeString str(const char *input)
48 {
49     UnicodeString str(input, ""); // Invariant conversion
50     return str.unescape();
51 }
52  */
53 
54 
CanonicalIteratorTest()55 CanonicalIteratorTest::CanonicalIteratorTest() :
56 nameTrans(nullptr), hexTrans(nullptr)
57 {
58 }
59 
~CanonicalIteratorTest()60 CanonicalIteratorTest::~CanonicalIteratorTest()
61 {
62 #if !UCONFIG_NO_TRANSLITERATION
63     delete(nameTrans);
64     delete(hexTrans);
65 #endif
66 }
67 
TestExhaustive()68 void CanonicalIteratorTest::TestExhaustive() {
69     UErrorCode status = U_ZERO_ERROR;
70     CanonicalIterator it("", status);
71     if (U_FAILURE(status)) {
72         dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
73         return;
74     }
75     UChar32 i = 0;
76     UnicodeString s;
77     // Test static and dynamic class IDs
78     if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
79         errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID");
80     }
81     for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) {
82         //for (i = 0xae00; i < 0xaf00; ++i) {
83 
84         if ((i % 0x100) == 0) {
85             logln("Testing U+%06X", i);
86         }
87 
88         // skip characters we know don't have decomps
89         int8_t type = u_charType(i);
90         if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR
91             || type == U_SURROGATE) continue;
92 
93         s = i;
94         characterTest(s, i, it);
95 
96         s += static_cast<UChar32>(0x0345); //"\\u0345";
97         characterTest(s, i, it);
98     }
99 }
100 
TestBasic()101 void CanonicalIteratorTest::TestBasic() {
102 
103     UErrorCode status = U_ZERO_ERROR;
104 
105     static const char * const testArray[][2] = {
106         {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, "
107             "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, "
108             "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, "
109             "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"},
110         {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"},
111         {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"},
112     };
113 
114 #if 0
115     // This is not interesting for C/C++ as the data is already built beforehand
116     // check build
117     UnicodeSet ss = CanonicalIterator.getSafeStart();
118     logln("Safe Start: " + ss.toPattern(true));
119     ss = CanonicalIterator.getStarts('a');
120     expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
121         new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
122         + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
123             );
124 #endif
125 
126     // check permute
127     // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
128 
129     Hashtable *permutations = new Hashtable(false, status);
130     permutations->setValueDeleter(uprv_deleteUObject);
131     UnicodeString toPermute("ABC");
132 
133     CanonicalIterator::permute(toPermute, false, permutations, status);
134 
135     logln("testing permutation");
136 
137     expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA");
138 
139     delete permutations;
140 
141     // try samples
142     logln("testing samples");
143     Hashtable *set = new Hashtable(false, status);
144     set->setValueDeleter(uprv_deleteUObject);
145     int32_t i = 0;
146     CanonicalIterator it("", status);
147     if(U_SUCCESS(status)) {
148       for (i = 0; i < UPRV_LENGTHOF(testArray); ++i) {
149           //logln("Results for: " + name.transliterate(testArray[i]));
150           UnicodeString testStr = CharsToUnicodeString(testArray[i][0]);
151           it.setSource(testStr, status);
152           set->removeAll();
153           for (;;) {
154               //UnicodeString *result = new UnicodeString(it.next());
155               UnicodeString result(it.next());
156               if (result.isBogus()) {
157                   break;
158               }
159               set->put(result, new UnicodeString(result), status); // Add result to the table
160               //logln(++counter + ": " + hex.transliterate(result));
161               //logln(" = " + name.transliterate(result));
162           }
163           expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1]));
164 
165       }
166     } else {
167       dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status));
168     }
169     delete set;
170 }
171 
characterTest(UnicodeString & s,UChar32 ch,CanonicalIterator & it)172 void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it)
173 {
174     UErrorCode status = U_ZERO_ERROR;
175     UnicodeString decomp, comp;
176     UBool gotDecomp = false;
177     UBool gotComp = false;
178     UBool gotSource = false;
179 
180     Normalizer::decompose(s, false, 0, decomp, status);
181     Normalizer::compose(s, false, 0, comp, status);
182 
183     // skip characters that don't have either decomp.
184     // need quick test for this!
185     if (s == decomp && s == comp) {
186         return;
187     }
188 
189     it.setSource(s, status);
190 
191     for (;;) {
192         UnicodeString item = it.next();
193         if (item.isBogus()) break;
194         if (item == s) gotSource = true;
195         if (item == decomp) gotDecomp = true;
196         if (item == comp) gotComp = true;
197     }
198 
199     if (!gotSource || !gotDecomp || !gotComp) {
200         errln("FAIL CanonicalIterator: " + s + static_cast<int>(ch));
201     }
202 }
203 
expectEqual(const UnicodeString & message,const UnicodeString & item,const UnicodeString & a,const UnicodeString & b)204 void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) {
205     if (!(a==b)) {
206         errln("FAIL: " + message + getReadable(item));
207         errln("\t" + getReadable(a));
208         errln("\t" + getReadable(b));
209     } else {
210         logln("Checked: " + message + getReadable(item));
211         logln("\t" + getReadable(a));
212         logln("\t" + getReadable(b));
213     }
214 }
215 
getReadable(const UnicodeString & s)216 UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) {
217   UErrorCode status = U_ZERO_ERROR;
218   UnicodeString result = "[";
219     if (s.length() == 0) return "";
220     // set up for readable display
221 #if !UCONFIG_NO_TRANSLITERATION
222     if(verbose) {
223       if (nameTrans == nullptr)
224           nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status);
225       UnicodeString sName = s;
226       nameTrans->transliterate(sName);
227       result += sName;
228       result += ";";
229     }
230     if (hexTrans == nullptr)
231         hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status);
232 #endif
233     UnicodeString sHex = s;
234 #if !UCONFIG_NO_TRANSLITERATION
235     if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated
236       hexTrans->transliterate(sHex);
237     }
238 #endif
239     result += sHex;
240     result += "]";
241     return result;
242     //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]";
243 }
244 
245 U_CFUNC int U_CALLCONV
compareUnicodeStrings(const void * s1,const void * s2)246 compareUnicodeStrings(const void *s1, const void *s2) {
247   UnicodeString **st1 = static_cast<UnicodeString **>(const_cast<void*>(s1));
248   UnicodeString **st2 = static_cast<UnicodeString **>(const_cast<void*>(s2));
249 
250   return (*st1)->compare(**st2);
251 }
252 
253 
collectionToString(Hashtable * col)254 UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) {
255     UnicodeString result;
256 
257     // Iterate over the Hashtable, then qsort.
258 
259     UnicodeString **resArray = new UnicodeString*[col->count()];
260     int32_t i = 0;
261 
262     const UHashElement *ne = nullptr;
263     int32_t el = UHASH_FIRST;
264     //Iterator it = basic.iterator();
265     ne = col->nextElement(el);
266     //while (it.hasNext())
267     while (ne != nullptr) {
268       //String item = (String) it.next();
269       UnicodeString *item = static_cast<UnicodeString *>(ne->value.pointer);
270       resArray[i++] = item;
271       ne = col->nextElement(el);
272     }
273 
274     for(i = 0; i<col->count(); ++i) {
275       logln(*resArray[i]);
276     }
277 
278     qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings);
279 
280     result = *resArray[0];
281 
282     for(i = 1; i<col->count(); ++i) {
283       result += ", ";
284       result += *resArray[i];
285     }
286 
287 /*
288     Iterator it = col.iterator();
289     while (it.hasNext()) {
290         if (result.length() != 0) result.append(", ");
291         result.append(it.next().toString());
292     }
293 */
294 
295     delete [] resArray;
296 
297     return result;
298 }
299 
TestAPI()300 void CanonicalIteratorTest::TestAPI() {
301   UErrorCode status = U_ZERO_ERROR;
302   // Test reset and getSource
303   UnicodeString start("ljubav");
304   logln("Testing CanonicalIterator::getSource");
305   logln("Instantiating canonical iterator with string "+start);
306   CanonicalIterator can(start, status);
307   if (U_FAILURE(status)) {
308       dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
309       return;
310   }
311   UnicodeString source = can.getSource();
312   logln("CanonicalIterator::getSource returned "+source);
313   if(start != source) {
314     errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source);
315   }
316   logln("Testing CanonicalIterator::reset");
317   UnicodeString next = can.next();
318   logln("CanonicalIterator::next returned "+next);
319 
320   can.reset();
321 
322   UnicodeString afterReset = can.next();
323   logln("After reset, CanonicalIterator::next returned "+afterReset);
324 
325   if(next != afterReset) {
326     errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+").");
327   }
328 
329   logln("Testing getStaticClassID and getDynamicClassID");
330   if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
331       errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID");
332   }
333 }
334 
335 #endif /* #if !UCONFIG_NO_NORMALIZATION */
336