1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************
8 *
9 * @author Mark E. Davis
10 * @author Vladimir Weinstein
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_NORMALIZATION
16
17 #include "intltest.h"
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "canittst.h"
21 #include "unicode/caniter.h"
22 #include "unicode/normlzr.h"
23 #include "unicode/uchar.h"
24 #include "hash.h"
25
26 #define CASE(id,test) case id: \
27 name = #test; \
28 if (exec) { \
29 logln(#test "---"); \
30 logln((UnicodeString)""); \
31 test(); \
32 } \
33 break
34
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)35 void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec,
36 const char* &name, char* /*par*/) {
37 switch (index) {
38 CASE(0, TestBasic);
39 CASE(1, TestExhaustive);
40 CASE(2, TestAPI);
41 default: name = ""; break;
42 }
43 }
44
45 /**
46 * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
47 static UnicodeString str(const char *input)
48 {
49 UnicodeString str(input, ""); // Invariant conversion
50 return str.unescape();
51 }
52 */
53
54
CanonicalIteratorTest()55 CanonicalIteratorTest::CanonicalIteratorTest() :
56 nameTrans(NULL), hexTrans(NULL)
57 {
58 }
59
~CanonicalIteratorTest()60 CanonicalIteratorTest::~CanonicalIteratorTest()
61 {
62 #if !UCONFIG_NO_TRANSLITERATION
63 if(nameTrans != NULL) {
64 delete(nameTrans);
65 }
66 if(hexTrans != NULL) {
67 delete(hexTrans);
68 }
69 #endif
70 }
71
TestExhaustive()72 void CanonicalIteratorTest::TestExhaustive() {
73 UErrorCode status = U_ZERO_ERROR;
74 CanonicalIterator it("", status);
75 if (U_FAILURE(status)) {
76 dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
77 return;
78 }
79 UChar32 i = 0;
80 UnicodeString s;
81 // Test static and dynamic class IDs
82 if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
83 errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID");
84 }
85 for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) {
86 //for (i = 0xae00; i < 0xaf00; ++i) {
87
88 if ((i % 0x100) == 0) {
89 logln("Testing U+%06X", i);
90 }
91
92 // skip characters we know don't have decomps
93 int8_t type = u_charType(i);
94 if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR
95 || type == U_SURROGATE) continue;
96
97 s = i;
98 characterTest(s, i, it);
99
100 s += (UChar32)0x0345; //"\\u0345";
101 characterTest(s, i, it);
102 }
103 }
104
TestBasic()105 void CanonicalIteratorTest::TestBasic() {
106
107 UErrorCode status = U_ZERO_ERROR;
108
109 static const char * const testArray[][2] = {
110 {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, "
111 "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, "
112 "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, "
113 "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"},
114 {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"},
115 {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"},
116 };
117
118 #if 0
119 // This is not interesting for C/C++ as the data is already built beforehand
120 // check build
121 UnicodeSet ss = CanonicalIterator.getSafeStart();
122 logln("Safe Start: " + ss.toPattern(true));
123 ss = CanonicalIterator.getStarts('a');
124 expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
125 new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
126 + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
127 );
128 #endif
129
130 // check permute
131 // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
132
133 Hashtable *permutations = new Hashtable(FALSE, status);
134 permutations->setValueDeleter(uprv_deleteUObject);
135 UnicodeString toPermute("ABC");
136
137 CanonicalIterator::permute(toPermute, FALSE, permutations, status);
138
139 logln("testing permutation");
140
141 expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA");
142
143 delete permutations;
144
145 // try samples
146 logln("testing samples");
147 Hashtable *set = new Hashtable(FALSE, status);
148 set->setValueDeleter(uprv_deleteUObject);
149 int32_t i = 0;
150 CanonicalIterator it("", status);
151 if(U_SUCCESS(status)) {
152 for (i = 0; i < UPRV_LENGTHOF(testArray); ++i) {
153 //logln("Results for: " + name.transliterate(testArray[i]));
154 UnicodeString testStr = CharsToUnicodeString(testArray[i][0]);
155 it.setSource(testStr, status);
156 set->removeAll();
157 for (;;) {
158 //UnicodeString *result = new UnicodeString(it.next());
159 UnicodeString result(it.next());
160 if (result.isBogus()) {
161 break;
162 }
163 set->put(result, new UnicodeString(result), status); // Add result to the table
164 //logln(++counter + ": " + hex.transliterate(result));
165 //logln(" = " + name.transliterate(result));
166 }
167 expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1]));
168
169 }
170 } else {
171 dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status));
172 }
173 delete set;
174 }
175
characterTest(UnicodeString & s,UChar32 ch,CanonicalIterator & it)176 void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it)
177 {
178 UErrorCode status = U_ZERO_ERROR;
179 UnicodeString decomp, comp;
180 UBool gotDecomp = FALSE;
181 UBool gotComp = FALSE;
182 UBool gotSource = FALSE;
183
184 Normalizer::decompose(s, FALSE, 0, decomp, status);
185 Normalizer::compose(s, FALSE, 0, comp, status);
186
187 // skip characters that don't have either decomp.
188 // need quick test for this!
189 if (s == decomp && s == comp) {
190 return;
191 }
192
193 it.setSource(s, status);
194
195 for (;;) {
196 UnicodeString item = it.next();
197 if (item.isBogus()) break;
198 if (item == s) gotSource = TRUE;
199 if (item == decomp) gotDecomp = TRUE;
200 if (item == comp) gotComp = TRUE;
201 }
202
203 if (!gotSource || !gotDecomp || !gotComp) {
204 errln("FAIL CanonicalIterator: " + s + (int)ch);
205 }
206 }
207
expectEqual(const UnicodeString & message,const UnicodeString & item,const UnicodeString & a,const UnicodeString & b)208 void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) {
209 if (!(a==b)) {
210 errln("FAIL: " + message + getReadable(item));
211 errln("\t" + getReadable(a));
212 errln("\t" + getReadable(b));
213 } else {
214 logln("Checked: " + message + getReadable(item));
215 logln("\t" + getReadable(a));
216 logln("\t" + getReadable(b));
217 }
218 }
219
getReadable(const UnicodeString & s)220 UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) {
221 UErrorCode status = U_ZERO_ERROR;
222 UnicodeString result = "[";
223 if (s.length() == 0) return "";
224 // set up for readable display
225 #if !UCONFIG_NO_TRANSLITERATION
226 if(verbose) {
227 if (nameTrans == NULL)
228 nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status);
229 UnicodeString sName = s;
230 nameTrans->transliterate(sName);
231 result += sName;
232 result += ";";
233 }
234 if (hexTrans == NULL)
235 hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status);
236 #endif
237 UnicodeString sHex = s;
238 #if !UCONFIG_NO_TRANSLITERATION
239 if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated
240 hexTrans->transliterate(sHex);
241 }
242 #endif
243 result += sHex;
244 result += "]";
245 return result;
246 //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]";
247 }
248
249 U_CFUNC int U_CALLCONV
compareUnicodeStrings(const void * s1,const void * s2)250 compareUnicodeStrings(const void *s1, const void *s2) {
251 UnicodeString **st1 = (UnicodeString **)s1;
252 UnicodeString **st2 = (UnicodeString **)s2;
253
254 return (*st1)->compare(**st2);
255 }
256
257
collectionToString(Hashtable * col)258 UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) {
259 UnicodeString result;
260
261 // Iterate over the Hashtable, then qsort.
262
263 UnicodeString **resArray = new UnicodeString*[col->count()];
264 int32_t i = 0;
265
266 const UHashElement *ne = NULL;
267 int32_t el = UHASH_FIRST;
268 //Iterator it = basic.iterator();
269 ne = col->nextElement(el);
270 //while (it.hasNext())
271 while (ne != NULL) {
272 //String item = (String) it.next();
273 UnicodeString *item = (UnicodeString *)(ne->value.pointer);
274 resArray[i++] = item;
275 ne = col->nextElement(el);
276 }
277
278 for(i = 0; i<col->count(); ++i) {
279 logln(*resArray[i]);
280 }
281
282 qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings);
283
284 result = *resArray[0];
285
286 for(i = 1; i<col->count(); ++i) {
287 result += ", ";
288 result += *resArray[i];
289 }
290
291 /*
292 Iterator it = col.iterator();
293 while (it.hasNext()) {
294 if (result.length() != 0) result.append(", ");
295 result.append(it.next().toString());
296 }
297 */
298
299 delete [] resArray;
300
301 return result;
302 }
303
TestAPI()304 void CanonicalIteratorTest::TestAPI() {
305 UErrorCode status = U_ZERO_ERROR;
306 // Test reset and getSource
307 UnicodeString start("ljubav");
308 logln("Testing CanonicalIterator::getSource");
309 logln("Instantiating canonical iterator with string "+start);
310 CanonicalIterator can(start, status);
311 if (U_FAILURE(status)) {
312 dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
313 return;
314 }
315 UnicodeString source = can.getSource();
316 logln("CanonicalIterator::getSource returned "+source);
317 if(start != source) {
318 errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source);
319 }
320 logln("Testing CanonicalIterator::reset");
321 UnicodeString next = can.next();
322 logln("CanonicalIterator::next returned "+next);
323
324 can.reset();
325
326 UnicodeString afterReset = can.next();
327 logln("After reset, CanonicalIterator::next returned "+afterReset);
328
329 if(next != afterReset) {
330 errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+").");
331 }
332
333 logln("Testing getStaticClassID and getDynamicClassID");
334 if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
335 errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID");
336 }
337 }
338
339 #endif /* #if !UCONFIG_NO_NORMALIZATION */
340