1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************
8 *
9 * @author Mark E. Davis
10 * @author Vladimir Weinstein
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_NORMALIZATION
16
17 #include "intltest.h"
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "canittst.h"
21 #include "unicode/caniter.h"
22 #include "unicode/normlzr.h"
23 #include "unicode/uchar.h"
24 #include "hash.h"
25
26 #define CASE(id,test) case id: \
27 name = #test; \
28 if (exec) { \
29 logln(#test "---"); \
30 logln((UnicodeString)""); \
31 test(); \
32 } \
33 break
34
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)35 void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec,
36 const char* &name, char* /*par*/) {
37 switch (index) {
38 CASE(0, TestBasic);
39 CASE(1, TestExhaustive);
40 CASE(2, TestAPI);
41 default: name = ""; break;
42 }
43 }
44
45 /**
46 * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
47 static UnicodeString str(const char *input)
48 {
49 UnicodeString str(input, ""); // Invariant conversion
50 return str.unescape();
51 }
52 */
53
54
CanonicalIteratorTest()55 CanonicalIteratorTest::CanonicalIteratorTest() :
56 nameTrans(nullptr), hexTrans(nullptr)
57 {
58 }
59
~CanonicalIteratorTest()60 CanonicalIteratorTest::~CanonicalIteratorTest()
61 {
62 #if !UCONFIG_NO_TRANSLITERATION
63 delete(nameTrans);
64 delete(hexTrans);
65 #endif
66 }
67
TestExhaustive()68 void CanonicalIteratorTest::TestExhaustive() {
69 UErrorCode status = U_ZERO_ERROR;
70 CanonicalIterator it("", status);
71 if (U_FAILURE(status)) {
72 dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
73 return;
74 }
75 UChar32 i = 0;
76 UnicodeString s;
77 // Test static and dynamic class IDs
78 if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
79 errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID");
80 }
81 for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) {
82 //for (i = 0xae00; i < 0xaf00; ++i) {
83
84 if ((i % 0x100) == 0) {
85 logln("Testing U+%06X", i);
86 }
87
88 // skip characters we know don't have decomps
89 int8_t type = u_charType(i);
90 if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR
91 || type == U_SURROGATE) continue;
92
93 s = i;
94 characterTest(s, i, it);
95
96 s += static_cast<UChar32>(0x0345); //"\\u0345";
97 characterTest(s, i, it);
98 }
99 }
100
TestBasic()101 void CanonicalIteratorTest::TestBasic() {
102
103 UErrorCode status = U_ZERO_ERROR;
104
105 static const char * const testArray[][2] = {
106 {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, "
107 "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, "
108 "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, "
109 "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"},
110 {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"},
111 {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"},
112 };
113
114 #if 0
115 // This is not interesting for C/C++ as the data is already built beforehand
116 // check build
117 UnicodeSet ss = CanonicalIterator.getSafeStart();
118 logln("Safe Start: " + ss.toPattern(true));
119 ss = CanonicalIterator.getStarts('a');
120 expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
121 new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
122 + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
123 );
124 #endif
125
126 // check permute
127 // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
128
129 Hashtable *permutations = new Hashtable(false, status);
130 permutations->setValueDeleter(uprv_deleteUObject);
131 UnicodeString toPermute("ABC");
132
133 CanonicalIterator::permute(toPermute, false, permutations, status);
134
135 logln("testing permutation");
136
137 expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA");
138
139 delete permutations;
140
141 // try samples
142 logln("testing samples");
143 Hashtable *set = new Hashtable(false, status);
144 set->setValueDeleter(uprv_deleteUObject);
145 int32_t i = 0;
146 CanonicalIterator it("", status);
147 if(U_SUCCESS(status)) {
148 for (i = 0; i < UPRV_LENGTHOF(testArray); ++i) {
149 //logln("Results for: " + name.transliterate(testArray[i]));
150 UnicodeString testStr = CharsToUnicodeString(testArray[i][0]);
151 it.setSource(testStr, status);
152 set->removeAll();
153 for (;;) {
154 //UnicodeString *result = new UnicodeString(it.next());
155 UnicodeString result(it.next());
156 if (result.isBogus()) {
157 break;
158 }
159 set->put(result, new UnicodeString(result), status); // Add result to the table
160 //logln(++counter + ": " + hex.transliterate(result));
161 //logln(" = " + name.transliterate(result));
162 }
163 expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1]));
164
165 }
166 } else {
167 dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status));
168 }
169 delete set;
170 }
171
characterTest(UnicodeString & s,UChar32 ch,CanonicalIterator & it)172 void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it)
173 {
174 UErrorCode status = U_ZERO_ERROR;
175 UnicodeString decomp, comp;
176 UBool gotDecomp = false;
177 UBool gotComp = false;
178 UBool gotSource = false;
179
180 Normalizer::decompose(s, false, 0, decomp, status);
181 Normalizer::compose(s, false, 0, comp, status);
182
183 // skip characters that don't have either decomp.
184 // need quick test for this!
185 if (s == decomp && s == comp) {
186 return;
187 }
188
189 it.setSource(s, status);
190
191 for (;;) {
192 UnicodeString item = it.next();
193 if (item.isBogus()) break;
194 if (item == s) gotSource = true;
195 if (item == decomp) gotDecomp = true;
196 if (item == comp) gotComp = true;
197 }
198
199 if (!gotSource || !gotDecomp || !gotComp) {
200 errln("FAIL CanonicalIterator: " + s + static_cast<int>(ch));
201 }
202 }
203
expectEqual(const UnicodeString & message,const UnicodeString & item,const UnicodeString & a,const UnicodeString & b)204 void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) {
205 if (!(a==b)) {
206 errln("FAIL: " + message + getReadable(item));
207 errln("\t" + getReadable(a));
208 errln("\t" + getReadable(b));
209 } else {
210 logln("Checked: " + message + getReadable(item));
211 logln("\t" + getReadable(a));
212 logln("\t" + getReadable(b));
213 }
214 }
215
getReadable(const UnicodeString & s)216 UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) {
217 UErrorCode status = U_ZERO_ERROR;
218 UnicodeString result = "[";
219 if (s.length() == 0) return "";
220 // set up for readable display
221 #if !UCONFIG_NO_TRANSLITERATION
222 if(verbose) {
223 if (nameTrans == nullptr)
224 nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status);
225 UnicodeString sName = s;
226 nameTrans->transliterate(sName);
227 result += sName;
228 result += ";";
229 }
230 if (hexTrans == nullptr)
231 hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status);
232 #endif
233 UnicodeString sHex = s;
234 #if !UCONFIG_NO_TRANSLITERATION
235 if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated
236 hexTrans->transliterate(sHex);
237 }
238 #endif
239 result += sHex;
240 result += "]";
241 return result;
242 //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]";
243 }
244
245 U_CFUNC int U_CALLCONV
compareUnicodeStrings(const void * s1,const void * s2)246 compareUnicodeStrings(const void *s1, const void *s2) {
247 UnicodeString **st1 = static_cast<UnicodeString **>(const_cast<void*>(s1));
248 UnicodeString **st2 = static_cast<UnicodeString **>(const_cast<void*>(s2));
249
250 return (*st1)->compare(**st2);
251 }
252
253
collectionToString(Hashtable * col)254 UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) {
255 UnicodeString result;
256
257 // Iterate over the Hashtable, then qsort.
258
259 UnicodeString **resArray = new UnicodeString*[col->count()];
260 int32_t i = 0;
261
262 const UHashElement *ne = nullptr;
263 int32_t el = UHASH_FIRST;
264 //Iterator it = basic.iterator();
265 ne = col->nextElement(el);
266 //while (it.hasNext())
267 while (ne != nullptr) {
268 //String item = (String) it.next();
269 UnicodeString *item = static_cast<UnicodeString *>(ne->value.pointer);
270 resArray[i++] = item;
271 ne = col->nextElement(el);
272 }
273
274 for(i = 0; i<col->count(); ++i) {
275 logln(*resArray[i]);
276 }
277
278 qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings);
279
280 result = *resArray[0];
281
282 for(i = 1; i<col->count(); ++i) {
283 result += ", ";
284 result += *resArray[i];
285 }
286
287 /*
288 Iterator it = col.iterator();
289 while (it.hasNext()) {
290 if (result.length() != 0) result.append(", ");
291 result.append(it.next().toString());
292 }
293 */
294
295 delete [] resArray;
296
297 return result;
298 }
299
TestAPI()300 void CanonicalIteratorTest::TestAPI() {
301 UErrorCode status = U_ZERO_ERROR;
302 // Test reset and getSource
303 UnicodeString start("ljubav");
304 logln("Testing CanonicalIterator::getSource");
305 logln("Instantiating canonical iterator with string "+start);
306 CanonicalIterator can(start, status);
307 if (U_FAILURE(status)) {
308 dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
309 return;
310 }
311 UnicodeString source = can.getSource();
312 logln("CanonicalIterator::getSource returned "+source);
313 if(start != source) {
314 errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source);
315 }
316 logln("Testing CanonicalIterator::reset");
317 UnicodeString next = can.next();
318 logln("CanonicalIterator::next returned "+next);
319
320 can.reset();
321
322 UnicodeString afterReset = can.next();
323 logln("After reset, CanonicalIterator::next returned "+afterReset);
324
325 if(next != afterReset) {
326 errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+").");
327 }
328
329 logln("Testing getStaticClassID and getDynamicClassID");
330 if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
331 errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID");
332 }
333 }
334
335 #endif /* #if !UCONFIG_NO_NORMALIZATION */
336