1 /*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "PhoneticStringUtils.h"
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include <utils/String8.h>
24
25 using namespace android;
26
27 class TestExecutor {
28 public:
TestExecutor()29 TestExecutor() : m_total_count(0), m_success_count(0), m_success(true) {}
30 bool DoAllTests();
31 private:
32 void DoOneTest(void (TestExecutor::*test)());
33
34 void testUtf32At();
35 void testGetPhoneticallySortableCodePointAscii();
36 void testGetPhoneticallySortableCodePointKana();
37 void testGetPhoneticallySortableCodePointWhitespaceOnly();
38 void testGetPhoneticallySortableCodePointSimpleCompare();
39 void testGetUtf8FromUtf32();
40 void testGetPhoneticallySortableString();
41 void testGetNormalizedString();
42 void testLongString();
43
44 // Note: When adding a test, do not forget to add it to DoOneTest().
45
46 int m_total_count;
47 int m_success_count;
48
49 bool m_success;
50 };
51
52 #define ASSERT_EQ_VALUE(input, expected) \
53 ({ \
54 if ((expected) != (input)) { \
55 printf("0x%X(result) != 0x%X(expected)\n", input, expected); \
56 m_success = false; \
57 return; \
58 } \
59 })
60
61 #define EXPECT_EQ_VALUE(input, expected) \
62 ({ \
63 if ((expected) != (input)) { \
64 printf("0x%X(result) != 0x%X(expected)\n", input, expected); \
65 m_success = false; \
66 } \
67 })
68
69
DoAllTests()70 bool TestExecutor::DoAllTests() {
71 DoOneTest(&TestExecutor::testUtf32At);
72 DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointAscii);
73 DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointKana);
74 DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly);
75 DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare);
76 DoOneTest(&TestExecutor::testGetUtf8FromUtf32);
77 DoOneTest(&TestExecutor::testGetPhoneticallySortableString);
78 DoOneTest(&TestExecutor::testGetNormalizedString);
79 DoOneTest(&TestExecutor::testLongString);
80
81 printf("Test total: %d\nSuccess: %d\nFailure: %d\n",
82 m_total_count, m_success_count, m_total_count - m_success_count);
83
84 bool success = m_total_count == m_success_count;
85 printf("\n%s\n", success ? "Success" : "Failure");
86
87 return success;
88 }
89
DoOneTest(void (TestExecutor::* test)())90 void TestExecutor::DoOneTest(void (TestExecutor::*test)()) {
91 m_success = true;
92
93 (this->*test)();
94
95 ++m_total_count;
96 m_success_count += m_success ? 1 : 0;
97 }
98
99 #define TEST_GET_UTF32AT(src, index, expected_next, expected_value) \
100 ({ \
101 size_t next; \
102 int32_t ret = utf32_at(src, strlen(src), index, &next); \
103 if (ret < 0) { \
104 printf("getUtf32At() returned negative value (src: %s, index: %d)\n", \
105 (src), (index)); \
106 m_success = false; \
107 } else if (next != (expected_next)) { \
108 printf("next is unexpected value (src: %s, actual: %u, expected: %u)\n", \
109 (src), next, (expected_next)); \
110 } else { \
111 EXPECT_EQ_VALUE(ret, (expected_value)); \
112 } \
113 })
114
testUtf32At()115 void TestExecutor::testUtf32At() {
116 printf("testUtf32At()\n");
117
118 TEST_GET_UTF32AT("a", 0, 1, 97);
119 // Japanese hiragana "a"
120 TEST_GET_UTF32AT("\xE3\x81\x82", 0, 3, 0x3042);
121 // Japanese fullwidth katakana "a" with ascii a
122 TEST_GET_UTF32AT("a\xE3\x82\xA2", 1, 4, 0x30A2);
123
124 // 2 PUA
125 TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 0, 4, 0xFE000);
126 TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 4, 8, 0xFE008);
127 }
128
testGetPhoneticallySortableCodePointAscii()129 void TestExecutor::testGetPhoneticallySortableCodePointAscii() {
130 printf("testGetPhoneticallySortableCodePoint()\n");
131 int halfwidth[94];
132 int fullwidth[94];
133 int i;
134 char32_t codepoint;
135 bool next_is_consumed;
136 for (i = 0, codepoint = 0x0021; codepoint <= 0x007E; ++i, ++codepoint) {
137 halfwidth[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
138 &next_is_consumed);
139 if (halfwidth[i] < 0) {
140 printf("returned value become negative at 0x%04X", codepoint);
141 m_success = false;
142 return;
143 }
144 if (next_is_consumed) {
145 printf("next_is_consumed become true at 0x%04X", codepoint);
146 m_success = false;
147 return;
148 }
149 }
150 for (i = 0, codepoint = 0xFF01; codepoint <= 0xFF5E; ++i, ++codepoint) {
151 fullwidth[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
152 &next_is_consumed);
153 if (fullwidth[i] < 0) {
154 printf("returned value become negative at 0x%04X", codepoint);
155 m_success = false;
156 return;
157 }
158 if (next_is_consumed) {
159 printf("next_is_consumed become true at 0x%04X", codepoint);
160 m_success = false;
161 return;
162 }
163 }
164
165 for (i = 0; i < 94; i++) {
166 EXPECT_EQ_VALUE(halfwidth[i], fullwidth[i]);
167 }
168 }
169
testGetPhoneticallySortableCodePointKana()170 void TestExecutor::testGetPhoneticallySortableCodePointKana() {
171 printf("testGetPhoneticallySortableCodePointKana()\n");
172 int hiragana[86];
173 int fullwidth_katakana[86];
174 int i;
175 char32_t codepoint;
176 bool next_is_consumed;
177
178 for (i = 0, codepoint = 0x3041; codepoint <= 0x3096; ++i, ++codepoint) {
179 hiragana[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
180 &next_is_consumed);
181 if (hiragana[i] < 0) {
182 printf("returned value become negative at 0x%04X", codepoint);
183 m_success = false;
184 return;
185 }
186 if (next_is_consumed) {
187 printf("next_is_consumed become true at 0x%04X", codepoint);
188 m_success = false;
189 return;
190 }
191 }
192
193 for (i = 0, codepoint = 0x30A1; codepoint <= 0x30F6; ++i, ++codepoint) {
194 fullwidth_katakana[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
195 &next_is_consumed);
196 if (fullwidth_katakana[i] < 0) {
197 printf("returned value become negative at 0x%04X", codepoint);
198 m_success = false;
199 return;
200 }
201 if (next_is_consumed) {
202 printf("next_is_consumed become true at 0x%04X", codepoint);
203 m_success = false;
204 return;
205 }
206 }
207
208 // hankaku-katakana space do not have some characters corresponding to
209 // zenkaku-hiragana (e.g. xwa, xka, xku). To make test easier, insert
210 // zenkaku-katakana version of them into this array (See the value 0x30??).
211 char32_t halfwidth_katakana[] = {
212 0xFF67, 0xFF71, 0xFF68, 0xFF72, 0xFF69, 0xFF73, 0xFF6A, 0xFF74, 0xFF6B,
213 0xFF75, 0xFF76, 0xFF76, 0xFF9E, 0xFF77, 0xFF77, 0xFF9E, 0xFF78, 0xFF78,
214 0xFF9E, 0xFF79, 0xFF79, 0xFF9E, 0xFF7A, 0xFF7A, 0xFF9E, 0xFF7B, 0xFF7B,
215 0xFF9E, 0xFF7C, 0xFF7C, 0xFF9E, 0xFF7D, 0xFF7D, 0xFF9E, 0xFF7E, 0xFF7E,
216 0xFF9E, 0xFF7F, 0xFF7F, 0xFF9E, 0xFF80, 0xFF80, 0xFF9E, 0xFF81, 0xFF81,
217 0xFF9E, 0xFF6F, 0xFF82, 0xFF82, 0xFF9E, 0xFF83, 0xFF83, 0xFF9E, 0xFF84,
218 0xFF84, 0xFF9E, 0xFF85, 0xFF86, 0xFF87, 0xFF88, 0xFF89, 0xFF8A, 0xFF8A,
219 0xFF9E, 0xFF8A, 0xFF9F, 0xFF8B, 0xFF8B, 0xFF9E, 0xFF8B, 0xFF9F, 0xFF8C,
220 0xFF8C, 0xFF9E, 0xFF8C, 0xFF9F, 0xFF8D, 0xFF8D, 0xFF9E, 0xFF8D, 0xFF9F,
221 0xFF8E, 0xFF8E, 0xFF9E, 0xFF8E, 0xFF9F, 0xFF8F, 0xFF90, 0xFF91, 0xFF92,
222 0xFF93, 0xFF6C, 0xFF94, 0xFF6D, 0xFF95, 0xFF6E, 0xFF96, 0xFF97, 0xFF98,
223 0xFF99, 0xFF9A, 0xFF9B, 0x30EE, 0xFF9C, 0x30F0, 0x30F1, 0xFF66, 0xFF9D,
224 0xFF73, 0xFF9E, 0x30F5, 0x30F6};
225 int len = sizeof(halfwidth_katakana)/sizeof(int);
226
227 int halfwidth_katakana_result[86];
228
229 int j;
230 for (i = 0, j = 0; i < len && j < 86; ++i, ++j) {
231 char32_t codepoint = halfwidth_katakana[i];
232 char32_t next_codepoint = i + 1 < len ? halfwidth_katakana[i + 1] : 0;
233 halfwidth_katakana_result[j] =
234 GetPhoneticallySortableCodePoint(codepoint, next_codepoint,
235 &next_is_consumed);
236 // Consume voiced mark/half-voiced mark.
237 if (next_is_consumed) {
238 ++i;
239 }
240 }
241 ASSERT_EQ_VALUE(i, len);
242 ASSERT_EQ_VALUE(j, 86);
243
244 for (i = 0; i < 86; ++i) {
245 EXPECT_EQ_VALUE(fullwidth_katakana[i], hiragana[i]);
246 EXPECT_EQ_VALUE(halfwidth_katakana_result[i], hiragana[i]);
247 }
248 }
249
testGetPhoneticallySortableCodePointWhitespaceOnly()250 void TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly() {
251 printf("testGetPhoneticallySortableCodePointWhitespaceOnly()\n");
252 // Halfwidth space
253 int result = GetPhoneticallySortableCodePoint(0x0020, 0x0061, NULL);
254 ASSERT_EQ_VALUE(result, -1);
255 // Fullwidth space
256 result = GetPhoneticallySortableCodePoint(0x3000, 0x0062, NULL);
257 ASSERT_EQ_VALUE(result, -1);
258 // tab
259 result = GetPhoneticallySortableCodePoint(0x0009, 0x0062, NULL);
260 ASSERT_EQ_VALUE(result, -1);
261 }
262
testGetPhoneticallySortableCodePointSimpleCompare()263 void TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare() {
264 printf("testGetPhoneticallySortableCodePointSimpleCompare()\n");
265
266 char32_t codepoints[] = {
267 0x3042, 0x30AB, 0xFF7B, 0x305F, 0x30CA, 0xFF8A, 0x30D0, 0x3071,
268 0x307E, 0x30E4, 0xFF97, 0x308F, 0x3093, 0x3094, 'A', 'Z',
269 '0', '9', '!', '/', ':', '?', '[', '`', '{', '~'};
270 size_t len = sizeof(codepoints)/sizeof(int);
271 bool next_is_consumed;
272 for (size_t i = 0; i < len - 1; ++i) {
273 int codepoint_a =
274 GetPhoneticallySortableCodePoint(codepoints[i], 0,
275 &next_is_consumed);
276 if (next_is_consumed) {
277 printf("next_is_consumed become true at 0x%04X", codepoint_a);
278 m_success = false;
279 return;
280 }
281 int codepoint_b =
282 GetPhoneticallySortableCodePoint(codepoints[i + 1], 0,
283 &next_is_consumed);
284 if (next_is_consumed) {
285 printf("next_is_consumed become true at 0x%04X", codepoint_b);
286 m_success = false;
287 return;
288 }
289
290 if (codepoint_a >= codepoint_b) {
291 printf("0x%04X (from 0x%04X) >= 0x%04X (from 0x%04X)\n",
292 codepoint_a, codepoints[i], codepoint_b, codepoints[i + 1]);
293 m_success = false;
294 return;
295 }
296 }
297 }
298
299 #define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected) \
300 ({ \
301 char32_t codepoints[1] = {codepoint}; \
302 status_t ret = string8.setTo(codepoints, 1); \
303 if (ret != NO_ERROR) { \
304 printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \
305 m_success = false; \
306 } else { \
307 const char* string = string8.string(); \
308 if (strcmp(string, expected) != 0) { \
309 printf("Failed at codepoint 0x%04X\n", codepoint); \
310 for (const char *ch = string; *ch != '\0'; ++ch) { \
311 printf("0x%X ", *ch); \
312 } \
313 printf("!= "); \
314 for (const char *ch = expected; *ch != '\0'; ++ch) { \
315 printf("0x%X ", *ch); \
316 } \
317 printf("\n"); \
318 m_success = false; \
319 } \
320 } \
321 })
322
testGetUtf8FromUtf32()323 void TestExecutor::testGetUtf8FromUtf32() {
324 printf("testGetUtf8FromUtf32()\n");
325 String8 string8;
326
327 EXPECT_EQ_CODEPOINT_UTF8('a', "\x61");
328 // Armenian capital letter AYB (2 bytes in UTF8)
329 EXPECT_EQ_CODEPOINT_UTF8(0x0530, "\xD4\xB0");
330 // Japanese 'a' (3 bytes in UTF8)
331 EXPECT_EQ_CODEPOINT_UTF8(0x3042, "\xE3\x81\x82");
332 // Kanji
333 EXPECT_EQ_CODEPOINT_UTF8(0x65E5, "\xE6\x97\xA5");
334 // PUA (4 byets in UTF8)
335 EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96");
336 EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2");
337 }
338
339 #define EXPECT_EQ_UTF8_UTF8(src, expected) \
340 ({ \
341 if (!GetPhoneticallySortableString(src, &dst, &len)) { \
342 printf("GetPhoneticallySortableString() returned false.\n"); \
343 m_success = false; \
344 } else { \
345 if (strcmp(dst, expected) != 0) { \
346 for (const char *ch = dst; *ch != '\0'; ++ch) { \
347 printf("0x%X ", *ch); \
348 } \
349 printf("!= "); \
350 for (const char *ch = expected; *ch != '\0'; ++ch) { \
351 printf("0x%X ", *ch); \
352 } \
353 printf("\n"); \
354 m_success = false; \
355 } \
356 free(dst); \
357 } \
358 })
359
testGetPhoneticallySortableString()360 void TestExecutor::testGetPhoneticallySortableString() {
361 printf("testGetPhoneticallySortableString()\n");
362 char *dst;
363 size_t len;
364
365 // halfwidth alphabets -> fullwidth alphabets.
366 EXPECT_EQ_UTF8_UTF8("ABCD",
367 "\xEF\xBC\xA1\xEF\xBC\xA2\xEF\xBC\xA3\xEF\xBC\xA4");
368 // halfwidth/fullwidth-katakana -> hiragana
369 EXPECT_EQ_UTF8_UTF8(
370 "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA",
371 "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A");
372
373 // whitespace -> string which should be placed at last
374 EXPECT_EQ_UTF8_UTF8(" \t", "\xF0\x9F\xBF\xBD");
375 }
376
377 #undef EXPECT_EQ_UTF8_UTF8
378
379 #define EXPECT_EQ_UTF8_UTF8(src, expected) \
380 ({ \
381 if (!GetNormalizedString(src, &dst, &len)) { \
382 printf("GetPhoneticallySortableString() returned false.\n"); \
383 m_success = false; \
384 } else { \
385 if (strcmp(dst, expected) != 0) { \
386 for (const char *ch = dst; *ch != '\0'; ++ch) { \
387 printf("0x%X ", *ch); \
388 } \
389 printf("!= "); \
390 for (const char *ch = expected; *ch != '\0'; ++ch) { \
391 printf("0x%X ", *ch); \
392 } \
393 printf("\n"); \
394 m_success = false; \
395 } \
396 free(dst); \
397 } \
398 })
399
testGetNormalizedString()400 void TestExecutor::testGetNormalizedString() {
401 printf("testGetNormalizedString()\n");
402 char *dst;
403 size_t len;
404
405 // halfwidth alphabets/symbols -> keep it as is.
406 EXPECT_EQ_UTF8_UTF8("ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()",
407 "ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()");
408 EXPECT_EQ_UTF8_UTF8("abcdefghijklmnopqrstuvwxyz[]{}\\@/",
409 "abcdefghijklmnopqrstuvwxyz[]{}\\@/");
410
411 // halfwidth/fullwidth-katakana -> hiragana
412 EXPECT_EQ_UTF8_UTF8(
413 "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA",
414 "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A");
415
416 // whitespace -> keep it as is.
417 EXPECT_EQ_UTF8_UTF8(" \t", " \t");
418 }
419
testLongString()420 void TestExecutor::testLongString() {
421 printf("testLongString()\n");
422 char * dst;
423 size_t len;
424 EXPECT_EQ_UTF8_UTF8("Qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqtttttttttttttttttttttttttttttttttttttttttttttttttgggggggggggggggggggggggggggggggggggggggbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
425 "Qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqtttttttttttttttttttttttttttttttttttttttttttttttttggggggggggggggggggggggggggggggggggg");
426 }
427
428
main()429 int main() {
430 TestExecutor executor;
431 if(executor.DoAllTests()) {
432 return 0;
433 } else {
434 return 1;
435 }
436 }
437