• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "PhoneticStringUtils.h"
18 
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 
23 #include <utils/String8.h>
24 
25 using namespace android;
26 
27 class TestExecutor {
28  public:
TestExecutor()29   TestExecutor() : m_total_count(0), m_success_count(0), m_success(true) {}
30   bool DoAllTests();
31  private:
32   void DoOneTest(void (TestExecutor::*test)());
33 
34   void testUtf32At();
35   void testGetPhoneticallySortableCodePointAscii();
36   void testGetPhoneticallySortableCodePointKana();
37   void testGetPhoneticallySortableCodePointWhitespaceOnly();
38   void testGetPhoneticallySortableCodePointSimpleCompare();
39   void testGetUtf8FromUtf32();
40   void testGetPhoneticallySortableString();
41   void testGetNormalizedString();
42   void testLongString();
43 
44   // Note: When adding a test, do not forget to add it to DoOneTest().
45 
46   int m_total_count;
47   int m_success_count;
48 
49   bool m_success;
50 };
51 
52 #define ASSERT_EQ_VALUE(input, expected)                                \
53   ({                                                                    \
54     if ((expected) != (input)) {                                        \
55       printf("0x%X(result) != 0x%X(expected)\n", input, expected);      \
56       m_success = false;                                                \
57       return;                                                           \
58     }                                                                   \
59   })
60 
61 #define EXPECT_EQ_VALUE(input, expected)                                \
62   ({                                                                    \
63     if ((expected) != (input)) {                                        \
64       printf("0x%X(result) != 0x%X(expected)\n", input, expected);      \
65       m_success = false;                                                \
66     }                                                                   \
67   })
68 
69 
DoAllTests()70 bool TestExecutor::DoAllTests() {
71   DoOneTest(&TestExecutor::testUtf32At);
72   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointAscii);
73   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointKana);
74   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly);
75   DoOneTest(&TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare);
76   DoOneTest(&TestExecutor::testGetUtf8FromUtf32);
77   DoOneTest(&TestExecutor::testGetPhoneticallySortableString);
78   DoOneTest(&TestExecutor::testGetNormalizedString);
79   DoOneTest(&TestExecutor::testLongString);
80 
81   printf("Test total: %d\nSuccess: %d\nFailure: %d\n",
82          m_total_count, m_success_count, m_total_count - m_success_count);
83 
84   bool success = m_total_count == m_success_count;
85   printf("\n%s\n", success ? "Success" : "Failure");
86 
87   return success;
88 }
89 
DoOneTest(void (TestExecutor::* test)())90 void TestExecutor::DoOneTest(void (TestExecutor::*test)()) {
91   m_success = true;
92 
93   (this->*test)();
94 
95   ++m_total_count;
96   m_success_count += m_success ? 1 : 0;
97 }
98 
99 #define TEST_GET_UTF32AT(src, index, expected_next, expected_value)     \
100   ({                                                                    \
101     size_t next;                                                        \
102     int32_t ret = utf32_at(src, strlen(src), index, &next);   \
103     if (ret < 0) {                                                      \
104       printf("getUtf32At() returned negative value (src: %s, index: %d)\n", \
105              (src), (index));                                           \
106       m_success = false;                                                \
107     } else if (next != (expected_next)) {                               \
108       printf("next is unexpected value (src: %s, actual: %u, expected: %u)\n", \
109              (src), next, (expected_next));                             \
110     } else {                                                            \
111       EXPECT_EQ_VALUE(ret, (expected_value));                           \
112     }                                                                   \
113    })
114 
testUtf32At()115 void TestExecutor::testUtf32At() {
116   printf("testUtf32At()\n");
117 
118   TEST_GET_UTF32AT("a", 0, 1, 97);
119   // Japanese hiragana "a"
120   TEST_GET_UTF32AT("\xE3\x81\x82", 0, 3, 0x3042);
121   // Japanese fullwidth katakana "a" with ascii a
122   TEST_GET_UTF32AT("a\xE3\x82\xA2", 1, 4, 0x30A2);
123 
124   // 2 PUA
125   TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 0, 4, 0xFE000);
126   TEST_GET_UTF32AT("\xF3\xBE\x80\x80\xF3\xBE\x80\x88", 4, 8, 0xFE008);
127 }
128 
testGetPhoneticallySortableCodePointAscii()129 void TestExecutor::testGetPhoneticallySortableCodePointAscii() {
130   printf("testGetPhoneticallySortableCodePoint()\n");
131   int halfwidth[94];
132   int fullwidth[94];
133   int i;
134   char32_t codepoint;
135   bool next_is_consumed;
136   for (i = 0, codepoint = 0x0021; codepoint <= 0x007E; ++i, ++codepoint) {
137     halfwidth[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
138                                                     &next_is_consumed);
139     if (halfwidth[i] < 0) {
140       printf("returned value become negative at 0x%04X", codepoint);
141       m_success = false;
142       return;
143     }
144     if (next_is_consumed) {
145       printf("next_is_consumed become true at 0x%04X", codepoint);
146       m_success = false;
147       return;
148     }
149   }
150   for (i = 0, codepoint = 0xFF01; codepoint <= 0xFF5E; ++i, ++codepoint) {
151     fullwidth[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
152                                                     &next_is_consumed);
153     if (fullwidth[i] < 0) {
154       printf("returned value become negative at 0x%04X", codepoint);
155       m_success = false;
156       return;
157     }
158     if (next_is_consumed) {
159       printf("next_is_consumed become true at 0x%04X", codepoint);
160       m_success = false;
161       return;
162     }
163   }
164 
165   for (i = 0; i < 94; i++) {
166     EXPECT_EQ_VALUE(halfwidth[i], fullwidth[i]);
167   }
168 }
169 
testGetPhoneticallySortableCodePointKana()170 void TestExecutor::testGetPhoneticallySortableCodePointKana() {
171   printf("testGetPhoneticallySortableCodePointKana()\n");
172   int hiragana[86];
173   int fullwidth_katakana[86];
174   int i;
175   char32_t codepoint;
176   bool next_is_consumed;
177 
178   for (i = 0, codepoint = 0x3041; codepoint <= 0x3096; ++i, ++codepoint) {
179     hiragana[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
180                                                    &next_is_consumed);
181     if (hiragana[i] < 0) {
182       printf("returned value become negative at 0x%04X", codepoint);
183       m_success = false;
184       return;
185     }
186     if (next_is_consumed) {
187       printf("next_is_consumed become true at 0x%04X", codepoint);
188       m_success = false;
189       return;
190     }
191   }
192 
193   for (i = 0, codepoint = 0x30A1; codepoint <= 0x30F6; ++i, ++codepoint) {
194     fullwidth_katakana[i] = GetPhoneticallySortableCodePoint(codepoint, 0,
195                                                    &next_is_consumed);
196     if (fullwidth_katakana[i] < 0) {
197       printf("returned value become negative at 0x%04X", codepoint);
198       m_success = false;
199       return;
200     }
201     if (next_is_consumed) {
202       printf("next_is_consumed become true at 0x%04X", codepoint);
203       m_success = false;
204       return;
205     }
206   }
207 
208   // hankaku-katakana space do not have some characters corresponding to
209   // zenkaku-hiragana (e.g. xwa, xka, xku). To make test easier, insert
210   // zenkaku-katakana version of them into this array (See the value 0x30??).
211   char32_t halfwidth_katakana[] = {
212     0xFF67, 0xFF71, 0xFF68, 0xFF72, 0xFF69, 0xFF73, 0xFF6A, 0xFF74, 0xFF6B,
213     0xFF75, 0xFF76, 0xFF76, 0xFF9E, 0xFF77, 0xFF77, 0xFF9E, 0xFF78, 0xFF78,
214     0xFF9E, 0xFF79, 0xFF79, 0xFF9E, 0xFF7A, 0xFF7A, 0xFF9E, 0xFF7B, 0xFF7B,
215     0xFF9E, 0xFF7C, 0xFF7C, 0xFF9E, 0xFF7D, 0xFF7D, 0xFF9E, 0xFF7E, 0xFF7E,
216     0xFF9E, 0xFF7F, 0xFF7F, 0xFF9E, 0xFF80, 0xFF80, 0xFF9E, 0xFF81, 0xFF81,
217     0xFF9E, 0xFF6F, 0xFF82, 0xFF82, 0xFF9E, 0xFF83, 0xFF83, 0xFF9E, 0xFF84,
218     0xFF84, 0xFF9E, 0xFF85, 0xFF86, 0xFF87, 0xFF88, 0xFF89, 0xFF8A, 0xFF8A,
219     0xFF9E, 0xFF8A, 0xFF9F, 0xFF8B, 0xFF8B, 0xFF9E, 0xFF8B, 0xFF9F, 0xFF8C,
220     0xFF8C, 0xFF9E, 0xFF8C, 0xFF9F, 0xFF8D, 0xFF8D, 0xFF9E, 0xFF8D, 0xFF9F,
221     0xFF8E, 0xFF8E, 0xFF9E, 0xFF8E, 0xFF9F, 0xFF8F, 0xFF90, 0xFF91, 0xFF92,
222     0xFF93, 0xFF6C, 0xFF94, 0xFF6D, 0xFF95, 0xFF6E, 0xFF96, 0xFF97, 0xFF98,
223     0xFF99, 0xFF9A, 0xFF9B, 0x30EE, 0xFF9C, 0x30F0, 0x30F1, 0xFF66, 0xFF9D,
224     0xFF73, 0xFF9E, 0x30F5, 0x30F6};
225   int len = sizeof(halfwidth_katakana)/sizeof(int);
226 
227   int halfwidth_katakana_result[86];
228 
229   int j;
230   for (i = 0, j = 0; i < len && j < 86; ++i, ++j) {
231     char32_t codepoint = halfwidth_katakana[i];
232     char32_t next_codepoint = i + 1 < len ? halfwidth_katakana[i + 1] : 0;
233     halfwidth_katakana_result[j] =
234         GetPhoneticallySortableCodePoint(codepoint, next_codepoint,
235                                          &next_is_consumed);
236     // Consume voiced mark/half-voiced mark.
237     if (next_is_consumed) {
238       ++i;
239     }
240   }
241   ASSERT_EQ_VALUE(i, len);
242   ASSERT_EQ_VALUE(j, 86);
243 
244   for (i = 0; i < 86; ++i) {
245     EXPECT_EQ_VALUE(fullwidth_katakana[i], hiragana[i]);
246     EXPECT_EQ_VALUE(halfwidth_katakana_result[i], hiragana[i]);
247   }
248 }
249 
testGetPhoneticallySortableCodePointWhitespaceOnly()250 void TestExecutor::testGetPhoneticallySortableCodePointWhitespaceOnly() {
251   printf("testGetPhoneticallySortableCodePointWhitespaceOnly()\n");
252   // Halfwidth space
253   int result = GetPhoneticallySortableCodePoint(0x0020, 0x0061, NULL);
254   ASSERT_EQ_VALUE(result, -1);
255   // Fullwidth space
256   result = GetPhoneticallySortableCodePoint(0x3000, 0x0062, NULL);
257   ASSERT_EQ_VALUE(result, -1);
258   // tab
259   result = GetPhoneticallySortableCodePoint(0x0009, 0x0062, NULL);
260   ASSERT_EQ_VALUE(result, -1);
261 }
262 
testGetPhoneticallySortableCodePointSimpleCompare()263 void TestExecutor::testGetPhoneticallySortableCodePointSimpleCompare() {
264   printf("testGetPhoneticallySortableCodePointSimpleCompare()\n");
265 
266   char32_t codepoints[] = {
267     0x3042, 0x30AB, 0xFF7B, 0x305F, 0x30CA, 0xFF8A, 0x30D0, 0x3071,
268     0x307E, 0x30E4, 0xFF97, 0x308F, 0x3093, 0x3094, 'A', 'Z',
269     '0', '9', '!', '/', ':', '?', '[', '`', '{', '~'};
270   size_t len = sizeof(codepoints)/sizeof(int);
271   bool next_is_consumed;
272   for (size_t i = 0; i < len - 1; ++i) {
273     int codepoint_a =
274         GetPhoneticallySortableCodePoint(codepoints[i], 0,
275                                          &next_is_consumed);
276     if (next_is_consumed) {
277       printf("next_is_consumed become true at 0x%04X", codepoint_a);
278       m_success = false;
279       return;
280     }
281     int codepoint_b =
282         GetPhoneticallySortableCodePoint(codepoints[i + 1], 0,
283                                          &next_is_consumed);
284     if (next_is_consumed) {
285       printf("next_is_consumed become true at 0x%04X", codepoint_b);
286       m_success = false;
287       return;
288     }
289 
290     if (codepoint_a >= codepoint_b) {
291       printf("0x%04X (from 0x%04X) >= 0x%04X (from 0x%04X)\n",
292              codepoint_a, codepoints[i], codepoint_b, codepoints[i + 1]);
293       m_success = false;
294       return;
295     }
296   }
297 }
298 
299 #define EXPECT_EQ_CODEPOINT_UTF8(codepoint, expected)                   \
300   ({                                                                    \
301     char32_t codepoints[1] = {codepoint};                                \
302     status_t ret = string8.setTo(codepoints, 1);                        \
303     if (ret != NO_ERROR) {                                              \
304       printf("GetUtf8FromCodePoint() returned false at 0x%04X\n", codepoint); \
305       m_success = false;                                                \
306     } else {                                                            \
307       const char* string = string8.string();                            \
308       if (strcmp(string, expected) != 0) {                              \
309         printf("Failed at codepoint 0x%04X\n", codepoint);              \
310         for (const char *ch = string; *ch != '\0'; ++ch) {              \
311           printf("0x%X ", *ch);                                         \
312         }                                                               \
313         printf("!= ");                                                  \
314         for (const char *ch = expected; *ch != '\0'; ++ch) {            \
315           printf("0x%X ", *ch);                                         \
316         }                                                               \
317         printf("\n");                                                   \
318         m_success = false;                                              \
319       }                                                                 \
320     }                                                                   \
321   })
322 
testGetUtf8FromUtf32()323 void TestExecutor::testGetUtf8FromUtf32() {
324   printf("testGetUtf8FromUtf32()\n");
325   String8 string8;
326 
327   EXPECT_EQ_CODEPOINT_UTF8('a', "\x61");
328   // Armenian capital letter AYB (2 bytes in UTF8)
329   EXPECT_EQ_CODEPOINT_UTF8(0x0530, "\xD4\xB0");
330   // Japanese 'a' (3 bytes in UTF8)
331   EXPECT_EQ_CODEPOINT_UTF8(0x3042, "\xE3\x81\x82");
332   // Kanji
333   EXPECT_EQ_CODEPOINT_UTF8(0x65E5, "\xE6\x97\xA5");
334   // PUA (4 byets in UTF8)
335   EXPECT_EQ_CODEPOINT_UTF8(0xFE016, "\xF3\xBE\x80\x96");
336   EXPECT_EQ_CODEPOINT_UTF8(0xFE972, "\xF3\xBE\xA5\xB2");
337 }
338 
339 #define EXPECT_EQ_UTF8_UTF8(src, expected)                              \
340   ({                                                                    \
341     if (!GetPhoneticallySortableString(src, &dst, &len)) {              \
342       printf("GetPhoneticallySortableString() returned false.\n");      \
343       m_success = false;                                                \
344     } else {                                                            \
345       if (strcmp(dst, expected) != 0) {                                 \
346         for (const char *ch = dst; *ch != '\0'; ++ch) {                 \
347           printf("0x%X ", *ch);                                         \
348         }                                                               \
349         printf("!= ");                                                  \
350         for (const char *ch = expected; *ch != '\0'; ++ch) {            \
351           printf("0x%X ", *ch);                                         \
352         }                                                               \
353         printf("\n");                                                   \
354         m_success = false;                                              \
355       }                                                                 \
356       free(dst);                                                        \
357     }                                                                   \
358    })
359 
testGetPhoneticallySortableString()360 void TestExecutor::testGetPhoneticallySortableString() {
361   printf("testGetPhoneticallySortableString()\n");
362   char *dst;
363   size_t len;
364 
365   // halfwidth alphabets -> fullwidth alphabets.
366   EXPECT_EQ_UTF8_UTF8("ABCD",
367                       "\xEF\xBC\xA1\xEF\xBC\xA2\xEF\xBC\xA3\xEF\xBC\xA4");
368   // halfwidth/fullwidth-katakana -> hiragana
369   EXPECT_EQ_UTF8_UTF8(
370       "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA",
371       "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A");
372 
373   // whitespace -> string which should be placed at last
374   EXPECT_EQ_UTF8_UTF8("    \t", "\xF0\x9F\xBF\xBD");
375 }
376 
377 #undef EXPECT_EQ_UTF8_UTF8
378 
379 #define EXPECT_EQ_UTF8_UTF8(src, expected)                              \
380   ({                                                                    \
381     if (!GetNormalizedString(src, &dst, &len)) {                        \
382       printf("GetPhoneticallySortableString() returned false.\n");      \
383       m_success = false;                                                \
384     } else {                                                            \
385       if (strcmp(dst, expected) != 0) {                                 \
386         for (const char *ch = dst; *ch != '\0'; ++ch) {                 \
387           printf("0x%X ", *ch);                                         \
388         }                                                               \
389         printf("!= ");                                                  \
390         for (const char *ch = expected; *ch != '\0'; ++ch) {            \
391           printf("0x%X ", *ch);                                         \
392         }                                                               \
393         printf("\n");                                                   \
394         m_success = false;                                              \
395       }                                                                 \
396       free(dst);                                                        \
397     }                                                                   \
398    })
399 
testGetNormalizedString()400 void TestExecutor::testGetNormalizedString() {
401   printf("testGetNormalizedString()\n");
402   char *dst;
403   size_t len;
404 
405   // halfwidth alphabets/symbols -> keep it as is.
406   EXPECT_EQ_UTF8_UTF8("ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()",
407                       "ABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%^&'()");
408   EXPECT_EQ_UTF8_UTF8("abcdefghijklmnopqrstuvwxyz[]{}\\@/",
409                       "abcdefghijklmnopqrstuvwxyz[]{}\\@/");
410 
411   // halfwidth/fullwidth-katakana -> hiragana
412   EXPECT_EQ_UTF8_UTF8(
413       "\xE3\x81\x82\xE3\x82\xA4\xE3\x81\x86\xEF\xBD\xB4\xE3\x82\xAA",
414       "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A");
415 
416   // whitespace -> keep it as is.
417   EXPECT_EQ_UTF8_UTF8("    \t", "    \t");
418 }
419 
testLongString()420 void TestExecutor::testLongString() {
421   printf("testLongString()\n");
422   char * dst;
423   size_t len;
424   EXPECT_EQ_UTF8_UTF8("Qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqtttttttttttttttttttttttttttttttttttttttttttttttttgggggggggggggggggggggggggggggggggggggggbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
425       "Qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqtttttttttttttttttttttttttttttttttttttttttttttttttggggggggggggggggggggggggggggggggggg");
426 }
427 
428 
main()429 int main() {
430   TestExecutor executor;
431   if(executor.DoAllTests()) {
432     return 0;
433   } else {
434     return 1;
435   }
436 }
437