1 /*
2 * Copyright 2010, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <ctype.h>
18 #include <string.h>
19
20 #include <unicode/ucol.h>
21 #include <unicode/uiter.h>
22 #include <unicode/ustring.h>
23 #include <unicode/utypes.h>
24
25 #include "PhonebookIndex.h"
26 #include "PhoneticStringUtils.h"
27
28 #define MIN_OUTPUT_SIZE 6 // Minimum required size for the output buffer (in bytes)
29
30 namespace android {
31
32 // IMPORTANT! Keep the codes below SORTED. We are doing a binary search on the array
33 static UChar DEFAULT_CHAR_MAP[] = {
34 0x00C6, 'A', // AE
35 0x00DF, 'S', // Etzett
36 0x1100, 0x3131, // HANGUL LETTER KIYEOK
37 0x1101, 0x3132, // HANGUL LETTER SSANGKIYEOK
38 0x1102, 0x3134, // HANGUL LETTER NIEUN
39 0x1103, 0x3137, // HANGUL LETTER TIKEUT
40 0x1104, 0x3138, // HANGUL LETTER SSANGTIKEUT
41 0x1105, 0x3139, // HANGUL LETTER RIEUL
42 0x1106, 0x3141, // HANGUL LETTER MIEUM
43 0x1107, 0x3142, // HANGUL LETTER PIEUP
44 0x1108, 0x3143, // HANGUL LETTER SSANGPIEUP
45 0x1109, 0x3145, // HANGUL LETTER SIOS
46 0x110A, 0x3146, // HANGUL LETTER SSANGSIOS
47 0x110B, 0x3147, // HANGUL LETTER IEUNG
48 0x110C, 0x3148, // HANGUL LETTER CIEUC
49 0x110D, 0x3149, // HANGUL LETTER SSANGCIEUC
50 0x110E, 0x314A, // HANGUL LETTER CHIEUCH
51 0x110F, 0x314B, // HANGUL LETTER KHIEUKH
52 0x1110, 0x314C, // HANGUL LETTER THIEUTH
53 0x1111, 0x314D, // HANGUL LETTER PHIEUPH
54 0x1112, 0x314E, // HANGUL LETTER HIEUH
55 0x111A, 0x3140, // HANGUL LETTER RIEUL-HIEUH
56 0x1121, 0x3144, // HANGUL LETTER PIEUP-SIOS
57 0x1161, 0x314F, // HANGUL LETTER A
58 0x1162, 0x3150, // HANGUL LETTER AE
59 0x1163, 0x3151, // HANGUL LETTER YA
60 0x1164, 0x3152, // HANGUL LETTER YAE
61 0x1165, 0x3153, // HANGUL LETTER EO
62 0x1166, 0x3154, // HANGUL LETTER E
63 0x1167, 0x3155, // HANGUL LETTER YEO
64 0x1168, 0x3156, // HANGUL LETTER YE
65 0x1169, 0x3157, // HANGUL LETTER O
66 0x116A, 0x3158, // HANGUL LETTER WA
67 0x116B, 0x3159, // HANGUL LETTER WAE
68 0x116C, 0x315A, // HANGUL LETTER OE
69 0x116D, 0x315B, // HANGUL LETTER YO
70 0x116E, 0x315C, // HANGUL LETTER U
71 0x116F, 0x315D, // HANGUL LETTER WEO
72 0x1170, 0x315E, // HANGUL LETTER WE
73 0x1171, 0x315F, // HANGUL LETTER WI
74 0x1172, 0x3160, // HANGUL LETTER YU
75 0x1173, 0x3161, // HANGUL LETTER EU
76 0x1174, 0x3162, // HANGUL LETTER YI
77 0x1175, 0x3163, // HANGUL LETTER I
78 0x11AA, 0x3133, // HANGUL LETTER KIYEOK-SIOS
79 0x11AC, 0x3135, // HANGUL LETTER NIEUN-CIEUC
80 0x11AD, 0x3136, // HANGUL LETTER NIEUN-HIEUH
81 0x11B0, 0x313A, // HANGUL LETTER RIEUL-KIYEOK
82 0x11B1, 0x313B, // HANGUL LETTER RIEUL-MIEUM
83 0x11B3, 0x313D, // HANGUL LETTER RIEUL-SIOS
84 0x11B4, 0x313E, // HANGUL LETTER RIEUL-THIEUTH
85 0x11B5, 0x313F, // HANGUL LETTER RIEUL-PHIEUPH
86 };
87
88 /**
89 * Binary search to map an individual character to the corresponding phone book index.
90 */
map_character(UChar c,UChar * char_map,int32_t length)91 static UChar map_character(UChar c, UChar * char_map, int32_t length) {
92 int from = 0, to = length;
93 while (from < to) {
94 int m = ((to + from) >> 1) & ~0x1; // Only consider even positions
95 UChar cm = char_map[m];
96 if (cm == c) {
97 return char_map[m + 1];
98 } else if (cm < c) {
99 from = m + 2;
100 } else {
101 to = m;
102 }
103 }
104 return 0;
105 }
106
107 /**
108 * Returns TRUE if the character belongs to a Hanzi unicode block
109 */
is_CJK(UChar c)110 static bool is_CJK(UChar c) {
111 return
112 (0x4e00 <= c && c <= 0x9fff) // CJK_UNIFIED_IDEOGRAPHS
113 || (0x3400 <= c && c <= 0x4dbf) // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
114 || (0x3000 <= c && c <= 0x303f) // CJK_SYMBOLS_AND_PUNCTUATION
115 || (0x2e80 <= c && c <= 0x2eff) // CJK_RADICALS_SUPPLEMENT
116 || (0x3300 <= c && c <= 0x33ff) // CJK_COMPATIBILITY
117 || (0xfe30 <= c && c <= 0xfe4f) // CJK_COMPATIBILITY_FORMS
118 || (0xf900 <= c && c <= 0xfaff); // CJK_COMPATIBILITY_IDEOGRAPHS
119 }
120
GetPhonebookIndex(UCharIterator * iter,const char * locale,UChar * out,int32_t size,UBool * isError)121 int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size,
122 UBool * isError)
123 {
124 if (size < MIN_OUTPUT_SIZE) {
125 *isError = TRUE;
126 return 0;
127 }
128
129 *isError = FALSE;
130
131 // Normalize the first character to remove accents using the NFD normalization
132 UErrorCode errorCode = U_ZERO_ERROR;
133 int32_t len = unorm_next(iter, out, size, UNORM_NFD,
134 0 /* options */, TRUE /* normalize */, NULL, &errorCode);
135 if (U_FAILURE(errorCode)) {
136 *isError = TRUE;
137 return 0;
138 }
139
140 if (len == 0) { // Empty input string
141 return 0;
142 }
143
144 UChar c = out[0];
145
146 // We are only interested in letters
147 if (!u_isalpha(c)) {
148 return 0;
149 }
150
151 c = u_toupper(c);
152
153 // Check for explicitly mapped characters
154 UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar));
155 if (c_mapped != 0) {
156 out[0] = c_mapped;
157 return 1;
158 }
159
160 // Convert Kanas to Hiragana
161 UChar next = len > 2 ? out[1] : 0;
162 c = android::GetNormalizedCodePoint(c, next, NULL);
163
164 // Traditional grouping of Hiragana characters
165 if (0x3042 <= c && c <= 0x309F) {
166 if (c < 0x304B) c = 0x3042; // a
167 else if (c < 0x3055) c = 0x304B; // ka
168 else if (c < 0x305F) c = 0x3055; // sa
169 else if (c < 0x306A) c = 0x305F; // ta
170 else if (c < 0x306F) c = 0x306A; // na
171 else if (c < 0x307E) c = 0x306F; // ha
172 else if (c < 0x3084) c = 0x307E; // ma
173 else if (c < 0x3089) c = 0x3084; // ya
174 else if (c < 0x308F) c = 0x3089; // ra
175 else c = 0x308F; // wa
176 out[0] = c;
177 return 1;
178 }
179
180 if (is_CJK(c)) {
181 if (strncmp(locale, "ja", 2) == 0) {
182 // Japanese word meaning "misc" or "other"
183 out[0] = 0x4ED6;
184 return 1;
185 } else {
186 return 0;
187 }
188 }
189
190 out[0] = c;
191 return 1;
192 }
193
194 } // namespace android
195