1
2 ///////////////////////////////////////////////////////////////////////
3 // File: unicharset.cpp
4 // Description: Unicode character/ligature set class.
5 // Author: Thomas Kielbus
6 // Created: Wed Jun 28 17:05:01 PDT 2006
7 //
8 // (C) Copyright 2006, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
19 ///////////////////////////////////////////////////////////////////////
20
21 #include <assert.h>
22 #include <stdio.h>
23 #include <string.h>
24
25 #include "tprintf.h"
26 #include "unichar.h"
27 #include "unicharset.h"
28 #include "varable.h"
29
30 static const int ISALPHA_MASK = 0x1;
31 static const int ISLOWER_MASK = 0x2;
32 static const int ISUPPER_MASK = 0x4;
33 static const int ISDIGIT_MASK = 0x8;
34 static const int ISPUNCTUATION_MASK = 0x10;
35
UNICHARSET()36 UNICHARSET::UNICHARSET() :
37 unichars(NULL),
38 ids(),
39 size_used(0),
40 size_reserved(0),
41 script_table(0),
42 script_table_size_used(0),
43 script_table_size_reserved(0),
44 null_script("NULL"),
45 null_sid_(0),
46 common_sid_(0),
47 latin_sid_(0),
48 cyrillic_sid_(0),
49 greek_sid_(0),
50 han_sid_(0) {}
51
~UNICHARSET()52 UNICHARSET::~UNICHARSET() {
53 if (size_reserved > 0) {
54 for (int i = 0; i < script_table_size_used; ++i)
55 delete[] script_table[i];
56 delete[] script_table;
57 delete_pointers_in_unichars();
58 delete[] unichars;
59 }
60 }
61
reserve(int unichars_number)62 void UNICHARSET::reserve(int unichars_number) {
63 if (unichars_number > size_reserved) {
64 UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
65 for (int i = 0; i < size_used; ++i)
66 memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
67 for (int j = size_used; j < unichars_number; ++j) {
68 unichars_new[j].properties.script_id = add_script(null_script);
69 unichars_new[j].properties.fragment = NULL;
70 }
71 delete[] unichars;
72 unichars = unichars_new;
73 size_reserved = unichars_number;
74 }
75 }
76
77 const UNICHAR_ID
unichar_to_id(const char * const unichar_repr) const78 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
79 return ids.contains(unichar_repr) ?
80 ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
81 }
82
unichar_to_id(const char * const unichar_repr,int length) const83 const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
84 int length) const {
85 assert(length > 0 && length <= UNICHAR_LEN);
86 return ids.contains(unichar_repr, length) ?
87 ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
88 }
89
90 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
91 // while leaving a legal UNICHAR_ID afterwards. In other words, if there
92 // is both a short and a long match to the string, return the length that
93 // ensures there is a legal match after it.
step(const char * str) const94 int UNICHARSET::step(const char* str) const {
95 // Find the length of the first matching unicharset member.
96 int minlength = ids.minmatch(str);
97 if (minlength == 0)
98 return 0; // Empty string or illegal char.
99
100 int goodlength = minlength;
101 while (goodlength <= UNICHAR_LEN) {
102 if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
103 return goodlength; // This length works!
104 // The next char is illegal so find the next usable length.
105 do {
106 ++goodlength;
107 } while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
108 !ids.contains(str, goodlength));
109 }
110 // Search to find a subsequent legal char failed so return the minlength.
111 return minlength;
112 }
113
id_to_unichar(UNICHAR_ID id) const114 const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
115 if (id == INVALID_UNICHAR_ID) {
116 return INVALID_UNICHAR;
117 }
118 assert(id < this->size());
119 return unichars[id].representation;
120 }
121
122 // Return a STRING that reformats the utf8 str into the str followed
123 // by its hex unicodes.
debug_utf8_str(const char * str)124 STRING UNICHARSET::debug_utf8_str(const char* str) {
125 STRING result = str;
126 result += " [";
127 int step = 1;
128 // Chop into unicodes and code each as hex.
129 for (int i = 0; str[i] != '\0'; i += step) {
130 char hex[sizeof(int) * 2 + 1];
131 step = UNICHAR::utf8_step(str + i);
132 if (step == 0) {
133 step = 1;
134 sprintf(hex, "%x", str[i]);
135 } else {
136 UNICHAR ch(str + i, step);
137 sprintf(hex, "%x", ch.first_uni());
138 }
139 result += hex;
140 result += " ";
141 }
142 result += "]";
143 return result;
144 }
145
146 // Return a STRING containing debug information on the unichar, including
147 // the id_to_unichar, its hex unicodes and the properties.
debug_str(UNICHAR_ID id) const148 STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
149 const CHAR_FRAGMENT *fragment = this->get_fragment(id);
150 if (fragment) {
151 STRING base = debug_str(fragment->get_unichar());
152 return CHAR_FRAGMENT::to_string(base.string(), fragment->get_pos(),
153 fragment->get_total());
154 }
155 const char* str = id_to_unichar(id);
156 if (id == INVALID_UNICHAR_ID) return STRING(str);
157 STRING result = debug_utf8_str(str);
158 // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
159 if (get_isalpha(id)) {
160 if (get_islower(id))
161 result += "a";
162 else if (get_isupper(id))
163 result += "A";
164 else
165 result += "x";
166 }
167 // Append 0 if a digit.
168 if (get_isdigit(id)) {
169 result += "0";
170 }
171 // Append p is a punctuation symbol.
172 if (get_ispunctuation(id)) {
173 result += "p";
174 }
175 return result;
176 }
177
178
179
unichar_insert(const char * const unichar_repr)180 void UNICHARSET::unichar_insert(const char* const unichar_repr) {
181 if (!ids.contains(unichar_repr)) {
182 if (strlen(unichar_repr) > UNICHAR_LEN) {
183 fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
184 int(strlen(unichar_repr)), unichar_repr);
185 return;
186 }
187 if (size_used == size_reserved) {
188 if (size_used == 0)
189 reserve(8);
190 else
191 reserve(2 * size_used);
192 }
193
194 strcpy(unichars[size_used].representation, unichar_repr);
195 this->set_isalpha(size_used, false);
196 this->set_islower(size_used, false);
197 this->set_isupper(size_used, false);
198 this->set_isdigit(size_used, false);
199 this->set_ispunctuation(size_used, false);
200 this->set_isngram(size_used, false);
201 this->set_script(size_used, null_script);
202 // If the given unichar_repr represents a fragmented character, set
203 // fragment property to a pointer to CHAR_FRAGMENT class instance with
204 // information parsed from the unichar representation. Use the script
205 // of the base unichar for the fragmented character if possible.
206 CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
207 this->unichars[size_used].properties.fragment = frag;
208 if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
209 this->unichars[size_used].properties.script_id =
210 this->get_script(frag->get_unichar());
211 }
212 this->unichars[size_used].properties.enabled = true;
213 ids.insert(unichar_repr, size_used);
214 ++size_used;
215 }
216 }
217
contains_unichar(const char * const unichar_repr) const218 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
219 return ids.contains(unichar_repr);
220 }
221
contains_unichar(const char * const unichar_repr,int length) const222 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
223 int length) const {
224 if (length == 0) {
225 return false;
226 }
227 return ids.contains(unichar_repr, length);
228 }
229
eq(UNICHAR_ID unichar_id,const char * const unichar_repr) const230 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
231 const char* const unichar_repr) const {
232 return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
233 }
234
save_to_file(FILE * file) const235 bool UNICHARSET::save_to_file(FILE *file) const {
236 fprintf(file, "%d\n", this->size());
237 for (UNICHAR_ID id = 0; id < this->size(); ++id) {
238 unsigned int properties = 0;
239
240 if (this->get_isalpha(id))
241 properties |= ISALPHA_MASK;
242 if (this->get_islower(id))
243 properties |= ISLOWER_MASK;
244 if (this->get_isupper(id))
245 properties |= ISUPPER_MASK;
246 if (this->get_isdigit(id))
247 properties |= ISDIGIT_MASK;
248 if (this->get_ispunctuation(id))
249 properties |= ISPUNCTUATION_MASK;
250
251 if (strcmp(this->id_to_unichar(id), " ") == 0)
252 fprintf(file, "%s %x %s %d\n", "NULL", properties,
253 this->get_script_from_script_id(this->get_script(id)),
254 this->get_other_case(id));
255 else
256 fprintf(file, "%s %x %s %d\n", this->id_to_unichar(id), properties,
257 this->get_script_from_script_id(this->get_script(id)),
258 this->get_other_case(id));
259 }
260 return true;
261 }
262
load_from_file(FILE * file)263 bool UNICHARSET::load_from_file(FILE *file) {
264 int unicharset_size;
265 char buffer[256];
266
267 this->clear();
268 if (fgets(buffer, sizeof (buffer), file) == NULL ||
269 sscanf(buffer, "%d", &unicharset_size) != 1) {
270 return false;
271 }
272 this->reserve(unicharset_size);
273 for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
274 char unichar[256];
275 unsigned int properties;
276 char script[64];
277
278 strcpy(script, null_script);
279 this->unichars[id].properties.other_case = id;
280 if (fgets(buffer, sizeof (buffer), file) == NULL ||
281 (sscanf(buffer, "%s %x %63s %d", unichar, &properties,
282 script, &(this->unichars[id].properties.other_case)) != 4 &&
283 sscanf(buffer, "%s %x %63s", unichar, &properties, script) != 3 &&
284 sscanf(buffer, "%s %x", unichar, &properties) != 2)) {
285 return false;
286 }
287 if (strcmp(unichar, "NULL") == 0)
288 this->unichar_insert(" ");
289 else
290 this->unichar_insert(unichar);
291
292 this->set_isalpha(id, (properties & ISALPHA_MASK) != 0);
293 this->set_islower(id, (properties & ISLOWER_MASK) != 0);
294 this->set_isupper(id, (properties & ISUPPER_MASK) != 0);
295 this->set_isdigit(id, (properties & ISDIGIT_MASK) != 0);
296 this->set_ispunctuation(id, (properties & ISPUNCTUATION_MASK) != 0);
297 this->set_isngram(id, false);
298 this->set_script(id, script);
299 this->unichars[id].properties.enabled = true;
300 }
301
302 null_sid_ = get_script_id_from_name(null_script);
303 ASSERT_HOST(null_sid_ == 0);
304 common_sid_ = get_script_id_from_name("Common");
305 latin_sid_ = get_script_id_from_name("Latin");
306 cyrillic_sid_ = get_script_id_from_name("Cyrillic");
307 greek_sid_ = get_script_id_from_name("Greek");
308 han_sid_ = get_script_id_from_name("Han");
309 return true;
310 }
311
312 // Set a whitelist and/or blacklist of characters to recognize.
313 // An empty or NULL whitelist enables everything (minus any blacklist).
314 // An empty or NULL blacklist disables nothing.
set_black_and_whitelist(const char * blacklist,const char * whitelist)315 void UNICHARSET::set_black_and_whitelist(const char* blacklist,
316 const char* whitelist) {
317 bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
318 // Set everything to default
319 for (int ch = 0; ch < size_used; ++ch)
320 unichars[ch].properties.enabled = def_enabled;
321 int ch_step;
322 if (!def_enabled) {
323 // Enable the whitelist.
324 for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
325 ch_step = step(whitelist + w_ind);
326 if (ch_step > 0) {
327 UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
328 unichars[u_id].properties.enabled = true;
329 } else {
330 ch_step = 1;
331 }
332 }
333 }
334 if (blacklist != NULL && blacklist[0] != '\0') {
335 // Disable the blacklist.
336 for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
337 ch_step = step(blacklist + b_ind);
338 if (ch_step > 0) {
339 UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
340 unichars[u_id].properties.enabled = false;
341 } else {
342 ch_step = 1;
343 }
344 }
345 }
346 }
347
add_script(const char * script)348 int UNICHARSET::add_script(const char* script) {
349 for (int i = 0; i < script_table_size_used; ++i) {
350 if (strcmp(script, script_table[i]) == 0)
351 return i;
352 }
353 if (script_table_size_reserved == 0) {
354 script_table_size_reserved = 8;
355 script_table = new char*[script_table_size_reserved];
356 }
357 if (script_table_size_used + 1 >= script_table_size_reserved) {
358 char** new_script_table = new char*[script_table_size_reserved * 2];
359 memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
360 delete[] script_table;
361 script_table = new_script_table;
362 script_table_size_reserved = 2 * script_table_size_reserved;
363 }
364 script_table[script_table_size_used] = new char[strlen(script) + 1];
365 strcpy(script_table[script_table_size_used], script);
366 return script_table_size_used++;
367 }
368
parse_from_string(const char * string)369 CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
370 const char *ptr = string;
371 int len = strlen(string);
372 if (len < kMinLen || *ptr != kSeparator) {
373 return NULL; // this string can not represent a fragment
374 }
375 ptr++; // move to the next character
376 int step = 0;
377 while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
378 step += UNICHAR::utf8_step(ptr + step);
379 }
380 if (step == 0 || step > UNICHAR_LEN) {
381 return NULL; // no character for unichar or the character is too long
382 }
383 char unichar[UNICHAR_LEN + 1];
384 strncpy(unichar, ptr, step);
385 unichar[step] = '\0'; // null terminate unichar
386 ptr += step; // move to the next fragment separator
387 int pos = 0;
388 int total = 0;
389 char *end_ptr = NULL;
390 for (int i = 0; i < 2; i++) {
391 if (ptr > string + len || *ptr != kSeparator) {
392 return NULL; // failed to parse fragment representation
393 }
394 ptr++; // move to the next character
395 i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
396 : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
397 ptr = end_ptr;
398 }
399 if (ptr != string + len) {
400 return NULL; // malformed fragment representation
401 }
402 CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
403 fragment->set_all(unichar, pos, total);
404 return fragment;
405 }
406
get_script_id_from_name(const char * script_name) const407 int UNICHARSET::get_script_id_from_name(const char* script_name) const {
408 for (int i = 0; i < script_table_size_used; ++i) {
409 if (strcmp(script_name, script_table[i]) == 0)
410 return i;
411 }
412 return 0; // 0 is always the null_script
413 }
414