• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 ///////////////////////////////////////////////////////////////////////
3 // File:        unicharset.cpp
4 // Description: Unicode character/ligature set class.
5 // Author:      Thomas Kielbus
6 // Created:     Wed Jun 28 17:05:01 PDT 2006
7 //
8 // (C) Copyright 2006, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
19 ///////////////////////////////////////////////////////////////////////
20 
21 #include <assert.h>
22 #include <stdio.h>
23 #include <string.h>
24 
25 #include "tprintf.h"
26 #include "unichar.h"
27 #include "unicharset.h"
28 #include "varable.h"
29 
30 static const int ISALPHA_MASK = 0x1;
31 static const int ISLOWER_MASK = 0x2;
32 static const int ISUPPER_MASK = 0x4;
33 static const int ISDIGIT_MASK = 0x8;
34 static const int ISPUNCTUATION_MASK = 0x10;
35 
UNICHARSET()36 UNICHARSET::UNICHARSET() :
37     unichars(NULL),
38     ids(),
39     size_used(0),
40     size_reserved(0),
41     script_table(0),
42     script_table_size_used(0),
43     script_table_size_reserved(0),
44     null_script("NULL"),
45     null_sid_(0),
46     common_sid_(0),
47     latin_sid_(0),
48     cyrillic_sid_(0),
49     greek_sid_(0),
50     han_sid_(0) {}
51 
~UNICHARSET()52 UNICHARSET::~UNICHARSET() {
53   if (size_reserved > 0) {
54     for (int i = 0; i < script_table_size_used; ++i)
55       delete[] script_table[i];
56     delete[] script_table;
57     delete_pointers_in_unichars();
58     delete[] unichars;
59   }
60 }
61 
reserve(int unichars_number)62 void UNICHARSET::reserve(int unichars_number) {
63   if (unichars_number > size_reserved) {
64     UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
65     for (int i = 0; i < size_used; ++i)
66       memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
67     for (int j = size_used; j < unichars_number; ++j) {
68       unichars_new[j].properties.script_id = add_script(null_script);
69       unichars_new[j].properties.fragment = NULL;
70     }
71     delete[] unichars;
72     unichars = unichars_new;
73     size_reserved = unichars_number;
74   }
75 }
76 
77 const UNICHAR_ID
unichar_to_id(const char * const unichar_repr) const78 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
79   return ids.contains(unichar_repr) ?
80     ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
81 }
82 
unichar_to_id(const char * const unichar_repr,int length) const83 const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
84                                            int length) const {
85   assert(length > 0 && length <= UNICHAR_LEN);
86   return ids.contains(unichar_repr, length) ?
87     ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
88 }
89 
90 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
91 // while leaving a legal UNICHAR_ID afterwards. In other words, if there
92 // is both a short and a long match to the string, return the length that
93 // ensures there is a legal match after it.
step(const char * str) const94 int UNICHARSET::step(const char* str) const {
95   // Find the length of the first matching unicharset member.
96   int minlength = ids.minmatch(str);
97   if (minlength == 0)
98     return 0;  // Empty string or illegal char.
99 
100   int goodlength = minlength;
101   while (goodlength <= UNICHAR_LEN) {
102     if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
103       return goodlength;  // This length works!
104     // The next char is illegal so find the next usable length.
105     do {
106       ++goodlength;
107     } while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
108              !ids.contains(str, goodlength));
109   }
110   // Search to find a subsequent legal char failed so return the minlength.
111   return minlength;
112 }
113 
id_to_unichar(UNICHAR_ID id) const114 const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
115   if (id == INVALID_UNICHAR_ID) {
116     return INVALID_UNICHAR;
117   }
118   assert(id < this->size());
119   return unichars[id].representation;
120 }
121 
122 // Return a STRING that reformats the utf8 str into the str followed
123 // by its hex unicodes.
debug_utf8_str(const char * str)124 STRING UNICHARSET::debug_utf8_str(const char* str) {
125   STRING result = str;
126   result += " [";
127   int step = 1;
128   // Chop into unicodes and code each as hex.
129   for (int i = 0; str[i] != '\0'; i += step) {
130     char hex[sizeof(int) * 2 + 1];
131     step = UNICHAR::utf8_step(str + i);
132     if (step == 0) {
133       step = 1;
134       sprintf(hex, "%x", str[i]);
135     } else {
136       UNICHAR ch(str + i, step);
137       sprintf(hex, "%x", ch.first_uni());
138     }
139     result += hex;
140     result += " ";
141   }
142   result += "]";
143   return result;
144 }
145 
146 // Return a STRING containing debug information on the unichar, including
147 // the id_to_unichar, its hex unicodes and the properties.
debug_str(UNICHAR_ID id) const148 STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
149   const CHAR_FRAGMENT *fragment = this->get_fragment(id);
150   if (fragment) {
151     STRING base = debug_str(fragment->get_unichar());
152     return CHAR_FRAGMENT::to_string(base.string(), fragment->get_pos(),
153                                     fragment->get_total());
154   }
155   const char* str = id_to_unichar(id);
156   if (id == INVALID_UNICHAR_ID) return STRING(str);
157   STRING result = debug_utf8_str(str);
158   // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
159   if (get_isalpha(id)) {
160     if (get_islower(id))
161       result += "a";
162     else if (get_isupper(id))
163       result += "A";
164     else
165       result += "x";
166   }
167   // Append 0 if a digit.
168   if (get_isdigit(id)) {
169     result += "0";
170   }
171   // Append p is a punctuation symbol.
172   if (get_ispunctuation(id)) {
173     result += "p";
174   }
175   return result;
176 }
177 
178 
179 
unichar_insert(const char * const unichar_repr)180 void UNICHARSET::unichar_insert(const char* const unichar_repr) {
181   if (!ids.contains(unichar_repr)) {
182     if (strlen(unichar_repr) > UNICHAR_LEN) {
183       fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
184               int(strlen(unichar_repr)), unichar_repr);
185       return;
186     }
187     if (size_used == size_reserved) {
188       if (size_used == 0)
189         reserve(8);
190       else
191         reserve(2 * size_used);
192     }
193 
194     strcpy(unichars[size_used].representation, unichar_repr);
195     this->set_isalpha(size_used, false);
196     this->set_islower(size_used, false);
197     this->set_isupper(size_used, false);
198     this->set_isdigit(size_used, false);
199     this->set_ispunctuation(size_used, false);
200     this->set_isngram(size_used, false);
201     this->set_script(size_used, null_script);
202     // If the given unichar_repr represents a fragmented character, set
203     // fragment property to a pointer to CHAR_FRAGMENT class instance with
204     // information parsed from the unichar representation. Use the script
205     // of the base unichar for the fragmented character if possible.
206     CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
207     this->unichars[size_used].properties.fragment = frag;
208     if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
209       this->unichars[size_used].properties.script_id =
210         this->get_script(frag->get_unichar());
211     }
212     this->unichars[size_used].properties.enabled = true;
213     ids.insert(unichar_repr, size_used);
214     ++size_used;
215   }
216 }
217 
contains_unichar(const char * const unichar_repr) const218 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
219   return ids.contains(unichar_repr);
220 }
221 
contains_unichar(const char * const unichar_repr,int length) const222 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
223                                   int length) const {
224   if (length == 0) {
225     return false;
226   }
227   return ids.contains(unichar_repr, length);
228 }
229 
eq(UNICHAR_ID unichar_id,const char * const unichar_repr) const230 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
231                     const char* const unichar_repr) const {
232   return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
233 }
234 
save_to_file(FILE * file) const235 bool UNICHARSET::save_to_file(FILE *file) const {
236   fprintf(file, "%d\n", this->size());
237   for (UNICHAR_ID id = 0; id < this->size(); ++id) {
238     unsigned int properties = 0;
239 
240     if (this->get_isalpha(id))
241       properties |= ISALPHA_MASK;
242     if (this->get_islower(id))
243       properties |= ISLOWER_MASK;
244     if (this->get_isupper(id))
245       properties |= ISUPPER_MASK;
246     if (this->get_isdigit(id))
247       properties |= ISDIGIT_MASK;
248     if (this->get_ispunctuation(id))
249       properties |= ISPUNCTUATION_MASK;
250 
251     if (strcmp(this->id_to_unichar(id), " ") == 0)
252       fprintf(file, "%s %x %s %d\n", "NULL", properties,
253               this->get_script_from_script_id(this->get_script(id)),
254               this->get_other_case(id));
255     else
256       fprintf(file, "%s %x %s %d\n", this->id_to_unichar(id), properties,
257               this->get_script_from_script_id(this->get_script(id)),
258               this->get_other_case(id));
259   }
260   return true;
261 }
262 
load_from_file(FILE * file)263 bool UNICHARSET::load_from_file(FILE *file) {
264   int unicharset_size;
265   char buffer[256];
266 
267   this->clear();
268   if (fgets(buffer, sizeof (buffer), file) == NULL ||
269       sscanf(buffer, "%d", &unicharset_size) != 1) {
270     return false;
271   }
272   this->reserve(unicharset_size);
273   for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
274     char unichar[256];
275     unsigned int properties;
276     char script[64];
277 
278     strcpy(script, null_script);
279     this->unichars[id].properties.other_case = id;
280     if (fgets(buffer, sizeof (buffer), file) == NULL ||
281         (sscanf(buffer, "%s %x %63s %d", unichar, &properties,
282                 script, &(this->unichars[id].properties.other_case)) != 4 &&
283          sscanf(buffer, "%s %x %63s", unichar, &properties, script) != 3 &&
284          sscanf(buffer, "%s %x", unichar, &properties) != 2)) {
285       return false;
286     }
287     if (strcmp(unichar, "NULL") == 0)
288       this->unichar_insert(" ");
289     else
290       this->unichar_insert(unichar);
291 
292     this->set_isalpha(id, (properties & ISALPHA_MASK) != 0);
293     this->set_islower(id, (properties & ISLOWER_MASK) != 0);
294     this->set_isupper(id, (properties & ISUPPER_MASK) != 0);
295     this->set_isdigit(id, (properties & ISDIGIT_MASK) != 0);
296     this->set_ispunctuation(id, (properties & ISPUNCTUATION_MASK) != 0);
297     this->set_isngram(id, false);
298     this->set_script(id, script);
299     this->unichars[id].properties.enabled = true;
300   }
301 
302   null_sid_ = get_script_id_from_name(null_script);
303   ASSERT_HOST(null_sid_ == 0);
304   common_sid_ = get_script_id_from_name("Common");
305   latin_sid_ = get_script_id_from_name("Latin");
306   cyrillic_sid_ = get_script_id_from_name("Cyrillic");
307   greek_sid_ = get_script_id_from_name("Greek");
308   han_sid_ = get_script_id_from_name("Han");
309   return true;
310 }
311 
312 // Set a whitelist and/or blacklist of characters to recognize.
313 // An empty or NULL whitelist enables everything (minus any blacklist).
314 // An empty or NULL blacklist disables nothing.
set_black_and_whitelist(const char * blacklist,const char * whitelist)315 void UNICHARSET::set_black_and_whitelist(const char* blacklist,
316                                          const char* whitelist) {
317   bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
318   // Set everything to default
319   for (int ch = 0; ch < size_used; ++ch)
320     unichars[ch].properties.enabled = def_enabled;
321   int ch_step;
322   if (!def_enabled) {
323     // Enable the whitelist.
324     for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
325       ch_step = step(whitelist + w_ind);
326       if (ch_step > 0) {
327         UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
328         unichars[u_id].properties.enabled = true;
329       } else {
330         ch_step = 1;
331       }
332     }
333   }
334   if (blacklist != NULL && blacklist[0] != '\0') {
335     // Disable the blacklist.
336     for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
337       ch_step = step(blacklist + b_ind);
338       if (ch_step > 0) {
339         UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
340         unichars[u_id].properties.enabled = false;
341       } else {
342         ch_step = 1;
343       }
344     }
345   }
346 }
347 
add_script(const char * script)348 int UNICHARSET::add_script(const char* script) {
349   for (int i = 0; i < script_table_size_used; ++i) {
350     if (strcmp(script, script_table[i]) == 0)
351       return i;
352   }
353   if (script_table_size_reserved == 0) {
354     script_table_size_reserved = 8;
355     script_table = new char*[script_table_size_reserved];
356   }
357   if (script_table_size_used + 1 >= script_table_size_reserved) {
358     char** new_script_table = new char*[script_table_size_reserved * 2];
359     memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
360     delete[] script_table;
361     script_table = new_script_table;
362       script_table_size_reserved = 2 * script_table_size_reserved;
363   }
364   script_table[script_table_size_used] = new char[strlen(script) + 1];
365   strcpy(script_table[script_table_size_used], script);
366   return script_table_size_used++;
367 }
368 
parse_from_string(const char * string)369 CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
370   const char *ptr = string;
371   int len = strlen(string);
372   if (len < kMinLen || *ptr != kSeparator) {
373     return NULL;  // this string can not represent a fragment
374   }
375   ptr++;  // move to the next character
376   int step = 0;
377   while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
378     step += UNICHAR::utf8_step(ptr + step);
379   }
380   if (step == 0 || step > UNICHAR_LEN) {
381     return NULL;  // no character for unichar or the character is too long
382   }
383   char unichar[UNICHAR_LEN + 1];
384   strncpy(unichar, ptr, step);
385   unichar[step] = '\0';  // null terminate unichar
386   ptr += step;  // move to the next fragment separator
387   int pos = 0;
388   int total = 0;
389   char *end_ptr = NULL;
390   for (int i = 0; i < 2; i++) {
391     if (ptr > string + len || *ptr != kSeparator) {
392       return NULL;  // failed to parse fragment representation
393     }
394     ptr++;  // move to the next character
395     i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
396       : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
397     ptr = end_ptr;
398   }
399   if (ptr != string + len) {
400     return NULL;  // malformed fragment representation
401   }
402   CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
403   fragment->set_all(unichar, pos, total);
404   return fragment;
405 }
406 
get_script_id_from_name(const char * script_name) const407 int UNICHARSET::get_script_id_from_name(const char* script_name) const {
408   for (int i = 0; i < script_table_size_used; ++i) {
409     if (strcmp(script_name, script_table[i]) == 0)
410       return i;
411   }
412   return 0;  // 0 is always the null_script
413 }
414