• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        unicharset.h
3 // Description: Unicode character/ligature set class.
4 // Author:      Thomas Kielbus
5 // Created:     Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
21 #define TESSERACT_CCUTIL_UNICHARSET_H__
22 
23 #include "assert.h"
24 #include "strngs.h"
25 #include "unichar.h"
26 #include "unicharmap.h"
27 #include "varable.h"
28 
29 class CHAR_FRAGMENT {
30  public:
31   // Minimum number of characters used for fragment representation.
32   static const int kMinLen = 6;
33   // Maximum number of characters used for fragment representation.
34   static const int kMaxLen = 3 + UNICHAR_LEN + 2;
35   // Special character used in representing character fragments.
36   static const char kSeparator = '|';
37   // Maximum number of fragments per character.
38   static const int kMaxChunks = 3;
39 
40   // Setters and Getters.
set_all(const char * unichar,int pos,int total)41   inline void set_all(const char *unichar, int pos, int total) {
42     this->set_unichar(unichar);
43     this->set_pos(pos);
44     this->set_total(total);
45   }
set_unichar(const char * uch)46   inline void set_unichar(const char *uch) {
47     strncpy(this->unichar, uch, UNICHAR_LEN);
48     this->unichar[UNICHAR_LEN] = '\0';
49   }
set_pos(int p)50   inline void set_pos(int p) { this->pos = p; }
set_total(int t)51   inline void set_total(int t) { this->total = t; }
get_unichar()52   inline const char* get_unichar() const { return this->unichar; }
get_pos()53   inline int get_pos() const { return this->pos; }
get_total()54   inline int get_total() const { return this->total; }
55 
56   // Returns the string that represents a fragment
57   // with the given unichar, pos and total.
to_string(const char * unichar,int pos,int total)58   static STRING to_string(const char *unichar, int pos, int total) {
59     STRING result = "";
60     result += kSeparator;
61     result += unichar;
62     char buffer[kMaxLen];
63     snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total);
64     result += buffer;
65     return result;
66   }
67   // Returns the string that represents this fragment.
to_string()68   STRING to_string() const {
69     return to_string(this->unichar, this->pos, this->total);
70   }
71 
72   // Checks whether a fragment has the same unichar,
73   // position and total as the given inputs.
equals(const char * other_unichar,int other_pos,int other_total)74   inline bool equals(const char *other_unichar,
75                      int other_pos, int other_total) const {
76     return (strcmp(this->unichar, other_unichar) == 0 &&
77             this->pos == other_pos && this->total == other_total);
78   }
equals(const CHAR_FRAGMENT * other)79   inline bool equals(const CHAR_FRAGMENT *other) const {
80     return this->equals(other->get_unichar(),
81                         other->get_pos(),
82                         other->get_total());
83   }
84 
85   // Checks whether a given fragment is a continuation of this fragment.
86   // Assumes that the given fragment pointer is not NULL.
is_continuation_of(const CHAR_FRAGMENT * fragment)87   inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
88     return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
89             this->total == fragment->get_total() &&
90             this->pos == fragment->get_pos() + 1);
91   }
92 
93   // Returns true if this fragment is a beginning fragment.
is_beginning()94   inline bool is_beginning() const { return this->pos == 0; }
95 
96   // Returns true if this fragment is an ending fragment.
is_ending()97   inline bool is_ending() const { return this->pos == this->total-1; }
98 
99   // Parses the string to see whether it represents a character fragment
100   // (rather than a regular character). If so, allocates memory for a new
101   // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
102   // information. Fragments are of the form:
103   // |m|1|2, meaning chunk 1 of 2 of character m.
104   //
105   // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
106   // instance, otherwise (if the string does not represent a fragment or it
107   // looks like it does, but parsing it as a fragment fails) returns NULL.
108   //
109   // Note: The caller is responsible for deallocating memory
110   // associated with the returned pointer.
111   static CHAR_FRAGMENT *parse_from_string(const char *str);
112 
113  private:
114   char unichar[UNICHAR_LEN + 1];
115   inT16 pos;    // fragment position in the character
116   inT16 total;  // total number of fragments in the character
117 };
118 
119 // The UNICHARSET class is an utility class for Tesseract that holds the
120 // set of characters that are used by the engine. Each character is identified
121 // by a unique number, from 0 to (size - 1).
122 class UNICHARSET {
123  public:
124   // Create an empty UNICHARSET
125   UNICHARSET();
126 
127   ~UNICHARSET();
128 
129   // Return the UNICHAR_ID of a given unichar representation within the
130   // UNICHARSET.
131   const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
132 
133   // Return the UNICHAR_ID of a given unichar representation within the
134   // UNICHARSET. Only the first length characters from unichar_repr are used.
135   const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
136                                  int length) const;
137 
138   // Return the minimum number of bytes that matches a legal UNICHAR_ID,
139   // while leaving a legal UNICHAR_ID afterwards. In other words, if there
140   // is both a short and a long match to the string, return the length that
141   // ensures there is a legal match after it.
142   int step(const char* str) const;
143 
144   // Return the unichar representation corresponding to the given UNICHAR_ID
145   // within the UNICHARSET.
146   const char* const id_to_unichar(UNICHAR_ID id) const;
147 
148   // Return a STRING that reformats the utf8 str into the str followed
149   // by its hex unicodes.
150   static STRING debug_utf8_str(const char* str);
151 
152   // Return a STRING containing debug information on the unichar, including
153   // the id_to_unichar, its hex unicodes and the properties.
154   STRING debug_str(UNICHAR_ID id) const;
debug_str(const char * unichar_repr)155   STRING debug_str(const char * unichar_repr) const {
156     return debug_str(unichar_to_id(unichar_repr));
157   }
158 
159   // Add a unichar representation to the set.
160   void unichar_insert(const char* const unichar_repr);
161 
162   // Return true if the given unichar id exists within the set.
163   // Relies on the fact that unichar ids are contiguous in the unicharset.
contains_unichar_id(UNICHAR_ID unichar_id)164   bool contains_unichar_id(UNICHAR_ID unichar_id) const {
165     return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used;
166   }
167 
168   // Return true if the given unichar representation exists within the set.
169   bool contains_unichar(const char* const unichar_repr) const;
170   bool contains_unichar(const char* const unichar_repr, int length) const;
171 
172   // Return true if the given unichar representation corresponds to the given
173   // UNICHAR_ID within the set.
174   bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
175 
176   // Delete CHAR_FRAGMENTs stored in properties of unichars array.
delete_pointers_in_unichars()177   void delete_pointers_in_unichars() {
178     for (int i = 0; i < size_used; ++i) {
179       if (unichars[i].properties.fragment != NULL) {
180         delete unichars[i].properties.fragment;
181         unichars[i].properties.fragment = NULL;
182       }
183     }
184   }
185 
186   // Clear the UNICHARSET (all the previous data is lost).
clear()187   void clear() {
188     if (size_reserved > 0) {
189       for (int i = 0; i < script_table_size_used; ++i)
190         delete[] script_table[i];
191       delete[] script_table;
192       script_table = 0;
193       script_table_size_reserved = 0;
194       script_table_size_used = 0;
195       delete_pointers_in_unichars();
196       delete[] unichars;
197       unichars = 0;
198       size_reserved = 0;
199       size_used = 0;
200     }
201     ids.clear();
202   }
203 
204   // Return the size of the set (the number of different UNICHAR it holds).
size()205   int size() const {
206     return size_used;
207   }
208 
209   // Reserve enough memory space for the given number of UNICHARS
210   void reserve(int unichars_number);
211 
212   // Opens the file indicated by filename and saves unicharset to that file.
213   // Returns true if the operation is successful.
save_to_file(const char * const filename)214   bool save_to_file(const char * const filename) const {
215     FILE* file = fopen(filename, "w+");
216     if (file == NULL) return false;
217     bool result = save_to_file(file);
218     fclose(file);
219     return result;
220   }
221 
222   // Saves the content of the UNICHARSET to the given file.
223   // Returns true if the operation is successful.
224   bool save_to_file(FILE *file) const;
225 
226   // Opens the file indicated by filename and loads the UNICHARSET
227   // from the given file. The previous data is lost.
228   // Returns true if the operation is successful.
load_from_file(const char * const filename)229   bool load_from_file(const char* const filename) {
230     FILE* file = fopen(filename, "r");
231     if (file == NULL) return false;
232     bool result = load_from_file(file);
233     fclose(file);
234     return result;
235   }
236 
237   // Loads the UNICHARSET from the given file. The previous data is lost.
238   // Returns true if the operation is successful.
239   bool load_from_file(FILE *file);
240 
241   // Set a whitelist and/or blacklist of characters to recognize.
242   // An empty or NULL whitelist enables everything (minus any blacklist).
243   // An empty or NULL blacklist disables nothing.
244   // The blacklist overrides the whitelist.
245   // Each list is a string of utf8 character strings. Boundaries between
246   // unicharset units are worked out automatically, and characters not in
247   // the unicharset are silently ignored.
248   void set_black_and_whitelist(const char* blacklist, const char* whitelist);
249 
250   // Set the isalpha property of the given unichar to the given value.
set_isalpha(UNICHAR_ID unichar_id,bool value)251   void set_isalpha(UNICHAR_ID unichar_id, bool value) {
252     unichars[unichar_id].properties.isalpha = value;
253   }
254 
255   // Set the islower property of the given unichar to the given value.
set_islower(UNICHAR_ID unichar_id,bool value)256   void set_islower(UNICHAR_ID unichar_id, bool value) {
257     unichars[unichar_id].properties.islower = value;
258   }
259 
260   // Set the isupper property of the given unichar to the given value.
set_isupper(UNICHAR_ID unichar_id,bool value)261   void set_isupper(UNICHAR_ID unichar_id, bool value) {
262     unichars[unichar_id].properties.isupper = value;
263   }
264 
265   // Set the isdigit property of the given unichar to the given value.
set_isdigit(UNICHAR_ID unichar_id,bool value)266   void set_isdigit(UNICHAR_ID unichar_id, bool value) {
267     unichars[unichar_id].properties.isdigit = value;
268   }
269 
270   // Set the ispunctuation property of the given unichar to the given value.
set_ispunctuation(UNICHAR_ID unichar_id,bool value)271   void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
272     unichars[unichar_id].properties.ispunctuation = value;
273   }
274 
275   // Set the isngram property of the given unichar to the given value.
set_isngram(UNICHAR_ID unichar_id,bool value)276   void set_isngram(UNICHAR_ID unichar_id, bool value) {
277     unichars[unichar_id].properties.isngram = value;
278   }
279 
280   // Set the script name of the given unichar to the given value.
281   // Value is copied and thus can be a temporary;
set_script(UNICHAR_ID unichar_id,const char * value)282   void set_script(UNICHAR_ID unichar_id, const char* value) {
283     unichars[unichar_id].properties.script_id = add_script(value);
284   }
285 
286   // Set other_case unichar id in the properties for the given unichar id.
set_other_case(UNICHAR_ID unichar_id,UNICHAR_ID other_case)287   void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
288     unichars[unichar_id].properties.other_case = other_case;
289   }
290 
291   // Return the isalpha property of the given unichar.
get_isalpha(UNICHAR_ID unichar_id)292   bool get_isalpha(UNICHAR_ID unichar_id) const {
293     return unichars[unichar_id].properties.isalpha;
294   }
295 
296   // Return the islower property of the given unichar.
get_islower(UNICHAR_ID unichar_id)297   bool get_islower(UNICHAR_ID unichar_id) const {
298     return unichars[unichar_id].properties.islower;
299   }
300 
301   // Return the isupper property of the given unichar.
get_isupper(UNICHAR_ID unichar_id)302   bool get_isupper(UNICHAR_ID unichar_id) const {
303     return unichars[unichar_id].properties.isupper;
304   }
305 
306   // Return the isdigit property of the given unichar.
get_isdigit(UNICHAR_ID unichar_id)307   bool get_isdigit(UNICHAR_ID unichar_id) const {
308     return unichars[unichar_id].properties.isdigit;
309   }
310 
311   // Return the ispunctuation property of the given unichar.
get_ispunctuation(UNICHAR_ID unichar_id)312   bool get_ispunctuation(UNICHAR_ID unichar_id) const {
313     return unichars[unichar_id].properties.ispunctuation;
314   }
315 
316   // Return the isngram property of the given unichar.
get_isngram(UNICHAR_ID unichar_id)317   bool get_isngram(UNICHAR_ID unichar_id) const {
318     return unichars[unichar_id].properties.isngram;
319   }
320 
321   // Return the script name of the given unichar.
322   // The returned pointer will always be the same for the same script, it's
323   // managed by unicharset and thus MUST NOT be deleted
get_script(UNICHAR_ID unichar_id)324   int get_script(UNICHAR_ID unichar_id) const {
325     return unichars[unichar_id].properties.script_id;
326   }
327 
328   // Get other_case unichar id in the properties for the given unichar id.
get_other_case(UNICHAR_ID unichar_id)329   UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
330     return unichars[unichar_id].properties.other_case;
331   }
332 
333   // Returns UNICHAR_ID of the corresponding lower-case unichar.
to_lower(UNICHAR_ID unichar_id)334   UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
335     if (unichars[unichar_id].properties.islower) return unichar_id;
336     return unichars[unichar_id].properties.other_case;
337   }
338 
339   // Returns UNICHAR_ID of the corresponding upper-case unichar.
to_upper(UNICHAR_ID unichar_id)340   UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
341     if (unichars[unichar_id].properties.isupper) return unichar_id;
342     return unichars[unichar_id].properties.other_case;
343   }
344 
345   // Return a pointer to the CHAR_FRAGMENT class if the given
346   // unichar id represents a character fragment.
get_fragment(UNICHAR_ID unichar_id)347   const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
348     return unichars[unichar_id].properties.fragment;
349   }
350 
351   // Return the isalpha property of the given unichar representation.
get_isalpha(const char * const unichar_repr)352   bool get_isalpha(const char* const unichar_repr) const {
353     return get_isalpha(unichar_to_id(unichar_repr));
354   }
355 
356   // Return the islower property of the given unichar representation.
get_islower(const char * const unichar_repr)357   bool get_islower(const char* const unichar_repr) const {
358     return get_islower(unichar_to_id(unichar_repr));
359   }
360 
361   // Return the isupper property of the given unichar representation.
get_isupper(const char * const unichar_repr)362   bool get_isupper(const char* const unichar_repr) const {
363     return get_isupper(unichar_to_id(unichar_repr));
364   }
365 
366   // Return the isdigit property of the given unichar representation.
get_isdigit(const char * const unichar_repr)367   bool get_isdigit(const char* const unichar_repr) const {
368     return get_isdigit(unichar_to_id(unichar_repr));
369   }
370 
371   // Return the ispunctuation property of the given unichar representation.
get_ispunctuation(const char * const unichar_repr)372   bool get_ispunctuation(const char* const unichar_repr) const {
373     return get_ispunctuation(unichar_to_id(unichar_repr));
374   }
375 
376   // Return the script name of the given unichar representation.
377   // The returned pointer will always be the same for the same script, it's
378   // managed by unicharset and thus MUST NOT be deleted
get_script(const char * const unichar_repr)379   int get_script(const char* const unichar_repr) const {
380     return get_script(unichar_to_id(unichar_repr));
381   }
382 
383   // Return a pointer to the CHAR_FRAGMENT class struct if the given
384   // unichar representation represents a character fragment.
get_fragment(const char * const unichar_repr)385   const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
386     if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
387         !ids.contains(unichar_repr)) {
388       return NULL;
389     }
390     return get_fragment(unichar_to_id(unichar_repr));
391   }
392 
393   // Return the isalpha property of the given unichar representation.
394   // Only the first length characters from unichar_repr are used.
get_isalpha(const char * const unichar_repr,int length)395   bool get_isalpha(const char* const unichar_repr,
396                int length) const {
397     return get_isalpha(unichar_to_id(unichar_repr, length));
398   }
399 
400   // Return the islower property of the given unichar representation.
401   // Only the first length characters from unichar_repr are used.
get_islower(const char * const unichar_repr,int length)402   bool get_islower(const char* const unichar_repr,
403                int length) const {
404     return get_islower(unichar_to_id(unichar_repr, length));
405   }
406 
407   // Return the isupper property of the given unichar representation.
408   // Only the first length characters from unichar_repr are used.
get_isupper(const char * const unichar_repr,int length)409   bool get_isupper(const char* const unichar_repr,
410                int length) const {
411     return get_isupper(unichar_to_id(unichar_repr, length));
412   }
413 
414   // Return the isdigit property of the given unichar representation.
415   // Only the first length characters from unichar_repr are used.
get_isdigit(const char * const unichar_repr,int length)416   bool get_isdigit(const char* const unichar_repr,
417                int length) const {
418     return get_isdigit(unichar_to_id(unichar_repr, length));
419   }
420 
421   // Return the ispunctuation property of the given unichar representation.
422   // Only the first length characters from unichar_repr are used.
get_ispunctuation(const char * const unichar_repr,int length)423   bool get_ispunctuation(const char* const unichar_repr,
424                           int length) const {
425     return get_ispunctuation(unichar_to_id(unichar_repr, length));
426   }
427 
428   // Return the script name of the given unichar representation.
429   // Only the first length characters from unichar_repr are used.
430   // The returned pointer will always be the same for the same script, it's
431   // managed by unicharset and thus MUST NOT be deleted
get_script(const char * const unichar_repr,int length)432   int get_script(const char* const unichar_repr,
433                int length) const {
434     return get_script(unichar_to_id(unichar_repr, length));
435   }
436 
437   // Return the (current) number of scripts in the script table
get_script_table_size()438   int get_script_table_size() const {
439     return script_table_size_used;
440   }
441 
442   // Return the script string from its id
get_script_from_script_id(int id)443   const char* get_script_from_script_id(int id) const {
444     if (id >= script_table_size_used || id < 0)
445       return null_script;
446     return script_table[id];
447   }
448 
449   // Returns the id from the name of the script, or 0 if script is not found.
450   // Note that this is an expensive operation since it involves iteratively
451   // comparing strings in the script table.  To avoid dependency on STL, we
452   // won't use a hash.  Instead, the calling function can use this to lookup
453   // and save the ID for relevant scripts for fast comparisons later.
454   int get_script_id_from_name(const char* script_name) const;
455 
456   // Return true if the given script is the null script
is_null_script(const char * script)457   bool is_null_script(const char* script) const {
458     return script == null_script;
459   }
460 
461   // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
462   // then the returned pointer will be the same.
463   // The script parameter is copied and thus can be a temporary.
464   int add_script(const char* script);
465 
466   // Return the enabled property of the given unichar.
get_enabled(UNICHAR_ID unichar_id)467   bool get_enabled(UNICHAR_ID unichar_id) const {
468     return unichars[unichar_id].properties.enabled;
469   }
470 
471 
null_sid()472   int null_sid() const { return null_sid_; }
common_sid()473   int common_sid() const { return common_sid_; }
latin_sid()474   int latin_sid() const { return latin_sid_; }
cyrillic_sid()475   int cyrillic_sid() const { return cyrillic_sid_; }
greek_sid()476   int greek_sid() const { return greek_sid_; }
han_sid()477   int han_sid() const { return han_sid_; }
478 
479  private:
480 
481   struct UNICHAR_PROPERTIES {
482     bool  isalpha;
483     bool  islower;
484     bool  isupper;
485     bool  isdigit;
486     bool  ispunctuation;
487     bool  isngram;
488     bool  enabled;
489     int   script_id;
490     UNICHAR_ID other_case;  // id of the corresponding upper/lower case unichar
491 
492     // Contains meta information about the fragment if a unichar represents
493     // a fragment of a character, otherwise should be set to NULL.
494     // It is assumed that character fragments are added to the unicharset
495     // after the corresponding 'base' characters.
496     CHAR_FRAGMENT *fragment;
497   };
498 
499   struct UNICHAR_SLOT {
500     char representation[UNICHAR_LEN + 1];
501     UNICHAR_PROPERTIES properties;
502   };
503 
504   UNICHAR_SLOT* unichars;
505   UNICHARMAP ids;
506   int size_used;
507   int size_reserved;
508   char** script_table;
509   int script_table_size_used;
510   int script_table_size_reserved;
511   const char* null_script;
512 
513   // A few convenient script name-to-id mapping without using hash.
514   // These are initialized when unicharset file is loaded.  Anything
515   // missing from this list can be looked up using get_script_id_from_name.
516   int null_sid_;
517   int common_sid_;
518   int latin_sid_;
519   int cyrillic_sid_;
520   int greek_sid_;
521   int han_sid_;
522 };
523 
524 #endif  // TESSERACT_CCUTIL_UNICHARSET_H__
525