1 /////////////////////////////////////////////////////////////////////// 2 // File: unicharset.h 3 // Description: Unicode character/ligature set class. 4 // Author: Thomas Kielbus 5 // Created: Wed Jun 28 17:05:01 PDT 2006 6 // 7 // (C) Copyright 2006, Google Inc. 8 // Licensed under the Apache License, Version 2.0 (the "License"); 9 // you may not use this file except in compliance with the License. 10 // You may obtain a copy of the License at 11 // http://www.apache.org/licenses/LICENSE-2.0 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 // 18 /////////////////////////////////////////////////////////////////////// 19 20 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__ 21 #define TESSERACT_CCUTIL_UNICHARSET_H__ 22 23 #include "assert.h" 24 #include "strngs.h" 25 #include "unichar.h" 26 #include "unicharmap.h" 27 #include "varable.h" 28 29 class CHAR_FRAGMENT { 30 public: 31 // Minimum number of characters used for fragment representation. 32 static const int kMinLen = 6; 33 // Maximum number of characters used for fragment representation. 34 static const int kMaxLen = 3 + UNICHAR_LEN + 2; 35 // Special character used in representing character fragments. 36 static const char kSeparator = '|'; 37 // Maximum number of fragments per character. 38 static const int kMaxChunks = 3; 39 40 // Setters and Getters. set_all(const char * unichar,int pos,int total)41 inline void set_all(const char *unichar, int pos, int total) { 42 this->set_unichar(unichar); 43 this->set_pos(pos); 44 this->set_total(total); 45 } set_unichar(const char * uch)46 inline void set_unichar(const char *uch) { 47 strncpy(this->unichar, uch, UNICHAR_LEN); 48 this->unichar[UNICHAR_LEN] = '\0'; 49 } set_pos(int p)50 inline void set_pos(int p) { this->pos = p; } set_total(int t)51 inline void set_total(int t) { this->total = t; } get_unichar()52 inline const char* get_unichar() const { return this->unichar; } get_pos()53 inline int get_pos() const { return this->pos; } get_total()54 inline int get_total() const { return this->total; } 55 56 // Returns the string that represents a fragment 57 // with the given unichar, pos and total. to_string(const char * unichar,int pos,int total)58 static STRING to_string(const char *unichar, int pos, int total) { 59 STRING result = ""; 60 result += kSeparator; 61 result += unichar; 62 char buffer[kMaxLen]; 63 snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total); 64 result += buffer; 65 return result; 66 } 67 // Returns the string that represents this fragment. to_string()68 STRING to_string() const { 69 return to_string(this->unichar, this->pos, this->total); 70 } 71 72 // Checks whether a fragment has the same unichar, 73 // position and total as the given inputs. equals(const char * other_unichar,int other_pos,int other_total)74 inline bool equals(const char *other_unichar, 75 int other_pos, int other_total) const { 76 return (strcmp(this->unichar, other_unichar) == 0 && 77 this->pos == other_pos && this->total == other_total); 78 } equals(const CHAR_FRAGMENT * other)79 inline bool equals(const CHAR_FRAGMENT *other) const { 80 return this->equals(other->get_unichar(), 81 other->get_pos(), 82 other->get_total()); 83 } 84 85 // Checks whether a given fragment is a continuation of this fragment. 86 // Assumes that the given fragment pointer is not NULL. is_continuation_of(const CHAR_FRAGMENT * fragment)87 inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const { 88 return (strcmp(this->unichar, fragment->get_unichar()) == 0 && 89 this->total == fragment->get_total() && 90 this->pos == fragment->get_pos() + 1); 91 } 92 93 // Returns true if this fragment is a beginning fragment. is_beginning()94 inline bool is_beginning() const { return this->pos == 0; } 95 96 // Returns true if this fragment is an ending fragment. is_ending()97 inline bool is_ending() const { return this->pos == this->total-1; } 98 99 // Parses the string to see whether it represents a character fragment 100 // (rather than a regular character). If so, allocates memory for a new 101 // CHAR_FRAGMENT instance and fills it in with the corresponding fragment 102 // information. Fragments are of the form: 103 // |m|1|2, meaning chunk 1 of 2 of character m. 104 // 105 // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT 106 // instance, otherwise (if the string does not represent a fragment or it 107 // looks like it does, but parsing it as a fragment fails) returns NULL. 108 // 109 // Note: The caller is responsible for deallocating memory 110 // associated with the returned pointer. 111 static CHAR_FRAGMENT *parse_from_string(const char *str); 112 113 private: 114 char unichar[UNICHAR_LEN + 1]; 115 inT16 pos; // fragment position in the character 116 inT16 total; // total number of fragments in the character 117 }; 118 119 // The UNICHARSET class is an utility class for Tesseract that holds the 120 // set of characters that are used by the engine. Each character is identified 121 // by a unique number, from 0 to (size - 1). 122 class UNICHARSET { 123 public: 124 // Create an empty UNICHARSET 125 UNICHARSET(); 126 127 ~UNICHARSET(); 128 129 // Return the UNICHAR_ID of a given unichar representation within the 130 // UNICHARSET. 131 const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const; 132 133 // Return the UNICHAR_ID of a given unichar representation within the 134 // UNICHARSET. Only the first length characters from unichar_repr are used. 135 const UNICHAR_ID unichar_to_id(const char* const unichar_repr, 136 int length) const; 137 138 // Return the minimum number of bytes that matches a legal UNICHAR_ID, 139 // while leaving a legal UNICHAR_ID afterwards. In other words, if there 140 // is both a short and a long match to the string, return the length that 141 // ensures there is a legal match after it. 142 int step(const char* str) const; 143 144 // Return the unichar representation corresponding to the given UNICHAR_ID 145 // within the UNICHARSET. 146 const char* const id_to_unichar(UNICHAR_ID id) const; 147 148 // Return a STRING that reformats the utf8 str into the str followed 149 // by its hex unicodes. 150 static STRING debug_utf8_str(const char* str); 151 152 // Return a STRING containing debug information on the unichar, including 153 // the id_to_unichar, its hex unicodes and the properties. 154 STRING debug_str(UNICHAR_ID id) const; debug_str(const char * unichar_repr)155 STRING debug_str(const char * unichar_repr) const { 156 return debug_str(unichar_to_id(unichar_repr)); 157 } 158 159 // Add a unichar representation to the set. 160 void unichar_insert(const char* const unichar_repr); 161 162 // Return true if the given unichar id exists within the set. 163 // Relies on the fact that unichar ids are contiguous in the unicharset. contains_unichar_id(UNICHAR_ID unichar_id)164 bool contains_unichar_id(UNICHAR_ID unichar_id) const { 165 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used; 166 } 167 168 // Return true if the given unichar representation exists within the set. 169 bool contains_unichar(const char* const unichar_repr) const; 170 bool contains_unichar(const char* const unichar_repr, int length) const; 171 172 // Return true if the given unichar representation corresponds to the given 173 // UNICHAR_ID within the set. 174 bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const; 175 176 // Delete CHAR_FRAGMENTs stored in properties of unichars array. delete_pointers_in_unichars()177 void delete_pointers_in_unichars() { 178 for (int i = 0; i < size_used; ++i) { 179 if (unichars[i].properties.fragment != NULL) { 180 delete unichars[i].properties.fragment; 181 unichars[i].properties.fragment = NULL; 182 } 183 } 184 } 185 186 // Clear the UNICHARSET (all the previous data is lost). clear()187 void clear() { 188 if (size_reserved > 0) { 189 for (int i = 0; i < script_table_size_used; ++i) 190 delete[] script_table[i]; 191 delete[] script_table; 192 script_table = 0; 193 script_table_size_reserved = 0; 194 script_table_size_used = 0; 195 delete_pointers_in_unichars(); 196 delete[] unichars; 197 unichars = 0; 198 size_reserved = 0; 199 size_used = 0; 200 } 201 ids.clear(); 202 } 203 204 // Return the size of the set (the number of different UNICHAR it holds). size()205 int size() const { 206 return size_used; 207 } 208 209 // Reserve enough memory space for the given number of UNICHARS 210 void reserve(int unichars_number); 211 212 // Opens the file indicated by filename and saves unicharset to that file. 213 // Returns true if the operation is successful. save_to_file(const char * const filename)214 bool save_to_file(const char * const filename) const { 215 FILE* file = fopen(filename, "w+"); 216 if (file == NULL) return false; 217 bool result = save_to_file(file); 218 fclose(file); 219 return result; 220 } 221 222 // Saves the content of the UNICHARSET to the given file. 223 // Returns true if the operation is successful. 224 bool save_to_file(FILE *file) const; 225 226 // Opens the file indicated by filename and loads the UNICHARSET 227 // from the given file. The previous data is lost. 228 // Returns true if the operation is successful. load_from_file(const char * const filename)229 bool load_from_file(const char* const filename) { 230 FILE* file = fopen(filename, "r"); 231 if (file == NULL) return false; 232 bool result = load_from_file(file); 233 fclose(file); 234 return result; 235 } 236 237 // Loads the UNICHARSET from the given file. The previous data is lost. 238 // Returns true if the operation is successful. 239 bool load_from_file(FILE *file); 240 241 // Set a whitelist and/or blacklist of characters to recognize. 242 // An empty or NULL whitelist enables everything (minus any blacklist). 243 // An empty or NULL blacklist disables nothing. 244 // The blacklist overrides the whitelist. 245 // Each list is a string of utf8 character strings. Boundaries between 246 // unicharset units are worked out automatically, and characters not in 247 // the unicharset are silently ignored. 248 void set_black_and_whitelist(const char* blacklist, const char* whitelist); 249 250 // Set the isalpha property of the given unichar to the given value. set_isalpha(UNICHAR_ID unichar_id,bool value)251 void set_isalpha(UNICHAR_ID unichar_id, bool value) { 252 unichars[unichar_id].properties.isalpha = value; 253 } 254 255 // Set the islower property of the given unichar to the given value. set_islower(UNICHAR_ID unichar_id,bool value)256 void set_islower(UNICHAR_ID unichar_id, bool value) { 257 unichars[unichar_id].properties.islower = value; 258 } 259 260 // Set the isupper property of the given unichar to the given value. set_isupper(UNICHAR_ID unichar_id,bool value)261 void set_isupper(UNICHAR_ID unichar_id, bool value) { 262 unichars[unichar_id].properties.isupper = value; 263 } 264 265 // Set the isdigit property of the given unichar to the given value. set_isdigit(UNICHAR_ID unichar_id,bool value)266 void set_isdigit(UNICHAR_ID unichar_id, bool value) { 267 unichars[unichar_id].properties.isdigit = value; 268 } 269 270 // Set the ispunctuation property of the given unichar to the given value. set_ispunctuation(UNICHAR_ID unichar_id,bool value)271 void set_ispunctuation(UNICHAR_ID unichar_id, bool value) { 272 unichars[unichar_id].properties.ispunctuation = value; 273 } 274 275 // Set the isngram property of the given unichar to the given value. set_isngram(UNICHAR_ID unichar_id,bool value)276 void set_isngram(UNICHAR_ID unichar_id, bool value) { 277 unichars[unichar_id].properties.isngram = value; 278 } 279 280 // Set the script name of the given unichar to the given value. 281 // Value is copied and thus can be a temporary; set_script(UNICHAR_ID unichar_id,const char * value)282 void set_script(UNICHAR_ID unichar_id, const char* value) { 283 unichars[unichar_id].properties.script_id = add_script(value); 284 } 285 286 // Set other_case unichar id in the properties for the given unichar id. set_other_case(UNICHAR_ID unichar_id,UNICHAR_ID other_case)287 void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) { 288 unichars[unichar_id].properties.other_case = other_case; 289 } 290 291 // Return the isalpha property of the given unichar. get_isalpha(UNICHAR_ID unichar_id)292 bool get_isalpha(UNICHAR_ID unichar_id) const { 293 return unichars[unichar_id].properties.isalpha; 294 } 295 296 // Return the islower property of the given unichar. get_islower(UNICHAR_ID unichar_id)297 bool get_islower(UNICHAR_ID unichar_id) const { 298 return unichars[unichar_id].properties.islower; 299 } 300 301 // Return the isupper property of the given unichar. get_isupper(UNICHAR_ID unichar_id)302 bool get_isupper(UNICHAR_ID unichar_id) const { 303 return unichars[unichar_id].properties.isupper; 304 } 305 306 // Return the isdigit property of the given unichar. get_isdigit(UNICHAR_ID unichar_id)307 bool get_isdigit(UNICHAR_ID unichar_id) const { 308 return unichars[unichar_id].properties.isdigit; 309 } 310 311 // Return the ispunctuation property of the given unichar. get_ispunctuation(UNICHAR_ID unichar_id)312 bool get_ispunctuation(UNICHAR_ID unichar_id) const { 313 return unichars[unichar_id].properties.ispunctuation; 314 } 315 316 // Return the isngram property of the given unichar. get_isngram(UNICHAR_ID unichar_id)317 bool get_isngram(UNICHAR_ID unichar_id) const { 318 return unichars[unichar_id].properties.isngram; 319 } 320 321 // Return the script name of the given unichar. 322 // The returned pointer will always be the same for the same script, it's 323 // managed by unicharset and thus MUST NOT be deleted get_script(UNICHAR_ID unichar_id)324 int get_script(UNICHAR_ID unichar_id) const { 325 return unichars[unichar_id].properties.script_id; 326 } 327 328 // Get other_case unichar id in the properties for the given unichar id. get_other_case(UNICHAR_ID unichar_id)329 UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const { 330 return unichars[unichar_id].properties.other_case; 331 } 332 333 // Returns UNICHAR_ID of the corresponding lower-case unichar. to_lower(UNICHAR_ID unichar_id)334 UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const { 335 if (unichars[unichar_id].properties.islower) return unichar_id; 336 return unichars[unichar_id].properties.other_case; 337 } 338 339 // Returns UNICHAR_ID of the corresponding upper-case unichar. to_upper(UNICHAR_ID unichar_id)340 UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const { 341 if (unichars[unichar_id].properties.isupper) return unichar_id; 342 return unichars[unichar_id].properties.other_case; 343 } 344 345 // Return a pointer to the CHAR_FRAGMENT class if the given 346 // unichar id represents a character fragment. get_fragment(UNICHAR_ID unichar_id)347 const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const { 348 return unichars[unichar_id].properties.fragment; 349 } 350 351 // Return the isalpha property of the given unichar representation. get_isalpha(const char * const unichar_repr)352 bool get_isalpha(const char* const unichar_repr) const { 353 return get_isalpha(unichar_to_id(unichar_repr)); 354 } 355 356 // Return the islower property of the given unichar representation. get_islower(const char * const unichar_repr)357 bool get_islower(const char* const unichar_repr) const { 358 return get_islower(unichar_to_id(unichar_repr)); 359 } 360 361 // Return the isupper property of the given unichar representation. get_isupper(const char * const unichar_repr)362 bool get_isupper(const char* const unichar_repr) const { 363 return get_isupper(unichar_to_id(unichar_repr)); 364 } 365 366 // Return the isdigit property of the given unichar representation. get_isdigit(const char * const unichar_repr)367 bool get_isdigit(const char* const unichar_repr) const { 368 return get_isdigit(unichar_to_id(unichar_repr)); 369 } 370 371 // Return the ispunctuation property of the given unichar representation. get_ispunctuation(const char * const unichar_repr)372 bool get_ispunctuation(const char* const unichar_repr) const { 373 return get_ispunctuation(unichar_to_id(unichar_repr)); 374 } 375 376 // Return the script name of the given unichar representation. 377 // The returned pointer will always be the same for the same script, it's 378 // managed by unicharset and thus MUST NOT be deleted get_script(const char * const unichar_repr)379 int get_script(const char* const unichar_repr) const { 380 return get_script(unichar_to_id(unichar_repr)); 381 } 382 383 // Return a pointer to the CHAR_FRAGMENT class struct if the given 384 // unichar representation represents a character fragment. get_fragment(const char * const unichar_repr)385 const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const { 386 if (unichar_repr == NULL || unichar_repr[0] == '\0' || 387 !ids.contains(unichar_repr)) { 388 return NULL; 389 } 390 return get_fragment(unichar_to_id(unichar_repr)); 391 } 392 393 // Return the isalpha property of the given unichar representation. 394 // Only the first length characters from unichar_repr are used. get_isalpha(const char * const unichar_repr,int length)395 bool get_isalpha(const char* const unichar_repr, 396 int length) const { 397 return get_isalpha(unichar_to_id(unichar_repr, length)); 398 } 399 400 // Return the islower property of the given unichar representation. 401 // Only the first length characters from unichar_repr are used. get_islower(const char * const unichar_repr,int length)402 bool get_islower(const char* const unichar_repr, 403 int length) const { 404 return get_islower(unichar_to_id(unichar_repr, length)); 405 } 406 407 // Return the isupper property of the given unichar representation. 408 // Only the first length characters from unichar_repr are used. get_isupper(const char * const unichar_repr,int length)409 bool get_isupper(const char* const unichar_repr, 410 int length) const { 411 return get_isupper(unichar_to_id(unichar_repr, length)); 412 } 413 414 // Return the isdigit property of the given unichar representation. 415 // Only the first length characters from unichar_repr are used. get_isdigit(const char * const unichar_repr,int length)416 bool get_isdigit(const char* const unichar_repr, 417 int length) const { 418 return get_isdigit(unichar_to_id(unichar_repr, length)); 419 } 420 421 // Return the ispunctuation property of the given unichar representation. 422 // Only the first length characters from unichar_repr are used. get_ispunctuation(const char * const unichar_repr,int length)423 bool get_ispunctuation(const char* const unichar_repr, 424 int length) const { 425 return get_ispunctuation(unichar_to_id(unichar_repr, length)); 426 } 427 428 // Return the script name of the given unichar representation. 429 // Only the first length characters from unichar_repr are used. 430 // The returned pointer will always be the same for the same script, it's 431 // managed by unicharset and thus MUST NOT be deleted get_script(const char * const unichar_repr,int length)432 int get_script(const char* const unichar_repr, 433 int length) const { 434 return get_script(unichar_to_id(unichar_repr, length)); 435 } 436 437 // Return the (current) number of scripts in the script table get_script_table_size()438 int get_script_table_size() const { 439 return script_table_size_used; 440 } 441 442 // Return the script string from its id get_script_from_script_id(int id)443 const char* get_script_from_script_id(int id) const { 444 if (id >= script_table_size_used || id < 0) 445 return null_script; 446 return script_table[id]; 447 } 448 449 // Returns the id from the name of the script, or 0 if script is not found. 450 // Note that this is an expensive operation since it involves iteratively 451 // comparing strings in the script table. To avoid dependency on STL, we 452 // won't use a hash. Instead, the calling function can use this to lookup 453 // and save the ID for relevant scripts for fast comparisons later. 454 int get_script_id_from_name(const char* script_name) const; 455 456 // Return true if the given script is the null script is_null_script(const char * script)457 bool is_null_script(const char* script) const { 458 return script == null_script; 459 } 460 461 // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0, 462 // then the returned pointer will be the same. 463 // The script parameter is copied and thus can be a temporary. 464 int add_script(const char* script); 465 466 // Return the enabled property of the given unichar. get_enabled(UNICHAR_ID unichar_id)467 bool get_enabled(UNICHAR_ID unichar_id) const { 468 return unichars[unichar_id].properties.enabled; 469 } 470 471 null_sid()472 int null_sid() const { return null_sid_; } common_sid()473 int common_sid() const { return common_sid_; } latin_sid()474 int latin_sid() const { return latin_sid_; } cyrillic_sid()475 int cyrillic_sid() const { return cyrillic_sid_; } greek_sid()476 int greek_sid() const { return greek_sid_; } han_sid()477 int han_sid() const { return han_sid_; } 478 479 private: 480 481 struct UNICHAR_PROPERTIES { 482 bool isalpha; 483 bool islower; 484 bool isupper; 485 bool isdigit; 486 bool ispunctuation; 487 bool isngram; 488 bool enabled; 489 int script_id; 490 UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar 491 492 // Contains meta information about the fragment if a unichar represents 493 // a fragment of a character, otherwise should be set to NULL. 494 // It is assumed that character fragments are added to the unicharset 495 // after the corresponding 'base' characters. 496 CHAR_FRAGMENT *fragment; 497 }; 498 499 struct UNICHAR_SLOT { 500 char representation[UNICHAR_LEN + 1]; 501 UNICHAR_PROPERTIES properties; 502 }; 503 504 UNICHAR_SLOT* unichars; 505 UNICHARMAP ids; 506 int size_used; 507 int size_reserved; 508 char** script_table; 509 int script_table_size_used; 510 int script_table_size_reserved; 511 const char* null_script; 512 513 // A few convenient script name-to-id mapping without using hash. 514 // These are initialized when unicharset file is loaded. Anything 515 // missing from this list can be looked up using get_script_id_from_name. 516 int null_sid_; 517 int common_sid_; 518 int latin_sid_; 519 int cyrillic_sid_; 520 int greek_sid_; 521 int han_sid_; 522 }; 523 524 #endif // TESSERACT_CCUTIL_UNICHARSET_H__ 525