• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 ///////////////////////////////////////////////////////////////////////
2 // File:        unicharmap.cpp
3 // Description: Unicode character/ligature to integer id class.
4 // Author:      Thomas Kielbus
5 // Created:     Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19 
20 #include <assert.h>
21 #include "unichar.h"
22 #include "host.h"
23 #include "unicharmap.h"
24 
UNICHARMAP()25 UNICHARMAP::UNICHARMAP() :
26 nodes(0) {
27 }
28 
~UNICHARMAP()29 UNICHARMAP::~UNICHARMAP() {
30   if (nodes != 0)
31     delete[] nodes;
32 }
33 
34 // Search the given unichar representation in the tree. Each character in the
35 // string is interpreted as an index in an array of nodes.
unichar_to_id(const char * const unichar_repr) const36 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
37   const char* current_char = unichar_repr;
38   UNICHARMAP_NODE* current_nodes = nodes;
39 
40   assert(*unichar_repr != '\0');
41 
42   do {
43     if (*(current_char + 1) == '\0')
44       return current_nodes[static_cast<unsigned char>(*current_char)].id;
45     current_nodes =
46         current_nodes[static_cast<unsigned char>(*current_char)].children;
47     ++current_char;
48   } while (true);
49 }
50 
51 // Search the given unichar representation in the tree, using length characters
52 // from it maximum. Each character in the string is interpreted as an index in
53 // an array of nodes.
unichar_to_id(const char * const unichar_repr,int length) const54 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
55                                      int length) const {
56   const char* current_char = unichar_repr;
57   UNICHARMAP_NODE* current_nodes = nodes;
58 
59   assert(*unichar_repr != '\0');
60   assert(length > 0 && length <= UNICHAR_LEN);
61 
62   do {
63     if (length == 1 || *(current_char + 1) == '\0')
64       return current_nodes[static_cast<unsigned char>(*current_char)].id;
65     current_nodes =
66         current_nodes[static_cast<unsigned char>(*current_char)].children;
67     ++current_char;
68     --length;
69   } while (true);
70 }
71 
72 // Search the given unichar representation in the tree, creating the possibly
73 // missing nodes. Once the right place has been found, insert the given id and
74 // update the inserted flag to keep track of the insert. Each character in the
75 // string is interpreted as an index in an array of nodes.
insert(const char * const unichar_repr,UNICHAR_ID id)76 void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
77   const char* current_char = unichar_repr;
78   UNICHARMAP_NODE** current_nodes_pointer = &nodes;
79 
80   assert(*unichar_repr != '\0');
81   assert(id >= 0);
82 
83   do {
84     if (*current_nodes_pointer == 0)
85       *current_nodes_pointer = new UNICHARMAP_NODE[256];
86     if (*(current_char + 1) == '\0') {
87       (*current_nodes_pointer)
88           [static_cast<unsigned char>(*current_char)].id = id;
89       return;
90     }
91     current_nodes_pointer =
92         &((*current_nodes_pointer)
93           [static_cast<unsigned char>(*current_char)].children);
94     ++current_char;
95   } while (true);
96 }
97 
98 // Search the given unichar representation in the tree. Each character in the
99 // string is interpreted as an index in an array of nodes. Stop once the tree
100 // does not have anymore nodes or once we found the right unichar_repr.
contains(const char * const unichar_repr) const101 bool UNICHARMAP::contains(const char* const unichar_repr) const {
102   const char* current_char = unichar_repr;
103   UNICHARMAP_NODE* current_nodes = nodes;
104 
105   assert(*unichar_repr != '\0');
106 
107   while (current_nodes != 0 && *(current_char + 1) != '\0') {
108     current_nodes =
109         current_nodes[static_cast<unsigned char>(*current_char)].children;
110     ++current_char;
111   }
112   return current_nodes != 0 && *(current_char + 1) == '\0' &&
113       current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
114 }
115 
116 // Search the given unichar representation in the tree, using length characters
117 // from it maximum. Each character in the string is interpreted as an index in
118 // an array of nodes. Stop once the tree does not have anymore nodes or once we
119 // found the right unichar_repr.
contains(const char * const unichar_repr,int length) const120 bool UNICHARMAP::contains(const char* const unichar_repr,
121                           int length) const {
122   const char* current_char = unichar_repr;
123   UNICHARMAP_NODE* current_nodes = nodes;
124 
125   assert(*unichar_repr != '\0');
126   assert(length > 0 && length <= UNICHAR_LEN);
127 
128   while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
129     current_nodes =
130         current_nodes[static_cast<unsigned char>(*current_char)].children;
131     --length;
132     ++current_char;
133   }
134   return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
135       current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
136 }
137 
138 // Return the minimum number of characters that must be used from this string
139 // to obtain a match in the UNICHARMAP.
minmatch(const char * const unichar_repr) const140 int UNICHARMAP::minmatch(const char* const unichar_repr) const {
141   const char* current_char = unichar_repr;
142   UNICHARMAP_NODE* current_nodes = nodes;
143 
144   while (current_nodes != NULL && *current_char != '\0') {
145     if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
146       return current_char + 1 - unichar_repr;
147     current_nodes =
148         current_nodes[static_cast<unsigned char>(*current_char)].children;
149     ++current_char;
150   }
151   return 0;
152 }
153 
clear()154 void UNICHARMAP::clear() {
155   if (nodes != 0)
156   {
157     delete[] nodes;
158     nodes = 0;
159   }
160 }
161 
UNICHARMAP_NODE()162 UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() :
163 children(0),
164 id(-1) {
165 }
166 
167 // Recursively delete the children
~UNICHARMAP_NODE()168 UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
169   if (children != 0) {
170     delete[] children;
171   }
172 }
173