1 // symbol-table.h
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 //
16 // \file
17 // Classes to provide symbol-to-integer and integer-to-symbol mappings.
18
19 #ifndef FST_LIB_SYMBOL_TABLE_H__
20 #define FST_LIB_SYMBOL_TABLE_H__
21
22 #include <ext/hash_map>
23 using __gnu_cxx::hash_map;
24 #include <fstream>
25 #include <iostream>
26 #include <string>
27 #include <vector>
28
29 #include "fst/lib/compat.h"
30
31
32
33 DECLARE_bool(fst_compat_symbols);
34
35 namespace fst {
36
37 class SymbolTableImpl {
38 friend class SymbolTableIterator;
39 public:
SymbolTableImpl(const string & name)40 SymbolTableImpl(const string &name)
41 : name_(name), available_key_(0), ref_count_(1),
42 check_sum_finalized_(false) {}
~SymbolTableImpl()43 ~SymbolTableImpl() {
44 for (size_t i = 0; i < symbols_.size(); ++i)
45 delete[] symbols_[i];
46 }
47
48 int64 AddSymbol(const string& symbol, int64 key);
49
AddSymbol(const string & symbol)50 int64 AddSymbol(const string& symbol) {
51 int64 key = Find(symbol);
52 return (key == -1) ? AddSymbol(symbol, available_key_++) : key;
53 }
54
AddTable(SymbolTableImpl * table)55 void AddTable(SymbolTableImpl* table) {
56 for (size_t i = 0; i < table->symbols_.size(); ++i) {
57 AddSymbol(table->symbols_[i]);
58 }
59 }
60
61 static SymbolTableImpl* ReadText(const string& filename);
62
63 static SymbolTableImpl* Read(istream &strm, const string& source);
64
65 bool Write(ostream &strm) const;
66
67 bool WriteText(ostream &strm) const;
68
69 //
70 // Return the string associated with the key. If the key is out of
71 // range (<0, >max), return an empty string.
Find(int64 key)72 string Find(int64 key) const {
73 hash_map<int64, string>::const_iterator it =
74 key_map_.find(key);
75 if (it == key_map_.end()) {
76 return "";
77 }
78 return it->second;
79 }
80
81 //
82 // Return the key associated with the symbol. If the symbol
83 // does not exists, return -1.
Find(const string & symbol)84 int64 Find(const string& symbol) const {
85 return Find(symbol.c_str());
86 }
87
88 //
89 // Return the key associated with the symbol. If the symbol
90 // does not exists, return -1.
Find(const char * symbol)91 int64 Find(const char* symbol) const {
92 hash_map<string, int64>::const_iterator it =
93 symbol_map_.find(symbol);
94 if (it == symbol_map_.end()) {
95 return -1;
96 }
97 return it->second;
98 }
99
Name()100 const string& Name() const { return name_; }
101
IncrRefCount()102 int IncrRefCount() const {
103 return ++ref_count_;
104 }
DecrRefCount()105 int DecrRefCount() const {
106 return --ref_count_;
107 }
108
CheckSum()109 string CheckSum() const {
110 if (!check_sum_finalized_) {
111 RecomputeCheckSum();
112 check_sum_string_ = check_sum_.Digest();
113 }
114 return check_sum_string_;
115 }
116
AvailableKey()117 int64 AvailableKey() const {
118 return available_key_;
119 }
120
121 // private support methods
122 private:
123 void RecomputeCheckSum() const;
124 static SymbolTableImpl* Read1(istream &, const string &);
125
126 string name_;
127 int64 available_key_;
128 vector<const char *> symbols_;
129 hash_map<int64, string> key_map_;
130 hash_map<string, int64> symbol_map_;
131
132 mutable int ref_count_;
133 mutable bool check_sum_finalized_;
134 mutable MD5 check_sum_;
135 mutable string check_sum_string_;
136
137 DISALLOW_EVIL_CONSTRUCTORS(SymbolTableImpl);
138 };
139
140
141 class SymbolTableIterator;
142
143 //
144 // \class SymbolTable
145 // \brief Symbol (string) to int and reverse mapping
146 //
147 // The SymbolTable implements the mappings of labels to strings and reverse.
148 // SymbolTables are used to describe the alphabet of the input and output
149 // labels for arcs in a Finite State Transducer.
150 //
151 // SymbolTables are reference counted and can therefore be shared across
152 // multiple machines. For example a language model grammar G, with a
153 // SymbolTable for the words in the language model can share this symbol
154 // table with the lexical representation L o G.
155 //
156 class SymbolTable {
157 friend class SymbolTableIterator;
158 public:
159 static const int64 kNoSymbol = -1;
160
161 // Construct symbol table with a unique name.
SymbolTable(const string & name)162 SymbolTable(const string& name) : impl_(new SymbolTableImpl(name)) {}
163
164 // Create a reference counted copy.
SymbolTable(const SymbolTable & table)165 SymbolTable(const SymbolTable& table) : impl_(table.impl_) {
166 impl_->IncrRefCount();
167 }
168
169 // Derefence implentation object. When reference count hits 0, delete
170 // implementation.
~SymbolTable()171 ~SymbolTable() {
172 if (!impl_->DecrRefCount()) delete impl_;
173 }
174
175 // create a reference counted copy
Copy()176 SymbolTable* Copy() const {
177 return new SymbolTable(*this);
178 }
179
180 // Add a symbol with given key to table. A symbol table also
181 // keeps track of the last available key (highest key value in
182 // the symbol table).
183 //
184 // \param symbol string symbol to add
185 // \param key associated key for string symbol
186 // \return the key created by the symbol table. Symbols allready added to
187 // the symbol table will not get a different key.
AddSymbol(const string & symbol,int64 key)188 int64 AddSymbol(const string& symbol, int64 key) {
189 return impl_->AddSymbol(symbol, key);
190 }
191
192 // Add a symbol to the table. The associated value key is automatically
193 // assigned by the symbol table.
194 //
195 // \param symbol string to add to the table
196 // \return the value key assigned to the associated string symbol
AddSymbol(const string & symbol)197 int64 AddSymbol(const string& symbol) {
198 return impl_->AddSymbol(symbol);
199 }
200
201 // Add another symbol table to this table. All key values will be offset
202 // by the current available key (highest key value in the symbol table).
203 // Note string symbols with the same key value with still have the same
204 // key value after the symbol table has been merged, but a different
205 // value. Adding symbol tables do not result in changes in the base table.
206 //
207 // Merging N symbol tables is often useful when combining the various
208 // name spaces of transducers to a unified representation.
209 //
210 // \param table the symbol table to add to this table
AddTable(const SymbolTable & table)211 void AddTable(const SymbolTable& table) {
212 return impl_->AddTable(table.impl_);
213 }
214
215 // return the name of the symbol table
Name()216 const string& Name() const {
217 return impl_->Name();
218 }
219
220 // return the MD5 check-sum for this table. All new symbols added to
221 // the table will result in an updated checksum.
CheckSum()222 string CheckSum() const {
223 return impl_->CheckSum();
224 }
225
226 // read an ascii representation of the symbol table
ReadText(const string & filename)227 static SymbolTable* ReadText(const string& filename) {
228 SymbolTableImpl* impl = SymbolTableImpl::ReadText(filename);
229 if (!impl)
230 return 0;
231 else
232 return new SymbolTable(impl);
233 }
234
235 // read a binary dump of the symbol table
Read(istream & strm,const string & source)236 static SymbolTable* Read(istream &strm, const string& source) {
237 SymbolTableImpl* impl = SymbolTableImpl::Read(strm, source);
238 if (!impl)
239 return 0;
240 else
241 return new SymbolTable(impl);
242 }
243
244 // read a binary dump of the symbol table
Read(const string & filename)245 static SymbolTable* Read(const string& filename) {
246 ifstream strm(filename.c_str());
247 if (!strm) {
248 LOG(ERROR) << "SymbolTable::Read: Can't open file " << filename;
249 return 0;
250 }
251 return Read(strm, filename);
252 }
253
Write(ostream & strm)254 bool Write(ostream &strm) const {
255 return impl_->Write(strm);
256 }
257
Write(const string & filename)258 bool Write(const string& filename) const {
259 ofstream strm(filename.c_str());
260 if (!strm) {
261 LOG(ERROR) << "SymbolTable::Write: Can't open file " << filename;
262 return false;
263 }
264 return Write(strm);
265 }
266
267 // Dump an ascii text representation of the symbol table
WriteText(ostream & strm)268 bool WriteText(ostream &strm) const {
269 return impl_->WriteText(strm);
270 }
271
272 // Dump an ascii text representation of the symbol table
WriteText(const string & filename)273 bool WriteText(const string& filename) const {
274 ofstream strm(filename.c_str());
275 if (!strm) {
276 LOG(ERROR) << "SymbolTable::WriteText: Can't open file " << filename;
277 return false;
278 }
279 return WriteText(strm);
280 }
281
282 // Return the string associated with the key. If the key is out of
283 // range (<0, >max), log error and return an empty string.
Find(int64 key)284 string Find(int64 key) const {
285 return impl_->Find(key);
286 }
287
288 // Return the key associated with the symbol. If the symbol
289 // does not exists, log error and return -1
Find(const string & symbol)290 int64 Find(const string& symbol) const {
291 return impl_->Find(symbol);
292 }
293
294 // Return the key associated with the symbol. If the symbol
295 // does not exists, log error and return -1
Find(const char * symbol)296 int64 Find(const char* symbol) const {
297 return impl_->Find(symbol);
298 }
299
300 // return the current available key (i.e highest key number) in
301 // the symbol table
AvailableKey(void)302 int64 AvailableKey(void) const {
303 return impl_->AvailableKey();
304 }
305
306 protected:
SymbolTable(SymbolTableImpl * impl)307 explicit SymbolTable(SymbolTableImpl* impl) : impl_(impl) {}
308
Impl()309 const SymbolTableImpl* Impl() const {
310 return impl_;
311 }
312
313 private:
314 SymbolTableImpl* impl_;
315
316
317 void operator=(const SymbolTable &table); // disallow
318 };
319
320
321 //
322 // \class SymbolTableIterator
323 // \brief Iterator class for symbols in a symbol table
324 class SymbolTableIterator {
325 public:
326 // Constructor creates a refcounted copy of underlying implementation
SymbolTableIterator(const SymbolTable & symbol_table)327 SymbolTableIterator(const SymbolTable& symbol_table) {
328 impl_ = symbol_table.Impl();
329 impl_->IncrRefCount();
330 pos_ = 0;
331 size_ = impl_->symbols_.size();
332 }
333
334 // decrement implementation refcount, and delete if 0
~SymbolTableIterator()335 ~SymbolTableIterator() {
336 if (!impl_->DecrRefCount()) delete impl_;
337 }
338
339 // is iterator done
Done(void)340 bool Done(void) {
341 return (pos_ == size_);
342 }
343
344 // return the Value() of the current symbol (in64 key)
Value(void)345 int64 Value(void) {
346 return impl_->Find(impl_->symbols_[pos_]);
347 }
348
349 // return the string of the current symbol
Symbol(void)350 const char* Symbol(void) {
351 return impl_->symbols_[pos_];
352 }
353
354 // advance iterator forward
Next(void)355 void Next(void) {
356 if (Done()) return;
357 ++pos_;
358 }
359
360 // reset iterator
Reset(void)361 void Reset(void) {
362 pos_ = 0;
363 }
364
365 private:
366 const SymbolTableImpl* impl_;
367 size_t pos_;
368 size_t size_;
369 };
370
371
372 // Tests compatibilty between two sets of symbol tables
CompatSymbols(const SymbolTable * syms1,const SymbolTable * syms2)373 inline bool CompatSymbols(const SymbolTable *syms1,
374 const SymbolTable *syms2) {
375 if (!FLAGS_fst_compat_symbols)
376 return true;
377 else if (!syms1 && !syms2)
378 return true;
379 else if (syms1 && !syms2 || !syms1 && syms2)
380 return false;
381 else
382 return syms1->CheckSum() == syms2->CheckSum();
383 }
384
385 } // namespace fst
386
387 #endif // FST_LIB_SYMBOL_TABLE_H__
388