• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // symbol-table.h
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 //
16 // \file
17 // Classes to provide symbol-to-integer and integer-to-symbol mappings.
18 
19 #ifndef FST_LIB_SYMBOL_TABLE_H__
20 #define FST_LIB_SYMBOL_TABLE_H__
21 
22 #include <ext/hash_map>
23 using __gnu_cxx::hash_map;
24 #include <fstream>
25 #include <iostream>
26 #include <string>
27 #include <vector>
28 
29 #include "fst/lib/compat.h"
30 
31 
32 
33 DECLARE_bool(fst_compat_symbols);
34 
35 namespace fst {
36 
37 class SymbolTableImpl {
38   friend class SymbolTableIterator;
39  public:
SymbolTableImpl(const string & name)40   SymbolTableImpl(const string &name)
41       : name_(name), available_key_(0), ref_count_(1),
42         check_sum_finalized_(false) {}
~SymbolTableImpl()43   ~SymbolTableImpl() {
44     for (size_t i = 0; i < symbols_.size(); ++i)
45       delete[] symbols_[i];
46   }
47 
48   int64 AddSymbol(const string& symbol, int64 key);
49 
AddSymbol(const string & symbol)50   int64 AddSymbol(const string& symbol) {
51     int64 key = Find(symbol);
52     return (key == -1) ? AddSymbol(symbol, available_key_++) : key;
53   }
54 
AddTable(SymbolTableImpl * table)55   void AddTable(SymbolTableImpl* table) {
56     for (size_t i = 0; i < table->symbols_.size(); ++i) {
57       AddSymbol(table->symbols_[i]);
58     }
59   }
60 
61   static SymbolTableImpl* ReadText(const string& filename);
62 
63   static SymbolTableImpl* Read(istream &strm, const string& source);
64 
65   bool Write(ostream &strm) const;
66 
67   bool WriteText(ostream &strm) const;
68 
69   //
70   // Return the string associated with the key. If the key is out of
71   // range (<0, >max), return an empty string.
Find(int64 key)72   string Find(int64 key) const {
73     hash_map<int64, string>::const_iterator it =
74       key_map_.find(key);
75     if (it == key_map_.end()) {
76       return "";
77     }
78     return it->second;
79   }
80 
81   //
82   // Return the key associated with the symbol. If the symbol
83   // does not exists, return -1.
Find(const string & symbol)84   int64 Find(const string& symbol) const {
85     return Find(symbol.c_str());
86   }
87 
88   //
89   // Return the key associated with the symbol. If the symbol
90   // does not exists, return -1.
Find(const char * symbol)91   int64 Find(const char* symbol) const {
92     hash_map<string, int64>::const_iterator it =
93       symbol_map_.find(symbol);
94     if (it == symbol_map_.end()) {
95       return -1;
96     }
97     return it->second;
98   }
99 
Name()100   const string& Name() const { return name_; }
101 
IncrRefCount()102   int IncrRefCount() const {
103     return ++ref_count_;
104   }
DecrRefCount()105   int DecrRefCount() const {
106     return --ref_count_;
107   }
108 
CheckSum()109   string CheckSum() const {
110     if (!check_sum_finalized_) {
111       RecomputeCheckSum();
112       check_sum_string_ = check_sum_.Digest();
113     }
114     return check_sum_string_;
115   }
116 
AvailableKey()117   int64 AvailableKey() const {
118     return available_key_;
119   }
120 
121   // private support methods
122  private:
123   void RecomputeCheckSum() const;
124   static SymbolTableImpl* Read1(istream &, const string &);
125 
126   string name_;
127   int64 available_key_;
128   vector<const char *> symbols_;
129   hash_map<int64, string> key_map_;
130   hash_map<string, int64> symbol_map_;
131 
132   mutable int ref_count_;
133   mutable bool check_sum_finalized_;
134   mutable MD5 check_sum_;
135   mutable string check_sum_string_;
136 
137   DISALLOW_EVIL_CONSTRUCTORS(SymbolTableImpl);
138 };
139 
140 
141 class SymbolTableIterator;
142 
143 //
144 // \class SymbolTable
145 // \brief Symbol (string) to int and reverse mapping
146 //
147 // The SymbolTable implements the mappings of labels to strings and reverse.
148 // SymbolTables are used to describe the alphabet of the input and output
149 // labels for arcs in a Finite State Transducer.
150 //
151 // SymbolTables are reference counted and can therefore be shared across
152 // multiple machines. For example a language model grammar G, with a
153 // SymbolTable for the words in the language model can share this symbol
154 // table with the lexical representation L o G.
155 //
156 class SymbolTable {
157   friend class SymbolTableIterator;
158  public:
159   static const int64 kNoSymbol = -1;
160 
161   // Construct symbol table with a unique name.
SymbolTable(const string & name)162   SymbolTable(const string& name) : impl_(new SymbolTableImpl(name)) {}
163 
164   // Create a reference counted copy.
SymbolTable(const SymbolTable & table)165   SymbolTable(const SymbolTable& table) : impl_(table.impl_) {
166     impl_->IncrRefCount();
167   }
168 
169   // Derefence implentation object. When reference count hits 0, delete
170   // implementation.
~SymbolTable()171   ~SymbolTable() {
172     if (!impl_->DecrRefCount()) delete impl_;
173   }
174 
175   // create a reference counted copy
Copy()176   SymbolTable* Copy() const {
177     return new SymbolTable(*this);
178   }
179 
180   // Add a symbol with given key to table. A symbol table also
181   // keeps track of the last available key (highest key value in
182   // the symbol table).
183   //
184   // \param symbol string symbol to add
185   // \param key associated key for string symbol
186   // \return the key created by the symbol table. Symbols allready added to
187   //         the symbol table will not get a different key.
AddSymbol(const string & symbol,int64 key)188   int64 AddSymbol(const string& symbol, int64 key) {
189     return impl_->AddSymbol(symbol, key);
190   }
191 
192   // Add a symbol to the table. The associated value key is automatically
193   // assigned by the symbol table.
194   //
195   // \param symbol string to add to the table
196   // \return the value key assigned to the associated string symbol
AddSymbol(const string & symbol)197   int64 AddSymbol(const string& symbol) {
198     return impl_->AddSymbol(symbol);
199   }
200 
201   // Add another symbol table to this table. All key values will be offset
202   // by the current available key (highest key value in the symbol table).
203   // Note string symbols with the same key value with still have the same
204   // key value after the symbol table has been merged, but a different
205   // value. Adding symbol tables do not result in changes in the base table.
206   //
207   // Merging N symbol tables is often useful when combining the various
208   // name spaces of transducers to a unified representation.
209   //
210   // \param table the symbol table to add to this table
AddTable(const SymbolTable & table)211   void AddTable(const SymbolTable& table) {
212     return impl_->AddTable(table.impl_);
213   }
214 
215   // return the name of the symbol table
Name()216   const string& Name() const {
217     return impl_->Name();
218   }
219 
220   // return the MD5 check-sum for this table. All new symbols added to
221   // the table will result in an updated checksum.
CheckSum()222   string CheckSum() const {
223     return impl_->CheckSum();
224   }
225 
226   // read an ascii representation of the symbol table
ReadText(const string & filename)227   static SymbolTable* ReadText(const string& filename) {
228     SymbolTableImpl* impl = SymbolTableImpl::ReadText(filename);
229     if (!impl)
230       return 0;
231     else
232       return new SymbolTable(impl);
233   }
234 
235   // read a binary dump of the symbol table
Read(istream & strm,const string & source)236   static SymbolTable* Read(istream &strm, const string& source) {
237     SymbolTableImpl* impl = SymbolTableImpl::Read(strm, source);
238     if (!impl)
239       return 0;
240     else
241       return new SymbolTable(impl);
242   }
243 
244   // read a binary dump of the symbol table
Read(const string & filename)245   static SymbolTable* Read(const string& filename) {
246     ifstream strm(filename.c_str());
247     if (!strm) {
248       LOG(ERROR) << "SymbolTable::Read: Can't open file " << filename;
249       return 0;
250     }
251     return Read(strm, filename);
252   }
253 
Write(ostream & strm)254   bool Write(ostream  &strm) const {
255     return impl_->Write(strm);
256   }
257 
Write(const string & filename)258   bool Write(const string& filename) const {
259     ofstream strm(filename.c_str());
260     if (!strm) {
261       LOG(ERROR) << "SymbolTable::Write: Can't open file " << filename;
262       return false;
263     }
264     return Write(strm);
265   }
266 
267   // Dump an ascii text representation of the symbol table
WriteText(ostream & strm)268   bool WriteText(ostream &strm) const {
269     return impl_->WriteText(strm);
270   }
271 
272   // Dump an ascii text representation of the symbol table
WriteText(const string & filename)273   bool WriteText(const string& filename) const {
274     ofstream strm(filename.c_str());
275     if (!strm) {
276       LOG(ERROR) << "SymbolTable::WriteText: Can't open file " << filename;
277       return false;
278     }
279     return WriteText(strm);
280   }
281 
282   // Return the string associated with the key. If the key is out of
283   // range (<0, >max), log error and return an empty string.
Find(int64 key)284   string Find(int64 key) const {
285     return impl_->Find(key);
286   }
287 
288   // Return the key associated with the symbol. If the symbol
289   // does not exists, log error and  return -1
Find(const string & symbol)290   int64 Find(const string& symbol) const {
291     return impl_->Find(symbol);
292   }
293 
294   // Return the key associated with the symbol. If the symbol
295   // does not exists, log error and  return -1
Find(const char * symbol)296   int64 Find(const char* symbol) const {
297     return impl_->Find(symbol);
298   }
299 
300   // return the current available key (i.e highest key number) in
301   // the symbol table
AvailableKey(void)302   int64 AvailableKey(void) const {
303     return impl_->AvailableKey();
304   }
305 
306  protected:
SymbolTable(SymbolTableImpl * impl)307   explicit SymbolTable(SymbolTableImpl* impl) : impl_(impl) {}
308 
Impl()309   const SymbolTableImpl* Impl() const {
310     return impl_;
311   }
312 
313  private:
314   SymbolTableImpl* impl_;
315 
316 
317   void operator=(const SymbolTable &table);  // disallow
318 };
319 
320 
321 //
322 // \class SymbolTableIterator
323 // \brief Iterator class for symbols in a symbol table
324 class SymbolTableIterator {
325  public:
326   // Constructor creates a refcounted copy of underlying implementation
SymbolTableIterator(const SymbolTable & symbol_table)327   SymbolTableIterator(const SymbolTable& symbol_table) {
328     impl_ = symbol_table.Impl();
329     impl_->IncrRefCount();
330     pos_ = 0;
331     size_ = impl_->symbols_.size();
332   }
333 
334   // decrement implementation refcount, and delete if 0
~SymbolTableIterator()335   ~SymbolTableIterator() {
336     if (!impl_->DecrRefCount()) delete impl_;
337   }
338 
339   // is iterator done
Done(void)340   bool Done(void) {
341     return (pos_ == size_);
342   }
343 
344   // return the Value() of the current symbol (in64 key)
Value(void)345   int64 Value(void) {
346     return impl_->Find(impl_->symbols_[pos_]);
347   }
348 
349   // return the string of the current symbol
Symbol(void)350   const char* Symbol(void) {
351     return impl_->symbols_[pos_];
352   }
353 
354   // advance iterator forward
Next(void)355   void Next(void) {
356     if (Done()) return;
357     ++pos_;
358   }
359 
360   // reset iterator
Reset(void)361   void Reset(void) {
362     pos_ = 0;
363   }
364 
365  private:
366   const SymbolTableImpl* impl_;
367   size_t pos_;
368   size_t size_;
369 };
370 
371 
372 // Tests compatibilty between two sets of symbol tables
CompatSymbols(const SymbolTable * syms1,const SymbolTable * syms2)373 inline bool CompatSymbols(const SymbolTable *syms1,
374                           const SymbolTable *syms2) {
375   if (!FLAGS_fst_compat_symbols)
376     return true;
377   else if (!syms1 && !syms2)
378     return true;
379   else if (syms1 && !syms2 || !syms1 && syms2)
380     return false;
381   else
382     return syms1->CheckSum() == syms2->CheckSum();
383 }
384 
385 }  // namespace fst
386 
387 #endif  // FST_LIB_SYMBOL_TABLE_H__
388