• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // synchronize.h
2 
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Copyright 2005-2010 Google, Inc.
16 // Author: allauzen@google.com (Cyril Allauzen)
17 //
18 // \file
19 // Synchronize an FST with bounded delay.
20 
21 #ifndef FST_LIB_SYNCHRONIZE_H__
22 #define FST_LIB_SYNCHRONIZE_H__
23 
24 #include <algorithm>
25 #include <tr1/unordered_map>
26 using std::tr1::unordered_map;
27 using std::tr1::unordered_multimap;
28 #include <tr1/unordered_set>
29 using std::tr1::unordered_set;
30 using std::tr1::unordered_multiset;
31 #include <string>
32 #include <utility>
33 using std::pair; using std::make_pair;
34 #include <vector>
35 using std::vector;
36 
37 #include <fst/cache.h>
38 #include <fst/test-properties.h>
39 
40 
41 namespace fst {
42 
43 typedef CacheOptions SynchronizeFstOptions;
44 
45 
46 // Implementation class for SynchronizeFst
47 template <class A>
48 class SynchronizeFstImpl
49     : public CacheImpl<A> {
50  public:
51   using FstImpl<A>::SetType;
52   using FstImpl<A>::SetProperties;
53   using FstImpl<A>::SetInputSymbols;
54   using FstImpl<A>::SetOutputSymbols;
55 
56   using CacheBaseImpl< CacheState<A> >::PushArc;
57   using CacheBaseImpl< CacheState<A> >::HasArcs;
58   using CacheBaseImpl< CacheState<A> >::HasFinal;
59   using CacheBaseImpl< CacheState<A> >::HasStart;
60   using CacheBaseImpl< CacheState<A> >::SetArcs;
61   using CacheBaseImpl< CacheState<A> >::SetFinal;
62   using CacheBaseImpl< CacheState<A> >::SetStart;
63 
64   typedef A Arc;
65   typedef typename A::Label Label;
66   typedef typename A::Weight Weight;
67   typedef typename A::StateId StateId;
68 
69   typedef basic_string<Label> String;
70 
71   struct Element {
ElementElement72     Element() {}
73 
ElementElement74     Element(StateId s, const String *i, const String *o)
75         : state(s), istring(i), ostring(o) {}
76 
77     StateId state;     // Input state Id
78     const String *istring;     // Residual input labels
79     const String *ostring;     // Residual output labels
80     // Residual strings are represented by const pointers to
81     // basic_string<Label> and are stored in a hash_set. The pointed
82     // memory is owned by the hash_set string_set_.
83   };
84 
SynchronizeFstImpl(const Fst<A> & fst,const SynchronizeFstOptions & opts)85   SynchronizeFstImpl(const Fst<A> &fst, const SynchronizeFstOptions &opts)
86       : CacheImpl<A>(opts), fst_(fst.Copy()) {
87     SetType("synchronize");
88     uint64 props = fst.Properties(kFstProperties, false);
89     SetProperties(SynchronizeProperties(props), kCopyProperties);
90 
91     SetInputSymbols(fst.InputSymbols());
92     SetOutputSymbols(fst.OutputSymbols());
93   }
94 
SynchronizeFstImpl(const SynchronizeFstImpl & impl)95   SynchronizeFstImpl(const SynchronizeFstImpl &impl)
96       : CacheImpl<A>(impl),
97         fst_(impl.fst_->Copy(true)) {
98     SetType("synchronize");
99     SetProperties(impl.Properties(), kCopyProperties);
100     SetInputSymbols(impl.InputSymbols());
101     SetOutputSymbols(impl.OutputSymbols());
102   }
103 
~SynchronizeFstImpl()104   ~SynchronizeFstImpl() {
105     delete fst_;
106     // Extract pointers from the hash set
107     vector<const String*> strings;
108     typename StringSet::iterator it = string_set_.begin();
109     for (; it != string_set_.end(); ++it)
110       strings.push_back(*it);
111     // Free the extracted pointers
112     for (size_t i = 0; i < strings.size(); ++i)
113       delete strings[i];
114   }
115 
Start()116   StateId Start() {
117     if (!HasStart()) {
118       StateId s = fst_->Start();
119       if (s == kNoStateId)
120         return kNoStateId;
121       const String *empty = FindString(new String());
122       StateId start = FindState(Element(fst_->Start(), empty, empty));
123       SetStart(start);
124     }
125     return CacheImpl<A>::Start();
126   }
127 
Final(StateId s)128   Weight Final(StateId s) {
129     if (!HasFinal(s)) {
130       const Element &e = elements_[s];
131       Weight w = e.state == kNoStateId ? Weight::One() : fst_->Final(e.state);
132       if ((w != Weight::Zero()) && (e.istring)->empty() && (e.ostring)->empty())
133         SetFinal(s, w);
134       else
135         SetFinal(s, Weight::Zero());
136     }
137     return CacheImpl<A>::Final(s);
138   }
139 
NumArcs(StateId s)140   size_t NumArcs(StateId s) {
141     if (!HasArcs(s))
142       Expand(s);
143     return CacheImpl<A>::NumArcs(s);
144   }
145 
NumInputEpsilons(StateId s)146   size_t NumInputEpsilons(StateId s) {
147     if (!HasArcs(s))
148       Expand(s);
149     return CacheImpl<A>::NumInputEpsilons(s);
150   }
151 
NumOutputEpsilons(StateId s)152   size_t NumOutputEpsilons(StateId s) {
153     if (!HasArcs(s))
154       Expand(s);
155     return CacheImpl<A>::NumOutputEpsilons(s);
156   }
157 
Properties()158   uint64 Properties() const { return Properties(kFstProperties); }
159 
160   // Set error if found; return FST impl properties.
Properties(uint64 mask)161   uint64 Properties(uint64 mask) const {
162     if ((mask & kError) && fst_->Properties(kError, false))
163       SetProperties(kError, kError);
164     return FstImpl<Arc>::Properties(mask);
165   }
166 
InitArcIterator(StateId s,ArcIteratorData<A> * data)167   void InitArcIterator(StateId s, ArcIteratorData<A> *data) {
168     if (!HasArcs(s))
169       Expand(s);
170     CacheImpl<A>::InitArcIterator(s, data);
171   }
172 
173   // Returns the first character of the string obtained by
174   // concatenating s and l.
175   Label Car(const String *s, Label l = 0) const {
176     if (!s->empty())
177       return (*s)[0];
178     else
179       return l;
180   }
181 
182   // Computes the residual string obtained by removing the first
183   // character in the concatenation of s and l.
184   const String *Cdr(const String *s, Label l = 0) {
185     String *r = new String();
186     for (int i = 1; i < s->size(); ++i)
187       r->push_back((*s)[i]);
188     if (l && !(s->empty())) r->push_back(l);
189     return FindString(r);
190   }
191 
192   // Computes the concatenation of s and l.
193   const String *Concat(const String *s, Label l = 0) {
194     String *r = new String();
195     for (int i = 0; i < s->size(); ++i)
196       r->push_back((*s)[i]);
197     if (l) r->push_back(l);
198     return FindString(r);
199   }
200 
201   // Tests if the concatenation of s and l is empty
202   bool Empty(const String *s, Label l = 0) const {
203     if (s->empty())
204       return l == 0;
205     else
206       return false;
207   }
208 
209   // Finds the string pointed by s in the hash set. Transfers the
210   // pointer ownership to the hash set.
FindString(const String * s)211   const String *FindString(const String *s) {
212     typename StringSet::iterator it = string_set_.find(s);
213     if (it != string_set_.end()) {
214       delete s;
215       return (*it);
216     } else {
217       string_set_.insert(s);
218       return s;
219     }
220   }
221 
222   // Finds state corresponding to an element. Creates new state
223   // if element not found.
FindState(const Element & e)224   StateId FindState(const Element &e) {
225     typename ElementMap::iterator eit = element_map_.find(e);
226     if (eit != element_map_.end()) {
227       return (*eit).second;
228     } else {
229       StateId s = elements_.size();
230       elements_.push_back(e);
231       element_map_.insert(pair<const Element, StateId>(e, s));
232       return s;
233     }
234   }
235 
236 
237   // Computes the outgoing transitions from a state, creating new destination
238   // states as needed.
Expand(StateId s)239   void Expand(StateId s) {
240     Element e = elements_[s];
241 
242     if (e.state != kNoStateId)
243       for (ArcIterator< Fst<A> > ait(*fst_, e.state);
244            !ait.Done();
245            ait.Next()) {
246         const A &arc = ait.Value();
247         if (!Empty(e.istring, arc.ilabel)  && !Empty(e.ostring, arc.olabel)) {
248           const String *istring = Cdr(e.istring, arc.ilabel);
249           const String *ostring = Cdr(e.ostring, arc.olabel);
250           StateId d = FindState(Element(arc.nextstate, istring, ostring));
251           PushArc(s, Arc(Car(e.istring, arc.ilabel),
252                         Car(e.ostring, arc.olabel), arc.weight, d));
253         } else {
254           const String *istring = Concat(e.istring, arc.ilabel);
255           const String *ostring = Concat(e.ostring, arc.olabel);
256           StateId d = FindState(Element(arc.nextstate, istring, ostring));
257           PushArc(s, Arc(0 , 0, arc.weight, d));
258         }
259       }
260 
261     Weight w = e.state == kNoStateId ? Weight::One() : fst_->Final(e.state);
262     if ((w != Weight::Zero()) &&
263         ((e.istring)->size() + (e.ostring)->size() > 0)) {
264       const String *istring = Cdr(e.istring);
265       const String *ostring = Cdr(e.ostring);
266       StateId d = FindState(Element(kNoStateId, istring, ostring));
267       PushArc(s, Arc(Car(e.istring), Car(e.ostring), w, d));
268     }
269     SetArcs(s);
270   }
271 
272  private:
273   // Equality function for Elements, assume strings have been hashed.
274   class ElementEqual {
275    public:
operator()276     bool operator()(const Element &x, const Element &y) const {
277       return x.state == y.state &&
278               x.istring == y.istring &&
279               x.ostring == y.ostring;
280     }
281   };
282 
283   // Hash function for Elements to Fst states.
284   class ElementKey {
285    public:
operator()286     size_t operator()(const Element &x) const {
287       size_t key = x.state;
288       key = (key << 1) ^ (x.istring)->size();
289       for (size_t i = 0; i < (x.istring)->size(); ++i)
290         key = (key << 1) ^ (*x.istring)[i];
291       key = (key << 1) ^ (x.ostring)->size();
292       for (size_t i = 0; i < (x.ostring)->size(); ++i)
293         key = (key << 1) ^ (*x.ostring)[i];
294       return key;
295     }
296   };
297 
298   // Equality function for strings
299   class StringEqual {
300    public:
operator()301     bool operator()(const String * const &x, const String * const &y) const {
302       if (x->size() != y->size()) return false;
303       for (size_t i = 0; i < x->size(); ++i)
304         if ((*x)[i] != (*y)[i]) return false;
305       return true;
306     }
307   };
308 
309   // Hash function for set of strings
310   class StringKey{
311    public:
operator()312     size_t operator()(const String * const & x) const {
313       size_t key = x->size();
314       for (size_t i = 0; i < x->size(); ++i)
315         key = (key << 1) ^ (*x)[i];
316       return key;
317     }
318   };
319 
320 
321   typedef unordered_map<Element, StateId, ElementKey, ElementEqual> ElementMap;
322   typedef unordered_set<const String*, StringKey, StringEqual> StringSet;
323 
324   const Fst<A> *fst_;
325   vector<Element> elements_;  // mapping Fst state to Elements
326   ElementMap element_map_;    // mapping Elements to Fst state
327   StringSet string_set_;
328 
329   void operator=(const SynchronizeFstImpl<A> &);  // disallow
330 };
331 
332 
333 // Synchronizes a transducer. This version is a delayed Fst.  The
334 // result will be an equivalent FST that has the property that during
335 // the traversal of a path, the delay is either zero or strictly
336 // increasing, where the delay is the difference between the number of
337 // non-epsilon output labels and input labels along the path.
338 //
339 // For the algorithm to terminate, the input transducer must have
340 // bounded delay, i.e., the delay of every cycle must be zero.
341 //
342 // Complexity:
343 // - A has bounded delay: exponential
344 // - A does not have bounded delay: does not terminate
345 //
346 // References:
347 // - Mehryar Mohri. Edit-Distance of Weighted Automata: General
348 //   Definitions and Algorithms, International Journal of Computer
349 //   Science, 14(6): 957-982 (2003).
350 //
351 // This class attaches interface to implementation and handles
352 // reference counting, delegating most methods to ImplToFst.
353 template <class A>
354 class SynchronizeFst : public ImplToFst< SynchronizeFstImpl<A> > {
355  public:
356   friend class ArcIterator< SynchronizeFst<A> >;
357   friend class StateIterator< SynchronizeFst<A> >;
358 
359   typedef A Arc;
360   typedef typename A::Weight Weight;
361   typedef typename A::StateId StateId;
362   typedef CacheState<A> State;
363   typedef SynchronizeFstImpl<A> Impl;
364 
SynchronizeFst(const Fst<A> & fst)365   SynchronizeFst(const Fst<A> &fst)
366       : ImplToFst<Impl>(new Impl(fst, SynchronizeFstOptions())) {}
367 
SynchronizeFst(const Fst<A> & fst,const SynchronizeFstOptions & opts)368   SynchronizeFst(const Fst<A> &fst,  const SynchronizeFstOptions &opts)
369       : ImplToFst<Impl>(new Impl(fst, opts)) {}
370 
371   // See Fst<>::Copy() for doc.
372   SynchronizeFst(const SynchronizeFst<A> &fst, bool safe = false)
373       : ImplToFst<Impl>(fst, safe) {}
374 
375   // Get a copy of this SynchronizeFst. See Fst<>::Copy() for further doc.
376   virtual SynchronizeFst<A> *Copy(bool safe = false) const {
377     return new SynchronizeFst<A>(*this, safe);
378   }
379 
380   virtual inline void InitStateIterator(StateIteratorData<A> *data) const;
381 
InitArcIterator(StateId s,ArcIteratorData<A> * data)382   virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const {
383     GetImpl()->InitArcIterator(s, data);
384   }
385 
386  private:
387   // Makes visible to friends.
GetImpl()388   Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); }
389 
390   void operator=(const SynchronizeFst<A> &fst);  // Disallow
391 };
392 
393 
394 // Specialization for SynchronizeFst.
395 template<class A>
396 class StateIterator< SynchronizeFst<A> >
397     : public CacheStateIterator< SynchronizeFst<A> > {
398  public:
StateIterator(const SynchronizeFst<A> & fst)399   explicit StateIterator(const SynchronizeFst<A> &fst)
400       : CacheStateIterator< SynchronizeFst<A> >(fst, fst.GetImpl()) {}
401 };
402 
403 
404 // Specialization for SynchronizeFst.
405 template <class A>
406 class ArcIterator< SynchronizeFst<A> >
407     : public CacheArcIterator< SynchronizeFst<A> > {
408  public:
409   typedef typename A::StateId StateId;
410 
ArcIterator(const SynchronizeFst<A> & fst,StateId s)411   ArcIterator(const SynchronizeFst<A> &fst, StateId s)
412       : CacheArcIterator< SynchronizeFst<A> >(fst.GetImpl(), s) {
413     if (!fst.GetImpl()->HasArcs(s))
414       fst.GetImpl()->Expand(s);
415   }
416 
417  private:
418   DISALLOW_COPY_AND_ASSIGN(ArcIterator);
419 };
420 
421 
422 template <class A> inline
InitStateIterator(StateIteratorData<A> * data)423 void SynchronizeFst<A>::InitStateIterator(StateIteratorData<A> *data) const
424 {
425   data->base = new StateIterator< SynchronizeFst<A> >(*this);
426 }
427 
428 
429 
430 // Synchronizes a transducer. This version writes the synchronized
431 // result to a MutableFst.  The result will be an equivalent FST that
432 // has the property that during the traversal of a path, the delay is
433 // either zero or strictly increasing, where the delay is the
434 // difference between the number of non-epsilon output labels and
435 // input labels along the path.
436 //
437 // For the algorithm to terminate, the input transducer must have
438 // bounded delay, i.e., the delay of every cycle must be zero.
439 //
440 // Complexity:
441 // - A has bounded delay: exponential
442 // - A does not have bounded delay: does not terminate
443 //
444 // References:
445 // - Mehryar Mohri. Edit-Distance of Weighted Automata: General
446 //   Definitions and Algorithms, International Journal of Computer
447 //   Science, 14(6): 957-982 (2003).
448 template<class Arc>
Synchronize(const Fst<Arc> & ifst,MutableFst<Arc> * ofst)449 void Synchronize(const Fst<Arc> &ifst, MutableFst<Arc> *ofst) {
450   SynchronizeFstOptions opts;
451   opts.gc_limit = 0;  // Cache only the last state for fastest copy.
452   *ofst = SynchronizeFst<Arc>(ifst, opts);
453 }
454 
455 }  // namespace fst
456 
457 #endif // FST_LIB_SYNCHRONIZE_H__
458