1 //===- DFAEmitter.cpp - Finite state automaton emitter --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class can produce a generic deterministic finite state automaton (DFA),
10 // given a set of possible states and transitions.
11 //
12 // The input transitions can be nondeterministic - this class will produce the
13 // deterministic equivalent state machine.
14 //
15 // The generated code can run the DFA and produce an accepted / not accepted
16 // state and also produce, given a sequence of transitions that results in an
17 // accepted state, the sequence of intermediate states. This is useful if the
18 // initial automaton was nondeterministic - it allows mapping back from the DFA
19 // to the NFA.
20 //
21 //===----------------------------------------------------------------------===//
22 #define DEBUG_TYPE "dfa-emitter"
23
24 #include "DFAEmitter.h"
25 #include "CodeGenTarget.h"
26 #include "SequenceToOffsetTable.h"
27 #include "TableGenBackends.h"
28 #include "llvm/ADT/SmallVector.h"
29 #include "llvm/ADT/StringExtras.h"
30 #include "llvm/ADT/UniqueVector.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/raw_ostream.h"
33 #include "llvm/TableGen/Record.h"
34 #include "llvm/TableGen/TableGenBackend.h"
35 #include <cassert>
36 #include <cstdint>
37 #include <map>
38 #include <set>
39 #include <string>
40 #include <vector>
41
42 using namespace llvm;
43
44 //===----------------------------------------------------------------------===//
45 // DfaEmitter implementation. This is independent of the GenAutomaton backend.
46 //===----------------------------------------------------------------------===//
47
addTransition(state_type From,state_type To,action_type A)48 void DfaEmitter::addTransition(state_type From, state_type To, action_type A) {
49 Actions.insert(A);
50 NfaStates.insert(From);
51 NfaStates.insert(To);
52 NfaTransitions[{From, A}].push_back(To);
53 ++NumNfaTransitions;
54 }
55
visitDfaState(const DfaState & DS)56 void DfaEmitter::visitDfaState(const DfaState &DS) {
57 // For every possible action...
58 auto FromId = DfaStates.idFor(DS);
59 for (action_type A : Actions) {
60 DfaState NewStates;
61 DfaTransitionInfo TI;
62 // For every represented state, word pair in the original NFA...
63 for (state_type FromState : DS) {
64 // If this action is possible from this state add the transitioned-to
65 // states to NewStates.
66 auto I = NfaTransitions.find({FromState, A});
67 if (I == NfaTransitions.end())
68 continue;
69 for (state_type &ToState : I->second) {
70 NewStates.push_back(ToState);
71 TI.emplace_back(FromState, ToState);
72 }
73 }
74 if (NewStates.empty())
75 continue;
76 // Sort and unique.
77 sort(NewStates);
78 NewStates.erase(std::unique(NewStates.begin(), NewStates.end()),
79 NewStates.end());
80 sort(TI);
81 TI.erase(std::unique(TI.begin(), TI.end()), TI.end());
82 unsigned ToId = DfaStates.insert(NewStates);
83 DfaTransitions.emplace(std::make_pair(FromId, A), std::make_pair(ToId, TI));
84 }
85 }
86
constructDfa()87 void DfaEmitter::constructDfa() {
88 DfaState Initial(1, /*NFA initial state=*/0);
89 DfaStates.insert(Initial);
90
91 // Note that UniqueVector starts indices at 1, not zero.
92 unsigned DfaStateId = 1;
93 while (DfaStateId <= DfaStates.size()) {
94 DfaState S = DfaStates[DfaStateId];
95 visitDfaState(S);
96 DfaStateId++;
97 }
98 }
99
emit(StringRef Name,raw_ostream & OS)100 void DfaEmitter::emit(StringRef Name, raw_ostream &OS) {
101 constructDfa();
102
103 OS << "// Input NFA has " << NfaStates.size() << " states with "
104 << NumNfaTransitions << " transitions.\n";
105 OS << "// Generated DFA has " << DfaStates.size() << " states with "
106 << DfaTransitions.size() << " transitions.\n\n";
107
108 // Implementation note: We don't bake a simple std::pair<> here as it requires
109 // significantly more effort to parse. A simple test with a large array of
110 // struct-pairs (N=100000) took clang-10 6s to parse. The same array of
111 // std::pair<uint64_t, uint64_t> took 242s. Instead we allow the user to
112 // define the pair type.
113 //
114 // FIXME: It may make sense to emit these as ULEB sequences instead of
115 // pairs of uint64_t.
116 OS << "// A zero-terminated sequence of NFA state transitions. Every DFA\n";
117 OS << "// transition implies a set of NFA transitions. These are referred\n";
118 OS << "// to by index in " << Name << "Transitions[].\n";
119
120 SequenceToOffsetTable<DfaTransitionInfo> Table;
121 std::map<DfaTransitionInfo, unsigned> EmittedIndices;
122 for (auto &T : DfaTransitions)
123 Table.add(T.second.second);
124 Table.layout();
125 OS << "const std::array<NfaStatePair, " << Table.size() << "> " << Name
126 << "TransitionInfo = {{\n";
127 Table.emit(
128 OS,
129 [](raw_ostream &OS, std::pair<uint64_t, uint64_t> P) {
130 OS << "{" << P.first << ", " << P.second << "}";
131 },
132 "{0ULL, 0ULL}");
133
134 OS << "}};\n\n";
135
136 OS << "// A transition in the generated " << Name << " DFA.\n";
137 OS << "struct " << Name << "Transition {\n";
138 OS << " unsigned FromDfaState; // The transitioned-from DFA state.\n";
139 OS << " ";
140 printActionType(OS);
141 OS << " Action; // The input symbol that causes this transition.\n";
142 OS << " unsigned ToDfaState; // The transitioned-to DFA state.\n";
143 OS << " unsigned InfoIdx; // Start index into " << Name
144 << "TransitionInfo.\n";
145 OS << "};\n\n";
146
147 OS << "// A table of DFA transitions, ordered by {FromDfaState, Action}.\n";
148 OS << "// The initial state is 1, not zero.\n";
149 OS << "const std::array<" << Name << "Transition, "
150 << DfaTransitions.size() << "> " << Name << "Transitions = {{\n";
151 for (auto &KV : DfaTransitions) {
152 dfa_state_type From = KV.first.first;
153 dfa_state_type To = KV.second.first;
154 action_type A = KV.first.second;
155 unsigned InfoIdx = Table.get(KV.second.second);
156 OS << " {" << From << ", ";
157 printActionValue(A, OS);
158 OS << ", " << To << ", " << InfoIdx << "},\n";
159 }
160 OS << "\n}};\n\n";
161 }
162
printActionType(raw_ostream & OS)163 void DfaEmitter::printActionType(raw_ostream &OS) { OS << "uint64_t"; }
164
printActionValue(action_type A,raw_ostream & OS)165 void DfaEmitter::printActionValue(action_type A, raw_ostream &OS) { OS << A; }
166
167 //===----------------------------------------------------------------------===//
168 // AutomatonEmitter implementation
169 //===----------------------------------------------------------------------===//
170
171 namespace {
172 // FIXME: This entire discriminated union could be removed with c++17:
173 // using Action = std::variant<Record *, unsigned, std::string>;
174 struct Action {
175 Record *R = nullptr;
176 unsigned I = 0;
177 std::string S;
178
179 Action() = default;
Action__anon24fc4d370211::Action180 Action(Record *R, unsigned I, std::string S) : R(R), I(I), S(S) {}
181
print__anon24fc4d370211::Action182 void print(raw_ostream &OS) const {
183 if (R)
184 OS << R->getName();
185 else if (!S.empty())
186 OS << '"' << S << '"';
187 else
188 OS << I;
189 }
operator <__anon24fc4d370211::Action190 bool operator<(const Action &Other) const {
191 return std::make_tuple(R, I, S) <
192 std::make_tuple(Other.R, Other.I, Other.S);
193 }
194 };
195
196 using ActionTuple = std::vector<Action>;
197 class Automaton;
198
199 class Transition {
200 uint64_t NewState;
201 // The tuple of actions that causes this transition.
202 ActionTuple Actions;
203 // The types of the actions; this is the same across all transitions.
204 SmallVector<std::string, 4> Types;
205
206 public:
207 Transition(Record *R, Automaton *Parent);
getActions()208 const ActionTuple &getActions() { return Actions; }
getTypes()209 SmallVector<std::string, 4> getTypes() { return Types; }
210
211 bool canTransitionFrom(uint64_t State);
212 uint64_t transitionFrom(uint64_t State);
213 };
214
215 class Automaton {
216 RecordKeeper &Records;
217 Record *R;
218 std::vector<Transition> Transitions;
219 /// All possible action tuples, uniqued.
220 UniqueVector<ActionTuple> Actions;
221 /// The fields within each Transition object to find the action symbols.
222 std::vector<StringRef> ActionSymbolFields;
223
224 public:
225 Automaton(RecordKeeper &Records, Record *R);
226 void emit(raw_ostream &OS);
227
getActionSymbolFields()228 ArrayRef<StringRef> getActionSymbolFields() { return ActionSymbolFields; }
229 /// If the type of action A has been overridden (there exists a field
230 /// "TypeOf_A") return that, otherwise return the empty string.
231 StringRef getActionSymbolType(StringRef A);
232 };
233
234 class AutomatonEmitter {
235 RecordKeeper &Records;
236
237 public:
AutomatonEmitter(RecordKeeper & R)238 AutomatonEmitter(RecordKeeper &R) : Records(R) {}
239 void run(raw_ostream &OS);
240 };
241
242 /// A DfaEmitter implementation that can print our variant action type.
243 class CustomDfaEmitter : public DfaEmitter {
244 const UniqueVector<ActionTuple> &Actions;
245 std::string TypeName;
246
247 public:
CustomDfaEmitter(const UniqueVector<ActionTuple> & Actions,StringRef TypeName)248 CustomDfaEmitter(const UniqueVector<ActionTuple> &Actions, StringRef TypeName)
249 : Actions(Actions), TypeName(TypeName) {}
250
251 void printActionType(raw_ostream &OS) override;
252 void printActionValue(action_type A, raw_ostream &OS) override;
253 };
254 } // namespace
255
run(raw_ostream & OS)256 void AutomatonEmitter::run(raw_ostream &OS) {
257 for (Record *R : Records.getAllDerivedDefinitions("GenericAutomaton")) {
258 Automaton A(Records, R);
259 OS << "#ifdef GET_" << R->getName() << "_DECL\n";
260 A.emit(OS);
261 OS << "#endif // GET_" << R->getName() << "_DECL\n";
262 }
263 }
264
Automaton(RecordKeeper & Records,Record * R)265 Automaton::Automaton(RecordKeeper &Records, Record *R)
266 : Records(Records), R(R) {
267 LLVM_DEBUG(dbgs() << "Emitting automaton for " << R->getName() << "\n");
268 ActionSymbolFields = R->getValueAsListOfStrings("SymbolFields");
269 }
270
emit(raw_ostream & OS)271 void Automaton::emit(raw_ostream &OS) {
272 StringRef TransitionClass = R->getValueAsString("TransitionClass");
273 for (Record *T : Records.getAllDerivedDefinitions(TransitionClass)) {
274 assert(T->isSubClassOf("Transition"));
275 Transitions.emplace_back(T, this);
276 Actions.insert(Transitions.back().getActions());
277 }
278
279 LLVM_DEBUG(dbgs() << " Action alphabet cardinality: " << Actions.size()
280 << "\n");
281 LLVM_DEBUG(dbgs() << " Each state has " << Transitions.size()
282 << " potential transitions.\n");
283
284 StringRef Name = R->getName();
285
286 CustomDfaEmitter Emitter(Actions, std::string(Name) + "Action");
287 // Starting from the initial state, build up a list of possible states and
288 // transitions.
289 std::deque<uint64_t> Worklist(1, 0);
290 std::set<uint64_t> SeenStates;
291 unsigned NumTransitions = 0;
292 SeenStates.insert(Worklist.front());
293 while (!Worklist.empty()) {
294 uint64_t State = Worklist.front();
295 Worklist.pop_front();
296 for (Transition &T : Transitions) {
297 if (!T.canTransitionFrom(State))
298 continue;
299 uint64_t NewState = T.transitionFrom(State);
300 if (SeenStates.emplace(NewState).second)
301 Worklist.emplace_back(NewState);
302 ++NumTransitions;
303 Emitter.addTransition(State, NewState, Actions.idFor(T.getActions()));
304 }
305 }
306 LLVM_DEBUG(dbgs() << " NFA automaton has " << SeenStates.size()
307 << " states with " << NumTransitions << " transitions.\n");
308
309 const auto &ActionTypes = Transitions.back().getTypes();
310 OS << "// The type of an action in the " << Name << " automaton.\n";
311 if (ActionTypes.size() == 1) {
312 OS << "using " << Name << "Action = " << ActionTypes[0] << ";\n";
313 } else {
314 OS << "using " << Name << "Action = std::tuple<" << join(ActionTypes, ", ")
315 << ">;\n";
316 }
317 OS << "\n";
318
319 Emitter.emit(Name, OS);
320 }
321
getActionSymbolType(StringRef A)322 StringRef Automaton::getActionSymbolType(StringRef A) {
323 Twine Ty = "TypeOf_" + A;
324 if (!R->getValue(Ty.str()))
325 return "";
326 return R->getValueAsString(Ty.str());
327 }
328
Transition(Record * R,Automaton * Parent)329 Transition::Transition(Record *R, Automaton *Parent) {
330 BitsInit *NewStateInit = R->getValueAsBitsInit("NewState");
331 NewState = 0;
332 assert(NewStateInit->getNumBits() <= sizeof(uint64_t) * 8 &&
333 "State cannot be represented in 64 bits!");
334 for (unsigned I = 0; I < NewStateInit->getNumBits(); ++I) {
335 if (auto *Bit = dyn_cast<BitInit>(NewStateInit->getBit(I))) {
336 if (Bit->getValue())
337 NewState |= 1ULL << I;
338 }
339 }
340
341 for (StringRef A : Parent->getActionSymbolFields()) {
342 RecordVal *SymbolV = R->getValue(A);
343 if (auto *Ty = dyn_cast<RecordRecTy>(SymbolV->getType())) {
344 Actions.emplace_back(R->getValueAsDef(A), 0, "");
345 Types.emplace_back(Ty->getAsString());
346 } else if (isa<IntRecTy>(SymbolV->getType())) {
347 Actions.emplace_back(nullptr, R->getValueAsInt(A), "");
348 Types.emplace_back("unsigned");
349 } else if (isa<StringRecTy>(SymbolV->getType())) {
350 Actions.emplace_back(nullptr, 0, std::string(R->getValueAsString(A)));
351 Types.emplace_back("std::string");
352 } else {
353 report_fatal_error("Unhandled symbol type!");
354 }
355
356 StringRef TypeOverride = Parent->getActionSymbolType(A);
357 if (!TypeOverride.empty())
358 Types.back() = std::string(TypeOverride);
359 }
360 }
361
canTransitionFrom(uint64_t State)362 bool Transition::canTransitionFrom(uint64_t State) {
363 if ((State & NewState) == 0)
364 // The bits we want to set are not set;
365 return true;
366 return false;
367 }
368
transitionFrom(uint64_t State)369 uint64_t Transition::transitionFrom(uint64_t State) {
370 return State | NewState;
371 }
372
printActionType(raw_ostream & OS)373 void CustomDfaEmitter::printActionType(raw_ostream &OS) { OS << TypeName; }
374
printActionValue(action_type A,raw_ostream & OS)375 void CustomDfaEmitter::printActionValue(action_type A, raw_ostream &OS) {
376 const ActionTuple &AT = Actions[A];
377 if (AT.size() > 1)
378 OS << "std::make_tuple(";
379 bool First = true;
380 for (const auto &SingleAction : AT) {
381 if (!First)
382 OS << ", ";
383 First = false;
384 SingleAction.print(OS);
385 }
386 if (AT.size() > 1)
387 OS << ")";
388 }
389
390 namespace llvm {
391
EmitAutomata(RecordKeeper & RK,raw_ostream & OS)392 void EmitAutomata(RecordKeeper &RK, raw_ostream &OS) {
393 AutomatonEmitter(RK).run(OS);
394 }
395
396 } // namespace llvm
397