1 /* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 #ifndef NFAtoDFA_DEFINED 8 #define NFAtoDFA_DEFINED 9 10 #include "src/sksl/lex/DFA.h" 11 #include "src/sksl/lex/DFAState.h" 12 #include "src/sksl/lex/NFA.h" 13 #include "src/sksl/lex/NFAState.h" 14 15 #include <algorithm> 16 #include <climits> 17 #include <memory> 18 #include <unordered_map> 19 #include <set> 20 #include <vector> 21 22 /** 23 * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and 24 * DFAs differ only in that an NFA allows multiple states at the same time, we can find each 25 * possible combination of simultaneous NFA states and give this combination a label. These labelled 26 * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time. 27 * 28 * As an NFA can end up in multiple accept states at the same time (for instance, the token "while" 29 * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex 30 * (in terms of the order in which they were added to the NFA). 31 */ 32 class NFAtoDFA { 33 public: 34 static constexpr char START_CHAR = 9; 35 static constexpr char END_CHAR = 126; 36 NFAtoDFA(NFA * nfa)37 NFAtoDFA(NFA* nfa) 38 : fNFA(*nfa) {} 39 40 /** 41 * Returns a DFA created from the NFA. 42 */ convert()43 DFA convert() { 44 // create state 0, the "reject" state 45 getState(DFAState::Label({})); 46 // create a state representing being in all of the NFA's start states at once 47 std::vector<int> startStates = fNFA.fStartStates; 48 std::sort(startStates.begin(), startStates.end()); 49 // this becomes state 1, our start state 50 DFAState* start = getState(DFAState::Label(startStates)); 51 this->scanState(start); 52 53 this->computeMappings(); 54 55 int stateCount = 0; 56 for (const auto& row : fTransitions) { 57 stateCount = std::max(stateCount, (int) row.size()); 58 } 59 return DFA(fCharMappings, fTransitions, fAccepts); 60 } 61 62 private: 63 /** 64 * Returns an existing state with the given label, or creates a new one and returns it. 65 */ getState(DFAState::Label label)66 DFAState* getState(DFAState::Label label) { 67 auto found = fStates.find(label); 68 if (found == fStates.end()) { 69 int id = fStates.size(); 70 fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label)); 71 return fStates[label].get(); 72 } 73 return found->second.get(); 74 } 75 add(int nfaState,std::vector<int> * states)76 void add(int nfaState, std::vector<int>* states) { 77 NFAState state = fNFA.fStates[nfaState]; 78 if (state.fKind == NFAState::kRemapped_Kind) { 79 for (int next : state.fData) { 80 this->add(next, states); 81 } 82 } else { 83 for (int state : *states) { 84 if (nfaState == state) { 85 return; 86 } 87 } 88 states->push_back(nfaState); 89 } 90 } 91 addTransition(char c,int start,int next)92 void addTransition(char c, int start, int next) { 93 while (fTransitions.size() <= (size_t) c) { 94 fTransitions.push_back(std::vector<int>()); 95 } 96 std::vector<int>& row = fTransitions[c]; 97 while (row.size() <= (size_t) start) { 98 row.push_back(INVALID); 99 } 100 row[start] = next; 101 } 102 scanState(DFAState * state)103 void scanState(DFAState* state) { 104 state->fIsScanned = true; 105 for (char c = START_CHAR; c <= END_CHAR; ++c) { 106 std::vector<int> next; 107 int bestAccept = INT_MAX; 108 for (int idx : state->fLabel.fStates) { 109 const NFAState& nfaState = fNFA.fStates[idx]; 110 if (nfaState.accept(c)) { 111 for (int nextState : nfaState.fNext) { 112 if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) { 113 bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]); 114 } 115 this->add(nextState, &next); 116 } 117 } 118 } 119 std::sort(next.begin(), next.end()); 120 DFAState* nextState = this->getState(DFAState::Label(next)); 121 this->addTransition(c, state->fId, nextState->fId); 122 if (bestAccept != INT_MAX) { 123 while (fAccepts.size() <= (size_t) nextState->fId) { 124 fAccepts.push_back(INVALID); 125 } 126 fAccepts[nextState->fId] = bestAccept; 127 } 128 if (!nextState->fIsScanned) { 129 this->scanState(nextState); 130 } 131 } 132 } 133 134 // collapse rows with the same transitions to a single row. This is common, as each row 135 // represents a character and often there are many characters for which all transitions are 136 // identical (e.g. [0-9] are treated the same way by all lexer rules) computeMappings()137 void computeMappings() { 138 // mappings[<input row>] = <output row> 139 std::vector<std::vector<int>*> uniques; 140 // this could be done more efficiently, but O(n^2) is plenty fast for our purposes 141 for (size_t i = 0; i < fTransitions.size(); ++i) { 142 int found = -1; 143 for (size_t j = 0; j < uniques.size(); ++j) { 144 if (*uniques[j] == fTransitions[i]) { 145 found = j; 146 break; 147 } 148 } 149 if (found == -1) { 150 found = (int) uniques.size(); 151 uniques.push_back(&fTransitions[i]); 152 } 153 fCharMappings.push_back(found); 154 } 155 std::vector<std::vector<int>> newTransitions; 156 for (std::vector<int>* row : uniques) { 157 newTransitions.push_back(*row); 158 } 159 fTransitions = newTransitions; 160 } 161 162 const NFA& fNFA; 163 std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates; 164 std::vector<std::vector<int>> fTransitions; 165 std::vector<int> fCharMappings; 166 std::vector<int> fAccepts; 167 }; 168 #endif // NFAtoDFA_DEFINED 169