1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Compiled regular expression representation.
6 // Tested by compile_test.cc
7
8 #include "util/util.h"
9 #include "util/sparse_set.h"
10 #include "re2/prog.h"
11 #include "re2/stringpiece.h"
12
13 namespace re2 {
14
15 // Constructors per Inst opcode
16
InitAlt(uint32 out,uint32 out1)17 void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
18 DCHECK_EQ(out_opcode_, 0);
19 set_out_opcode(out, kInstAlt);
20 out1_ = out1;
21 }
22
InitByteRange(int lo,int hi,int foldcase,uint32 out)23 void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
24 DCHECK_EQ(out_opcode_, 0);
25 set_out_opcode(out, kInstByteRange);
26 lo_ = lo & 0xFF;
27 hi_ = hi & 0xFF;
28 foldcase_ = foldcase;
29 }
30
InitCapture(int cap,uint32 out)31 void Prog::Inst::InitCapture(int cap, uint32 out) {
32 DCHECK_EQ(out_opcode_, 0);
33 set_out_opcode(out, kInstCapture);
34 cap_ = cap;
35 }
36
InitEmptyWidth(EmptyOp empty,uint32 out)37 void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
38 DCHECK_EQ(out_opcode_, 0);
39 set_out_opcode(out, kInstEmptyWidth);
40 empty_ = empty;
41 }
42
InitMatch(int32 id)43 void Prog::Inst::InitMatch(int32 id) {
44 DCHECK_EQ(out_opcode_, 0);
45 set_opcode(kInstMatch);
46 match_id_ = id;
47 }
48
InitNop(uint32 out)49 void Prog::Inst::InitNop(uint32 out) {
50 DCHECK_EQ(out_opcode_, 0);
51 set_opcode(kInstNop);
52 }
53
InitFail()54 void Prog::Inst::InitFail() {
55 DCHECK_EQ(out_opcode_, 0);
56 set_opcode(kInstFail);
57 }
58
Dump()59 string Prog::Inst::Dump() {
60 switch (opcode()) {
61 default:
62 return StringPrintf("opcode %d", static_cast<int>(opcode()));
63
64 case kInstAlt:
65 return StringPrintf("alt -> %d | %d", out(), out1_);
66
67 case kInstAltMatch:
68 return StringPrintf("altmatch -> %d | %d", out(), out1_);
69
70 case kInstByteRange:
71 return StringPrintf("byte%s [%02x-%02x] -> %d",
72 foldcase_ ? "/i" : "",
73 lo_, hi_, out());
74
75 case kInstCapture:
76 return StringPrintf("capture %d -> %d", cap_, out());
77
78 case kInstEmptyWidth:
79 return StringPrintf("emptywidth %#x -> %d",
80 static_cast<int>(empty_), out());
81
82 case kInstMatch:
83 return StringPrintf("match! %d", match_id());
84
85 case kInstNop:
86 return StringPrintf("nop -> %d", out());
87
88 case kInstFail:
89 return StringPrintf("fail");
90 }
91 }
92
Prog()93 Prog::Prog()
94 : anchor_start_(false),
95 anchor_end_(false),
96 reversed_(false),
97 did_onepass_(false),
98 start_(0),
99 start_unanchored_(0),
100 size_(0),
101 byte_inst_count_(0),
102 bytemap_range_(0),
103 flags_(0),
104 onepass_statesize_(0),
105 inst_(NULL),
106 dfa_first_(NULL),
107 dfa_longest_(NULL),
108 dfa_mem_(0),
109 delete_dfa_(NULL),
110 unbytemap_(NULL),
111 onepass_nodes_(NULL),
112 onepass_start_(NULL) {
113 }
114
~Prog()115 Prog::~Prog() {
116 if (delete_dfa_) {
117 if (dfa_first_)
118 delete_dfa_(dfa_first_);
119 if (dfa_longest_)
120 delete_dfa_(dfa_longest_);
121 }
122 delete[] onepass_nodes_;
123 delete[] inst_;
124 delete[] unbytemap_;
125 }
126
127 typedef SparseSet Workq;
128
AddToQueue(Workq * q,int id)129 static inline void AddToQueue(Workq* q, int id) {
130 if (id != 0)
131 q->insert(id);
132 }
133
ProgToString(Prog * prog,Workq * q)134 static string ProgToString(Prog* prog, Workq* q) {
135 string s;
136
137 for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
138 int id = *i;
139 Prog::Inst* ip = prog->inst(id);
140 StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
141 AddToQueue(q, ip->out());
142 if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
143 AddToQueue(q, ip->out1());
144 }
145 return s;
146 }
147
Dump()148 string Prog::Dump() {
149 string map;
150 if (false) { // Debugging
151 int lo = 0;
152 StringAppendF(&map, "byte map:\n");
153 for (int i = 0; i < bytemap_range_; i++) {
154 StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
155 lo = unbytemap_[i] + 1;
156 }
157 StringAppendF(&map, "\n");
158 }
159
160 Workq q(size_);
161 AddToQueue(&q, start_);
162 return map + ProgToString(this, &q);
163 }
164
DumpUnanchored()165 string Prog::DumpUnanchored() {
166 Workq q(size_);
167 AddToQueue(&q, start_unanchored_);
168 return ProgToString(this, &q);
169 }
170
171 static bool IsMatch(Prog*, Prog::Inst*);
172
173 // Peep-hole optimizer.
Optimize()174 void Prog::Optimize() {
175 Workq q(size_);
176
177 // Eliminate nops. Most are taken out during compilation
178 // but a few are hard to avoid.
179 q.clear();
180 AddToQueue(&q, start_);
181 for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
182 int id = *i;
183
184 Inst* ip = inst(id);
185 int j = ip->out();
186 Inst* jp;
187 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
188 j = jp->out();
189 }
190 ip->set_out(j);
191 AddToQueue(&q, ip->out());
192
193 if (ip->opcode() == kInstAlt) {
194 j = ip->out1();
195 while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
196 j = jp->out();
197 }
198 ip->out1_ = j;
199 AddToQueue(&q, ip->out1());
200 }
201 }
202
203 // Insert kInstAltMatch instructions
204 // Look for
205 // ip: Alt -> j | k
206 // j: ByteRange [00-FF] -> ip
207 // k: Match
208 // or the reverse (the above is the greedy one).
209 // Rewrite Alt to AltMatch.
210 q.clear();
211 AddToQueue(&q, start_);
212 for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
213 int id = *i;
214 Inst* ip = inst(id);
215 AddToQueue(&q, ip->out());
216 if (ip->opcode() == kInstAlt)
217 AddToQueue(&q, ip->out1());
218
219 if (ip->opcode() == kInstAlt) {
220 Inst* j = inst(ip->out());
221 Inst* k = inst(ip->out1());
222 if (j->opcode() == kInstByteRange && j->out() == id &&
223 j->lo() == 0x00 && j->hi() == 0xFF &&
224 IsMatch(this, k)) {
225 ip->set_opcode(kInstAltMatch);
226 continue;
227 }
228 if (IsMatch(this, j) &&
229 k->opcode() == kInstByteRange && k->out() == id &&
230 k->lo() == 0x00 && k->hi() == 0xFF) {
231 ip->set_opcode(kInstAltMatch);
232 }
233 }
234 }
235 }
236
237 // Is ip a guaranteed match at end of text, perhaps after some capturing?
IsMatch(Prog * prog,Prog::Inst * ip)238 static bool IsMatch(Prog* prog, Prog::Inst* ip) {
239 for (;;) {
240 switch (ip->opcode()) {
241 default:
242 LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
243 return false;
244
245 case kInstAlt:
246 case kInstAltMatch:
247 case kInstByteRange:
248 case kInstFail:
249 case kInstEmptyWidth:
250 return false;
251
252 case kInstCapture:
253 case kInstNop:
254 ip = prog->inst(ip->out());
255 break;
256
257 case kInstMatch:
258 return true;
259 }
260 }
261 }
262
EmptyFlags(const StringPiece & text,const char * p)263 uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
264 int flags = 0;
265
266 // ^ and \A
267 if (p == text.begin())
268 flags |= kEmptyBeginText | kEmptyBeginLine;
269 else if (p[-1] == '\n')
270 flags |= kEmptyBeginLine;
271
272 // $ and \z
273 if (p == text.end())
274 flags |= kEmptyEndText | kEmptyEndLine;
275 else if (p < text.end() && p[0] == '\n')
276 flags |= kEmptyEndLine;
277
278 // \b and \B
279 if (p == text.begin() && p == text.end()) {
280 // no word boundary here
281 } else if (p == text.begin()) {
282 if (IsWordChar(p[0]))
283 flags |= kEmptyWordBoundary;
284 } else if (p == text.end()) {
285 if (IsWordChar(p[-1]))
286 flags |= kEmptyWordBoundary;
287 } else {
288 if (IsWordChar(p[-1]) != IsWordChar(p[0]))
289 flags |= kEmptyWordBoundary;
290 }
291 if (!(flags & kEmptyWordBoundary))
292 flags |= kEmptyNonWordBoundary;
293
294 return flags;
295 }
296
MarkByteRange(int lo,int hi)297 void Prog::MarkByteRange(int lo, int hi) {
298 CHECK_GE(lo, 0);
299 CHECK_GE(hi, 0);
300 CHECK_LE(lo, 255);
301 CHECK_LE(hi, 255);
302 if (lo > 0)
303 byterange_.Set(lo - 1);
304 byterange_.Set(hi);
305 }
306
ComputeByteMap()307 void Prog::ComputeByteMap() {
308 // Fill in bytemap with byte classes for prog_.
309 // Ranges of bytes that are treated as indistinguishable
310 // by the regexp program are mapped to a single byte class.
311 // The vector prog_->byterange() marks the end of each
312 // such range.
313 const Bitmap<256>& v = byterange();
314
315 COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
316 uint8 n = 0;
317 uint32 bits = 0;
318 for (int i = 0; i < 256; i++) {
319 if ((i&31) == 0)
320 bits = v.Word(i >> 5);
321 bytemap_[i] = n;
322 n += bits & 1;
323 bits >>= 1;
324 }
325 bytemap_range_ = bytemap_[255] + 1;
326 unbytemap_ = new uint8[bytemap_range_];
327 for (int i = 0; i < 256; i++)
328 unbytemap_[bytemap_[i]] = i;
329
330 if (0) { // For debugging: use trivial byte map.
331 for (int i = 0; i < 256; i++) {
332 bytemap_[i] = i;
333 unbytemap_[i] = i;
334 }
335 bytemap_range_ = 256;
336 LOG(INFO) << "Using trivial bytemap.";
337 }
338 }
339
340 } // namespace re2
341
342