1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Format a regular expression structure as a string.
6 // Tested by parse_test.cc
7
8 #include <string.h>
9 #include <string>
10
11 #include "util/util.h"
12 #include "util/logging.h"
13 #include "util/strutil.h"
14 #include "util/utf.h"
15 #include "re2/regexp.h"
16 #include "re2/walker-inl.h"
17
18 namespace re2 {
19
20 enum {
21 PrecAtom,
22 PrecUnary,
23 PrecConcat,
24 PrecAlternate,
25 PrecEmpty,
26 PrecParen,
27 PrecToplevel,
28 };
29
30 // Helper function. See description below.
31 static void AppendCCRange(std::string* t, Rune lo, Rune hi);
32
33 // Walker to generate string in s_.
34 // The arg pointers are actually integers giving the
35 // context precedence.
36 // The child_args are always NULL.
37 class ToStringWalker : public Regexp::Walker<int> {
38 public:
ToStringWalker(std::string * t)39 explicit ToStringWalker(std::string* t) : t_(t) {}
40
41 virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
42 virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
43 int* child_args, int nchild_args);
ShortVisit(Regexp * re,int parent_arg)44 virtual int ShortVisit(Regexp* re, int parent_arg) {
45 return 0;
46 }
47
48 private:
49 std::string* t_; // The string the walker appends to.
50
51 ToStringWalker(const ToStringWalker&) = delete;
52 ToStringWalker& operator=(const ToStringWalker&) = delete;
53 };
54
ToString()55 std::string Regexp::ToString() {
56 std::string t;
57 ToStringWalker w(&t);
58 w.WalkExponential(this, PrecToplevel, 100000);
59 if (w.stopped_early())
60 t += " [truncated]";
61 return t;
62 }
63
64 #define ToString DontCallToString // Avoid accidental recursion.
65
66 // Visits re before children are processed.
67 // Appends ( if needed and passes new precedence to children.
PreVisit(Regexp * re,int parent_arg,bool * stop)68 int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
69 int prec = parent_arg;
70 int nprec = PrecAtom;
71
72 switch (re->op()) {
73 case kRegexpNoMatch:
74 case kRegexpEmptyMatch:
75 case kRegexpLiteral:
76 case kRegexpAnyChar:
77 case kRegexpAnyByte:
78 case kRegexpBeginLine:
79 case kRegexpEndLine:
80 case kRegexpBeginText:
81 case kRegexpEndText:
82 case kRegexpWordBoundary:
83 case kRegexpNoWordBoundary:
84 case kRegexpCharClass:
85 case kRegexpHaveMatch:
86 nprec = PrecAtom;
87 break;
88
89 case kRegexpConcat:
90 case kRegexpLiteralString:
91 if (prec < PrecConcat)
92 t_->append("(?:");
93 nprec = PrecConcat;
94 break;
95
96 case kRegexpAlternate:
97 if (prec < PrecAlternate)
98 t_->append("(?:");
99 nprec = PrecAlternate;
100 break;
101
102 case kRegexpCapture:
103 t_->append("(");
104 if (re->cap() == 0)
105 LOG(DFATAL) << "kRegexpCapture cap() == 0";
106 if (re->name()) {
107 t_->append("?P<");
108 t_->append(*re->name());
109 t_->append(">");
110 }
111 nprec = PrecParen;
112 break;
113
114 case kRegexpStar:
115 case kRegexpPlus:
116 case kRegexpQuest:
117 case kRegexpRepeat:
118 if (prec < PrecUnary)
119 t_->append("(?:");
120 // The subprecedence here is PrecAtom instead of PrecUnary
121 // because PCRE treats two unary ops in a row as a parse error.
122 nprec = PrecAtom;
123 break;
124 }
125
126 return nprec;
127 }
128
AppendLiteral(std::string * t,Rune r,bool foldcase)129 static void AppendLiteral(std::string *t, Rune r, bool foldcase) {
130 if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
131 t->append(1, '\\');
132 t->append(1, static_cast<char>(r));
133 } else if (foldcase && 'a' <= r && r <= 'z') {
134 r -= 'a' - 'A';
135 t->append(1, '[');
136 t->append(1, static_cast<char>(r));
137 t->append(1, static_cast<char>(r) + 'a' - 'A');
138 t->append(1, ']');
139 } else {
140 AppendCCRange(t, r, r);
141 }
142 }
143
144 // Visits re after children are processed.
145 // For childless regexps, all the work is done here.
146 // For regexps with children, append any unary suffixes or ).
PostVisit(Regexp * re,int parent_arg,int pre_arg,int * child_args,int nchild_args)147 int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
148 int* child_args, int nchild_args) {
149 int prec = parent_arg;
150 switch (re->op()) {
151 case kRegexpNoMatch:
152 // There's no simple symbol for "no match", but
153 // [^0-Runemax] excludes everything.
154 t_->append("[^\\x00-\\x{10ffff}]");
155 break;
156
157 case kRegexpEmptyMatch:
158 // Append (?:) to make empty string visible,
159 // unless this is already being parenthesized.
160 if (prec < PrecEmpty)
161 t_->append("(?:)");
162 break;
163
164 case kRegexpLiteral:
165 AppendLiteral(t_, re->rune(),
166 (re->parse_flags() & Regexp::FoldCase) != 0);
167 break;
168
169 case kRegexpLiteralString:
170 for (int i = 0; i < re->nrunes(); i++)
171 AppendLiteral(t_, re->runes()[i],
172 (re->parse_flags() & Regexp::FoldCase) != 0);
173 if (prec < PrecConcat)
174 t_->append(")");
175 break;
176
177 case kRegexpConcat:
178 if (prec < PrecConcat)
179 t_->append(")");
180 break;
181
182 case kRegexpAlternate:
183 // Clumsy but workable: the children all appended |
184 // at the end of their strings, so just remove the last one.
185 if ((*t_)[t_->size()-1] == '|')
186 t_->erase(t_->size()-1);
187 else
188 LOG(DFATAL) << "Bad final char: " << t_;
189 if (prec < PrecAlternate)
190 t_->append(")");
191 break;
192
193 case kRegexpStar:
194 t_->append("*");
195 if (re->parse_flags() & Regexp::NonGreedy)
196 t_->append("?");
197 if (prec < PrecUnary)
198 t_->append(")");
199 break;
200
201 case kRegexpPlus:
202 t_->append("+");
203 if (re->parse_flags() & Regexp::NonGreedy)
204 t_->append("?");
205 if (prec < PrecUnary)
206 t_->append(")");
207 break;
208
209 case kRegexpQuest:
210 t_->append("?");
211 if (re->parse_flags() & Regexp::NonGreedy)
212 t_->append("?");
213 if (prec < PrecUnary)
214 t_->append(")");
215 break;
216
217 case kRegexpRepeat:
218 if (re->max() == -1)
219 t_->append(StringPrintf("{%d,}", re->min()));
220 else if (re->min() == re->max())
221 t_->append(StringPrintf("{%d}", re->min()));
222 else
223 t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
224 if (re->parse_flags() & Regexp::NonGreedy)
225 t_->append("?");
226 if (prec < PrecUnary)
227 t_->append(")");
228 break;
229
230 case kRegexpAnyChar:
231 t_->append(".");
232 break;
233
234 case kRegexpAnyByte:
235 t_->append("\\C");
236 break;
237
238 case kRegexpBeginLine:
239 t_->append("^");
240 break;
241
242 case kRegexpEndLine:
243 t_->append("$");
244 break;
245
246 case kRegexpBeginText:
247 t_->append("(?-m:^)");
248 break;
249
250 case kRegexpEndText:
251 if (re->parse_flags() & Regexp::WasDollar)
252 t_->append("(?-m:$)");
253 else
254 t_->append("\\z");
255 break;
256
257 case kRegexpWordBoundary:
258 t_->append("\\b");
259 break;
260
261 case kRegexpNoWordBoundary:
262 t_->append("\\B");
263 break;
264
265 case kRegexpCharClass: {
266 if (re->cc()->size() == 0) {
267 t_->append("[^\\x00-\\x{10ffff}]");
268 break;
269 }
270 t_->append("[");
271 // Heuristic: show class as negated if it contains the
272 // non-character 0xFFFE and yet somehow isn't full.
273 CharClass* cc = re->cc();
274 if (cc->Contains(0xFFFE) && !cc->full()) {
275 cc = cc->Negate();
276 t_->append("^");
277 }
278 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
279 AppendCCRange(t_, i->lo, i->hi);
280 if (cc != re->cc())
281 cc->Delete();
282 t_->append("]");
283 break;
284 }
285
286 case kRegexpCapture:
287 t_->append(")");
288 break;
289
290 case kRegexpHaveMatch:
291 // There's no syntax accepted by the parser to generate
292 // this node (it is generated by RE2::Set) so make something
293 // up that is readable but won't compile.
294 t_->append("(?HaveMatch:%d)", re->match_id());
295 break;
296 }
297
298 // If the parent is an alternation, append the | for it.
299 if (prec == PrecAlternate)
300 t_->append("|");
301
302 return 0;
303 }
304
305 // Appends a rune for use in a character class to the string t.
AppendCCChar(std::string * t,Rune r)306 static void AppendCCChar(std::string* t, Rune r) {
307 if (0x20 <= r && r <= 0x7E) {
308 if (strchr("[]^-\\", r))
309 t->append("\\");
310 t->append(1, static_cast<char>(r));
311 return;
312 }
313 switch (r) {
314 default:
315 break;
316
317 case '\r':
318 t->append("\\r");
319 return;
320
321 case '\t':
322 t->append("\\t");
323 return;
324
325 case '\n':
326 t->append("\\n");
327 return;
328
329 case '\f':
330 t->append("\\f");
331 return;
332 }
333
334 if (r < 0x100) {
335 *t += StringPrintf("\\x%02x", static_cast<int>(r));
336 return;
337 }
338 *t += StringPrintf("\\x{%x}", static_cast<int>(r));
339 }
340
AppendCCRange(std::string * t,Rune lo,Rune hi)341 static void AppendCCRange(std::string* t, Rune lo, Rune hi) {
342 if (lo > hi)
343 return;
344 AppendCCChar(t, lo);
345 if (lo < hi) {
346 t->append("-");
347 AppendCCChar(t, hi);
348 }
349 }
350
351 } // namespace re2
352