• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- Regex.cpp - Regular Expression matcher implementation -------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements a POSIX regular expression matcher.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Support/Regex.h"
15 #include "llvm/ADT/SmallVector.h"
16 #include "llvm/ADT/StringRef.h"
17 #include "llvm/ADT/Twine.h"
18 #include <string>
19 
20 // Important this comes last because it defines "_REGEX_H_". At least on
21 // Darwin, if included before any header that (transitively) includes
22 // xlocale.h, this will cause trouble, because of missing regex-related types.
23 #include "regex_impl.h"
24 
25 using namespace llvm;
26 
Regex()27 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {}
28 
Regex(StringRef regex,unsigned Flags)29 Regex::Regex(StringRef regex, unsigned Flags) {
30   unsigned flags = 0;
31   preg = new llvm_regex();
32   preg->re_endp = regex.end();
33   if (Flags & IgnoreCase)
34     flags |= REG_ICASE;
35   if (Flags & Newline)
36     flags |= REG_NEWLINE;
37   if (!(Flags & BasicRegex))
38     flags |= REG_EXTENDED;
39   error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
40 }
41 
Regex(Regex && regex)42 Regex::Regex(Regex &&regex) {
43   preg = regex.preg;
44   error = regex.error;
45   regex.preg = nullptr;
46   regex.error = REG_BADPAT;
47 }
48 
~Regex()49 Regex::~Regex() {
50   if (preg) {
51     llvm_regfree(preg);
52     delete preg;
53   }
54 }
55 
isValid(std::string & Error) const56 bool Regex::isValid(std::string &Error) const {
57   if (!error)
58     return true;
59 
60   size_t len = llvm_regerror(error, preg, nullptr, 0);
61 
62   Error.resize(len - 1);
63   llvm_regerror(error, preg, &Error[0], len);
64   return false;
65 }
66 
67 /// getNumMatches - In a valid regex, return the number of parenthesized
68 /// matches it contains.
getNumMatches() const69 unsigned Regex::getNumMatches() const {
70   return preg->re_nsub;
71 }
72 
match(StringRef String,SmallVectorImpl<StringRef> * Matches)73 bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
74   if (error)
75     return false;
76 
77   unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
78 
79   // pmatch needs to have at least one element.
80   SmallVector<llvm_regmatch_t, 8> pm;
81   pm.resize(nmatch > 0 ? nmatch : 1);
82   pm[0].rm_so = 0;
83   pm[0].rm_eo = String.size();
84 
85   int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
86 
87   if (rc == REG_NOMATCH)
88     return false;
89   if (rc != 0) {
90     // regexec can fail due to invalid pattern or running out of memory.
91     error = rc;
92     return false;
93   }
94 
95   // There was a match.
96 
97   if (Matches) { // match position requested
98     Matches->clear();
99 
100     for (unsigned i = 0; i != nmatch; ++i) {
101       if (pm[i].rm_so == -1) {
102         // this group didn't match
103         Matches->push_back(StringRef());
104         continue;
105       }
106       assert(pm[i].rm_eo >= pm[i].rm_so);
107       Matches->push_back(StringRef(String.data()+pm[i].rm_so,
108                                    pm[i].rm_eo-pm[i].rm_so));
109     }
110   }
111 
112   return true;
113 }
114 
sub(StringRef Repl,StringRef String,std::string * Error)115 std::string Regex::sub(StringRef Repl, StringRef String,
116                        std::string *Error) {
117   SmallVector<StringRef, 8> Matches;
118 
119   // Reset error, if given.
120   if (Error && !Error->empty()) *Error = "";
121 
122   // Return the input if there was no match.
123   if (!match(String, &Matches))
124     return String;
125 
126   // Otherwise splice in the replacement string, starting with the prefix before
127   // the match.
128   std::string Res(String.begin(), Matches[0].begin());
129 
130   // Then the replacement string, honoring possible substitutions.
131   while (!Repl.empty()) {
132     // Skip to the next escape.
133     std::pair<StringRef, StringRef> Split = Repl.split('\\');
134 
135     // Add the skipped substring.
136     Res += Split.first;
137 
138     // Check for terminimation and trailing backslash.
139     if (Split.second.empty()) {
140       if (Repl.size() != Split.first.size() &&
141           Error && Error->empty())
142         *Error = "replacement string contained trailing backslash";
143       break;
144     }
145 
146     // Otherwise update the replacement string and interpret escapes.
147     Repl = Split.second;
148 
149     // FIXME: We should have a StringExtras function for mapping C99 escapes.
150     switch (Repl[0]) {
151       // Treat all unrecognized characters as self-quoting.
152     default:
153       Res += Repl[0];
154       Repl = Repl.substr(1);
155       break;
156 
157       // Single character escapes.
158     case 't':
159       Res += '\t';
160       Repl = Repl.substr(1);
161       break;
162     case 'n':
163       Res += '\n';
164       Repl = Repl.substr(1);
165       break;
166 
167       // Decimal escapes are backreferences.
168     case '0': case '1': case '2': case '3': case '4':
169     case '5': case '6': case '7': case '8': case '9': {
170       // Extract the backreference number.
171       StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
172       Repl = Repl.substr(Ref.size());
173 
174       unsigned RefValue;
175       if (!Ref.getAsInteger(10, RefValue) &&
176           RefValue < Matches.size())
177         Res += Matches[RefValue];
178       else if (Error && Error->empty())
179         *Error = ("invalid backreference string '" + Twine(Ref) + "'").str();
180       break;
181     }
182     }
183   }
184 
185   // And finally the suffix.
186   Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
187 
188   return Res;
189 }
190 
191 // These are the special characters matched in functions like "p_ere_exp".
192 static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
193 
isLiteralERE(StringRef Str)194 bool Regex::isLiteralERE(StringRef Str) {
195   // Check for regex metacharacters.  This list was derived from our regex
196   // implementation in regcomp.c and double checked against the POSIX extended
197   // regular expression specification.
198   return Str.find_first_of(RegexMetachars) == StringRef::npos;
199 }
200 
escape(StringRef String)201 std::string Regex::escape(StringRef String) {
202   std::string RegexStr;
203   for (unsigned i = 0, e = String.size(); i != e; ++i) {
204     if (strchr(RegexMetachars, String[i]))
205       RegexStr += '\\';
206     RegexStr += String[i];
207   }
208 
209   return RegexStr;
210 }
211