• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * @file op_regex.cpp
3  * This file contains implementation for a lightweight wrapper around
4  * libc regex, providing regular expression match and replace facility.
5  *
6  * @remark Copyright 2003 OProfile authors
7  * @remark Read the file COPYING
8  * @remark Idea comes from TextFilt project <http://textfilt.sourceforge.net>
9  *
10  * @author Philippe Elie
11  */
12 
13 #include <cerrno>
14 
15 #include <iostream>
16 #include <fstream>
17 
18 #include "string_manip.h"
19 
20 #include "op_regex.h"
21 
22 using namespace std;
23 
24 namespace {
25 
op_regerror(int err,regex_t const & regexp)26 string op_regerror(int err, regex_t const & regexp)
27 {
28 	size_t needed_size = regerror(err, &regexp, 0, 0);
29 	char * buffer = new char[needed_size];
30 	regerror(err, &regexp, buffer, needed_size);
31 
32 	return buffer;
33 }
34 
35 
op_regcomp(regex_t & regexp,string const & pattern)36 void op_regcomp(regex_t & regexp, string const & pattern)
37 {
38 	int err = regcomp(&regexp, pattern.c_str(), REG_EXTENDED);
39 	if (err) {
40 		throw bad_regex("regcomp error: " + op_regerror(err, regexp)
41 				+ " for pattern : " + pattern);
42 	}
43 }
44 
45 
op_regexec(regex_t const & regex,string const & str,regmatch_t * match,size_t nmatch)46 bool op_regexec(regex_t const & regex, string const & str, regmatch_t * match,
47 	       size_t nmatch)
48 {
49 	return regexec(&regex, str.c_str(), nmatch, match, 0) != REG_NOMATCH;
50 }
51 
52 
op_regfree(regex_t & regexp)53 void op_regfree(regex_t & regexp)
54 {
55 	regfree(&regexp);
56 }
57 
58 
59 // return the index number associated with a char seen in a "\x".
60 // Allowed range are for x is [0-9a-z] return size_t(-1) if x is not in
61 // these ranges.
subexpr_index(char ch)62 size_t subexpr_index(char ch)
63 {
64 	if (isdigit(ch))
65 		return ch - '0';
66 	if (ch >= 'a' && ch <= 'z')
67 		return ch - 'a' + 10;
68 	return size_t(-1);
69 }
70 
71 }  // anonymous namespace
72 
73 
bad_regex(string const & pattern)74 bad_regex::bad_regex(string const & pattern)
75 	: op_exception(pattern)
76 {
77 }
78 
79 
regular_expression_replace(size_t limit_,size_t limit_defs)80 regular_expression_replace::regular_expression_replace(size_t limit_,
81 						       size_t limit_defs)
82 	:
83 	limit(limit_),
84 	limit_defs_expansion(limit_defs)
85 {
86 }
87 
88 
~regular_expression_replace()89 regular_expression_replace::~regular_expression_replace()
90 {
91 	for (size_t i = 0 ; i < regex_replace.size() ; ++i)
92 		op_regfree(regex_replace[i].regexp);
93 }
94 
95 
add_definition(string const & name,string const & definition)96 void regular_expression_replace::add_definition(string const & name,
97 						string const & definition)
98 {
99 	defs[name] = expand_string(definition);
100 }
101 
102 
add_pattern(string const & pattern,string const & replace)103 void regular_expression_replace::add_pattern(string const & pattern,
104 					     string const & replace)
105 {
106 	string expanded_pattern = expand_string(pattern);
107 
108 	regex_t regexp;
109 	op_regcomp(regexp, expanded_pattern);
110 	replace_t regex = { regexp, replace };
111 	regex_replace.push_back(regex);
112 }
113 
114 
expand_string(string const & input)115 string regular_expression_replace::expand_string(string const & input)
116 {
117 	string last, expanded(input);
118 	size_t i = 0;
119 	for (i = 0 ; i < limit_defs_expansion ; ++i) {
120 		last = expanded;
121 		expanded = substitute_definition(last);
122 		if (expanded == last)
123 			break;
124 	}
125 
126 	if (i == limit_defs_expansion)
127 		throw bad_regex("too many substitution for: + input");
128 
129 	return last;
130 }
131 
132 
substitute_definition(string const & pattern)133 string regular_expression_replace::substitute_definition(string const & pattern)
134 {
135 	string result;
136 	bool previous_is_escape = false;
137 
138 	for (size_t i = 0 ; i < pattern.length() ; ++i) {
139 		if (pattern[i] == '$' && !previous_is_escape) {
140 			size_t pos = pattern.find('{', i);
141 			if (pos != i + 1) {
142 				throw bad_regex("invalid $ in pattern: " + pattern);
143 			}
144 			size_t end = pattern.find('}', i);
145 			if (end == string::npos) {
146 				throw bad_regex("no matching '}' in pattern: " + pattern);
147 			}
148 			string def_name = pattern.substr(pos+1, (end-pos) - 1);
149 			if (defs.find(def_name) == defs.end()) {
150 				throw bad_regex("definition not found and used in pattern: ("
151 						+ def_name + ") " + pattern);
152 			}
153 			result += defs[def_name];
154 			i = end;
155 		} else {
156 			if (pattern[i] == '\\' && !previous_is_escape)
157 				previous_is_escape = true;
158 			else
159 				previous_is_escape = false;
160 			result += pattern[i];
161 		}
162 	}
163 
164 	return result;
165 }
166 
167 
168 // FIXME limit output string size ? (cause we can have exponential growing
169 // of output string through a rule "a" = "aa")
execute(string & str) const170 bool regular_expression_replace::execute(string & str) const
171 {
172 	bool changed = true;
173 	for (size_t nr_iter = 0; changed && nr_iter < limit ; ++nr_iter) {
174 		changed = false;
175 		for (size_t i = 0 ; i < regex_replace.size() ; ++i) {
176 			if (do_execute(str, regex_replace[i]))
177 				changed = true;
178 		}
179 	}
180 
181 	// this don't return if the input string has been changed but if
182 	// we reach the limit number of iteration.
183 	return changed == false;
184 }
185 
186 
do_execute(string & str,replace_t const & regexp) const187 bool regular_expression_replace::do_execute(string & str,
188                                             replace_t const & regexp) const
189 {
190 	bool changed = false;
191 
192 	regmatch_t match[max_match];
193 	for (size_t iter = 0;
194 	     op_regexec(regexp.regexp, str, match, max_match) && iter < limit;
195 	     iter++) {
196 		changed = true;
197 		do_replace(str, regexp.replace, match);
198 	}
199 
200 	return changed;
201 }
202 
203 
204 regmatch_t const &
get_match(regmatch_t const * match,char idx) const205 regular_expression_replace::get_match(regmatch_t const * match, char idx) const
206 {
207 	size_t sub_expr = subexpr_index(idx);
208 	if (sub_expr == size_t(-1))
209 		throw bad_regex("expect group index: " + idx);
210 	if (sub_expr >= max_match)
211 		throw bad_regex("illegal group index :" + idx);
212 	return match[sub_expr];
213 }
214 
do_replace(string & str,string const & replace,regmatch_t const * match) const215 void regular_expression_replace::do_replace
216 (string & str, string const & replace, regmatch_t const * match) const
217 {
218 	string inserted;
219 	for (size_t i = 0 ; i < replace.length() ; ++i) {
220 		if (replace[i] == '\\') {
221 			if (i == replace.length() - 1) {
222 				throw bad_regex("illegal \\ trailer: " +
223 				                replace);
224 			}
225 			++i;
226 			if (replace[i] == '\\') {
227 				inserted += '\\';
228 			}  else {
229 				regmatch_t const & matched = get_match(match,
230 					replace[i]);
231 				if (matched.rm_so == -1 &&
232 				    matched.rm_eo == -1) {
233 					// empty match: nothing todo
234 				} else if (matched.rm_so == -1 ||
235 					   matched.rm_eo == -1) {
236 					throw bad_regex("illegal match: " +
237 						replace);
238 				} else {
239 					inserted += str.substr(matched.rm_so,
240 					    matched.rm_eo - matched.rm_so);
241 				}
242 			}
243 		} else {
244 			inserted += replace[i];
245 		}
246 	}
247 
248 	size_t first = match[0].rm_so;
249 	size_t count = match[0].rm_eo - match[0].rm_so;
250 
251 	str.replace(first, count, inserted);
252 }
253 
254 
setup_regex(regular_expression_replace & regex,string const & filename)255 void setup_regex(regular_expression_replace & regex,
256                  string const & filename)
257 {
258 	ifstream in(filename.c_str());
259 	if (!in) {
260 		throw op_runtime_error("Can't open file " + filename +
261 				" for reading", errno);
262 	}
263 
264 	regular_expression_replace var_name_rule;
265 	var_name_rule.add_pattern("^\\$([_a-zA-Z][_a-zA-Z0-9]*)[ ]*=.*", "\\1");
266 	regular_expression_replace var_value_rule;
267 	var_value_rule.add_pattern(".*=[ ]*\"(.*)\"", "\\1");
268 
269 	regular_expression_replace left_rule;
270 	left_rule.add_pattern("[ ]*\"(.*)\"[ ]*=.*", "\\1");
271 	regular_expression_replace right_rule;
272 	right_rule.add_pattern(".*=[ ]*\"(.*)\"", "\\1");
273 
274 	string line;
275 	while (getline(in, line)) {
276 		line = trim(line);
277 		if (line.empty() || line[0] == '#')
278 			continue;
279 
280 		string temp = line;
281 		var_name_rule.execute(temp);
282 		if (temp == line) {
283 			string left = line;
284 			left_rule.execute(left);
285 			if (left == line) {
286 				throw bad_regex("invalid input file: \"" + line + '"');
287 			}
288 
289 			string right = line;
290 			right_rule.execute(right);
291 			if (right == line) {
292 				throw bad_regex("invalid input file: \"" + line + '"');
293 			}
294 
295 			regex.add_pattern(left, right);
296 		} else {
297 			// temp != line ==> var_name_rule succeed to substitute
298 			// into temp the var_name present in line
299 			string var_name = temp;
300 			string var_value = line;
301 			var_value_rule.execute(var_value);
302 			if (var_value == line) {
303 				throw bad_regex("invalid input file: \"" + line + '"');
304 			}
305 
306 			regex.add_definition(var_name, var_value);
307 		}
308 	}
309 }
310