• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2010, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // Author: Sanjay Ghemawat
31 
32 #ifdef HAVE_CONFIG_H
33 #include "config.h"
34 #endif
35 
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <ctype.h>
39 #include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
40 #include <assert.h>
41 #include <errno.h>
42 #include <string>
43 #include <algorithm>
44 
45 #include "pcrecpp_internal.h"
46 #include "pcre.h"
47 #include "pcrecpp.h"
48 #include "pcre_stringpiece.h"
49 
50 
51 namespace pcrecpp {
52 
53 // Maximum number of args we can set
54 static const int kMaxArgs = 16;
55 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
56 
57 // Special object that stands-in for no argument
58 Arg RE::no_arg((void*)NULL);
59 
60 // This is for ABI compatibility with old versions of pcre (pre-7.6),
61 // which defined a global no_arg variable instead of putting it in the
62 // RE class.  This works on GCC >= 3, at least.  It definitely works
63 // for ELF, but may not for other object formats (Mach-O, for
64 // instance, does not support aliases.)  We could probably have a more
65 // inclusive test if we ever needed it.  (Note that not only the
66 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
67 // gnu-specific.)
68 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__)
69 # define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
70 # define ULP_AS_STRING_INTERNAL(x)   #x
71 # define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
72 extern Arg no_arg
73   __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
74 #endif
75 
76 // If a regular expression has no error, its error_ field points here
77 static const string empty_string;
78 
79 // If the user doesn't ask for any options, we just use this one
80 static RE_Options default_options;
81 
Init(const string & pat,const RE_Options * options)82 void RE::Init(const string& pat, const RE_Options* options) {
83   pattern_ = pat;
84   if (options == NULL) {
85     options_ = default_options;
86   } else {
87     options_ = *options;
88   }
89   error_ = &empty_string;
90   re_full_ = NULL;
91   re_partial_ = NULL;
92 
93   re_partial_ = Compile(UNANCHORED);
94   if (re_partial_ != NULL) {
95     re_full_ = Compile(ANCHOR_BOTH);
96   }
97 }
98 
Cleanup()99 void RE::Cleanup() {
100   if (re_full_ != NULL)         (*pcre_free)(re_full_);
101   if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
102   if (error_ != &empty_string)  delete error_;
103 }
104 
105 
~RE()106 RE::~RE() {
107   Cleanup();
108 }
109 
110 
Compile(Anchor anchor)111 pcre* RE::Compile(Anchor anchor) {
112   // First, convert RE_Options into pcre options
113   int pcre_options = 0;
114   pcre_options = options_.all_options();
115 
116   // Special treatment for anchoring.  This is needed because at
117   // runtime pcre only provides an option for anchoring at the
118   // beginning of a string (unless you use offset).
119   //
120   // There are three types of anchoring we want:
121   //    UNANCHORED      Compile the original pattern, and use
122   //                    a pcre unanchored match.
123   //    ANCHOR_START    Compile the original pattern, and use
124   //                    a pcre anchored match.
125   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
126   //                    and use a pcre anchored match.
127 
128   const char* compile_error;
129   int eoffset;
130   pcre* re;
131   if (anchor != ANCHOR_BOTH) {
132     re = pcre_compile(pattern_.c_str(), pcre_options,
133                       &compile_error, &eoffset, NULL);
134   } else {
135     // Tack a '\z' at the end of RE.  Parenthesize it first so that
136     // the '\z' applies to all top-level alternatives in the regexp.
137     string wrapped = "(?:";  // A non-counting grouping operator
138     wrapped += pattern_;
139     wrapped += ")\\z";
140     re = pcre_compile(wrapped.c_str(), pcre_options,
141                       &compile_error, &eoffset, NULL);
142   }
143   if (re == NULL) {
144     if (error_ == &empty_string) error_ = new string(compile_error);
145   }
146   return re;
147 }
148 
149 /***** Matching interfaces *****/
150 
FullMatch(const StringPiece & text,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const151 bool RE::FullMatch(const StringPiece& text,
152                    const Arg& ptr1,
153                    const Arg& ptr2,
154                    const Arg& ptr3,
155                    const Arg& ptr4,
156                    const Arg& ptr5,
157                    const Arg& ptr6,
158                    const Arg& ptr7,
159                    const Arg& ptr8,
160                    const Arg& ptr9,
161                    const Arg& ptr10,
162                    const Arg& ptr11,
163                    const Arg& ptr12,
164                    const Arg& ptr13,
165                    const Arg& ptr14,
166                    const Arg& ptr15,
167                    const Arg& ptr16) const {
168   const Arg* args[kMaxArgs];
169   int n = 0;
170   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
171   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
172   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
173   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
174   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
175   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
176   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
177   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
178   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
179   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
180   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
181   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
182   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
183   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
184   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
185   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
186  done:
187 
188   int consumed;
189   int vec[kVecSize];
190   return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
191 }
192 
PartialMatch(const StringPiece & text,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const193 bool RE::PartialMatch(const StringPiece& text,
194                       const Arg& ptr1,
195                       const Arg& ptr2,
196                       const Arg& ptr3,
197                       const Arg& ptr4,
198                       const Arg& ptr5,
199                       const Arg& ptr6,
200                       const Arg& ptr7,
201                       const Arg& ptr8,
202                       const Arg& ptr9,
203                       const Arg& ptr10,
204                       const Arg& ptr11,
205                       const Arg& ptr12,
206                       const Arg& ptr13,
207                       const Arg& ptr14,
208                       const Arg& ptr15,
209                       const Arg& ptr16) const {
210   const Arg* args[kMaxArgs];
211   int n = 0;
212   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
213   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
214   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
215   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
216   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
217   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
218   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
219   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
220   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
221   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
222   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
223   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
224   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
225   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
226   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
227   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
228  done:
229 
230   int consumed;
231   int vec[kVecSize];
232   return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
233 }
234 
Consume(StringPiece * input,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const235 bool RE::Consume(StringPiece* input,
236                  const Arg& ptr1,
237                  const Arg& ptr2,
238                  const Arg& ptr3,
239                  const Arg& ptr4,
240                  const Arg& ptr5,
241                  const Arg& ptr6,
242                  const Arg& ptr7,
243                  const Arg& ptr8,
244                  const Arg& ptr9,
245                  const Arg& ptr10,
246                  const Arg& ptr11,
247                  const Arg& ptr12,
248                  const Arg& ptr13,
249                  const Arg& ptr14,
250                  const Arg& ptr15,
251                  const Arg& ptr16) const {
252   const Arg* args[kMaxArgs];
253   int n = 0;
254   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
255   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
256   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
257   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
258   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
259   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
260   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
261   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
262   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
263   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
264   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
265   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
266   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
267   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
268   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
269   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
270  done:
271 
272   int consumed;
273   int vec[kVecSize];
274   if (DoMatchImpl(*input, ANCHOR_START, &consumed,
275                   args, n, vec, kVecSize)) {
276     input->remove_prefix(consumed);
277     return true;
278   } else {
279     return false;
280   }
281 }
282 
FindAndConsume(StringPiece * input,const Arg & ptr1,const Arg & ptr2,const Arg & ptr3,const Arg & ptr4,const Arg & ptr5,const Arg & ptr6,const Arg & ptr7,const Arg & ptr8,const Arg & ptr9,const Arg & ptr10,const Arg & ptr11,const Arg & ptr12,const Arg & ptr13,const Arg & ptr14,const Arg & ptr15,const Arg & ptr16) const283 bool RE::FindAndConsume(StringPiece* input,
284                         const Arg& ptr1,
285                         const Arg& ptr2,
286                         const Arg& ptr3,
287                         const Arg& ptr4,
288                         const Arg& ptr5,
289                         const Arg& ptr6,
290                         const Arg& ptr7,
291                         const Arg& ptr8,
292                         const Arg& ptr9,
293                         const Arg& ptr10,
294                         const Arg& ptr11,
295                         const Arg& ptr12,
296                         const Arg& ptr13,
297                         const Arg& ptr14,
298                         const Arg& ptr15,
299                         const Arg& ptr16) const {
300   const Arg* args[kMaxArgs];
301   int n = 0;
302   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
303   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
304   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
305   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
306   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
307   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
308   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
309   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
310   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
311   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
312   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
313   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
314   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
315   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
316   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
317   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
318  done:
319 
320   int consumed;
321   int vec[kVecSize];
322   if (DoMatchImpl(*input, UNANCHORED, &consumed,
323                   args, n, vec, kVecSize)) {
324     input->remove_prefix(consumed);
325     return true;
326   } else {
327     return false;
328   }
329 }
330 
Replace(const StringPiece & rewrite,string * str) const331 bool RE::Replace(const StringPiece& rewrite,
332                  string *str) const {
333   int vec[kVecSize];
334   int matches = TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
335   if (matches == 0)
336     return false;
337 
338   string s;
339   if (!Rewrite(&s, rewrite, *str, vec, matches))
340     return false;
341 
342   assert(vec[0] >= 0);
343   assert(vec[1] >= 0);
344   str->replace(vec[0], vec[1] - vec[0], s);
345   return true;
346 }
347 
348 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
349 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
350 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
351 
NewlineMode(int pcre_options)352 static int NewlineMode(int pcre_options) {
353   // TODO: if we can make it threadsafe, cache this var
354   int newline_mode = 0;
355   /* if (newline_mode) return newline_mode; */  // do this once it's cached
356   if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
357                       PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
358     newline_mode = (pcre_options &
359                     (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
360                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
361   } else {
362     int newline;
363     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
364     if (newline == 10)
365       newline_mode = PCRE_NEWLINE_LF;
366     else if (newline == 13)
367       newline_mode = PCRE_NEWLINE_CR;
368     else if (newline == 3338)
369       newline_mode = PCRE_NEWLINE_CRLF;
370     else if (newline == -1)
371       newline_mode = PCRE_NEWLINE_ANY;
372     else if (newline == -2)
373       newline_mode = PCRE_NEWLINE_ANYCRLF;
374     else
375       assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
376   }
377   return newline_mode;
378 }
379 
GlobalReplace(const StringPiece & rewrite,string * str) const380 int RE::GlobalReplace(const StringPiece& rewrite,
381                       string *str) const {
382   int count = 0;
383   int vec[kVecSize];
384   string out;
385   int start = 0;
386   int lastend = -1;
387   bool last_match_was_empty_string = false;
388 
389   while (start <= static_cast<int>(str->length())) {
390     // If the previous match was for the empty string, we shouldn't
391     // just match again: we'll match in the same way and get an
392     // infinite loop.  Instead, we do the match in a special way:
393     // anchored -- to force another try at the same position --
394     // and with a flag saying that this time, ignore empty matches.
395     // If this special match returns, that means there's a non-empty
396     // match at this position as well, and we can continue.  If not,
397     // we do what perl does, and just advance by one.
398     // Notice that perl prints '@@@' for this;
399     //    perl -le '$_ = "aa"; s/b*|aa/@/g; print'
400     int matches;
401     if (last_match_was_empty_string) {
402       matches = TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize);
403       if (matches <= 0) {
404         int matchend = start + 1;     // advance one character.
405         // If the current char is CR and we're in CRLF mode, skip LF too.
406         // Note it's better to call pcre_fullinfo() than to examine
407         // all_options(), since options_ could have changed bewteen
408         // compile-time and now, but this is simpler and safe enough.
409         // Modified by PH to add ANY and ANYCRLF.
410         if (matchend < static_cast<int>(str->length()) &&
411             (*str)[start] == '\r' && (*str)[matchend] == '\n' &&
412             (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
413              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
414              NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)) {
415           matchend++;
416         }
417         // We also need to advance more than one char if we're in utf8 mode.
418 #ifdef SUPPORT_UTF8
419         if (options_.utf8()) {
420           while (matchend < static_cast<int>(str->length()) &&
421                  ((*str)[matchend] & 0xc0) == 0x80)
422             matchend++;
423         }
424 #endif
425         if (start < static_cast<int>(str->length()))
426           out.append(*str, start, matchend - start);
427         start = matchend;
428         last_match_was_empty_string = false;
429         continue;
430       }
431     } else {
432       matches = TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
433       if (matches <= 0)
434         break;
435     }
436     int matchstart = vec[0], matchend = vec[1];
437     assert(matchstart >= start);
438     assert(matchend >= matchstart);
439     out.append(*str, start, matchstart - start);
440     Rewrite(&out, rewrite, *str, vec, matches);
441     start = matchend;
442     lastend = matchend;
443     count++;
444     last_match_was_empty_string = (matchstart == matchend);
445   }
446 
447   if (count == 0)
448     return 0;
449 
450   if (start < static_cast<int>(str->length()))
451     out.append(*str, start, str->length() - start);
452   swap(out, *str);
453   return count;
454 }
455 
Extract(const StringPiece & rewrite,const StringPiece & text,string * out) const456 bool RE::Extract(const StringPiece& rewrite,
457                  const StringPiece& text,
458                  string *out) const {
459   int vec[kVecSize];
460   int matches = TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
461   if (matches == 0)
462     return false;
463   out->erase();
464   return Rewrite(out, rewrite, text, vec, matches);
465 }
466 
QuoteMeta(const StringPiece & unquoted)467 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
468   string result;
469 
470   // Escape any ascii character not in [A-Za-z_0-9].
471   //
472   // Note that it's legal to escape a character even if it has no
473   // special meaning in a regular expression -- so this function does
474   // that.  (This also makes it identical to the perl function of the
475   // same name; see `perldoc -f quotemeta`.)  The one exception is
476   // escaping NUL: rather than doing backslash + NUL, like perl does,
477   // we do '\0', because pcre itself doesn't take embedded NUL chars.
478   for (int ii = 0; ii < unquoted.size(); ++ii) {
479     // Note that using 'isalnum' here raises the benchmark time from
480     // 32ns to 58ns:
481     if (unquoted[ii] == '\0') {
482       result += "\\0";
483     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
484                (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
485                (unquoted[ii] < '0' || unquoted[ii] > '9') &&
486                unquoted[ii] != '_' &&
487                // If this is the part of a UTF8 or Latin1 character, we need
488                // to copy this byte without escaping.  Experimentally this is
489                // what works correctly with the regexp library.
490                !(unquoted[ii] & 128)) {
491       result += '\\';
492       result += unquoted[ii];
493     } else {
494       result += unquoted[ii];
495     }
496   }
497 
498   return result;
499 }
500 
501 /***** Actual matching and rewriting code *****/
502 
TryMatch(const StringPiece & text,int startpos,Anchor anchor,bool empty_ok,int * vec,int vecsize) const503 int RE::TryMatch(const StringPiece& text,
504                  int startpos,
505                  Anchor anchor,
506                  bool empty_ok,
507                  int *vec,
508                  int vecsize) const {
509   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
510   if (re == NULL) {
511     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
512     return 0;
513   }
514 
515   pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
516   if (options_.match_limit() > 0) {
517     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
518     extra.match_limit = options_.match_limit();
519   }
520   if (options_.match_limit_recursion() > 0) {
521     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
522     extra.match_limit_recursion = options_.match_limit_recursion();
523   }
524 
525   int options = 0;
526   if (anchor != UNANCHORED)
527     options |= PCRE_ANCHORED;
528   if (!empty_ok)
529     options |= PCRE_NOTEMPTY;
530 
531   int rc = pcre_exec(re,              // The regular expression object
532                      &extra,
533                      (text.data() == NULL) ? "" : text.data(),
534                      text.size(),
535                      startpos,
536                      options,
537                      vec,
538                      vecsize);
539 
540   // Handle errors
541   if (rc == PCRE_ERROR_NOMATCH) {
542     return 0;
543   } else if (rc < 0) {
544     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
545     //        re, pattern_.c_str());
546     return 0;
547   } else if (rc == 0) {
548     // pcre_exec() returns 0 as a special case when the number of
549     // capturing subpatterns exceeds the size of the vector.
550     // When this happens, there is a match and the output vector
551     // is filled, but we miss out on the positions of the extra subpatterns.
552     rc = vecsize / 2;
553   }
554 
555   return rc;
556 }
557 
DoMatchImpl(const StringPiece & text,Anchor anchor,int * consumed,const Arg * const * args,int n,int * vec,int vecsize) const558 bool RE::DoMatchImpl(const StringPiece& text,
559                      Anchor anchor,
560                      int* consumed,
561                      const Arg* const* args,
562                      int n,
563                      int* vec,
564                      int vecsize) const {
565   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
566   int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
567   assert(matches >= 0);  // TryMatch never returns negatives
568   if (matches == 0)
569     return false;
570 
571   *consumed = vec[1];
572 
573   if (n == 0 || args == NULL) {
574     // We are not interested in results
575     return true;
576   }
577 
578   if (NumberOfCapturingGroups() < n) {
579     // RE has fewer capturing groups than number of arg pointers passed in
580     return false;
581   }
582 
583   // If we got here, we must have matched the whole pattern.
584   // We do not need (can not do) any more checks on the value of 'matches' here
585   // -- see the comment for TryMatch.
586   for (int i = 0; i < n; i++) {
587     const int start = vec[2*(i+1)];
588     const int limit = vec[2*(i+1)+1];
589     if (!args[i]->Parse(text.data() + start, limit-start)) {
590       // TODO: Should we indicate what the error was?
591       return false;
592     }
593   }
594 
595   return true;
596 }
597 
DoMatch(const StringPiece & text,Anchor anchor,int * consumed,const Arg * const args[],int n) const598 bool RE::DoMatch(const StringPiece& text,
599                  Anchor anchor,
600                  int* consumed,
601                  const Arg* const args[],
602                  int n) const {
603   assert(n >= 0);
604   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
605                                        // (as for kVecSize)
606   int space[21];   // use stack allocation for small vecsize (common case)
607   int* vec = vecsize <= 21 ? space : new int[vecsize];
608   bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, (int)vecsize);
609   if (vec != space) delete [] vec;
610   return retval;
611 }
612 
Rewrite(string * out,const StringPiece & rewrite,const StringPiece & text,int * vec,int veclen) const613 bool RE::Rewrite(string *out, const StringPiece &rewrite,
614                  const StringPiece &text, int *vec, int veclen) const {
615   for (const char *s = rewrite.data(), *end = s + rewrite.size();
616        s < end; s++) {
617     int c = *s;
618     if (c == '\\') {
619       c = *++s;
620       if (isdigit(c)) {
621         int n = (c - '0');
622         if (n >= veclen) {
623           //fprintf(stderr, requested group %d in regexp %.*s\n",
624           //        n, rewrite.size(), rewrite.data());
625           return false;
626         }
627         int start = vec[2 * n];
628         if (start >= 0)
629           out->append(text.data() + start, vec[2 * n + 1] - start);
630       } else if (c == '\\') {
631         *out += '\\';
632       } else {
633         //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
634         //        rewrite.size(), rewrite.data());
635         return false;
636       }
637     } else {
638       *out += c;
639     }
640   }
641   return true;
642 }
643 
644 // Return the number of capturing subpatterns, or -1 if the
645 // regexp wasn't valid on construction.
NumberOfCapturingGroups() const646 int RE::NumberOfCapturingGroups() const {
647   if (re_partial_ == NULL) return -1;
648 
649   int result;
650   int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
651                                   NULL,         // We did not study the pattern
652                                   PCRE_INFO_CAPTURECOUNT,
653                                   &result);
654   assert(pcre_retval == 0);
655   return result;
656 }
657 
658 /***** Parsers for various types *****/
659 
parse_null(const char * str,int n,void * dest)660 bool Arg::parse_null(const char* str, int n, void* dest) {
661   // We fail if somebody asked us to store into a non-NULL void* pointer
662   return (dest == NULL);
663 }
664 
parse_string(const char * str,int n,void * dest)665 bool Arg::parse_string(const char* str, int n, void* dest) {
666   if (dest == NULL) return true;
667   reinterpret_cast<string*>(dest)->assign(str, n);
668   return true;
669 }
670 
parse_stringpiece(const char * str,int n,void * dest)671 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
672   if (dest == NULL) return true;
673   reinterpret_cast<StringPiece*>(dest)->set(str, n);
674   return true;
675 }
676 
parse_char(const char * str,int n,void * dest)677 bool Arg::parse_char(const char* str, int n, void* dest) {
678   if (n != 1) return false;
679   if (dest == NULL) return true;
680   *(reinterpret_cast<char*>(dest)) = str[0];
681   return true;
682 }
683 
parse_uchar(const char * str,int n,void * dest)684 bool Arg::parse_uchar(const char* str, int n, void* dest) {
685   if (n != 1) return false;
686   if (dest == NULL) return true;
687   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
688   return true;
689 }
690 
691 // Largest number spec that we are willing to parse
692 static const int kMaxNumberLength = 32;
693 
694 // REQUIRES "buf" must have length at least kMaxNumberLength+1
695 // REQUIRES "n > 0"
696 // Copies "str" into "buf" and null-terminates if necessary.
697 // Returns one of:
698 //      a. "str" if no termination is needed
699 //      b. "buf" if the string was copied and null-terminated
700 //      c. "" if the input was invalid and has no hope of being parsed
TerminateNumber(char * buf,const char * str,int n)701 static const char* TerminateNumber(char* buf, const char* str, int n) {
702   if ((n > 0) && isspace(*str)) {
703     // We are less forgiving than the strtoxxx() routines and do not
704     // allow leading spaces.
705     return "";
706   }
707 
708   // See if the character right after the input text may potentially
709   // look like a digit.
710   if (isdigit(str[n]) ||
711       ((str[n] >= 'a') && (str[n] <= 'f')) ||
712       ((str[n] >= 'A') && (str[n] <= 'F'))) {
713     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
714     memcpy(buf, str, n);
715     buf[n] = '\0';
716     return buf;
717   } else {
718     // We can parse right out of the supplied string, so return it.
719     return str;
720   }
721 }
722 
parse_long_radix(const char * str,int n,void * dest,int radix)723 bool Arg::parse_long_radix(const char* str,
724                            int n,
725                            void* dest,
726                            int radix) {
727   if (n == 0) return false;
728   char buf[kMaxNumberLength+1];
729   str = TerminateNumber(buf, str, n);
730   char* end;
731   errno = 0;
732   long r = strtol(str, &end, radix);
733   if (end != str + n) return false;   // Leftover junk
734   if (errno) return false;
735   if (dest == NULL) return true;
736   *(reinterpret_cast<long*>(dest)) = r;
737   return true;
738 }
739 
parse_ulong_radix(const char * str,int n,void * dest,int radix)740 bool Arg::parse_ulong_radix(const char* str,
741                             int n,
742                             void* dest,
743                             int radix) {
744   if (n == 0) return false;
745   char buf[kMaxNumberLength+1];
746   str = TerminateNumber(buf, str, n);
747   if (str[0] == '-') return false;    // strtoul() on a negative number?!
748   char* end;
749   errno = 0;
750   unsigned long r = strtoul(str, &end, radix);
751   if (end != str + n) return false;   // Leftover junk
752   if (errno) return false;
753   if (dest == NULL) return true;
754   *(reinterpret_cast<unsigned long*>(dest)) = r;
755   return true;
756 }
757 
parse_short_radix(const char * str,int n,void * dest,int radix)758 bool Arg::parse_short_radix(const char* str,
759                             int n,
760                             void* dest,
761                             int radix) {
762   long r;
763   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
764   if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
765   if (dest == NULL) return true;
766   *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
767   return true;
768 }
769 
parse_ushort_radix(const char * str,int n,void * dest,int radix)770 bool Arg::parse_ushort_radix(const char* str,
771                              int n,
772                              void* dest,
773                              int radix) {
774   unsigned long r;
775   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
776   if (r > USHRT_MAX) return false;                      // Out of range
777   if (dest == NULL) return true;
778   *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
779   return true;
780 }
781 
parse_int_radix(const char * str,int n,void * dest,int radix)782 bool Arg::parse_int_radix(const char* str,
783                           int n,
784                           void* dest,
785                           int radix) {
786   long r;
787   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
788   if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
789   if (dest == NULL) return true;
790   *(reinterpret_cast<int*>(dest)) = r;
791   return true;
792 }
793 
parse_uint_radix(const char * str,int n,void * dest,int radix)794 bool Arg::parse_uint_radix(const char* str,
795                            int n,
796                            void* dest,
797                            int radix) {
798   unsigned long r;
799   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
800   if (r > UINT_MAX) return false;                       // Out of range
801   if (dest == NULL) return true;
802   *(reinterpret_cast<unsigned int*>(dest)) = r;
803   return true;
804 }
805 
parse_longlong_radix(const char * str,int n,void * dest,int radix)806 bool Arg::parse_longlong_radix(const char* str,
807                                int n,
808                                void* dest,
809                                int radix) {
810 #ifndef HAVE_LONG_LONG
811   return false;
812 #else
813   if (n == 0) return false;
814   char buf[kMaxNumberLength+1];
815   str = TerminateNumber(buf, str, n);
816   char* end;
817   errno = 0;
818 #if defined HAVE_STRTOQ
819   long long r = strtoq(str, &end, radix);
820 #elif defined HAVE_STRTOLL
821   long long r = strtoll(str, &end, radix);
822 #elif defined HAVE__STRTOI64
823   long long r = _strtoi64(str, &end, radix);
824 #elif defined HAVE_STRTOIMAX
825   long long r = strtoimax(str, &end, radix);
826 #else
827 #error parse_longlong_radix: cannot convert input to a long-long
828 #endif
829   if (end != str + n) return false;   // Leftover junk
830   if (errno) return false;
831   if (dest == NULL) return true;
832   *(reinterpret_cast<long long*>(dest)) = r;
833   return true;
834 #endif   /* HAVE_LONG_LONG */
835 }
836 
parse_ulonglong_radix(const char * str,int n,void * dest,int radix)837 bool Arg::parse_ulonglong_radix(const char* str,
838                                 int n,
839                                 void* dest,
840                                 int radix) {
841 #ifndef HAVE_UNSIGNED_LONG_LONG
842   return false;
843 #else
844   if (n == 0) return false;
845   char buf[kMaxNumberLength+1];
846   str = TerminateNumber(buf, str, n);
847   if (str[0] == '-') return false;    // strtoull() on a negative number?!
848   char* end;
849   errno = 0;
850 #if defined HAVE_STRTOQ
851   unsigned long long r = strtouq(str, &end, radix);
852 #elif defined HAVE_STRTOLL
853   unsigned long long r = strtoull(str, &end, radix);
854 #elif defined HAVE__STRTOI64
855   unsigned long long r = _strtoui64(str, &end, radix);
856 #elif defined HAVE_STRTOIMAX
857   unsigned long long r = strtoumax(str, &end, radix);
858 #else
859 #error parse_ulonglong_radix: cannot convert input to a long-long
860 #endif
861   if (end != str + n) return false;   // Leftover junk
862   if (errno) return false;
863   if (dest == NULL) return true;
864   *(reinterpret_cast<unsigned long long*>(dest)) = r;
865   return true;
866 #endif   /* HAVE_UNSIGNED_LONG_LONG */
867 }
868 
parse_double(const char * str,int n,void * dest)869 bool Arg::parse_double(const char* str, int n, void* dest) {
870   if (n == 0) return false;
871   static const int kMaxLength = 200;
872   char buf[kMaxLength];
873   if (n >= kMaxLength) return false;
874   memcpy(buf, str, n);
875   buf[n] = '\0';
876   errno = 0;
877   char* end;
878   double r = strtod(buf, &end);
879   if (end != buf + n) return false;   // Leftover junk
880   if (errno) return false;
881   if (dest == NULL) return true;
882   *(reinterpret_cast<double*>(dest)) = r;
883   return true;
884 }
885 
parse_float(const char * str,int n,void * dest)886 bool Arg::parse_float(const char* str, int n, void* dest) {
887   double r;
888   if (!parse_double(str, n, &r)) return false;
889   if (dest == NULL) return true;
890   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
891   return true;
892 }
893 
894 
895 #define DEFINE_INTEGER_PARSERS(name)                                    \
896   bool Arg::parse_##name(const char* str, int n, void* dest) {          \
897     return parse_##name##_radix(str, n, dest, 10);                      \
898   }                                                                     \
899   bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
900     return parse_##name##_radix(str, n, dest, 16);                      \
901   }                                                                     \
902   bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
903     return parse_##name##_radix(str, n, dest, 8);                       \
904   }                                                                     \
905   bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
906     return parse_##name##_radix(str, n, dest, 0);                       \
907   }
908 
909 DEFINE_INTEGER_PARSERS(short)      /*                                   */
910 DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
911 DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
912 DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
913 DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
914 DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
915 DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
916 DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
917 
918 #undef DEFINE_INTEGER_PARSERS
919 
920 }   // namespace pcrecpp
921