• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2021 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 
42 /* This module is a wrapper that provides a POSIX API to the underlying PCRE2
43 functions. The operative functions are called pcre2_regcomp(), etc., with
44 wrappers that use the plain POSIX names. In addition, pcre2posix.h defines the
45 POSIX names as macros for the pcre2_xxx functions, so any program that includes
46 it and uses the POSIX names will call the base functions directly. This makes
47 it easier for an application to be sure it gets the PCRE2 versions in the
48 presence of other POSIX regex libraries. */
49 
50 
51 #ifdef HAVE_CONFIG_H
52 #include "config.h"
53 #endif
54 
55 
56 /* Ensure that the PCRE2POSIX_EXP_xxx macros are set appropriately for
57 compiling these functions. This must come before including pcre2posix.h, where
58 they are set for an application (using these functions) if they have not
59 previously been set. */
60 
61 #if defined(_WIN32) && !defined(PCRE2_STATIC)
62 #  define PCRE2POSIX_EXP_DECL extern __declspec(dllexport)
63 #  define PCRE2POSIX_EXP_DEFN __declspec(dllexport)
64 #endif
65 
66 /* Older versions of MSVC lack snprintf(). This define allows for
67 warning/error-free compilation and testing with MSVC compilers back to at least
68 MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */
69 
70 #if defined(_MSC_VER) && (_MSC_VER < 1900)
71 #define snprintf _snprintf
72 #endif
73 
74 
75 /* Compile-time error numbers start at this value. It should probably never be
76 changed. This #define is a copy of the one in pcre2_internal.h. */
77 
78 #define COMPILE_ERROR_BASE 100
79 
80 
81 /* Standard C headers */
82 
83 #include <ctype.h>
84 #include <limits.h>
85 #include <stddef.h>
86 #include <stdio.h>
87 #include <stdlib.h>
88 #include <string.h>
89 
90 /* PCRE2 headers */
91 
92 #include "pcre2.h"
93 #include "pcre2posix.h"
94 
95 /* When compiling with the MSVC compiler, it is sometimes necessary to include
96 a "calling convention" before exported function names. (This is secondhand
97 information; I know nothing about MSVC myself). For example, something like
98 
99   void __cdecl function(....)
100 
101 might be needed. In order to make this easy, all the exported functions have
102 PCRE2_CALL_CONVENTION just before their names. It is rarely needed; if not
103 set, we ensure here that it has no effect. */
104 
105 #ifndef PCRE2_CALL_CONVENTION
106 #define PCRE2_CALL_CONVENTION
107 #endif
108 
109 /* Table to translate PCRE2 compile time error codes into POSIX error codes.
110 Only a few PCRE2 errors with a value greater than 23 turn into special POSIX
111 codes: most go to REG_BADPAT. The second table lists, in pairs, those that
112 don't. */
113 
114 static const int eint1[] = {
115   0,           /* No error */
116   REG_EESCAPE, /* \ at end of pattern */
117   REG_EESCAPE, /* \c at end of pattern */
118   REG_EESCAPE, /* unrecognized character follows \ */
119   REG_BADBR,   /* numbers out of order in {} quantifier */
120   /* 5 */
121   REG_BADBR,   /* number too big in {} quantifier */
122   REG_EBRACK,  /* missing terminating ] for character class */
123   REG_ECTYPE,  /* invalid escape sequence in character class */
124   REG_ERANGE,  /* range out of order in character class */
125   REG_BADRPT,  /* nothing to repeat */
126   /* 10 */
127   REG_ASSERT,  /* internal error: unexpected repeat */
128   REG_BADPAT,  /* unrecognized character after (? or (?- */
129   REG_BADPAT,  /* POSIX named classes are supported only within a class */
130   REG_BADPAT,  /* POSIX collating elements are not supported */
131   REG_EPAREN,  /* missing ) */
132   /* 15 */
133   REG_ESUBREG, /* reference to non-existent subpattern */
134   REG_INVARG,  /* pattern passed as NULL */
135   REG_INVARG,  /* unknown compile-time option bit(s) */
136   REG_EPAREN,  /* missing ) after (?# comment */
137   REG_ESIZE,   /* parentheses nested too deeply */
138   /* 20 */
139   REG_ESIZE,   /* regular expression too large */
140   REG_ESPACE,  /* failed to get memory */
141   REG_EPAREN,  /* unmatched closing parenthesis */
142   REG_ASSERT   /* internal error: code overflow */
143   };
144 
145 static const int eint2[] = {
146   30, REG_ECTYPE,  /* unknown POSIX class name */
147   32, REG_INVARG,  /* this version of PCRE2 does not have Unicode support */
148   37, REG_EESCAPE, /* PCRE2 does not support \L, \l, \N{name}, \U, or \u */
149   56, REG_INVARG,  /* internal error: unknown newline setting */
150   92, REG_INVARG,  /* invalid option bits with PCRE2_LITERAL */
151   99, REG_EESCAPE  /* \K in lookaround */
152 };
153 
154 /* Table of texts corresponding to POSIX error codes */
155 
156 static const char *const pstring[] = {
157   "",                                /* Dummy for value 0 */
158   "internal error",                  /* REG_ASSERT */
159   "invalid repeat counts in {}",     /* BADBR      */
160   "pattern error",                   /* BADPAT     */
161   "? * + invalid",                   /* BADRPT     */
162   "unbalanced {}",                   /* EBRACE     */
163   "unbalanced []",                   /* EBRACK     */
164   "collation error - not relevant",  /* ECOLLATE   */
165   "bad class",                       /* ECTYPE     */
166   "bad escape sequence",             /* EESCAPE    */
167   "empty expression",                /* EMPTY      */
168   "unbalanced ()",                   /* EPAREN     */
169   "bad range inside []",             /* ERANGE     */
170   "expression too big",              /* ESIZE      */
171   "failed to get memory",            /* ESPACE     */
172   "bad back reference",              /* ESUBREG    */
173   "bad argument",                    /* INVARG     */
174   "match failed"                     /* NOMATCH    */
175 };
176 
177 
178 
179 #if 0  /* REMOVE THIS CODE */
180 
181 The code below was created for 10.33 (see ChangeLog 10.33 #4) when the
182 POSIX functions were given pcre2_... names instead of the traditional POSIX
183 names. However, it has proved to be more troublesome than useful. There have
184 been at least two cases where a program links with two others, one of which
185 uses the POSIX library and the other uses the PCRE2 POSIX functions, thus
186 causing two instances of the POSIX runctions to exist, leading to trouble. For
187 10.37 this code is commented out. In due course it can be removed if there are
188 no issues. The only small worry is the comment below about languages that do
189 not include pcre2posix.h. If there are any such cases, they will have to use
190 the PCRE2 names.
191 
192 
193 /*************************************************
194 *      Wrappers with traditional POSIX names     *
195 *************************************************/
196 
197 /* Keep defining them to preseve the ABI for applications linked to the pcre2
198 POSIX library before these names were changed into macros in pcre2posix.h.
199 This also ensures that the POSIX names are callable from languages that do not
200 include pcre2posix.h. It is vital to #undef the macro definitions from
201 pcre2posix.h! */
202 
203 #undef regerror
204 PCRE2POSIX_EXP_DECL size_t regerror(int, const regex_t *, char *, size_t);
205 PCRE2POSIX_EXP_DEFN size_t PCRE2_CALL_CONVENTION
206 regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
207 {
208 return pcre2_regerror(errcode, preg, errbuf, errbuf_size);
209 }
210 
211 #undef regfree
212 PCRE2POSIX_EXP_DECL void regfree(regex_t *);
213 PCRE2POSIX_EXP_DEFN void PCRE2_CALL_CONVENTION
214 regfree(regex_t *preg)
215 {
216 pcre2_regfree(preg);
217 }
218 
219 #undef regcomp
220 PCRE2POSIX_EXP_DECL int regcomp(regex_t *, const char *, int);
221 PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION
222 regcomp(regex_t *preg, const char *pattern, int cflags)
223 {
224 return pcre2_regcomp(preg, pattern, cflags);
225 }
226 
227 #undef regexec
228 PCRE2POSIX_EXP_DECL int regexec(const regex_t *, const char *, size_t,
229   regmatch_t *, int);
230 PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION
231 regexec(const regex_t *preg, const char *string, size_t nmatch,
232   regmatch_t pmatch[], int eflags)
233 {
234 return pcre2_regexec(preg, string, nmatch, pmatch, eflags);
235 }
236 #endif
237 
238 
239 /*************************************************
240 *          Translate error code to string        *
241 *************************************************/
242 
243 PCRE2POSIX_EXP_DEFN size_t PCRE2_CALL_CONVENTION
pcre2_regerror(int errcode,const regex_t * preg,char * errbuf,size_t errbuf_size)244 pcre2_regerror(int errcode, const regex_t *preg, char *errbuf,
245   size_t errbuf_size)
246 {
247 int used;
248 const char *message;
249 
250 message = (errcode <= 0 || errcode >= (int)(sizeof(pstring)/sizeof(char *)))?
251   "unknown error code" : pstring[errcode];
252 
253 if (preg != NULL && (int)preg->re_erroffset != -1)
254   {
255   used = snprintf(errbuf, errbuf_size, "%s at offset %-6d", message,
256     (int)preg->re_erroffset);
257   }
258 else
259   {
260   used = snprintf(errbuf, errbuf_size, "%s", message);
261   }
262 
263 return used + 1;
264 }
265 
266 
267 
268 /*************************************************
269 *           Free store held by a regex           *
270 *************************************************/
271 
272 PCRE2POSIX_EXP_DEFN void PCRE2_CALL_CONVENTION
pcre2_regfree(regex_t * preg)273 pcre2_regfree(regex_t *preg)
274 {
275 pcre2_match_data_free(preg->re_match_data);
276 pcre2_code_free(preg->re_pcre2_code);
277 }
278 
279 
280 
281 /*************************************************
282 *            Compile a regular expression        *
283 *************************************************/
284 
285 /*
286 Arguments:
287   preg        points to a structure for recording the compiled expression
288   pattern     the pattern to compile
289   cflags      compilation flags
290 
291 Returns:      0 on success
292               various non-zero codes on failure
293 */
294 
295 PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_regcomp(regex_t * preg,const char * pattern,int cflags)296 pcre2_regcomp(regex_t *preg, const char *pattern, int cflags)
297 {
298 PCRE2_SIZE erroffset;
299 PCRE2_SIZE patlen;
300 int errorcode;
301 int options = 0;
302 int re_nsub = 0;
303 
304 patlen = ((cflags & REG_PEND) != 0)? (PCRE2_SIZE)(preg->re_endp - pattern) :
305   PCRE2_ZERO_TERMINATED;
306 
307 if ((cflags & REG_ICASE) != 0)    options |= PCRE2_CASELESS;
308 if ((cflags & REG_NEWLINE) != 0)  options |= PCRE2_MULTILINE;
309 if ((cflags & REG_DOTALL) != 0)   options |= PCRE2_DOTALL;
310 if ((cflags & REG_NOSPEC) != 0)   options |= PCRE2_LITERAL;
311 if ((cflags & REG_UTF) != 0)      options |= PCRE2_UTF;
312 if ((cflags & REG_UCP) != 0)      options |= PCRE2_UCP;
313 if ((cflags & REG_UNGREEDY) != 0) options |= PCRE2_UNGREEDY;
314 
315 preg->re_cflags = cflags;
316 preg->re_pcre2_code = pcre2_compile((PCRE2_SPTR)pattern, patlen, options,
317   &errorcode, &erroffset, NULL);
318 preg->re_erroffset = erroffset;
319 
320 if (preg->re_pcre2_code == NULL)
321   {
322   unsigned int i;
323 
324   /* A negative value is a UTF error; otherwise all error codes are greater
325   than COMPILE_ERROR_BASE, but check, just in case. */
326 
327   if (errorcode < COMPILE_ERROR_BASE) return REG_BADPAT;
328   errorcode -= COMPILE_ERROR_BASE;
329 
330   if (errorcode < (int)(sizeof(eint1)/sizeof(const int)))
331     return eint1[errorcode];
332   for (i = 0; i < sizeof(eint2)/sizeof(const int); i += 2)
333     if (errorcode == eint2[i]) return eint2[i+1];
334   return REG_BADPAT;
335   }
336 
337 (void)pcre2_pattern_info((const pcre2_code *)preg->re_pcre2_code,
338   PCRE2_INFO_CAPTURECOUNT, &re_nsub);
339 preg->re_nsub = (size_t)re_nsub;
340 preg->re_match_data = pcre2_match_data_create(re_nsub + 1, NULL);
341 preg->re_erroffset = (size_t)(-1);  /* No meaning after successful compile */
342 
343 if (preg->re_match_data == NULL)
344   {
345   pcre2_code_free(preg->re_pcre2_code);
346   return REG_ESPACE;
347   }
348 
349 return 0;
350 }
351 
352 
353 
354 /*************************************************
355 *              Match a regular expression        *
356 *************************************************/
357 
358 /* A suitable match_data block, large enough to hold all possible captures, was
359 obtained when the pattern was compiled, to save having to allocate and free it
360 for each match. If REG_NOSUB was specified at compile time, the nmatch and
361 pmatch arguments are ignored, and the only result is yes/no/error. */
362 
363 PCRE2POSIX_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_regexec(const regex_t * preg,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)364 pcre2_regexec(const regex_t *preg, const char *string, size_t nmatch,
365   regmatch_t pmatch[], int eflags)
366 {
367 int rc, so, eo;
368 int options = 0;
369 pcre2_match_data *md = (pcre2_match_data *)preg->re_match_data;
370 
371 if (string == NULL) return REG_INVARG;
372 
373 if ((eflags & REG_NOTBOL) != 0) options |= PCRE2_NOTBOL;
374 if ((eflags & REG_NOTEOL) != 0) options |= PCRE2_NOTEOL;
375 if ((eflags & REG_NOTEMPTY) != 0) options |= PCRE2_NOTEMPTY;
376 
377 /* When REG_NOSUB was specified, or if no vector has been passed in which to
378 put captured strings, ensure that nmatch is zero. This will stop any attempt to
379 write to pmatch. */
380 
381 if ((preg->re_cflags & REG_NOSUB) != 0 || pmatch == NULL) nmatch = 0;
382 
383 /* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings.
384 The man page from OS X says "REG_STARTEND affects only the location of the
385 string, not how it is matched". That is why the "so" value is used to bump the
386 start location rather than being passed as a PCRE2 "starting offset". */
387 
388 if ((eflags & REG_STARTEND) != 0)
389   {
390   if (pmatch == NULL) return REG_INVARG;
391   so = pmatch[0].rm_so;
392   eo = pmatch[0].rm_eo;
393   }
394 else
395   {
396   so = 0;
397   eo = (int)strlen(string);
398   }
399 
400 rc = pcre2_match((const pcre2_code *)preg->re_pcre2_code,
401   (PCRE2_SPTR)string + so, (eo - so), 0, options, md, NULL);
402 
403 /* Successful match */
404 
405 if (rc >= 0)
406   {
407   size_t i;
408   PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(md);
409   if ((size_t)rc > nmatch) rc = (int)nmatch;
410   for (i = 0; i < (size_t)rc; i++)
411     {
412     pmatch[i].rm_so = (ovector[i*2] == PCRE2_UNSET)? -1 :
413       (int)(ovector[i*2] + so);
414     pmatch[i].rm_eo = (ovector[i*2+1] == PCRE2_UNSET)? -1 :
415       (int)(ovector[i*2+1] + so);
416     }
417   for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1;
418   return 0;
419   }
420 
421 /* Unsuccessful match */
422 
423 if (rc <= PCRE2_ERROR_UTF8_ERR1 && rc >= PCRE2_ERROR_UTF8_ERR21)
424   return REG_INVARG;
425 
426 switch(rc)
427   {
428   default: return REG_ASSERT;
429   case PCRE2_ERROR_BADMODE: return REG_INVARG;
430   case PCRE2_ERROR_BADMAGIC: return REG_INVARG;
431   case PCRE2_ERROR_BADOPTION: return REG_INVARG;
432   case PCRE2_ERROR_BADUTFOFFSET: return REG_INVARG;
433   case PCRE2_ERROR_MATCHLIMIT: return REG_ESPACE;
434   case PCRE2_ERROR_NOMATCH: return REG_NOMATCH;
435   case PCRE2_ERROR_NOMEMORY: return REG_ESPACE;
436   case PCRE2_ERROR_NULL: return REG_INVARG;
437   }
438 }
439 
440 /* End of pcre2posix.c */
441