• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GRegex -- regular expression API wrapper around PCRE.
2  *
3  * Copyright (C) 1999, 2000 Scott Wimer
4  * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
5  * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with this library; if not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "config.h"
22 
23 #include <string.h>
24 
25 #define PCRE2_CODE_UNIT_WIDTH 8
26 #include <pcre2.h>
27 
28 #include "gtypes.h"
29 #include "gregex.h"
30 #include "glibintl.h"
31 #include "glist.h"
32 #include "gmessages.h"
33 #include "gstrfuncs.h"
34 #include "gatomic.h"
35 #include "gthread.h"
36 
37 /**
38  * SECTION:gregex
39  * @title: Perl-compatible regular expressions
40  * @short_description: matches strings against regular expressions
41  * @see_also: [Regular expression syntax][glib-regex-syntax]
42  *
43  * The g_regex_*() functions implement regular
44  * expression pattern matching using syntax and semantics similar to
45  * Perl regular expression.
46  *
47  * Some functions accept a @start_position argument, setting it differs
48  * from just passing over a shortened string and setting #G_REGEX_MATCH_NOTBOL
49  * in the case of a pattern that begins with any kind of lookbehind assertion.
50  * For example, consider the pattern "\Biss\B" which finds occurrences of "iss"
51  * in the middle of words. ("\B" matches only if the current position in the
52  * subject is not a word boundary.) When applied to the string "Mississipi"
53  * from the fourth byte, namely "issipi", it does not match, because "\B" is
54  * always false at the start of the subject, which is deemed to be a word
55  * boundary. However, if the entire string is passed , but with
56  * @start_position set to 4, it finds the second occurrence of "iss" because
57  * it is able to look behind the starting point to discover that it is
58  * preceded by a letter.
59  *
60  * Note that, unless you set the #G_REGEX_RAW flag, all the strings passed
61  * to these functions must be encoded in UTF-8. The lengths and the positions
62  * inside the strings are in bytes and not in characters, so, for instance,
63  * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a
64  * single character. If you set #G_REGEX_RAW the strings can be non-valid
65  * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two
66  * bytes and two characters long.
67  *
68  * When matching a pattern, "\n" matches only against a "\n" character in
69  * the string, and "\r" matches only a "\r" character. To match any newline
70  * sequence use "\R". This particular group matches either the two-character
71  * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed,
72  * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"),
73  * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line
74  * separator, U+2028), or PS (paragraph separator, U+2029).
75  *
76  * The behaviour of the dot, circumflex, and dollar metacharacters are
77  * affected by newline characters, the default is to recognize any newline
78  * character (the same characters recognized by "\R"). This can be changed
79  * with #G_REGEX_NEWLINE_CR, #G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF
80  * compile options, and with #G_REGEX_MATCH_NEWLINE_ANY,
81  * #G_REGEX_MATCH_NEWLINE_CR, #G_REGEX_MATCH_NEWLINE_LF and
82  * #G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also
83  * relevant when compiling a pattern if #G_REGEX_EXTENDED is set, and an
84  * unescaped "#" outside a character class is encountered. This indicates
85  * a comment that lasts until after the next newline.
86  *
87  * When setting the %G_REGEX_JAVASCRIPT_COMPAT flag, pattern syntax and pattern
88  * matching is changed to be compatible with the way that regular expressions
89  * work in JavaScript. More precisely, a lonely ']' character in the pattern
90  * is a syntax error; the '\x' escape only allows 0 to 2 hexadecimal digits, and
91  * you must use the '\u' escape sequence with 4 hex digits to specify a unicode
92  * codepoint instead of '\x' or 'x{....}'. If '\x' or '\u' are not followed by
93  * the specified number of hex digits, they match 'x' and 'u' literally; also
94  * '\U' always matches 'U' instead of being an error in the pattern. Finally,
95  * pattern matching is modified so that back references to an unset subpattern
96  * group produces a match with the empty string instead of an error. See
97  * pcreapi(3) for more information.
98  *
99  * Creating and manipulating the same #GRegex structure from different
100  * threads is not a problem as #GRegex does not modify its internal
101  * state between creation and destruction, on the other hand #GMatchInfo
102  * is not threadsafe.
103  *
104  * The regular expressions low-level functionalities are obtained through
105  * the excellent
106  * [PCRE](http://www.pcre.org/)
107  * library written by Philip Hazel.
108  */
109 
110 /* signifies that flags have already been converted from pcre1 to pcre2 */
111 #define G_REGEX_FLAGS_CONVERTED 0x04000000u
112 /* Mask of all the possible values for GRegexCompileFlags. */
113 #define G_REGEX_COMPILE_MASK (PCRE2_CASELESS |        \
114                               PCRE2_MULTILINE |       \
115                               PCRE2_DOTALL |          \
116                               PCRE2_EXTENDED |        \
117                               PCRE2_ANCHORED |        \
118                               PCRE2_DOLLAR_ENDONLY |  \
119                               PCRE2_UNGREEDY |        \
120                               PCRE2_UTF |             \
121                               PCRE2_NO_AUTO_CAPTURE | \
122                               PCRE2_FIRSTLINE |       \
123                               PCRE2_DUPNAMES |        \
124                               PCRE2_NEWLINE_CR |      \
125                               PCRE2_NEWLINE_LF |      \
126                               PCRE2_NEWLINE_CRLF |    \
127                               PCRE2_NEWLINE_ANYCRLF | \
128                               PCRE2_BSR_ANYCRLF |     \
129                               G_REGEX_FLAGS_CONVERTED)
130 
131 /* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */
132 #define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK)
133 #define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF | \
134                                       G_REGEX_FLAGS_CONVERTED)
135 
136 /* Mask of all the possible values for GRegexMatchFlags. */
137 #define G_REGEX_MATCH_MASK (PCRE2_ANCHORED |         \
138                             PCRE2_NOTBOL |           \
139                             PCRE2_NOTEOL |           \
140                             PCRE2_NOTEMPTY |         \
141                             PCRE2_PARTIAL_SOFT |     \
142                             PCRE2_NEWLINE_CR |       \
143                             PCRE2_NEWLINE_LF |       \
144                             PCRE2_NEWLINE_CRLF |     \
145                             PCRE2_NEWLINE_ANY |      \
146                             PCRE2_NEWLINE_ANYCRLF |  \
147                             PCRE2_BSR_ANYCRLF |      \
148                             PCRE2_BSR_UNICODE |      \
149                             PCRE2_PARTIAL_SOFT |     \
150                             PCRE2_PARTIAL_HARD |     \
151                             PCRE2_NOTEMPTY_ATSTART | \
152                             G_REGEX_FLAGS_CONVERTED)
153 
154 /* if the string is in UTF-8 use g_utf8_ functions, else use
155  * use just +/- 1. */
156 #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
157                                 ((s) + 1) : \
158                                 g_utf8_next_char (s))
159 #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
160                                 ((s) - 1) : \
161                                 g_utf8_prev_char (s))
162 
163 struct _GMatchInfo
164 {
165   gint ref_count;               /* the ref count (atomic) */
166   GRegex *regex;                /* the regex */
167   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
168   gint matches;                 /* number of matching sub patterns */
169   gint pos;                     /* position in the string where last match left off */
170   gint  n_offsets;              /* number of offsets */
171   gint *offsets;                /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
172   gint *workspace;              /* workspace for pcre_dfa_exec() */
173   gint n_workspace;             /* number of workspace elements */
174   const gchar *string;          /* string passed to the match function */
175   gssize string_len;            /* length of string, in bytes */
176   pcre2_match_data *match_data;
177 };
178 
179 struct _GRegex
180 {
181   gint ref_count;               /* the ref count for the immutable part (atomic) */
182   gchar *pattern;               /* the pattern */
183   pcre2_code *pcre_re;          /* compiled form of the pattern */
184   GRegexCompileFlags compile_opts;      /* options used at compile time on the pattern */
185   GRegexMatchFlags match_opts;  /* options used at match time on the regex */
186 };
187 
188 /* TRUE if ret is an error code, FALSE otherwise. */
189 #define IS_PCRE_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL)
190 
191 typedef struct _InterpolationData InterpolationData;
192 static gboolean  interpolation_list_needs_match (GList *list);
193 static gboolean  interpolate_replacement        (const GMatchInfo *match_info,
194                                                  GString *result,
195                                                  gpointer data);
196 static GList    *split_replacement              (const gchar *replacement,
197                                                  GError **error);
198 static void      free_interpolation_data        (InterpolationData *data);
199 
200 static gint
map_to_pcre2_compile_flags(gint pcre1_flags)201 map_to_pcre2_compile_flags (gint pcre1_flags)
202 {
203   /* Maps compile flags from pcre1 to pcre2 values
204    */
205   gint pcre2_flags = G_REGEX_FLAGS_CONVERTED;
206 
207   if (pcre1_flags & G_REGEX_FLAGS_CONVERTED)
208     return pcre1_flags;
209 
210   if (pcre1_flags & G_REGEX_CASELESS)
211     pcre2_flags |= PCRE2_CASELESS;
212   if (pcre1_flags & G_REGEX_MULTILINE)
213     pcre2_flags |= PCRE2_MULTILINE;
214   if (pcre1_flags & G_REGEX_DOTALL)
215     pcre2_flags |= PCRE2_DOTALL;
216   if (pcre1_flags & G_REGEX_EXTENDED)
217     pcre2_flags |= PCRE2_EXTENDED;
218   if (pcre1_flags & G_REGEX_ANCHORED)
219     pcre2_flags |= PCRE2_ANCHORED;
220   if (pcre1_flags & G_REGEX_DOLLAR_ENDONLY)
221     pcre2_flags |= PCRE2_DOLLAR_ENDONLY;
222   if (pcre1_flags & G_REGEX_UNGREEDY)
223     pcre2_flags |= PCRE2_UNGREEDY;
224   if (pcre1_flags & G_REGEX_RAW)
225     pcre2_flags |= PCRE2_UTF;
226   if (pcre1_flags & G_REGEX_NO_AUTO_CAPTURE)
227     pcre2_flags |= PCRE2_NO_AUTO_CAPTURE;
228   if (pcre1_flags & G_REGEX_FIRSTLINE)
229     pcre2_flags |= PCRE2_FIRSTLINE;
230   if (pcre1_flags & G_REGEX_DUPNAMES)
231     pcre2_flags |= PCRE2_DUPNAMES;
232   if (pcre1_flags & G_REGEX_NEWLINE_CR)
233     pcre2_flags |= PCRE2_NEWLINE_CR;
234   if (pcre1_flags & G_REGEX_NEWLINE_LF)
235     pcre2_flags |= PCRE2_NEWLINE_LF;
236   if ((pcre1_flags & G_REGEX_NEWLINE_CRLF) == G_REGEX_NEWLINE_CRLF)
237     pcre2_flags |= PCRE2_NEWLINE_CRLF;
238   if ((pcre1_flags & G_REGEX_NEWLINE_ANYCRLF) == G_REGEX_NEWLINE_ANYCRLF)
239     pcre2_flags |= PCRE2_NEWLINE_ANYCRLF;
240   if (pcre1_flags & G_REGEX_BSR_ANYCRLF)
241     pcre2_flags |= PCRE2_BSR_ANYCRLF;
242 
243   /* these are not available in pcre2 */
244   if (pcre1_flags & G_REGEX_OPTIMIZE)
245     pcre2_flags |= 0;
246   if (pcre1_flags & G_REGEX_JAVASCRIPT_COMPAT)
247     pcre2_flags |= 0;
248 
249   return pcre2_flags;
250 }
251 
252 static gint
map_to_pcre2_match_flags(gint pcre1_flags)253 map_to_pcre2_match_flags (gint pcre1_flags)
254 {
255   /* Maps match flags from pcre1 to pcre2 values
256    */
257   gint pcre2_flags = G_REGEX_FLAGS_CONVERTED;
258 
259   if (pcre1_flags & G_REGEX_FLAGS_CONVERTED)
260     return pcre1_flags;
261 
262   if (pcre1_flags & G_REGEX_MATCH_ANCHORED)
263     pcre2_flags |= PCRE2_ANCHORED;
264   if (pcre1_flags & G_REGEX_MATCH_NOTBOL)
265     pcre2_flags |= PCRE2_NOTBOL;
266   if (pcre1_flags & G_REGEX_MATCH_NOTEOL)
267     pcre2_flags |= PCRE2_NOTEOL;
268   if (pcre1_flags & G_REGEX_MATCH_NOTEMPTY)
269     pcre2_flags |= PCRE2_NOTEMPTY;
270   if (pcre1_flags & G_REGEX_MATCH_PARTIAL)
271     pcre2_flags |= PCRE2_PARTIAL_SOFT;
272   if (pcre1_flags & G_REGEX_MATCH_NEWLINE_CR)
273     pcre2_flags |= PCRE2_NEWLINE_CR;
274   if (pcre1_flags & G_REGEX_MATCH_NEWLINE_LF)
275     pcre2_flags |= PCRE2_NEWLINE_LF;
276   if ((pcre1_flags & G_REGEX_MATCH_NEWLINE_CRLF) == G_REGEX_MATCH_NEWLINE_CRLF)
277     pcre2_flags |= PCRE2_NEWLINE_CRLF;
278   if (pcre1_flags & G_REGEX_MATCH_NEWLINE_ANY)
279     pcre2_flags |= PCRE2_NEWLINE_ANY;
280   if ((pcre1_flags & G_REGEX_MATCH_NEWLINE_ANYCRLF) == G_REGEX_MATCH_NEWLINE_ANYCRLF)
281     pcre2_flags |= PCRE2_NEWLINE_ANYCRLF;
282   if (pcre1_flags & G_REGEX_MATCH_BSR_ANYCRLF)
283     pcre2_flags |= PCRE2_BSR_ANYCRLF;
284   if (pcre1_flags & G_REGEX_MATCH_BSR_ANY)
285     pcre2_flags |= PCRE2_BSR_UNICODE;
286   if (pcre1_flags & G_REGEX_MATCH_PARTIAL_SOFT)
287     pcre2_flags |= PCRE2_PARTIAL_SOFT;
288   if (pcre1_flags & G_REGEX_MATCH_PARTIAL_HARD)
289     pcre2_flags |= PCRE2_PARTIAL_HARD;
290   if (pcre1_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART)
291     pcre2_flags |= PCRE2_NOTEMPTY_ATSTART;
292   if (pcre1_flags & G_REGEX_RAW)
293     pcre2_flags |= PCRE2_UTF;
294 
295   return pcre2_flags;
296 }
297 
298 static gint
map_to_pcre1_compile_flags(gint pcre2_flags)299 map_to_pcre1_compile_flags (gint pcre2_flags)
300 {
301   /* Maps compile flags from pcre2 to pcre1 values
302    */
303   gint pcre1_flags = 0;
304 
305   if (!(pcre2_flags & G_REGEX_FLAGS_CONVERTED))
306     return pcre2_flags;
307 
308   if (pcre2_flags & PCRE2_CASELESS)
309     pcre1_flags |= G_REGEX_CASELESS;
310   if (pcre2_flags & PCRE2_MULTILINE)
311     pcre1_flags |= G_REGEX_MULTILINE;
312   if (pcre2_flags & PCRE2_DOTALL)
313     pcre1_flags |= G_REGEX_DOTALL;
314   if (pcre2_flags & PCRE2_EXTENDED)
315     pcre1_flags |= G_REGEX_EXTENDED;
316   if (pcre2_flags & PCRE2_ANCHORED)
317     pcre1_flags |= G_REGEX_ANCHORED;
318   if (pcre2_flags & PCRE2_DOLLAR_ENDONLY)
319     pcre1_flags |= G_REGEX_DOLLAR_ENDONLY;
320   if (pcre2_flags & PCRE2_UNGREEDY)
321     pcre1_flags |= G_REGEX_UNGREEDY;
322   if (pcre2_flags & PCRE2_UTF)
323     pcre1_flags |= G_REGEX_RAW;
324   if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE)
325     pcre1_flags |= G_REGEX_NO_AUTO_CAPTURE;
326   if (pcre2_flags & PCRE2_FIRSTLINE)
327     pcre1_flags |= G_REGEX_FIRSTLINE;
328   if (pcre2_flags & PCRE2_DUPNAMES)
329     pcre1_flags |= G_REGEX_DUPNAMES;
330   if (pcre2_flags & PCRE2_NEWLINE_CR)
331     pcre1_flags |= G_REGEX_NEWLINE_CR;
332   if (pcre2_flags & PCRE2_NEWLINE_LF)
333     pcre1_flags |= G_REGEX_NEWLINE_LF;
334   if ((pcre2_flags & PCRE2_NEWLINE_CRLF) == PCRE2_NEWLINE_CRLF)
335     pcre1_flags |= G_REGEX_NEWLINE_CRLF;
336   if ((pcre2_flags & PCRE2_NEWLINE_ANYCRLF) == PCRE2_NEWLINE_ANYCRLF)
337     pcre1_flags |= G_REGEX_NEWLINE_ANYCRLF;
338   if (pcre2_flags & PCRE2_BSR_ANYCRLF)
339     pcre1_flags |= G_REGEX_BSR_ANYCRLF;
340 
341   return pcre1_flags;
342 }
343 
344 static gint
map_to_pcre1_match_flags(gint pcre2_flags)345 map_to_pcre1_match_flags (gint pcre2_flags)
346 {
347   /* Maps match flags from pcre2 to pcre1 values
348    */
349   gint pcre1_flags = 0;
350 
351   if (!(pcre2_flags & G_REGEX_FLAGS_CONVERTED))
352     return pcre2_flags;
353 
354   if (pcre2_flags & PCRE2_ANCHORED)
355     pcre1_flags |= G_REGEX_MATCH_ANCHORED;
356   if (pcre2_flags & PCRE2_NOTBOL)
357     pcre1_flags |= G_REGEX_MATCH_NOTBOL;
358   if (pcre2_flags & PCRE2_NOTEOL)
359     pcre1_flags |= G_REGEX_MATCH_NOTEOL;
360   if (pcre2_flags & PCRE2_NOTEMPTY)
361     pcre1_flags |= G_REGEX_MATCH_NOTEMPTY;
362   if (pcre2_flags & PCRE2_PARTIAL_SOFT)
363     pcre1_flags |= G_REGEX_MATCH_PARTIAL;
364   if (pcre2_flags & PCRE2_NEWLINE_CR)
365     pcre1_flags |= G_REGEX_MATCH_NEWLINE_CR;
366   if (pcre2_flags & PCRE2_NEWLINE_LF)
367     pcre1_flags |= G_REGEX_MATCH_NEWLINE_LF;
368   if ((pcre2_flags & PCRE2_NEWLINE_CRLF) == PCRE2_NEWLINE_CRLF)
369     pcre1_flags |= G_REGEX_MATCH_NEWLINE_CRLF;
370   if (pcre2_flags & PCRE2_NEWLINE_ANY)
371     pcre1_flags |= G_REGEX_MATCH_NEWLINE_ANY;
372   if ((pcre2_flags & PCRE2_NEWLINE_ANYCRLF) == PCRE2_NEWLINE_ANYCRLF)
373     pcre1_flags |= G_REGEX_MATCH_NEWLINE_ANYCRLF;
374   if (pcre2_flags & PCRE2_BSR_ANYCRLF)
375     pcre1_flags |= G_REGEX_MATCH_BSR_ANYCRLF;
376   if (pcre2_flags & PCRE2_BSR_UNICODE)
377     pcre1_flags |= G_REGEX_MATCH_BSR_ANY;
378   if (pcre2_flags & PCRE2_PARTIAL_SOFT)
379     pcre1_flags |= G_REGEX_MATCH_PARTIAL_SOFT;
380   if (pcre2_flags & PCRE2_PARTIAL_HARD)
381     pcre1_flags |= G_REGEX_MATCH_PARTIAL_HARD;
382   if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART)
383     pcre1_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART;
384   if (pcre2_flags & PCRE2_UTF)
385     pcre1_flags |= G_REGEX_RAW;
386 
387   return pcre1_flags;
388 }
389 
390 static gint
map_to_gregex_error(gint pcre2_error)391 map_to_gregex_error (gint pcre2_error)
392 {
393   /* Maps error codes from pcre2 to gregex values (which were based on pcre1)
394    */
395   switch (pcre2_error)
396     {
397     case PCRE2_ERROR_END_BACKSLASH:
398       return G_REGEX_ERROR_STRAY_BACKSLASH;
399     case PCRE2_ERROR_END_BACKSLASH_C:
400       return G_REGEX_ERROR_MISSING_CONTROL_CHAR;
401     case PCRE2_ERROR_UNKNOWN_ESCAPE:
402       return G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
403     case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER:
404       return G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER;
405     case PCRE2_ERROR_QUANTIFIER_TOO_BIG:
406       return G_REGEX_ERROR_QUANTIFIER_TOO_BIG;
407     case PCRE2_ERROR_MISSING_SQUARE_BRACKET:
408       return G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS;
409     case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS:
410       return G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS;
411     case PCRE2_ERROR_CLASS_RANGE_ORDER:
412       return G_REGEX_ERROR_RANGE_OUT_OF_ORDER;
413     case PCRE2_ERROR_QUANTIFIER_INVALID:
414       return G_REGEX_ERROR_NOTHING_TO_REPEAT;
415     case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT:
416       return G_REGEX_ERROR_NOTHING_TO_REPEAT;
417     case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY:
418       return G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
419     case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS:
420       return G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS;
421     case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING:
422       return G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED;
423     case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS:
424       return G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
425     case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE:
426       return G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE;
427     case PCRE2_ERROR_MISSING_COMMENT_CLOSING:
428       return G_REGEX_ERROR_UNTERMINATED_COMMENT;
429     case PCRE2_ERROR_PATTERN_TOO_LARGE:
430       return G_REGEX_ERROR_EXPRESSION_TOO_LARGE;
431     case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS:
432       return G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
433     case PCRE2_ERROR_MISSING_CONDITION_CLOSING:
434       return G_REGEX_ERROR_MALFORMED_CONDITION;
435     case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH:
436       return G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND;
437     case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES:
438       return G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES;
439     case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED:
440       return G_REGEX_ERROR_ASSERTION_EXPECTED;
441     case PCRE2_ERROR_BAD_RELATIVE_REFERENCE:
442       return G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE;
443     case PCRE2_ERROR_UNKNOWN_POSIX_CLASS:
444       return G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME;
445     case PCRE2_ERROR_CODE_POINT_TOO_BIG:
446       return G_REGEX_ERROR_HEX_CODE_TOO_LARGE;
447     case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C:
448       return G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND;
449     case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE:
450       return G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
451     case PCRE2_ERROR_MISSING_NAME_TERMINATOR:
452       return G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR;
453     case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME:
454       return G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME;
455     case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY:
456       return G_REGEX_ERROR_MALFORMED_PROPERTY;
457     case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY:
458       return G_REGEX_ERROR_UNKNOWN_PROPERTY;
459     case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG:
460       return G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG;
461     case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS:
462       return G_REGEX_ERROR_TOO_MANY_SUBPATTERNS;
463     case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG:
464       return G_REGEX_ERROR_INVALID_OCTAL_VALUE;
465     case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES:
466       return G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE;
467     case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE:
468       return G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS;
469     case PCRE2_ERROR_BACKSLASH_G_SYNTAX:
470       return G_REGEX_ERROR_MISSING_BACK_REFERENCE;
471     case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING:
472       return G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
473     case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED:
474       return G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN;
475     case PCRE2_ERROR_VERB_UNKNOWN:
476       return G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB;
477     case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG:
478       return G_REGEX_ERROR_NUMBER_TOO_BIG;
479     case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED:
480       return G_REGEX_ERROR_MISSING_SUBPATTERN_NAME;
481     case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH:
482       return G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME;
483     case PCRE2_ERROR_MARK_MISSING_ARGUMENT:
484       return G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED;
485     case PCRE2_ERROR_INVALID_HEXADECIMAL:
486       return G_REGEX_ERROR_HEX_CODE_TOO_LARGE;
487     case PCRE2_ERROR_BACKSLASH_C_SYNTAX:
488       return G_REGEX_ERROR_INVALID_CONTROL_CHAR;
489     case PCRE2_ERROR_BACKSLASH_K_SYNTAX:
490       return G_REGEX_ERROR_MISSING_NAME;
491     case PCRE2_ERROR_BACKSLASH_N_IN_CLASS:
492       return G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS;
493     case PCRE2_ERROR_VERB_NAME_TOO_LONG:
494       return G_REGEX_ERROR_NAME_TOO_LONG;
495     case PCRE2_ERROR_NULL_PATTERN:
496     case PCRE2_ERROR_BAD_OPTIONS:
497     case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP:
498     case PCRE2_ERROR_HEAP_FAILED:
499     case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW:
500     case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE:
501     case PCRE2_ERROR_INTERNAL_STUDY_ERROR:
502     case PCRE2_ERROR_UNICODE_NOT_SUPPORTED:
503     case PCRE2_ERROR_PARENTHESES_STACK_CHECK:
504     case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED:
505     case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG:
506     case PCRE2_ERROR_MISSING_CALLOUT_CLOSING:
507     case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB:
508     case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P:
509     case PCRE2_ERROR_INVALID_SUBPATTERN_NAME:
510     case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE:
511     case PCRE2_ERROR_CLASS_INVALID_RANGE:
512     case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE:
513     case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN:
514     case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE:
515     case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW:
516     case PCRE2_ERROR_INVALID_OCTAL:
517     case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS:
518     case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG:
519     case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT:
520     case PCRE2_ERROR_UTF_IS_DISABLED:
521     case PCRE2_ERROR_UCP_IS_DISABLED:
522     case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG:
523     case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS:
524     case PCRE2_ERROR_VERSION_CONDITION_SYNTAX:
525     case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS:
526     case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER:
527     case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER:
528     case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED:
529     case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP:
530     case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED:
531     case PCRE2_ERROR_PATTERN_TOO_COMPLICATED:
532     case PCRE2_ERROR_LOOKBEHIND_TOO_LONG:
533     case PCRE2_ERROR_PATTERN_STRING_TOO_LONG:
534     case PCRE2_ERROR_INTERNAL_BAD_CODE:
535     case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP:
536     case PCRE2_ERROR_NO_SURROGATES_IN_UTF16:
537     case PCRE2_ERROR_BAD_LITERAL_OPTIONS:
538     default:
539       return G_REGEX_ERROR_COMPILE;
540     }
541 }
542 
543 static const gchar *
match_error(gint errcode)544 match_error (gint errcode)
545 {
546   switch (errcode)
547     {
548     case PCRE2_ERROR_NOMATCH:
549       /* not an error */
550       break;
551     case PCRE2_ERROR_NULL:
552       /* NULL argument, this should not happen in GRegex */
553       g_warning ("A NULL argument was passed to PCRE");
554       break;
555     case PCRE2_ERROR_BADOPTION:
556       return "bad options";
557     case PCRE2_ERROR_BADMAGIC:
558       return _("corrupted object");
559     case PCRE2_ERROR_NOMEMORY:
560       return _("out of memory");
561     case PCRE2_ERROR_NOSUBSTRING:
562       /* not used by pcre_exec() */
563       break;
564     case PCRE2_ERROR_MATCHLIMIT:
565       return _("backtracking limit reached");
566     case PCRE2_ERROR_CALLOUT:
567       /* callouts are not implemented */
568       break;
569     case PCRE2_ERROR_BADUTFOFFSET:
570       /* we do not check if strings are valid */
571       break;
572     case PCRE2_ERROR_PARTIAL:
573       /* not an error */
574       break;
575     case PCRE2_ERROR_INTERNAL:
576       return _("internal error");
577     case PCRE2_ERROR_DFA_UITEM:
578       return _("the pattern contains items not supported for partial matching");
579     case PCRE2_ERROR_DFA_UCOND:
580       return _("back references as conditions are not supported for partial matching");
581     case PCRE2_ERROR_DFA_WSSIZE:
582       /* handled expanding the workspace */
583       break;
584     case PCRE2_ERROR_DFA_RECURSE:
585     case PCRE2_ERROR_RECURSIONLIMIT:
586       return _("recursion limit reached");
587     case PCRE2_ERROR_BADOFFSET:
588       return _("bad offset");
589     case PCRE2_ERROR_RECURSELOOP:
590       return _("recursion loop");
591     default:
592       break;
593     }
594   return _("unknown error");
595 }
596 
597 static void
translate_compile_error(gint * errcode,const gchar ** errmsg)598 translate_compile_error (gint *errcode, const gchar **errmsg)
599 {
600   /* Compile errors are created adding 100 to the error code returned
601    * by PCRE.
602    * If errcode is known we put the translatable error message in
603    * erromsg. If errcode is unknown we put the generic
604    * G_REGEX_ERROR_COMPILE error code in errcode and keep the
605    * untranslated error message returned by PCRE.
606    * Note that there can be more PCRE errors with the same GRegexError
607    * and that some PCRE errors are useless for us.
608    */
609 
610   *errcode = map_to_gregex_error (*errcode);
611 
612   switch (*errcode)
613     {
614     case G_REGEX_ERROR_STRAY_BACKSLASH:
615       *errmsg = _("\\ at end of pattern");
616       break;
617     case G_REGEX_ERROR_MISSING_CONTROL_CHAR:
618       *errmsg = _("\\c at end of pattern");
619       break;
620     case G_REGEX_ERROR_UNRECOGNIZED_ESCAPE:
621       *errmsg = _("unrecognized character following \\");
622       break;
623     case G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER:
624       *errmsg = _("numbers out of order in {} quantifier");
625       break;
626     case G_REGEX_ERROR_QUANTIFIER_TOO_BIG:
627       *errmsg = _("number too big in {} quantifier");
628       break;
629     case G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS:
630       *errmsg = _("missing terminating ] for character class");
631       break;
632     case G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS:
633       *errmsg = _("invalid escape sequence in character class");
634       break;
635     case G_REGEX_ERROR_RANGE_OUT_OF_ORDER:
636       *errmsg = _("range out of order in character class");
637       break;
638     case G_REGEX_ERROR_NOTHING_TO_REPEAT:
639       *errmsg = _("nothing to repeat");
640       break;
641     case 111: /* internal error: unexpected repeat */
642       *errcode = G_REGEX_ERROR_INTERNAL;
643       *errmsg = _("unexpected repeat");
644       break;
645     case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER:
646       *errmsg = _("unrecognized character after (? or (?-");
647       break;
648     case G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS:
649       *errmsg = _("POSIX named classes are supported only within a class");
650       break;
651     case G_REGEX_ERROR_UNMATCHED_PARENTHESIS:
652       *errmsg = _("missing terminating )");
653       break;
654     case G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE:
655       *errmsg = _("reference to non-existent subpattern");
656       break;
657     case G_REGEX_ERROR_UNTERMINATED_COMMENT:
658       *errmsg = _("missing ) after comment");
659       break;
660     case G_REGEX_ERROR_EXPRESSION_TOO_LARGE:
661       *errmsg = _("regular expression is too large");
662       break;
663     case G_REGEX_ERROR_MEMORY_ERROR:
664       *errmsg = _("failed to get memory");
665       break;
666     case 122: /* unmatched parentheses */
667       *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
668       *errmsg = _(") without opening (");
669       break;
670     case 123: /* internal error: code overflow */
671       *errcode = G_REGEX_ERROR_INTERNAL;
672       *errmsg = _("code overflow");
673       break;
674     case 124: /* "unrecognized character after (?<\0 */
675       *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
676       *errmsg = _("unrecognized character after (?<");
677       break;
678     case G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND:
679       *errmsg = _("lookbehind assertion is not fixed length");
680       break;
681     case G_REGEX_ERROR_MALFORMED_CONDITION:
682       *errmsg = _("malformed number or name after (?(");
683       break;
684     case G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES:
685       *errmsg = _("conditional group contains more than two branches");
686       break;
687     case G_REGEX_ERROR_ASSERTION_EXPECTED:
688       *errmsg = _("assertion expected after (?(");
689       break;
690     case 129:
691       *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
692       /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of)
693        * sequences here, '(?-54' would be an example for the second group.
694        */
695       *errmsg = _("(?R or (?[+-]digits must be followed by )");
696       break;
697     case G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME:
698       *errmsg = _("unknown POSIX class name");
699       break;
700     case G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED:
701       *errmsg = _("POSIX collating elements are not supported");
702       break;
703     case G_REGEX_ERROR_HEX_CODE_TOO_LARGE:
704       *errmsg = _("character value in \\x{...} sequence is too large");
705       break;
706     case G_REGEX_ERROR_INVALID_CONDITION:
707       *errmsg = _("invalid condition (?(0)");
708       break;
709     case G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND:
710       *errmsg = _("\\C not allowed in lookbehind assertion");
711       break;
712     case 137: /* PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0 */
713       /* A number of Perl escapes are not handled by PCRE.
714        * Therefore it explicitly raises ERR37.
715        */
716       *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
717       *errmsg = _("escapes \\L, \\l, \\N{name}, \\U, and \\u are not supported");
718       break;
719     case G_REGEX_ERROR_INFINITE_LOOP:
720       *errmsg = _("recursive call could loop indefinitely");
721       break;
722     case 141: /* unrecognized character after (?P\0 */
723       *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
724       *errmsg = _("unrecognized character after (?P");
725       break;
726     case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR:
727       *errmsg = _("missing terminator in subpattern name");
728       break;
729     case G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME:
730       *errmsg = _("two named subpatterns have the same name");
731       break;
732     case G_REGEX_ERROR_MALFORMED_PROPERTY:
733       *errmsg = _("malformed \\P or \\p sequence");
734       break;
735     case G_REGEX_ERROR_UNKNOWN_PROPERTY:
736       *errmsg = _("unknown property name after \\P or \\p");
737       break;
738     case G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG:
739       *errmsg = _("subpattern name is too long (maximum 32 characters)");
740       break;
741     case G_REGEX_ERROR_TOO_MANY_SUBPATTERNS:
742       *errmsg = _("too many named subpatterns (maximum 10,000)");
743       break;
744     case G_REGEX_ERROR_INVALID_OCTAL_VALUE:
745       *errmsg = _("octal value is greater than \\377");
746       break;
747     case 152: /* internal error: overran compiling workspace */
748       *errcode = G_REGEX_ERROR_INTERNAL;
749       *errmsg = _("overran compiling workspace");
750       break;
751     case 153: /* internal error: previously-checked referenced subpattern not found */
752       *errcode = G_REGEX_ERROR_INTERNAL;
753       *errmsg = _("previously-checked referenced subpattern not found");
754       break;
755     case G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE:
756       *errmsg = _("DEFINE group contains more than one branch");
757       break;
758     case G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS:
759       *errmsg = _("inconsistent NEWLINE options");
760       break;
761     case G_REGEX_ERROR_MISSING_BACK_REFERENCE:
762       *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or "
763                   "number, or by a plain number");
764       break;
765     case G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE:
766       *errmsg = _("a numbered reference must not be zero");
767       break;
768     case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN:
769       *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)");
770       break;
771     case G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB:
772       *errmsg = _("(*VERB) not recognized");
773       break;
774     case G_REGEX_ERROR_NUMBER_TOO_BIG:
775       *errmsg = _("number is too big");
776       break;
777     case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME:
778       *errmsg = _("missing subpattern name after (?&");
779       break;
780     case G_REGEX_ERROR_MISSING_DIGIT:
781       *errmsg = _("digit expected after (?+");
782       break;
783     case G_REGEX_ERROR_INVALID_DATA_CHARACTER:
784       *errmsg = _("] is an invalid data character in JavaScript compatibility mode");
785       break;
786     case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME:
787       *errmsg = _("different names for subpatterns of the same number are not allowed");
788       break;
789     case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED:
790       *errmsg = _("(*MARK) must have an argument");
791       break;
792     case G_REGEX_ERROR_INVALID_CONTROL_CHAR:
793       *errmsg = _( "\\c must be followed by an ASCII character");
794       break;
795     case G_REGEX_ERROR_MISSING_NAME:
796       *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name");
797       break;
798     case G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS:
799       *errmsg = _("\\N is not supported in a class");
800       break;
801     case G_REGEX_ERROR_TOO_MANY_FORWARD_REFERENCES:
802       *errmsg = _("too many forward references");
803       break;
804     case G_REGEX_ERROR_NAME_TOO_LONG:
805       *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)");
806       break;
807     case G_REGEX_ERROR_CHARACTER_VALUE_TOO_LARGE:
808       *errmsg = _("character value in \\u.... sequence is too large");
809       break;
810 
811     case 116: /* erroffset passed as NULL */
812       /* This should not happen as we never pass a NULL erroffset */
813       g_warning ("erroffset passed as NULL");
814       *errcode = G_REGEX_ERROR_COMPILE;
815       break;
816     case 117: /* unknown option bit(s) set */
817       /* This should not happen as we check options before passing them
818        * to pcre_compile2() */
819       g_warning ("unknown option bit(s) set");
820       *errcode = G_REGEX_ERROR_COMPILE;
821       break;
822     case 132: /* this version of PCRE is compiled without UTF support */
823     case 144: /* invalid UTF-8 string */
824     case 145: /* support for \\P, \\p, and \\X has not been compiled */
825     case 167: /* this version of PCRE is not compiled with Unicode property support */
826     case 173: /* disallowed Unicode code point (>= 0xd800 && <= 0xdfff) */
827     case 174: /* invalid UTF-16 string */
828       /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE
829        * and we do not check if strings are valid */
830     case 170: /* internal error: unknown opcode in find_fixedlength() */
831       *errcode = G_REGEX_ERROR_INTERNAL;
832       break;
833 
834     default:
835       *errcode = G_REGEX_ERROR_COMPILE;
836     }
837 }
838 
839 /* GMatchInfo */
840 
841 static GMatchInfo *
match_info_new(const GRegex * regex,const gchar * string,gint string_len,gint start_position,gint match_options,gboolean is_dfa)842 match_info_new (const GRegex *regex,
843                 const gchar  *string,
844                 gint          string_len,
845                 gint          start_position,
846                 gint          match_options,
847                 gboolean      is_dfa)
848 {
849   GMatchInfo *match_info;
850 
851   match_options = map_to_pcre2_match_flags (match_options);
852 
853   if (string_len < 0)
854     string_len = strlen (string);
855 
856   match_info = g_new0 (GMatchInfo, 1);
857   match_info->ref_count = 1;
858   match_info->regex = g_regex_ref ((GRegex *)regex);
859   match_info->string = string;
860   match_info->string_len = string_len;
861   match_info->matches = PCRE2_ERROR_NOMATCH;
862   match_info->pos = start_position;
863   match_info->match_opts = match_options;
864 
865   if (is_dfa)
866     {
867       /* These values should be enough for most cases, if they are not
868        * enough g_regex_match_all_full() will expand them. */
869       match_info->n_offsets = 24;
870       match_info->n_workspace = 100;
871       match_info->workspace = g_new (gint, match_info->n_workspace);
872     }
873   else
874     {
875       gint capture_count;
876       pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT,
877                           &capture_count);
878       match_info->n_offsets = (capture_count + 1) * 3;
879     }
880 
881   match_info->offsets = g_new0 (gint, match_info->n_offsets);
882   /* Set an invalid position for the previous match. */
883   match_info->offsets[0] = -1;
884   match_info->offsets[1] = -1;
885 
886   match_info->match_data = pcre2_match_data_create_from_pattern (
887       match_info->regex->pcre_re,
888       NULL);
889 
890   return match_info;
891 }
892 
893 /**
894  * g_match_info_get_regex:
895  * @match_info: a #GMatchInfo
896  *
897  * Returns #GRegex object used in @match_info. It belongs to Glib
898  * and must not be freed. Use g_regex_ref() if you need to keep it
899  * after you free @match_info object.
900  *
901  * Returns: #GRegex object used in @match_info
902  *
903  * Since: 2.14
904  */
905 GRegex *
g_match_info_get_regex(const GMatchInfo * match_info)906 g_match_info_get_regex (const GMatchInfo *match_info)
907 {
908   g_return_val_if_fail (match_info != NULL, NULL);
909   return match_info->regex;
910 }
911 
912 /**
913  * g_match_info_get_string:
914  * @match_info: a #GMatchInfo
915  *
916  * Returns the string searched with @match_info. This is the
917  * string passed to g_regex_match() or g_regex_replace() so
918  * you may not free it before calling this function.
919  *
920  * Returns: the string searched with @match_info
921  *
922  * Since: 2.14
923  */
924 const gchar *
g_match_info_get_string(const GMatchInfo * match_info)925 g_match_info_get_string (const GMatchInfo *match_info)
926 {
927   g_return_val_if_fail (match_info != NULL, NULL);
928   return match_info->string;
929 }
930 
931 /**
932  * g_match_info_ref:
933  * @match_info: a #GMatchInfo
934  *
935  * Increases reference count of @match_info by 1.
936  *
937  * Returns: @match_info
938  *
939  * Since: 2.30
940  */
941 GMatchInfo       *
g_match_info_ref(GMatchInfo * match_info)942 g_match_info_ref (GMatchInfo *match_info)
943 {
944   g_return_val_if_fail (match_info != NULL, NULL);
945   g_atomic_int_inc (&match_info->ref_count);
946   return match_info;
947 }
948 
949 /**
950  * g_match_info_unref:
951  * @match_info: a #GMatchInfo
952  *
953  * Decreases reference count of @match_info by 1. When reference count drops
954  * to zero, it frees all the memory associated with the match_info structure.
955  *
956  * Since: 2.30
957  */
958 void
g_match_info_unref(GMatchInfo * match_info)959 g_match_info_unref (GMatchInfo *match_info)
960 {
961   if (g_atomic_int_dec_and_test (&match_info->ref_count))
962     {
963       g_regex_unref (match_info->regex);
964       if (match_info->match_data)
965         pcre2_match_data_free (match_info->match_data);
966       g_free (match_info->offsets);
967       g_free (match_info->workspace);
968       g_free (match_info);
969     }
970 }
971 
972 /**
973  * g_match_info_free:
974  * @match_info: (nullable): a #GMatchInfo, or %NULL
975  *
976  * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does
977  * nothing.
978  *
979  * Since: 2.14
980  */
981 void
g_match_info_free(GMatchInfo * match_info)982 g_match_info_free (GMatchInfo *match_info)
983 {
984   if (match_info == NULL)
985     return;
986 
987   g_match_info_unref (match_info);
988 }
989 
990 /**
991  * g_match_info_next:
992  * @match_info: a #GMatchInfo structure
993  * @error: location to store the error occurring, or %NULL to ignore errors
994  *
995  * Scans for the next match using the same parameters of the previous
996  * call to g_regex_match_full() or g_regex_match() that returned
997  * @match_info.
998  *
999  * The match is done on the string passed to the match function, so you
1000  * cannot free it before calling this function.
1001  *
1002  * Returns: %TRUE is the string matched, %FALSE otherwise
1003  *
1004  * Since: 2.14
1005  */
1006 gboolean
g_match_info_next(GMatchInfo * match_info,GError ** error)1007 g_match_info_next (GMatchInfo  *match_info,
1008                    GError     **error)
1009 {
1010   gint prev_match_start;
1011   gint prev_match_end;
1012   gint i;
1013   gint opts;
1014   PCRE2_SIZE *ovector;
1015 
1016   g_return_val_if_fail (match_info != NULL, FALSE);
1017   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1018   g_return_val_if_fail (match_info->pos >= 0, FALSE);
1019 
1020   prev_match_start = match_info->offsets[0];
1021   prev_match_end = match_info->offsets[1];
1022 
1023   if (match_info->pos > match_info->string_len)
1024     {
1025       /* we have reached the end of the string */
1026       match_info->pos = -1;
1027       match_info->matches = PCRE2_ERROR_NOMATCH;
1028       return FALSE;
1029     }
1030 
1031   opts = map_to_pcre2_match_flags (match_info->regex->match_opts | match_info->match_opts);
1032   match_info->matches = pcre2_match (match_info->regex->pcre_re,
1033                                      (PCRE2_SPTR)match_info->string,
1034                                      match_info->string_len,
1035                                      match_info->pos,
1036                                      opts & ~G_REGEX_FLAGS_CONVERTED,
1037                                      match_info->match_data,
1038                                      NULL);
1039 
1040   if (IS_PCRE_ERROR (match_info->matches))
1041     {
1042       g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1043                    _("Error while matching regular expression %s: %s"),
1044                    match_info->regex->pattern, match_error (match_info->matches));
1045       return FALSE;
1046     }
1047   else
1048     {
1049       match_info->n_offsets = pcre2_get_ovector_count (match_info->match_data) * 2;
1050       ovector = pcre2_get_ovector_pointer (match_info->match_data);
1051       match_info->offsets = g_realloc_n (match_info->offsets,
1052                                          match_info->n_offsets,
1053                                          sizeof (gint));
1054       for (i = 0; i < match_info->n_offsets; i++)
1055         {
1056           match_info->offsets[i] = (int) ovector[i];
1057         }
1058     }
1059 
1060   /* avoid infinite loops if the pattern is an empty string or something
1061    * equivalent */
1062   if (match_info->pos == match_info->offsets[1])
1063     {
1064       if (match_info->pos > match_info->string_len)
1065         {
1066           /* we have reached the end of the string */
1067           match_info->pos = -1;
1068           match_info->matches = PCRE2_ERROR_NOMATCH;
1069           return FALSE;
1070         }
1071 
1072       match_info->pos = NEXT_CHAR (match_info->regex,
1073                                    &match_info->string[match_info->pos]) -
1074                                    match_info->string;
1075     }
1076   else
1077     {
1078       match_info->pos = match_info->offsets[1];
1079     }
1080 
1081   /* it's possible to get two identical matches when we are matching
1082    * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and
1083    * the string is "RegExTest" we have:
1084    *  - search at position 0: match from 0 to 0
1085    *  - search at position 1: match from 3 to 3
1086    *  - search at position 3: match from 3 to 3 (duplicate)
1087    *  - search at position 4: match from 5 to 5
1088    *  - search at position 5: match from 5 to 5 (duplicate)
1089    *  - search at position 6: no match -> stop
1090    * so we have to ignore the duplicates.
1091    * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */
1092   if (match_info->matches >= 0 &&
1093       prev_match_start == match_info->offsets[0] &&
1094       prev_match_end == match_info->offsets[1])
1095     {
1096       /* ignore this match and search the next one */
1097       return g_match_info_next (match_info, error);
1098     }
1099 
1100   return match_info->matches >= 0;
1101 }
1102 
1103 /**
1104  * g_match_info_matches:
1105  * @match_info: a #GMatchInfo structure
1106  *
1107  * Returns whether the previous match operation succeeded.
1108  *
1109  * Returns: %TRUE if the previous match operation succeeded,
1110  *   %FALSE otherwise
1111  *
1112  * Since: 2.14
1113  */
1114 gboolean
g_match_info_matches(const GMatchInfo * match_info)1115 g_match_info_matches (const GMatchInfo *match_info)
1116 {
1117   g_return_val_if_fail (match_info != NULL, FALSE);
1118 
1119   return match_info->matches >= 0;
1120 }
1121 
1122 /**
1123  * g_match_info_get_match_count:
1124  * @match_info: a #GMatchInfo structure
1125  *
1126  * Retrieves the number of matched substrings (including substring 0,
1127  * that is the whole matched text), so 1 is returned if the pattern
1128  * has no substrings in it and 0 is returned if the match failed.
1129  *
1130  * If the last match was obtained using the DFA algorithm, that is
1131  * using g_regex_match_all() or g_regex_match_all_full(), the retrieved
1132  * count is not that of the number of capturing parentheses but that of
1133  * the number of matched substrings.
1134  *
1135  * Returns: Number of matched substrings, or -1 if an error occurred
1136  *
1137  * Since: 2.14
1138  */
1139 gint
g_match_info_get_match_count(const GMatchInfo * match_info)1140 g_match_info_get_match_count (const GMatchInfo *match_info)
1141 {
1142   g_return_val_if_fail (match_info, -1);
1143 
1144   if (match_info->matches == PCRE2_ERROR_NOMATCH)
1145     /* no match */
1146     return 0;
1147   else if (match_info->matches < PCRE2_ERROR_NOMATCH)
1148     /* error */
1149     return -1;
1150   else
1151     /* match */
1152     return match_info->matches;
1153 }
1154 
1155 /**
1156  * g_match_info_is_partial_match:
1157  * @match_info: a #GMatchInfo structure
1158  *
1159  * Usually if the string passed to g_regex_match*() matches as far as
1160  * it goes, but is too short to match the entire pattern, %FALSE is
1161  * returned. There are circumstances where it might be helpful to
1162  * distinguish this case from other cases in which there is no match.
1163  *
1164  * Consider, for example, an application where a human is required to
1165  * type in data for a field with specific formatting requirements. An
1166  * example might be a date in the form ddmmmyy, defined by the pattern
1167  * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
1168  * If the application sees the user’s keystrokes one by one, and can
1169  * check that what has been typed so far is potentially valid, it is
1170  * able to raise an error as soon as a mistake is made.
1171  *
1172  * GRegex supports the concept of partial matching by means of the
1173  * #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD flags.
1174  * When they are used, the return code for
1175  * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
1176  * for a complete match, %FALSE otherwise. But, when these functions
1177  * return %FALSE, you can check if the match was partial calling
1178  * g_match_info_is_partial_match().
1179  *
1180  * The difference between #G_REGEX_MATCH_PARTIAL_SOFT and
1181  * #G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered
1182  * with #G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a
1183  * possible complete match, while with #G_REGEX_MATCH_PARTIAL_HARD matching
1184  * stops at the partial match.
1185  * When both #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD
1186  * are set, the latter takes precedence.
1187  *
1188  * There were formerly some restrictions on the pattern for partial matching.
1189  * The restrictions no longer apply.
1190  *
1191  * See pcrepartial(3) for more information on partial matching.
1192  *
1193  * Returns: %TRUE if the match was partial, %FALSE otherwise
1194  *
1195  * Since: 2.14
1196  */
1197 gboolean
g_match_info_is_partial_match(const GMatchInfo * match_info)1198 g_match_info_is_partial_match (const GMatchInfo *match_info)
1199 {
1200   g_return_val_if_fail (match_info != NULL, FALSE);
1201 
1202   return match_info->matches == PCRE2_ERROR_PARTIAL;
1203 }
1204 
1205 /**
1206  * g_match_info_expand_references:
1207  * @match_info: (nullable): a #GMatchInfo or %NULL
1208  * @string_to_expand: the string to expand
1209  * @error: location to store the error occurring, or %NULL to ignore errors
1210  *
1211  * Returns a new string containing the text in @string_to_expand with
1212  * references and escape sequences expanded. References refer to the last
1213  * match done with @string against @regex and have the same syntax used by
1214  * g_regex_replace().
1215  *
1216  * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was
1217  * passed to g_regex_new().
1218  *
1219  * The backreferences are extracted from the string passed to the match
1220  * function, so you cannot call this function after freeing the string.
1221  *
1222  * @match_info may be %NULL in which case @string_to_expand must not
1223  * contain references. For instance "foo\n" does not refer to an actual
1224  * pattern and '\n' merely will be replaced with \n character,
1225  * while to expand "\0" (whole match) one needs the result of a match.
1226  * Use g_regex_check_replacement() to find out whether @string_to_expand
1227  * contains references.
1228  *
1229  * Returns: (nullable): the expanded string, or %NULL if an error occurred
1230  *
1231  * Since: 2.14
1232  */
1233 gchar *
g_match_info_expand_references(const GMatchInfo * match_info,const gchar * string_to_expand,GError ** error)1234 g_match_info_expand_references (const GMatchInfo  *match_info,
1235                                 const gchar       *string_to_expand,
1236                                 GError           **error)
1237 {
1238   GString *result;
1239   GList *list;
1240   GError *tmp_error = NULL;
1241 
1242   g_return_val_if_fail (string_to_expand != NULL, NULL);
1243   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1244 
1245   list = split_replacement (string_to_expand, &tmp_error);
1246   if (tmp_error != NULL)
1247     {
1248       g_propagate_error (error, tmp_error);
1249       return NULL;
1250     }
1251 
1252   if (!match_info && interpolation_list_needs_match (list))
1253     {
1254       g_critical ("String '%s' contains references to the match, can't "
1255                   "expand references without GMatchInfo object",
1256                   string_to_expand);
1257       return NULL;
1258     }
1259 
1260   result = g_string_sized_new (strlen (string_to_expand));
1261   interpolate_replacement (match_info, result, list);
1262 
1263   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
1264 
1265   return g_string_free (result, FALSE);
1266 }
1267 
1268 /**
1269  * g_match_info_fetch:
1270  * @match_info: #GMatchInfo structure
1271  * @match_num: number of the sub expression
1272  *
1273  * Retrieves the text matching the @match_num'th capturing
1274  * parentheses. 0 is the full text of the match, 1 is the first paren
1275  * set, 2 the second, and so on.
1276  *
1277  * If @match_num is a valid sub pattern but it didn't match anything
1278  * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty
1279  * string is returned.
1280  *
1281  * If the match was obtained using the DFA algorithm, that is using
1282  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1283  * string is not that of a set of parentheses but that of a matched
1284  * substring. Substrings are matched in reverse order of length, so
1285  * 0 is the longest match.
1286  *
1287  * The string is fetched from the string passed to the match function,
1288  * so you cannot call this function after freeing the string.
1289  *
1290  * Returns: (nullable): The matched substring, or %NULL if an error
1291  *     occurred. You have to free the string yourself
1292  *
1293  * Since: 2.14
1294  */
1295 gchar *
g_match_info_fetch(const GMatchInfo * match_info,gint match_num)1296 g_match_info_fetch (const GMatchInfo *match_info,
1297                     gint              match_num)
1298 {
1299   /* we cannot use pcre_get_substring() because it allocates the
1300    * string using pcre_malloc(). */
1301   gchar *match = NULL;
1302   gint start, end;
1303 
1304   g_return_val_if_fail (match_info != NULL, NULL);
1305   g_return_val_if_fail (match_num >= 0, NULL);
1306 
1307   /* match_num does not exist or it didn't matched, i.e. matching "b"
1308    * against "(a)?b" then group 0 is empty. */
1309   if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
1310     match = NULL;
1311   else if (start == -1)
1312     match = g_strdup ("");
1313   else
1314     match = g_strndup (&match_info->string[start], end - start);
1315 
1316   return match;
1317 }
1318 
1319 /**
1320  * g_match_info_fetch_pos:
1321  * @match_info: #GMatchInfo structure
1322  * @match_num: number of the sub expression
1323  * @start_pos: (out) (optional): pointer to location where to store
1324  *     the start position, or %NULL
1325  * @end_pos: (out) (optional): pointer to location where to store
1326  *     the end position, or %NULL
1327  *
1328  * Retrieves the position in bytes of the @match_num'th capturing
1329  * parentheses. 0 is the full text of the match, 1 is the first
1330  * paren set, 2 the second, and so on.
1331  *
1332  * If @match_num is a valid sub pattern but it didn't match anything
1333  * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos
1334  * and @end_pos are set to -1 and %TRUE is returned.
1335  *
1336  * If the match was obtained using the DFA algorithm, that is using
1337  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1338  * position is not that of a set of parentheses but that of a matched
1339  * substring. Substrings are matched in reverse order of length, so
1340  * 0 is the longest match.
1341  *
1342  * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
1343  *   the position cannot be fetched, @start_pos and @end_pos are left
1344  *   unchanged
1345  *
1346  * Since: 2.14
1347  */
1348 gboolean
g_match_info_fetch_pos(const GMatchInfo * match_info,gint match_num,gint * start_pos,gint * end_pos)1349 g_match_info_fetch_pos (const GMatchInfo *match_info,
1350                         gint              match_num,
1351                         gint             *start_pos,
1352                         gint             *end_pos)
1353 {
1354   g_return_val_if_fail (match_info != NULL, FALSE);
1355   g_return_val_if_fail (match_num >= 0, FALSE);
1356 
1357   /* make sure the sub expression number they're requesting is less than
1358    * the total number of sub expressions that were matched. */
1359   if (match_num >= match_info->matches)
1360     return FALSE;
1361 
1362   if (start_pos != NULL)
1363     *start_pos = match_info->offsets[2 * match_num];
1364 
1365   if (end_pos != NULL)
1366     *end_pos = match_info->offsets[2 * match_num + 1];
1367 
1368   return TRUE;
1369 }
1370 
1371 /*
1372  * Returns number of first matched subpattern with name @name.
1373  * There may be more than one in case when DUPNAMES is used,
1374  * and not all subpatterns with that name match;
1375  * pcre_get_stringnumber() does not work in that case.
1376  */
1377 static gint
get_matched_substring_number(const GMatchInfo * match_info,const gchar * name)1378 get_matched_substring_number (const GMatchInfo *match_info,
1379                               const gchar      *name)
1380 {
1381   gint entrysize;
1382   PCRE2_SPTR first, last;
1383   guchar *entry;
1384 
1385   if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES))
1386     return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR)name);
1387 
1388   /* This code is copied from pcre_get.c: get_first_set() */
1389   entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re,
1390                                               (PCRE2_SPTR)name,
1391                                               &first,
1392                                               &last);
1393 
1394   if (entrysize <= 0)
1395     return entrysize;
1396 
1397   for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
1398     {
1399       gint n = (entry[0] << 8) + entry[1];
1400       if (match_info->offsets[n*2] >= 0)
1401         return n;
1402     }
1403 
1404   return (first[0] << 8) + first[1];
1405 }
1406 
1407 /**
1408  * g_match_info_fetch_named:
1409  * @match_info: #GMatchInfo structure
1410  * @name: name of the subexpression
1411  *
1412  * Retrieves the text matching the capturing parentheses named @name.
1413  *
1414  * If @name is a valid sub pattern name but it didn't match anything
1415  * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
1416  * then an empty string is returned.
1417  *
1418  * The string is fetched from the string passed to the match function,
1419  * so you cannot call this function after freeing the string.
1420  *
1421  * Returns: (nullable): The matched substring, or %NULL if an error
1422  *     occurred. You have to free the string yourself
1423  *
1424  * Since: 2.14
1425  */
1426 gchar *
g_match_info_fetch_named(const GMatchInfo * match_info,const gchar * name)1427 g_match_info_fetch_named (const GMatchInfo *match_info,
1428                           const gchar      *name)
1429 {
1430   /* we cannot use pcre_get_named_substring() because it allocates the
1431    * string using pcre_malloc(). */
1432   gint num;
1433 
1434   g_return_val_if_fail (match_info != NULL, NULL);
1435   g_return_val_if_fail (name != NULL, NULL);
1436 
1437   num = get_matched_substring_number (match_info, name);
1438   if (num < 0)
1439     return NULL;
1440   else
1441     return g_match_info_fetch (match_info, num);
1442 }
1443 
1444 /**
1445  * g_match_info_fetch_named_pos:
1446  * @match_info: #GMatchInfo structure
1447  * @name: name of the subexpression
1448  * @start_pos: (out) (optional): pointer to location where to store
1449  *     the start position, or %NULL
1450  * @end_pos: (out) (optional): pointer to location where to store
1451  *     the end position, or %NULL
1452  *
1453  * Retrieves the position in bytes of the capturing parentheses named @name.
1454  *
1455  * If @name is a valid sub pattern name but it didn't match anything
1456  * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
1457  * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
1458  *
1459  * Returns: %TRUE if the position was fetched, %FALSE otherwise.
1460  *     If the position cannot be fetched, @start_pos and @end_pos
1461  *     are left unchanged.
1462  *
1463  * Since: 2.14
1464  */
1465 gboolean
g_match_info_fetch_named_pos(const GMatchInfo * match_info,const gchar * name,gint * start_pos,gint * end_pos)1466 g_match_info_fetch_named_pos (const GMatchInfo *match_info,
1467                               const gchar      *name,
1468                               gint             *start_pos,
1469                               gint             *end_pos)
1470 {
1471   gint num;
1472 
1473   g_return_val_if_fail (match_info != NULL, FALSE);
1474   g_return_val_if_fail (name != NULL, FALSE);
1475 
1476   num = get_matched_substring_number (match_info, name);
1477   if (num < 0)
1478     return FALSE;
1479 
1480   return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
1481 }
1482 
1483 /**
1484  * g_match_info_fetch_all:
1485  * @match_info: a #GMatchInfo structure
1486  *
1487  * Bundles up pointers to each of the matching substrings from a match
1488  * and stores them in an array of gchar pointers. The first element in
1489  * the returned array is the match number 0, i.e. the entire matched
1490  * text.
1491  *
1492  * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
1493  * "b" against "(a)?b") then an empty string is inserted.
1494  *
1495  * If the last match was obtained using the DFA algorithm, that is using
1496  * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1497  * strings are not that matched by sets of parentheses but that of the
1498  * matched substring. Substrings are matched in reverse order of length,
1499  * so the first one is the longest match.
1500  *
1501  * The strings are fetched from the string passed to the match function,
1502  * so you cannot call this function after freeing the string.
1503  *
1504  * Returns: (transfer full): a %NULL-terminated array of gchar *
1505  *     pointers.  It must be freed using g_strfreev(). If the previous
1506  *     match failed %NULL is returned
1507  *
1508  * Since: 2.14
1509  */
1510 gchar **
g_match_info_fetch_all(const GMatchInfo * match_info)1511 g_match_info_fetch_all (const GMatchInfo *match_info)
1512 {
1513   /* we cannot use pcre_get_substring_list() because the returned value
1514    * isn't suitable for g_strfreev(). */
1515   gchar **result;
1516   gint i;
1517 
1518   g_return_val_if_fail (match_info != NULL, NULL);
1519 
1520   if (match_info->matches < 0)
1521     return NULL;
1522 
1523   result = g_new (gchar *, match_info->matches + 1);
1524   for (i = 0; i < match_info->matches; i++)
1525     result[i] = g_match_info_fetch (match_info, i);
1526   result[i] = NULL;
1527 
1528   return result;
1529 }
1530 
1531 
1532 /* GRegex */
1533 
1534 G_DEFINE_QUARK (g-regex-error-quark, g_regex_error)
1535 
1536 /**
1537  * g_regex_ref:
1538  * @regex: a #GRegex
1539  *
1540  * Increases reference count of @regex by 1.
1541  *
1542  * Returns: @regex
1543  *
1544  * Since: 2.14
1545  */
1546 GRegex *
g_regex_ref(GRegex * regex)1547 g_regex_ref (GRegex *regex)
1548 {
1549   g_return_val_if_fail (regex != NULL, NULL);
1550   g_atomic_int_inc (&regex->ref_count);
1551   return regex;
1552 }
1553 
1554 /**
1555  * g_regex_unref:
1556  * @regex: a #GRegex
1557  *
1558  * Decreases reference count of @regex by 1. When reference count drops
1559  * to zero, it frees all the memory associated with the regex structure.
1560  *
1561  * Since: 2.14
1562  */
1563 void
g_regex_unref(GRegex * regex)1564 g_regex_unref (GRegex *regex)
1565 {
1566   g_return_if_fail (regex != NULL);
1567 
1568   if (g_atomic_int_dec_and_test (&regex->ref_count))
1569     {
1570       g_free (regex->pattern);
1571       if (regex->pcre_re != NULL)
1572         pcre2_code_free (regex->pcre_re);
1573       g_free (regex);
1574     }
1575 }
1576 
1577 /*
1578  * @match_options: (inout) (optional):
1579  */
1580 static pcre2_code *regex_compile (const gchar *pattern,
1581                                   GRegexCompileFlags compile_options,
1582                                   GRegexCompileFlags *compile_options_out,
1583                                   GRegexMatchFlags *match_options,
1584                                   GError **error);
1585 
1586 /**
1587  * g_regex_new:
1588  * @pattern: the regular expression
1589  * @compile_options: compile options for the regular expression, or 0
1590  * @match_options: match options for the regular expression, or 0
1591  * @error: return location for a #GError
1592  *
1593  * Compiles the regular expression to an internal form, and does
1594  * the initial setup of the #GRegex structure.
1595  *
1596  * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call
1597  *   g_regex_unref() when you are done with it
1598  *
1599  * Since: 2.14
1600  */
1601 GRegex *
g_regex_new(const gchar * pattern,GRegexCompileFlags compile_options,GRegexMatchFlags match_options,GError ** error)1602 g_regex_new (const gchar         *pattern,
1603              GRegexCompileFlags   compile_options,
1604              GRegexMatchFlags     match_options,
1605              GError             **error)
1606 {
1607   GRegex *regex;
1608   pcre2_code *re;
1609   static gsize initialised = 0;
1610 
1611   compile_options = map_to_pcre2_compile_flags (compile_options);
1612   match_options = map_to_pcre2_match_flags (match_options);
1613 
1614   g_return_val_if_fail (pattern != NULL, NULL);
1615   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1616   g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
1617   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1618 
1619   if (g_once_init_enter (&initialised))
1620     {
1621       int supports_utf8;
1622 
1623       pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8);
1624       if (!supports_utf8)
1625         g_critical (_("PCRE library is compiled without UTF8 support"));
1626 
1627       g_once_init_leave (&initialised, supports_utf8 ? 1 : 2);
1628     }
1629 
1630   if (G_UNLIKELY (initialised != 1))
1631     {
1632       g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE,
1633                            _("PCRE library is compiled with incompatible options"));
1634       return NULL;
1635     }
1636 
1637   re = regex_compile (pattern, compile_options, &compile_options,
1638                       &match_options, error);
1639   if (re == NULL)
1640     return NULL;
1641 
1642   regex = g_new0 (GRegex, 1);
1643   regex->ref_count = 1;
1644   regex->pattern = g_strdup (pattern);
1645   regex->pcre_re = re;
1646   regex->compile_opts = compile_options;
1647   regex->match_opts = match_options;
1648 
1649   return regex;
1650 }
1651 
1652 static pcre2_code *
regex_compile(const gchar * pattern,GRegexCompileFlags compile_options,GRegexCompileFlags * compile_options_out,GRegexMatchFlags * match_options,GError ** error)1653 regex_compile (const gchar *pattern,
1654                GRegexCompileFlags compile_options,
1655                GRegexCompileFlags *compile_options_out,
1656                GRegexMatchFlags *match_options,
1657                GError **error)
1658 {
1659   pcre2_code *re;
1660   const gchar *errmsg;
1661   PCRE2_SIZE erroffset;
1662   gint errcode;
1663   GRegexCompileFlags nonpcre_compile_options;
1664   unsigned long int pcre_compile_options;
1665 
1666   compile_options = map_to_pcre2_compile_flags (compile_options);
1667   *match_options = map_to_pcre2_match_flags (*match_options);
1668 
1669   nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
1670 
1671   /* In GRegex the string are, by default, UTF-8 encoded. PCRE
1672    * instead uses UTF-8 only if required with PCRE_UTF8. */
1673   if (compile_options & PCRE2_UTF)
1674     {
1675       /* disable utf-8 */
1676       compile_options &= ~PCRE2_UTF;
1677     }
1678   else
1679     {
1680       /* enable utf-8 */
1681       compile_options |= PCRE2_UTF | PCRE2_NO_UTF_CHECK;
1682 
1683       if (match_options != NULL)
1684         *match_options |= PCRE2_NO_UTF_CHECK;
1685     }
1686   /* PCRE_NEWLINE_ANY is the default for the internal PCRE but
1687    * not for the system one. */
1688   if (!(compile_options & PCRE2_NEWLINE_CR) &&
1689       !(compile_options & PCRE2_NEWLINE_LF))
1690     {
1691       compile_options |= PCRE2_NEWLINE_ANY;
1692     }
1693 
1694   compile_options |= PCRE2_UCP;
1695 
1696   /* compile the pattern */
1697   re = pcre2_compile ((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, compile_options & ~G_REGEX_FLAGS_CONVERTED,
1698                       &errcode, &erroffset, NULL);
1699 
1700   /* if the compilation failed, set the error member and return
1701    * immediately */
1702   if (re == NULL)
1703     {
1704       GError *tmp_error;
1705 
1706       /* Translate the PCRE error code to GRegexError and use a translated
1707        * error message if possible */
1708       translate_compile_error (&errcode, &errmsg);
1709 
1710       /* PCRE uses byte offsets but we want to show character offsets */
1711       erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]);
1712 
1713       tmp_error = g_error_new (G_REGEX_ERROR, errcode,
1714                                _ ("Error while compiling regular "
1715                                   "expression %s at char %" G_GSIZE_FORMAT ": %s"),
1716                                pattern, erroffset, errmsg);
1717       g_propagate_error (error, tmp_error);
1718 
1719       return NULL;
1720     }
1721 
1722   /* For options set at the beginning of the pattern, pcre puts them into
1723    * compile options, e.g. "(?i)foo" will make the pcre structure store
1724    * PCRE_CASELESS even though it wasn't explicitly given for compilation. */
1725   pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options);
1726   compile_options = pcre_compile_options & G_REGEX_COMPILE_PCRE_MASK;
1727 
1728   /* Don't leak PCRE_NEWLINE_ANY, which is part of PCRE_NEWLINE_ANYCRLF */
1729   if ((pcre_compile_options & PCRE2_NEWLINE_ANYCRLF) != PCRE2_NEWLINE_ANYCRLF)
1730     compile_options &= ~PCRE2_NEWLINE_ANY;
1731 
1732   compile_options |= nonpcre_compile_options;
1733 
1734   if (!(compile_options & PCRE2_DUPNAMES))
1735     {
1736       gboolean jchanged = FALSE;
1737       pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged);
1738       if (jchanged)
1739         compile_options |= PCRE2_DUPNAMES;
1740     }
1741 
1742   if (compile_options_out != 0)
1743     *compile_options_out = compile_options;
1744 
1745   return re;
1746 }
1747 
1748 /**
1749  * g_regex_get_pattern:
1750  * @regex: a #GRegex structure
1751  *
1752  * Gets the pattern string associated with @regex, i.e. a copy of
1753  * the string passed to g_regex_new().
1754  *
1755  * Returns: the pattern of @regex
1756  *
1757  * Since: 2.14
1758  */
1759 const gchar *
g_regex_get_pattern(const GRegex * regex)1760 g_regex_get_pattern (const GRegex *regex)
1761 {
1762   g_return_val_if_fail (regex != NULL, NULL);
1763 
1764   return regex->pattern;
1765 }
1766 
1767 /**
1768  * g_regex_get_max_backref:
1769  * @regex: a #GRegex
1770  *
1771  * Returns the number of the highest back reference
1772  * in the pattern, or 0 if the pattern does not contain
1773  * back references.
1774  *
1775  * Returns: the number of the highest back reference
1776  *
1777  * Since: 2.14
1778  */
1779 gint
g_regex_get_max_backref(const GRegex * regex)1780 g_regex_get_max_backref (const GRegex *regex)
1781 {
1782   gint value;
1783 
1784   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value);
1785 
1786   return value;
1787 }
1788 
1789 /**
1790  * g_regex_get_capture_count:
1791  * @regex: a #GRegex
1792  *
1793  * Returns the number of capturing subpatterns in the pattern.
1794  *
1795  * Returns: the number of capturing subpatterns
1796  *
1797  * Since: 2.14
1798  */
1799 gint
g_regex_get_capture_count(const GRegex * regex)1800 g_regex_get_capture_count (const GRegex *regex)
1801 {
1802   gint value;
1803 
1804   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value);
1805 
1806   return value;
1807 }
1808 
1809 /**
1810  * g_regex_get_has_cr_or_lf:
1811  * @regex: a #GRegex structure
1812  *
1813  * Checks whether the pattern contains explicit CR or LF references.
1814  *
1815  * Returns: %TRUE if the pattern contains explicit CR or LF references
1816  *
1817  * Since: 2.34
1818  */
1819 gboolean
g_regex_get_has_cr_or_lf(const GRegex * regex)1820 g_regex_get_has_cr_or_lf (const GRegex *regex)
1821 {
1822   gint value;
1823 
1824   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value);
1825 
1826   return !!value;
1827 }
1828 
1829 /**
1830  * g_regex_get_max_lookbehind:
1831  * @regex: a #GRegex structure
1832  *
1833  * Gets the number of characters in the longest lookbehind assertion in the
1834  * pattern. This information is useful when doing multi-segment matching using
1835  * the partial matching facilities.
1836  *
1837  * Returns: the number of characters in the longest lookbehind assertion.
1838  *
1839  * Since: 2.38
1840  */
1841 gint
g_regex_get_max_lookbehind(const GRegex * regex)1842 g_regex_get_max_lookbehind (const GRegex *regex)
1843 {
1844   gint max_lookbehind;
1845 
1846   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND,
1847                       &max_lookbehind);
1848 
1849   return max_lookbehind;
1850 }
1851 
1852 /**
1853  * g_regex_get_compile_flags:
1854  * @regex: a #GRegex
1855  *
1856  * Returns the compile options that @regex was created with.
1857  *
1858  * Depending on the version of PCRE that is used, this may or may not
1859  * include flags set by option expressions such as `(?i)` found at the
1860  * top-level within the compiled pattern.
1861  *
1862  * Returns: flags from #GRegexCompileFlags
1863  *
1864  * Since: 2.26
1865  */
1866 GRegexCompileFlags
g_regex_get_compile_flags(const GRegex * regex)1867 g_regex_get_compile_flags (const GRegex *regex)
1868 {
1869   g_return_val_if_fail (regex != NULL, 0);
1870 
1871   return map_to_pcre1_compile_flags (regex->compile_opts);
1872 }
1873 
1874 /**
1875  * g_regex_get_match_flags:
1876  * @regex: a #GRegex
1877  *
1878  * Returns the match options that @regex was created with.
1879  *
1880  * Returns: flags from #GRegexMatchFlags
1881  *
1882  * Since: 2.26
1883  */
1884 GRegexMatchFlags
g_regex_get_match_flags(const GRegex * regex)1885 g_regex_get_match_flags (const GRegex *regex)
1886 {
1887   g_return_val_if_fail (regex != NULL, 0);
1888 
1889   return map_to_pcre1_match_flags (regex->match_opts & G_REGEX_MATCH_MASK);
1890 }
1891 
1892 /**
1893  * g_regex_match_simple:
1894  * @pattern: the regular expression
1895  * @string: the string to scan for matches
1896  * @compile_options: compile options for the regular expression, or 0
1897  * @match_options: match options, or 0
1898  *
1899  * Scans for a match in @string for @pattern.
1900  *
1901  * This function is equivalent to g_regex_match() but it does not
1902  * require to compile the pattern with g_regex_new(), avoiding some
1903  * lines of code when you need just to do a match without extracting
1904  * substrings, capture counts, and so on.
1905  *
1906  * If this function is to be called on the same @pattern more than
1907  * once, it's more efficient to compile the pattern once with
1908  * g_regex_new() and then use g_regex_match().
1909  *
1910  * Returns: %TRUE if the string matched, %FALSE otherwise
1911  *
1912  * Since: 2.14
1913  */
1914 gboolean
g_regex_match_simple(const gchar * pattern,const gchar * string,GRegexCompileFlags compile_options,GRegexMatchFlags match_options)1915 g_regex_match_simple (const gchar        *pattern,
1916                       const gchar        *string,
1917                       GRegexCompileFlags  compile_options,
1918                       GRegexMatchFlags    match_options)
1919 {
1920   GRegex *regex;
1921   gboolean result;
1922 
1923   compile_options = map_to_pcre2_compile_flags (compile_options);
1924   match_options = map_to_pcre2_match_flags (match_options);
1925 
1926   regex = g_regex_new (pattern, compile_options, 0, NULL);
1927   if (!regex)
1928     return FALSE;
1929   result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
1930   g_regex_unref (regex);
1931   return result;
1932 }
1933 
1934 /**
1935  * g_regex_match:
1936  * @regex: a #GRegex structure from g_regex_new()
1937  * @string: the string to scan for matches
1938  * @match_options: match options
1939  * @match_info: (out) (optional): pointer to location where to store
1940  *     the #GMatchInfo, or %NULL if you do not need it
1941  *
1942  * Scans for a match in @string for the pattern in @regex.
1943  * The @match_options are combined with the match options specified
1944  * when the @regex structure was created, letting you have more
1945  * flexibility in reusing #GRegex structures.
1946  *
1947  * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
1948  *
1949  * A #GMatchInfo structure, used to get information on the match,
1950  * is stored in @match_info if not %NULL. Note that if @match_info
1951  * is not %NULL then it is created even if the function returns %FALSE,
1952  * i.e. you must free it regardless if regular expression actually matched.
1953  *
1954  * To retrieve all the non-overlapping matches of the pattern in
1955  * string you can use g_match_info_next().
1956  *
1957  * |[<!-- language="C" -->
1958  * static void
1959  * print_uppercase_words (const gchar *string)
1960  * {
1961  *   // Print all uppercase-only words.
1962  *   GRegex *regex;
1963  *   GMatchInfo *match_info;
1964  *
1965  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1966  *   g_regex_match (regex, string, 0, &match_info);
1967  *   while (g_match_info_matches (match_info))
1968  *     {
1969  *       gchar *word = g_match_info_fetch (match_info, 0);
1970  *       g_print ("Found: %s\n", word);
1971  *       g_free (word);
1972  *       g_match_info_next (match_info, NULL);
1973  *     }
1974  *   g_match_info_free (match_info);
1975  *   g_regex_unref (regex);
1976  * }
1977  * ]|
1978  *
1979  * @string is not copied and is used in #GMatchInfo internally. If
1980  * you use any #GMatchInfo method (except g_match_info_free()) after
1981  * freeing or modifying @string then the behaviour is undefined.
1982  *
1983  * Returns: %TRUE is the string matched, %FALSE otherwise
1984  *
1985  * Since: 2.14
1986  */
1987 gboolean
g_regex_match(const GRegex * regex,const gchar * string,GRegexMatchFlags match_options,GMatchInfo ** match_info)1988 g_regex_match (const GRegex      *regex,
1989                const gchar       *string,
1990                GRegexMatchFlags   match_options,
1991                GMatchInfo       **match_info)
1992 {
1993   match_options = map_to_pcre2_match_flags (match_options);
1994 
1995   return g_regex_match_full (regex, string, -1, 0, match_options,
1996                              match_info, NULL);
1997 }
1998 
1999 /**
2000  * g_regex_match_full:
2001  * @regex: a #GRegex structure from g_regex_new()
2002  * @string: (array length=string_len): the string to scan for matches
2003  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2004  * @start_position: starting index of the string to match, in bytes
2005  * @match_options: match options
2006  * @match_info: (out) (optional): pointer to location where to store
2007  *     the #GMatchInfo, or %NULL if you do not need it
2008  * @error: location to store the error occurring, or %NULL to ignore errors
2009  *
2010  * Scans for a match in @string for the pattern in @regex.
2011  * The @match_options are combined with the match options specified
2012  * when the @regex structure was created, letting you have more
2013  * flexibility in reusing #GRegex structures.
2014  *
2015  * Setting @start_position differs from just passing over a shortened
2016  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2017  * that begins with any kind of lookbehind assertion, such as "\b".
2018  *
2019  * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2020  *
2021  * A #GMatchInfo structure, used to get information on the match, is
2022  * stored in @match_info if not %NULL. Note that if @match_info is
2023  * not %NULL then it is created even if the function returns %FALSE,
2024  * i.e. you must free it regardless if regular expression actually
2025  * matched.
2026  *
2027  * @string is not copied and is used in #GMatchInfo internally. If
2028  * you use any #GMatchInfo method (except g_match_info_free()) after
2029  * freeing or modifying @string then the behaviour is undefined.
2030  *
2031  * To retrieve all the non-overlapping matches of the pattern in
2032  * string you can use g_match_info_next().
2033  *
2034  * |[<!-- language="C" -->
2035  * static void
2036  * print_uppercase_words (const gchar *string)
2037  * {
2038  *   // Print all uppercase-only words.
2039  *   GRegex *regex;
2040  *   GMatchInfo *match_info;
2041  *   GError *error = NULL;
2042  *
2043  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
2044  *   g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error);
2045  *   while (g_match_info_matches (match_info))
2046  *     {
2047  *       gchar *word = g_match_info_fetch (match_info, 0);
2048  *       g_print ("Found: %s\n", word);
2049  *       g_free (word);
2050  *       g_match_info_next (match_info, &error);
2051  *     }
2052  *   g_match_info_free (match_info);
2053  *   g_regex_unref (regex);
2054  *   if (error != NULL)
2055  *     {
2056  *       g_printerr ("Error while matching: %s\n", error->message);
2057  *       g_error_free (error);
2058  *     }
2059  * }
2060  * ]|
2061  *
2062  * Returns: %TRUE is the string matched, %FALSE otherwise
2063  *
2064  * Since: 2.14
2065  */
2066 gboolean
g_regex_match_full(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,GRegexMatchFlags match_options,GMatchInfo ** match_info,GError ** error)2067 g_regex_match_full (const GRegex      *regex,
2068                     const gchar       *string,
2069                     gssize             string_len,
2070                     gint               start_position,
2071                     GRegexMatchFlags   match_options,
2072                     GMatchInfo       **match_info,
2073                     GError           **error)
2074 {
2075   GMatchInfo *info;
2076   gboolean match_ok;
2077 
2078   match_options = map_to_pcre2_match_flags (match_options);
2079 
2080   g_return_val_if_fail (regex != NULL, FALSE);
2081   g_return_val_if_fail (string != NULL, FALSE);
2082   g_return_val_if_fail (start_position >= 0, FALSE);
2083   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
2084   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
2085 
2086   info = match_info_new (regex, string, string_len, start_position,
2087                          match_options, FALSE);
2088   match_ok = g_match_info_next (info, error);
2089   if (match_info != NULL)
2090     *match_info = info;
2091   else
2092     g_match_info_free (info);
2093 
2094   return match_ok;
2095 }
2096 
2097 /**
2098  * g_regex_match_all:
2099  * @regex: a #GRegex structure from g_regex_new()
2100  * @string: the string to scan for matches
2101  * @match_options: match options
2102  * @match_info: (out) (optional): pointer to location where to store
2103  *     the #GMatchInfo, or %NULL if you do not need it
2104  *
2105  * Using the standard algorithm for regular expression matching only
2106  * the longest match in the string is retrieved. This function uses
2107  * a different algorithm so it can retrieve all the possible matches.
2108  * For more documentation see g_regex_match_all_full().
2109  *
2110  * A #GMatchInfo structure, used to get information on the match, is
2111  * stored in @match_info if not %NULL. Note that if @match_info is
2112  * not %NULL then it is created even if the function returns %FALSE,
2113  * i.e. you must free it regardless if regular expression actually
2114  * matched.
2115  *
2116  * @string is not copied and is used in #GMatchInfo internally. If
2117  * you use any #GMatchInfo method (except g_match_info_free()) after
2118  * freeing or modifying @string then the behaviour is undefined.
2119  *
2120  * Returns: %TRUE is the string matched, %FALSE otherwise
2121  *
2122  * Since: 2.14
2123  */
2124 gboolean
g_regex_match_all(const GRegex * regex,const gchar * string,GRegexMatchFlags match_options,GMatchInfo ** match_info)2125 g_regex_match_all (const GRegex      *regex,
2126                    const gchar       *string,
2127                    GRegexMatchFlags   match_options,
2128                    GMatchInfo       **match_info)
2129 {
2130   match_options = map_to_pcre2_match_flags (match_options);
2131 
2132   return g_regex_match_all_full (regex, string, -1, 0, match_options,
2133                                  match_info, NULL);
2134 }
2135 
2136 /**
2137  * g_regex_match_all_full:
2138  * @regex: a #GRegex structure from g_regex_new()
2139  * @string: (array length=string_len): the string to scan for matches
2140  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2141  * @start_position: starting index of the string to match, in bytes
2142  * @match_options: match options
2143  * @match_info: (out) (optional): pointer to location where to store
2144  *     the #GMatchInfo, or %NULL if you do not need it
2145  * @error: location to store the error occurring, or %NULL to ignore errors
2146  *
2147  * Using the standard algorithm for regular expression matching only
2148  * the longest match in the @string is retrieved, it is not possible
2149  * to obtain all the available matches. For instance matching
2150  * "<a> <b> <c>" against the pattern "<.*>"
2151  * you get "<a> <b> <c>".
2152  *
2153  * This function uses a different algorithm (called DFA, i.e. deterministic
2154  * finite automaton), so it can retrieve all the possible matches, all
2155  * starting at the same point in the string. For instance matching
2156  * "<a> <b> <c>" against the pattern "<.*>;"
2157  * you would obtain three matches: "<a> <b> <c>",
2158  * "<a> <b>" and "<a>".
2159  *
2160  * The number of matched strings is retrieved using
2161  * g_match_info_get_match_count(). To obtain the matched strings and
2162  * their position you can use, respectively, g_match_info_fetch() and
2163  * g_match_info_fetch_pos(). Note that the strings are returned in
2164  * reverse order of length; that is, the longest matching string is
2165  * given first.
2166  *
2167  * Note that the DFA algorithm is slower than the standard one and it
2168  * is not able to capture substrings, so backreferences do not work.
2169  *
2170  * Setting @start_position differs from just passing over a shortened
2171  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2172  * that begins with any kind of lookbehind assertion, such as "\b".
2173  *
2174  * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2175  *
2176  * A #GMatchInfo structure, used to get information on the match, is
2177  * stored in @match_info if not %NULL. Note that if @match_info is
2178  * not %NULL then it is created even if the function returns %FALSE,
2179  * i.e. you must free it regardless if regular expression actually
2180  * matched.
2181  *
2182  * @string is not copied and is used in #GMatchInfo internally. If
2183  * you use any #GMatchInfo method (except g_match_info_free()) after
2184  * freeing or modifying @string then the behaviour is undefined.
2185  *
2186  * Returns: %TRUE is the string matched, %FALSE otherwise
2187  *
2188  * Since: 2.14
2189  */
2190 gboolean
g_regex_match_all_full(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,GRegexMatchFlags match_options,GMatchInfo ** match_info,GError ** error)2191 g_regex_match_all_full (const GRegex      *regex,
2192                         const gchar       *string,
2193                         gssize             string_len,
2194                         gint               start_position,
2195                         GRegexMatchFlags   match_options,
2196                         GMatchInfo       **match_info,
2197                         GError           **error)
2198 {
2199   GMatchInfo *info;
2200   gboolean done;
2201   pcre2_code *pcre_re;
2202   gboolean retval;
2203   PCRE2_SIZE *ovector;
2204   gint i;
2205 
2206   match_options = map_to_pcre2_match_flags (match_options);
2207 
2208   g_return_val_if_fail (regex != NULL, FALSE);
2209   g_return_val_if_fail (string != NULL, FALSE);
2210   g_return_val_if_fail (start_position >= 0, FALSE);
2211   g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
2212   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
2213 
2214 #ifdef PCRE_NO_AUTO_POSSESS
2215   /* For PCRE >= 8.34 we need to turn off PCRE_NO_AUTO_POSSESS, which
2216    * is an optimization for normal regex matching, but results in omitting
2217    * some shorter matches here, and an observable behaviour change.
2218    *
2219    * DFA matching is rather niche, and very rarely used according to
2220    * codesearch.debian.net, so don't bother caching the recompiled RE. */
2221   pcre_re = regex_compile (regex->pattern,
2222                            regex->compile_opts | PCRE2_NO_AUTO_POSSESS,
2223                            NULL, NULL, error);
2224   if (pcre_re == NULL)
2225     return FALSE;
2226 
2227 #else
2228   /* For PCRE < 8.33 the precompiled regex is fine. */
2229   pcre_re = regex->pcre_re;
2230 #endif
2231 
2232   info = match_info_new (regex, string, string_len, start_position,
2233                          match_options, TRUE);
2234 
2235   done = FALSE;
2236   while (!done)
2237     {
2238       done = TRUE;
2239       info->matches = pcre2_dfa_match (pcre_re,
2240                                        (PCRE2_SPTR)info->string, info->string_len,
2241                                        info->pos,
2242                                        (match_options | PCRE2_NO_UTF_CHECK) & ~G_REGEX_FLAGS_CONVERTED,
2243                                        info->match_data,
2244                                        NULL,
2245                                        info->workspace, info->n_workspace);
2246 
2247       info->n_offsets = pcre2_get_ovector_count (info->match_data) * 2;
2248       ovector = pcre2_get_ovector_pointer (info->match_data);
2249       info->offsets = g_realloc (info->offsets,
2250                                  info->n_offsets * sizeof (gint));
2251       for (i = 0; i < info->n_offsets; i++)
2252         {
2253           info->offsets[i] = (int) ovector[i];
2254         }
2255 
2256       if (info->matches == PCRE2_ERROR_DFA_WSSIZE)
2257         {
2258           /* info->workspace is too small. */
2259           info->n_workspace *= 2;
2260           info->workspace = g_realloc (info->workspace,
2261                                        info->n_workspace * sizeof (gint));
2262           done = FALSE;
2263         }
2264       else if (info->matches == 0)
2265         {
2266           /* info->offsets is too small. */
2267           info->n_offsets *= 2;
2268           info->offsets = g_realloc (info->offsets,
2269                                      info->n_offsets * sizeof (gint));
2270           done = FALSE;
2271         }
2272       else if (IS_PCRE_ERROR (info->matches))
2273         {
2274           g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
2275                        _("Error while matching regular expression %s: %s"),
2276                        regex->pattern, match_error (info->matches));
2277         }
2278     }
2279 
2280 #ifdef PCRE_NO_AUTO_POSSESS
2281   pcre2_code_free (pcre_re);
2282 #endif
2283 
2284   /* set info->pos to -1 so that a call to g_match_info_next() fails. */
2285   info->pos = -1;
2286   retval = info->matches >= 0;
2287 
2288   if (match_info != NULL)
2289     *match_info = info;
2290   else
2291     g_match_info_free (info);
2292 
2293   return retval;
2294 }
2295 
2296 /**
2297  * g_regex_get_string_number:
2298  * @regex: #GRegex structure
2299  * @name: name of the subexpression
2300  *
2301  * Retrieves the number of the subexpression named @name.
2302  *
2303  * Returns: The number of the subexpression or -1 if @name
2304  *   does not exists
2305  *
2306  * Since: 2.14
2307  */
2308 gint
g_regex_get_string_number(const GRegex * regex,const gchar * name)2309 g_regex_get_string_number (const GRegex *regex,
2310                            const gchar  *name)
2311 {
2312   gint num;
2313 
2314   g_return_val_if_fail (regex != NULL, -1);
2315   g_return_val_if_fail (name != NULL, -1);
2316 
2317   num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR)name);
2318   if (num == PCRE2_ERROR_NOSUBSTRING)
2319     num = -1;
2320 
2321   return num;
2322 }
2323 
2324 /**
2325  * g_regex_split_simple:
2326  * @pattern: the regular expression
2327  * @string: the string to scan for matches
2328  * @compile_options: compile options for the regular expression, or 0
2329  * @match_options: match options, or 0
2330  *
2331  * Breaks the string on the pattern, and returns an array of
2332  * the tokens. If the pattern contains capturing parentheses,
2333  * then the text for each of the substrings will also be returned.
2334  * If the pattern does not match anywhere in the string, then the
2335  * whole string is returned as the first token.
2336  *
2337  * This function is equivalent to g_regex_split() but it does
2338  * not require to compile the pattern with g_regex_new(), avoiding
2339  * some lines of code when you need just to do a split without
2340  * extracting substrings, capture counts, and so on.
2341  *
2342  * If this function is to be called on the same @pattern more than
2343  * once, it's more efficient to compile the pattern once with
2344  * g_regex_new() and then use g_regex_split().
2345  *
2346  * As a special case, the result of splitting the empty string ""
2347  * is an empty vector, not a vector containing a single string.
2348  * The reason for this special case is that being able to represent
2349  * an empty vector is typically more useful than consistent handling
2350  * of empty elements. If you do need to represent empty elements,
2351  * you'll need to check for the empty string before calling this
2352  * function.
2353  *
2354  * A pattern that can match empty strings splits @string into
2355  * separate characters wherever it matches the empty string between
2356  * characters. For example splitting "ab c" using as a separator
2357  * "\s*", you will get "a", "b" and "c".
2358  *
2359  * Returns: (transfer full): a %NULL-terminated array of strings. Free
2360  * it using g_strfreev()
2361  *
2362  * Since: 2.14
2363  **/
2364 gchar **
g_regex_split_simple(const gchar * pattern,const gchar * string,GRegexCompileFlags compile_options,GRegexMatchFlags match_options)2365 g_regex_split_simple (const gchar        *pattern,
2366                       const gchar        *string,
2367                       GRegexCompileFlags  compile_options,
2368                       GRegexMatchFlags    match_options)
2369 {
2370   GRegex *regex;
2371   gchar **result;
2372 
2373   compile_options = map_to_pcre2_compile_flags (compile_options);
2374   match_options = map_to_pcre2_match_flags (match_options);
2375 
2376   regex = g_regex_new (pattern, compile_options, 0, NULL);
2377   if (!regex)
2378     return NULL;
2379 
2380   result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
2381   g_regex_unref (regex);
2382   return result;
2383 }
2384 
2385 /**
2386  * g_regex_split:
2387  * @regex: a #GRegex structure
2388  * @string: the string to split with the pattern
2389  * @match_options: match time option flags
2390  *
2391  * Breaks the string on the pattern, and returns an array of the tokens.
2392  * If the pattern contains capturing parentheses, then the text for each
2393  * of the substrings will also be returned. If the pattern does not match
2394  * anywhere in the string, then the whole string is returned as the first
2395  * token.
2396  *
2397  * As a special case, the result of splitting the empty string "" is an
2398  * empty vector, not a vector containing a single string. The reason for
2399  * this special case is that being able to represent an empty vector is
2400  * typically more useful than consistent handling of empty elements. If
2401  * you do need to represent empty elements, you'll need to check for the
2402  * empty string before calling this function.
2403  *
2404  * A pattern that can match empty strings splits @string into separate
2405  * characters wherever it matches the empty string between characters.
2406  * For example splitting "ab c" using as a separator "\s*", you will get
2407  * "a", "b" and "c".
2408  *
2409  * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2410  * it using g_strfreev()
2411  *
2412  * Since: 2.14
2413  **/
2414 gchar **
g_regex_split(const GRegex * regex,const gchar * string,GRegexMatchFlags match_options)2415 g_regex_split (const GRegex     *regex,
2416                const gchar      *string,
2417                GRegexMatchFlags  match_options)
2418 {
2419   match_options = map_to_pcre2_match_flags (match_options);
2420 
2421   return g_regex_split_full (regex, string, -1, 0,
2422                              match_options, 0, NULL);
2423 }
2424 
2425 /**
2426  * g_regex_split_full:
2427  * @regex: a #GRegex structure
2428  * @string: (array length=string_len): the string to split with the pattern
2429  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2430  * @start_position: starting index of the string to match, in bytes
2431  * @match_options: match time option flags
2432  * @max_tokens: the maximum number of tokens to split @string into.
2433  *   If this is less than 1, the string is split completely
2434  * @error: return location for a #GError
2435  *
2436  * Breaks the string on the pattern, and returns an array of the tokens.
2437  * If the pattern contains capturing parentheses, then the text for each
2438  * of the substrings will also be returned. If the pattern does not match
2439  * anywhere in the string, then the whole string is returned as the first
2440  * token.
2441  *
2442  * As a special case, the result of splitting the empty string "" is an
2443  * empty vector, not a vector containing a single string. The reason for
2444  * this special case is that being able to represent an empty vector is
2445  * typically more useful than consistent handling of empty elements. If
2446  * you do need to represent empty elements, you'll need to check for the
2447  * empty string before calling this function.
2448  *
2449  * A pattern that can match empty strings splits @string into separate
2450  * characters wherever it matches the empty string between characters.
2451  * For example splitting "ab c" using as a separator "\s*", you will get
2452  * "a", "b" and "c".
2453  *
2454  * Setting @start_position differs from just passing over a shortened
2455  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2456  * that begins with any kind of lookbehind assertion, such as "\b".
2457  *
2458  * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2459  * it using g_strfreev()
2460  *
2461  * Since: 2.14
2462  **/
2463 gchar **
g_regex_split_full(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,GRegexMatchFlags match_options,gint max_tokens,GError ** error)2464 g_regex_split_full (const GRegex      *regex,
2465                     const gchar       *string,
2466                     gssize             string_len,
2467                     gint               start_position,
2468                     GRegexMatchFlags   match_options,
2469                     gint               max_tokens,
2470                     GError           **error)
2471 {
2472   GError *tmp_error = NULL;
2473   GMatchInfo *match_info;
2474   GList *list, *last;
2475   gint i;
2476   gint token_count;
2477   gboolean match_ok;
2478   /* position of the last separator. */
2479   gint last_separator_end;
2480   /* was the last match 0 bytes long? */
2481   gboolean last_match_is_empty;
2482   /* the returned array of char **s */
2483   gchar **string_list;
2484 
2485   match_options = map_to_pcre2_match_flags (match_options);
2486 
2487   g_return_val_if_fail (regex != NULL, NULL);
2488   g_return_val_if_fail (string != NULL, NULL);
2489   g_return_val_if_fail (start_position >= 0, NULL);
2490   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2491   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2492 
2493   if (max_tokens <= 0)
2494     max_tokens = G_MAXINT;
2495 
2496   if (string_len < 0)
2497     string_len = strlen (string);
2498 
2499   /* zero-length string */
2500   if (string_len - start_position == 0)
2501     return g_new0 (gchar *, 1);
2502 
2503   if (max_tokens == 1)
2504     {
2505       string_list = g_new0 (gchar *, 2);
2506       string_list[0] = g_strndup (&string[start_position],
2507                                   string_len - start_position);
2508       return string_list;
2509     }
2510 
2511   list = NULL;
2512   token_count = 0;
2513   last_separator_end = start_position;
2514   last_match_is_empty = FALSE;
2515 
2516   match_ok = g_regex_match_full (regex, string, string_len, start_position,
2517                                  match_options, &match_info, &tmp_error);
2518 
2519   while (tmp_error == NULL)
2520     {
2521       if (match_ok)
2522         {
2523           last_match_is_empty =
2524                     (match_info->offsets[0] == match_info->offsets[1]);
2525 
2526           /* we need to skip empty separators at the same position of the end
2527            * of another separator. e.g. the string is "a b" and the separator
2528            * is " *", so from 1 to 2 we have a match and at position 2 we have
2529            * an empty match. */
2530           if (last_separator_end != match_info->offsets[1])
2531             {
2532               gchar *token;
2533               gint match_count;
2534 
2535               token = g_strndup (string + last_separator_end,
2536                                  match_info->offsets[0] - last_separator_end);
2537               list = g_list_prepend (list, token);
2538               token_count++;
2539 
2540               /* if there were substrings, these need to be added to
2541                * the list. */
2542               match_count = g_match_info_get_match_count (match_info);
2543               if (match_count > 1)
2544                 {
2545                   for (i = 1; i < match_count; i++)
2546                     list = g_list_prepend (list, g_match_info_fetch (match_info, i));
2547                 }
2548             }
2549         }
2550       else
2551         {
2552           /* if there was no match, copy to end of string. */
2553           if (!last_match_is_empty)
2554             {
2555               gchar *token = g_strndup (string + last_separator_end,
2556                                         match_info->string_len - last_separator_end);
2557               list = g_list_prepend (list, token);
2558             }
2559           /* no more tokens, end the loop. */
2560           break;
2561         }
2562 
2563       /* -1 to leave room for the last part. */
2564       if (token_count >= max_tokens - 1)
2565         {
2566           /* we have reached the maximum number of tokens, so we copy
2567            * the remaining part of the string. */
2568           if (last_match_is_empty)
2569             {
2570               /* the last match was empty, so we have moved one char
2571                * after the real position to avoid empty matches at the
2572                * same position. */
2573               match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
2574             }
2575           /* the if is needed in the case we have terminated the available
2576            * tokens, but we are at the end of the string, so there are no
2577            * characters left to copy. */
2578           if (string_len > match_info->pos)
2579             {
2580               gchar *token = g_strndup (string + match_info->pos,
2581                                         string_len - match_info->pos);
2582               list = g_list_prepend (list, token);
2583             }
2584           /* end the loop. */
2585           break;
2586         }
2587 
2588       last_separator_end = match_info->pos;
2589       if (last_match_is_empty)
2590         /* if the last match was empty, g_match_info_next() has moved
2591          * forward to avoid infinite loops, but we still need to copy that
2592          * character. */
2593         last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
2594 
2595       match_ok = g_match_info_next (match_info, &tmp_error);
2596     }
2597   g_match_info_free (match_info);
2598   if (tmp_error != NULL)
2599     {
2600       g_propagate_error (error, tmp_error);
2601       g_list_free_full (list, g_free);
2602       return NULL;
2603     }
2604 
2605   string_list = g_new (gchar *, g_list_length (list) + 1);
2606   i = 0;
2607   for (last = g_list_last (list); last; last = g_list_previous (last))
2608     string_list[i++] = last->data;
2609   string_list[i] = NULL;
2610   g_list_free (list);
2611 
2612   return string_list;
2613 }
2614 
2615 enum
2616 {
2617   REPL_TYPE_STRING,
2618   REPL_TYPE_CHARACTER,
2619   REPL_TYPE_SYMBOLIC_REFERENCE,
2620   REPL_TYPE_NUMERIC_REFERENCE,
2621   REPL_TYPE_CHANGE_CASE
2622 };
2623 
2624 typedef enum
2625 {
2626   CHANGE_CASE_NONE         = 1 << 0,
2627   CHANGE_CASE_UPPER        = 1 << 1,
2628   CHANGE_CASE_LOWER        = 1 << 2,
2629   CHANGE_CASE_UPPER_SINGLE = 1 << 3,
2630   CHANGE_CASE_LOWER_SINGLE = 1 << 4,
2631   CHANGE_CASE_SINGLE_MASK  = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
2632   CHANGE_CASE_LOWER_MASK   = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
2633   CHANGE_CASE_UPPER_MASK   = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
2634 } ChangeCase;
2635 
2636 struct _InterpolationData
2637 {
2638   gchar     *text;
2639   gint       type;
2640   gint       num;
2641   gchar      c;
2642   ChangeCase change_case;
2643 };
2644 
2645 static void
free_interpolation_data(InterpolationData * data)2646 free_interpolation_data (InterpolationData *data)
2647 {
2648   g_free (data->text);
2649   g_free (data);
2650 }
2651 
2652 static const gchar *
expand_escape(const gchar * replacement,const gchar * p,InterpolationData * data,GError ** error)2653 expand_escape (const gchar        *replacement,
2654                const gchar        *p,
2655                InterpolationData  *data,
2656                GError            **error)
2657 {
2658   const gchar *q, *r;
2659   gint x, d, h, i;
2660   const gchar *error_detail;
2661   gint base = 0;
2662   GError *tmp_error = NULL;
2663 
2664   p++;
2665   switch (*p)
2666     {
2667     case 't':
2668       p++;
2669       data->c = '\t';
2670       data->type = REPL_TYPE_CHARACTER;
2671       break;
2672     case 'n':
2673       p++;
2674       data->c = '\n';
2675       data->type = REPL_TYPE_CHARACTER;
2676       break;
2677     case 'v':
2678       p++;
2679       data->c = '\v';
2680       data->type = REPL_TYPE_CHARACTER;
2681       break;
2682     case 'r':
2683       p++;
2684       data->c = '\r';
2685       data->type = REPL_TYPE_CHARACTER;
2686       break;
2687     case 'f':
2688       p++;
2689       data->c = '\f';
2690       data->type = REPL_TYPE_CHARACTER;
2691       break;
2692     case 'a':
2693       p++;
2694       data->c = '\a';
2695       data->type = REPL_TYPE_CHARACTER;
2696       break;
2697     case 'b':
2698       p++;
2699       data->c = '\b';
2700       data->type = REPL_TYPE_CHARACTER;
2701       break;
2702     case '\\':
2703       p++;
2704       data->c = '\\';
2705       data->type = REPL_TYPE_CHARACTER;
2706       break;
2707     case 'x':
2708       p++;
2709       x = 0;
2710       if (*p == '{')
2711         {
2712           p++;
2713           do
2714             {
2715               h = g_ascii_xdigit_value (*p);
2716               if (h < 0)
2717                 {
2718                   error_detail = _("hexadecimal digit or “}” expected");
2719                   goto error;
2720                 }
2721               x = x * 16 + h;
2722               p++;
2723             }
2724           while (*p != '}');
2725           p++;
2726         }
2727       else
2728         {
2729           for (i = 0; i < 2; i++)
2730             {
2731               h = g_ascii_xdigit_value (*p);
2732               if (h < 0)
2733                 {
2734                   error_detail = _("hexadecimal digit expected");
2735                   goto error;
2736                 }
2737               x = x * 16 + h;
2738               p++;
2739             }
2740         }
2741       data->type = REPL_TYPE_STRING;
2742       data->text = g_new0 (gchar, 8);
2743       g_unichar_to_utf8 (x, data->text);
2744       break;
2745     case 'l':
2746       p++;
2747       data->type = REPL_TYPE_CHANGE_CASE;
2748       data->change_case = CHANGE_CASE_LOWER_SINGLE;
2749       break;
2750     case 'u':
2751       p++;
2752       data->type = REPL_TYPE_CHANGE_CASE;
2753       data->change_case = CHANGE_CASE_UPPER_SINGLE;
2754       break;
2755     case 'L':
2756       p++;
2757       data->type = REPL_TYPE_CHANGE_CASE;
2758       data->change_case = CHANGE_CASE_LOWER;
2759       break;
2760     case 'U':
2761       p++;
2762       data->type = REPL_TYPE_CHANGE_CASE;
2763       data->change_case = CHANGE_CASE_UPPER;
2764       break;
2765     case 'E':
2766       p++;
2767       data->type = REPL_TYPE_CHANGE_CASE;
2768       data->change_case = CHANGE_CASE_NONE;
2769       break;
2770     case 'g':
2771       p++;
2772       if (*p != '<')
2773         {
2774           error_detail = _("missing “<” in symbolic reference");
2775           goto error;
2776         }
2777       q = p + 1;
2778       do
2779         {
2780           p++;
2781           if (!*p)
2782             {
2783               error_detail = _("unfinished symbolic reference");
2784               goto error;
2785             }
2786         }
2787       while (*p != '>');
2788       if (p - q == 0)
2789         {
2790           error_detail = _("zero-length symbolic reference");
2791           goto error;
2792         }
2793       if (g_ascii_isdigit (*q))
2794         {
2795           x = 0;
2796           do
2797             {
2798               h = g_ascii_digit_value (*q);
2799               if (h < 0)
2800                 {
2801                   error_detail = _("digit expected");
2802                   p = q;
2803                   goto error;
2804                 }
2805               x = x * 10 + h;
2806               q++;
2807             }
2808           while (q != p);
2809           data->num = x;
2810           data->type = REPL_TYPE_NUMERIC_REFERENCE;
2811         }
2812       else
2813         {
2814           r = q;
2815           do
2816             {
2817               if (!g_ascii_isalnum (*r))
2818                 {
2819                   error_detail = _("illegal symbolic reference");
2820                   p = r;
2821                   goto error;
2822                 }
2823               r++;
2824             }
2825           while (r != p);
2826           data->text = g_strndup (q, p - q);
2827           data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
2828         }
2829       p++;
2830       break;
2831     case '0':
2832       /* if \0 is followed by a number is an octal number representing a
2833        * character, else it is a numeric reference. */
2834       if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
2835         {
2836           base = 8;
2837           p = g_utf8_next_char (p);
2838         }
2839       G_GNUC_FALLTHROUGH;
2840     case '1':
2841     case '2':
2842     case '3':
2843     case '4':
2844     case '5':
2845     case '6':
2846     case '7':
2847     case '8':
2848     case '9':
2849       x = 0;
2850       d = 0;
2851       for (i = 0; i < 3; i++)
2852         {
2853           h = g_ascii_digit_value (*p);
2854           if (h < 0)
2855             break;
2856           if (h > 7)
2857             {
2858               if (base == 8)
2859                 break;
2860               else
2861                 base = 10;
2862             }
2863           if (i == 2 && base == 10)
2864             break;
2865           x = x * 8 + h;
2866           d = d * 10 + h;
2867           p++;
2868         }
2869       if (base == 8 || i == 3)
2870         {
2871           data->type = REPL_TYPE_STRING;
2872           data->text = g_new0 (gchar, 8);
2873           g_unichar_to_utf8 (x, data->text);
2874         }
2875       else
2876         {
2877           data->type = REPL_TYPE_NUMERIC_REFERENCE;
2878           data->num = d;
2879         }
2880       break;
2881     case 0:
2882       error_detail = _("stray final “\\”");
2883       goto error;
2884       break;
2885     default:
2886       error_detail = _("unknown escape sequence");
2887       goto error;
2888     }
2889 
2890   return p;
2891 
2892  error:
2893   /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
2894   tmp_error = g_error_new (G_REGEX_ERROR,
2895                            G_REGEX_ERROR_REPLACE,
2896                            _("Error while parsing replacement "
2897                              "text “%s” at char %lu: %s"),
2898                            replacement,
2899                            (gulong)(p - replacement),
2900                            error_detail);
2901   g_propagate_error (error, tmp_error);
2902 
2903   return NULL;
2904 }
2905 
2906 static GList *
split_replacement(const gchar * replacement,GError ** error)2907 split_replacement (const gchar  *replacement,
2908                    GError      **error)
2909 {
2910   GList *list = NULL;
2911   InterpolationData *data;
2912   const gchar *p, *start;
2913 
2914   start = p = replacement;
2915   while (*p)
2916     {
2917       if (*p == '\\')
2918         {
2919           data = g_new0 (InterpolationData, 1);
2920           start = p = expand_escape (replacement, p, data, error);
2921           if (p == NULL)
2922             {
2923               g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
2924               free_interpolation_data (data);
2925 
2926               return NULL;
2927             }
2928           list = g_list_prepend (list, data);
2929         }
2930       else
2931         {
2932           p++;
2933           if (*p == '\\' || *p == '\0')
2934             {
2935               if (p - start > 0)
2936                 {
2937                   data = g_new0 (InterpolationData, 1);
2938                   data->text = g_strndup (start, p - start);
2939                   data->type = REPL_TYPE_STRING;
2940                   list = g_list_prepend (list, data);
2941                 }
2942             }
2943         }
2944     }
2945 
2946   return g_list_reverse (list);
2947 }
2948 
2949 /* Change the case of c based on change_case. */
2950 #define CHANGE_CASE(c, change_case) \
2951         (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
2952                 g_unichar_tolower (c) : \
2953                 g_unichar_toupper (c))
2954 
2955 static void
string_append(GString * string,const gchar * text,ChangeCase * change_case)2956 string_append (GString     *string,
2957                const gchar *text,
2958                ChangeCase  *change_case)
2959 {
2960   gunichar c;
2961 
2962   if (text[0] == '\0')
2963     return;
2964 
2965   if (*change_case == CHANGE_CASE_NONE)
2966     {
2967       g_string_append (string, text);
2968     }
2969   else if (*change_case & CHANGE_CASE_SINGLE_MASK)
2970     {
2971       c = g_utf8_get_char (text);
2972       g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2973       g_string_append (string, g_utf8_next_char (text));
2974       *change_case = CHANGE_CASE_NONE;
2975     }
2976   else
2977     {
2978       while (*text != '\0')
2979         {
2980           c = g_utf8_get_char (text);
2981           g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2982           text = g_utf8_next_char (text);
2983         }
2984     }
2985 }
2986 
2987 static gboolean
interpolate_replacement(const GMatchInfo * match_info,GString * result,gpointer data)2988 interpolate_replacement (const GMatchInfo *match_info,
2989                          GString          *result,
2990                          gpointer          data)
2991 {
2992   GList *list;
2993   InterpolationData *idata;
2994   gchar *match;
2995   ChangeCase change_case = CHANGE_CASE_NONE;
2996 
2997   for (list = data; list; list = list->next)
2998     {
2999       idata = list->data;
3000       switch (idata->type)
3001         {
3002         case REPL_TYPE_STRING:
3003           string_append (result, idata->text, &change_case);
3004           break;
3005         case REPL_TYPE_CHARACTER:
3006           g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
3007           if (change_case & CHANGE_CASE_SINGLE_MASK)
3008             change_case = CHANGE_CASE_NONE;
3009           break;
3010         case REPL_TYPE_NUMERIC_REFERENCE:
3011           match = g_match_info_fetch (match_info, idata->num);
3012           if (match)
3013             {
3014               string_append (result, match, &change_case);
3015               g_free (match);
3016             }
3017           break;
3018         case REPL_TYPE_SYMBOLIC_REFERENCE:
3019           match = g_match_info_fetch_named (match_info, idata->text);
3020           if (match)
3021             {
3022               string_append (result, match, &change_case);
3023               g_free (match);
3024             }
3025           break;
3026         case REPL_TYPE_CHANGE_CASE:
3027           change_case = idata->change_case;
3028           break;
3029         }
3030     }
3031 
3032   return FALSE;
3033 }
3034 
3035 /* whether actual match_info is needed for replacement, i.e.
3036  * whether there are references
3037  */
3038 static gboolean
interpolation_list_needs_match(GList * list)3039 interpolation_list_needs_match (GList *list)
3040 {
3041   while (list != NULL)
3042     {
3043       InterpolationData *data = list->data;
3044 
3045       if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||
3046           data->type == REPL_TYPE_NUMERIC_REFERENCE)
3047         {
3048           return TRUE;
3049         }
3050 
3051       list = list->next;
3052     }
3053 
3054   return FALSE;
3055 }
3056 
3057 /**
3058  * g_regex_replace:
3059  * @regex: a #GRegex structure
3060  * @string: (array length=string_len): the string to perform matches against
3061  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3062  * @start_position: starting index of the string to match, in bytes
3063  * @replacement: text to replace each match with
3064  * @match_options: options for the match
3065  * @error: location to store the error occurring, or %NULL to ignore errors
3066  *
3067  * Replaces all occurrences of the pattern in @regex with the
3068  * replacement text. Backreferences of the form '\number' or
3069  * '\g<number>' in the replacement text are interpolated by the
3070  * number-th captured subexpression of the match, '\g<name>' refers
3071  * to the captured subexpression with the given name. '\0' refers
3072  * to the complete match, but '\0' followed by a number is the octal
3073  * representation of a character. To include a literal '\' in the
3074  * replacement, write '\\\\'.
3075  *
3076  * There are also escapes that changes the case of the following text:
3077  *
3078  * - \l: Convert to lower case the next character
3079  * - \u: Convert to upper case the next character
3080  * - \L: Convert to lower case till \E
3081  * - \U: Convert to upper case till \E
3082  * - \E: End case modification
3083  *
3084  * If you do not need to use backreferences use g_regex_replace_literal().
3085  *
3086  * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was
3087  * passed to g_regex_new(). If you want to use not UTF-8 encoded strings
3088  * you can use g_regex_replace_literal().
3089  *
3090  * Setting @start_position differs from just passing over a shortened
3091  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that
3092  * begins with any kind of lookbehind assertion, such as "\b".
3093  *
3094  * Returns: a newly allocated string containing the replacements
3095  *
3096  * Since: 2.14
3097  */
3098 gchar *
g_regex_replace(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,const gchar * replacement,GRegexMatchFlags match_options,GError ** error)3099 g_regex_replace (const GRegex      *regex,
3100                  const gchar       *string,
3101                  gssize             string_len,
3102                  gint               start_position,
3103                  const gchar       *replacement,
3104                  GRegexMatchFlags   match_options,
3105                  GError           **error)
3106 {
3107   gchar *result;
3108   GList *list;
3109   GError *tmp_error = NULL;
3110 
3111   match_options = map_to_pcre2_match_flags (match_options);
3112 
3113   g_return_val_if_fail (regex != NULL, NULL);
3114   g_return_val_if_fail (string != NULL, NULL);
3115   g_return_val_if_fail (start_position >= 0, NULL);
3116   g_return_val_if_fail (replacement != NULL, NULL);
3117   g_return_val_if_fail (error == NULL || *error == NULL, NULL);
3118   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3119 
3120   list = split_replacement (replacement, &tmp_error);
3121   if (tmp_error != NULL)
3122     {
3123       g_propagate_error (error, tmp_error);
3124       return NULL;
3125     }
3126 
3127   result = g_regex_replace_eval (regex,
3128                                  string, string_len, start_position,
3129                                  match_options,
3130                                  interpolate_replacement,
3131                                  (gpointer)list,
3132                                  &tmp_error);
3133   if (tmp_error != NULL)
3134     g_propagate_error (error, tmp_error);
3135 
3136   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3137 
3138   return result;
3139 }
3140 
3141 static gboolean
literal_replacement(const GMatchInfo * match_info,GString * result,gpointer data)3142 literal_replacement (const GMatchInfo *match_info,
3143                      GString          *result,
3144                      gpointer          data)
3145 {
3146   g_string_append (result, data);
3147   return FALSE;
3148 }
3149 
3150 /**
3151  * g_regex_replace_literal:
3152  * @regex: a #GRegex structure
3153  * @string: (array length=string_len): the string to perform matches against
3154  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3155  * @start_position: starting index of the string to match, in bytes
3156  * @replacement: text to replace each match with
3157  * @match_options: options for the match
3158  * @error: location to store the error occurring, or %NULL to ignore errors
3159  *
3160  * Replaces all occurrences of the pattern in @regex with the
3161  * replacement text. @replacement is replaced literally, to
3162  * include backreferences use g_regex_replace().
3163  *
3164  * Setting @start_position differs from just passing over a
3165  * shortened string and setting #G_REGEX_MATCH_NOTBOL in the
3166  * case of a pattern that begins with any kind of lookbehind
3167  * assertion, such as "\b".
3168  *
3169  * Returns: a newly allocated string containing the replacements
3170  *
3171  * Since: 2.14
3172  */
3173 gchar *
g_regex_replace_literal(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,const gchar * replacement,GRegexMatchFlags match_options,GError ** error)3174 g_regex_replace_literal (const GRegex      *regex,
3175                          const gchar       *string,
3176                          gssize             string_len,
3177                          gint               start_position,
3178                          const gchar       *replacement,
3179                          GRegexMatchFlags   match_options,
3180                          GError           **error)
3181 {
3182   match_options = map_to_pcre2_match_flags (match_options);
3183 
3184   g_return_val_if_fail (replacement != NULL, NULL);
3185   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3186 
3187   return g_regex_replace_eval (regex,
3188                                string, string_len, start_position,
3189                                match_options,
3190                                literal_replacement,
3191                                (gpointer)replacement,
3192                                error);
3193 }
3194 
3195 /**
3196  * g_regex_replace_eval:
3197  * @regex: a #GRegex structure from g_regex_new()
3198  * @string: (array length=string_len): string to perform matches against
3199  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3200  * @start_position: starting index of the string to match, in bytes
3201  * @match_options: options for the match
3202  * @eval: a function to call for each match
3203  * @user_data: user data to pass to the function
3204  * @error: location to store the error occurring, or %NULL to ignore errors
3205  *
3206  * Replaces occurrences of the pattern in regex with the output of
3207  * @eval for that occurrence.
3208  *
3209  * Setting @start_position differs from just passing over a shortened
3210  * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
3211  * that begins with any kind of lookbehind assertion, such as "\b".
3212  *
3213  * The following example uses g_regex_replace_eval() to replace multiple
3214  * strings at once:
3215  * |[<!-- language="C" -->
3216  * static gboolean
3217  * eval_cb (const GMatchInfo *info,
3218  *          GString          *res,
3219  *          gpointer          data)
3220  * {
3221  *   gchar *match;
3222  *   gchar *r;
3223  *
3224  *    match = g_match_info_fetch (info, 0);
3225  *    r = g_hash_table_lookup ((GHashTable *)data, match);
3226  *    g_string_append (res, r);
3227  *    g_free (match);
3228  *
3229  *    return FALSE;
3230  * }
3231  *
3232  * ...
3233  *
3234  * GRegex *reg;
3235  * GHashTable *h;
3236  * gchar *res;
3237  *
3238  * h = g_hash_table_new (g_str_hash, g_str_equal);
3239  *
3240  * g_hash_table_insert (h, "1", "ONE");
3241  * g_hash_table_insert (h, "2", "TWO");
3242  * g_hash_table_insert (h, "3", "THREE");
3243  * g_hash_table_insert (h, "4", "FOUR");
3244  *
3245  * reg = g_regex_new ("1|2|3|4", 0, 0, NULL);
3246  * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL);
3247  * g_hash_table_destroy (h);
3248  *
3249  * ...
3250  * ]|
3251  *
3252  * Returns: a newly allocated string containing the replacements
3253  *
3254  * Since: 2.14
3255  */
3256 gchar *
g_regex_replace_eval(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,GRegexMatchFlags match_options,GRegexEvalCallback eval,gpointer user_data,GError ** error)3257 g_regex_replace_eval (const GRegex        *regex,
3258                       const gchar         *string,
3259                       gssize               string_len,
3260                       gint                 start_position,
3261                       GRegexMatchFlags     match_options,
3262                       GRegexEvalCallback   eval,
3263                       gpointer             user_data,
3264                       GError             **error)
3265 {
3266   GMatchInfo *match_info;
3267   GString *result;
3268   gint str_pos = 0;
3269   gboolean done = FALSE;
3270   GError *tmp_error = NULL;
3271 
3272   match_options = map_to_pcre2_match_flags (match_options);
3273 
3274   g_return_val_if_fail (regex != NULL, NULL);
3275   g_return_val_if_fail (string != NULL, NULL);
3276   g_return_val_if_fail (start_position >= 0, NULL);
3277   g_return_val_if_fail (eval != NULL, NULL);
3278   g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3279 
3280   if (string_len < 0)
3281     string_len = strlen (string);
3282 
3283   result = g_string_sized_new (string_len);
3284 
3285   /* run down the string making matches. */
3286   g_regex_match_full (regex, string, string_len, start_position,
3287                       match_options, &match_info, &tmp_error);
3288   while (!done && g_match_info_matches (match_info))
3289     {
3290       g_string_append_len (result,
3291                            string + str_pos,
3292                            match_info->offsets[0] - str_pos);
3293       done = (*eval) (match_info, result, user_data);
3294       str_pos = match_info->offsets[1];
3295       g_match_info_next (match_info, &tmp_error);
3296     }
3297   g_match_info_free (match_info);
3298   if (tmp_error != NULL)
3299     {
3300       g_propagate_error (error, tmp_error);
3301       g_string_free (result, TRUE);
3302       return NULL;
3303     }
3304 
3305   g_string_append_len (result, string + str_pos, string_len - str_pos);
3306   return g_string_free (result, FALSE);
3307 }
3308 
3309 /**
3310  * g_regex_check_replacement:
3311  * @replacement: the replacement string
3312  * @has_references: (out) (optional): location to store information about
3313  *   references in @replacement or %NULL
3314  * @error: location to store error
3315  *
3316  * Checks whether @replacement is a valid replacement string
3317  * (see g_regex_replace()), i.e. that all escape sequences in
3318  * it are valid.
3319  *
3320  * If @has_references is not %NULL then @replacement is checked
3321  * for pattern references. For instance, replacement text 'foo\n'
3322  * does not contain references and may be evaluated without information
3323  * about actual match, but '\0\1' (whole match followed by first
3324  * subpattern) requires valid #GMatchInfo object.
3325  *
3326  * Returns: whether @replacement is a valid replacement string
3327  *
3328  * Since: 2.14
3329  */
3330 gboolean
g_regex_check_replacement(const gchar * replacement,gboolean * has_references,GError ** error)3331 g_regex_check_replacement (const gchar  *replacement,
3332                            gboolean     *has_references,
3333                            GError      **error)
3334 {
3335   GList *list;
3336   GError *tmp = NULL;
3337 
3338   list = split_replacement (replacement, &tmp);
3339 
3340   if (tmp)
3341   {
3342     g_propagate_error (error, tmp);
3343     return FALSE;
3344   }
3345 
3346   if (has_references)
3347     *has_references = interpolation_list_needs_match (list);
3348 
3349   g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3350 
3351   return TRUE;
3352 }
3353 
3354 /**
3355  * g_regex_escape_nul:
3356  * @string: the string to escape
3357  * @length: the length of @string
3358  *
3359  * Escapes the nul characters in @string to "\x00".  It can be used
3360  * to compile a regex with embedded nul characters.
3361  *
3362  * For completeness, @length can be -1 for a nul-terminated string.
3363  * In this case the output string will be of course equal to @string.
3364  *
3365  * Returns: a newly-allocated escaped string
3366  *
3367  * Since: 2.30
3368  */
3369 gchar *
g_regex_escape_nul(const gchar * string,gint length)3370 g_regex_escape_nul (const gchar *string,
3371                     gint         length)
3372 {
3373   GString *escaped;
3374   const gchar *p, *piece_start, *end;
3375   gint backslashes;
3376 
3377   g_return_val_if_fail (string != NULL, NULL);
3378 
3379   if (length < 0)
3380     return g_strdup (string);
3381 
3382   end = string + length;
3383   p = piece_start = string;
3384   escaped = g_string_sized_new (length + 1);
3385 
3386   backslashes = 0;
3387   while (p < end)
3388     {
3389       switch (*p)
3390         {
3391         case '\0':
3392           if (p != piece_start)
3393             {
3394               /* copy the previous piece. */
3395               g_string_append_len (escaped, piece_start, p - piece_start);
3396             }
3397           if ((backslashes & 1) == 0)
3398             g_string_append_c (escaped, '\\');
3399           g_string_append_c (escaped, 'x');
3400           g_string_append_c (escaped, '0');
3401           g_string_append_c (escaped, '0');
3402           piece_start = ++p;
3403           backslashes = 0;
3404           break;
3405         case '\\':
3406           backslashes++;
3407           ++p;
3408           break;
3409         default:
3410           backslashes = 0;
3411           p = g_utf8_next_char (p);
3412           break;
3413         }
3414     }
3415 
3416   if (piece_start < end)
3417     g_string_append_len (escaped, piece_start, end - piece_start);
3418 
3419   return g_string_free (escaped, FALSE);
3420 }
3421 
3422 /**
3423  * g_regex_escape_string:
3424  * @string: (array length=length): the string to escape
3425  * @length: the length of @string, in bytes, or -1 if @string is nul-terminated
3426  *
3427  * Escapes the special characters used for regular expressions
3428  * in @string, for instance "a.b*c" becomes "a\.b\*c". This
3429  * function is useful to dynamically generate regular expressions.
3430  *
3431  * @string can contain nul characters that are replaced with "\0",
3432  * in this case remember to specify the correct length of @string
3433  * in @length.
3434  *
3435  * Returns: a newly-allocated escaped string
3436  *
3437  * Since: 2.14
3438  */
3439 gchar *
g_regex_escape_string(const gchar * string,gint length)3440 g_regex_escape_string (const gchar *string,
3441                        gint         length)
3442 {
3443   GString *escaped;
3444   const char *p, *piece_start, *end;
3445 
3446   g_return_val_if_fail (string != NULL, NULL);
3447 
3448   if (length < 0)
3449     length = strlen (string);
3450 
3451   end = string + length;
3452   p = piece_start = string;
3453   escaped = g_string_sized_new (length + 1);
3454 
3455   while (p < end)
3456     {
3457       switch (*p)
3458         {
3459         case '\0':
3460         case '\\':
3461         case '|':
3462         case '(':
3463         case ')':
3464         case '[':
3465         case ']':
3466         case '{':
3467         case '}':
3468         case '^':
3469         case '$':
3470         case '*':
3471         case '+':
3472         case '?':
3473         case '.':
3474           if (p != piece_start)
3475             /* copy the previous piece. */
3476             g_string_append_len (escaped, piece_start, p - piece_start);
3477           g_string_append_c (escaped, '\\');
3478           if (*p == '\0')
3479             g_string_append_c (escaped, '0');
3480           else
3481             g_string_append_c (escaped, *p);
3482           piece_start = ++p;
3483           break;
3484         default:
3485           p = g_utf8_next_char (p);
3486           break;
3487         }
3488   }
3489 
3490   if (piece_start < end)
3491     g_string_append_len (escaped, piece_start, end - piece_start);
3492 
3493   return g_string_free (escaped, FALSE);
3494 }
3495