1 /* GRegex -- regular expression API wrapper around PCRE.
2 *
3 * Copyright (C) 1999, 2000 Scott Wimer
4 * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com>
5 * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with this library; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "config.h"
22
23 #include <string.h>
24
25 #define PCRE2_CODE_UNIT_WIDTH 8
26 #include <pcre2.h>
27
28 #include "gtypes.h"
29 #include "gregex.h"
30 #include "glibintl.h"
31 #include "glist.h"
32 #include "gmessages.h"
33 #include "gstrfuncs.h"
34 #include "gatomic.h"
35 #include "gthread.h"
36
37 /**
38 * SECTION:gregex
39 * @title: Perl-compatible regular expressions
40 * @short_description: matches strings against regular expressions
41 * @see_also: [Regular expression syntax][glib-regex-syntax]
42 *
43 * The g_regex_*() functions implement regular
44 * expression pattern matching using syntax and semantics similar to
45 * Perl regular expression.
46 *
47 * Some functions accept a @start_position argument, setting it differs
48 * from just passing over a shortened string and setting #G_REGEX_MATCH_NOTBOL
49 * in the case of a pattern that begins with any kind of lookbehind assertion.
50 * For example, consider the pattern "\Biss\B" which finds occurrences of "iss"
51 * in the middle of words. ("\B" matches only if the current position in the
52 * subject is not a word boundary.) When applied to the string "Mississipi"
53 * from the fourth byte, namely "issipi", it does not match, because "\B" is
54 * always false at the start of the subject, which is deemed to be a word
55 * boundary. However, if the entire string is passed , but with
56 * @start_position set to 4, it finds the second occurrence of "iss" because
57 * it is able to look behind the starting point to discover that it is
58 * preceded by a letter.
59 *
60 * Note that, unless you set the #G_REGEX_RAW flag, all the strings passed
61 * to these functions must be encoded in UTF-8. The lengths and the positions
62 * inside the strings are in bytes and not in characters, so, for instance,
63 * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a
64 * single character. If you set #G_REGEX_RAW the strings can be non-valid
65 * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two
66 * bytes and two characters long.
67 *
68 * When matching a pattern, "\n" matches only against a "\n" character in
69 * the string, and "\r" matches only a "\r" character. To match any newline
70 * sequence use "\R". This particular group matches either the two-character
71 * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed,
72 * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"),
73 * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line
74 * separator, U+2028), or PS (paragraph separator, U+2029).
75 *
76 * The behaviour of the dot, circumflex, and dollar metacharacters are
77 * affected by newline characters, the default is to recognize any newline
78 * character (the same characters recognized by "\R"). This can be changed
79 * with #G_REGEX_NEWLINE_CR, #G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF
80 * compile options, and with #G_REGEX_MATCH_NEWLINE_ANY,
81 * #G_REGEX_MATCH_NEWLINE_CR, #G_REGEX_MATCH_NEWLINE_LF and
82 * #G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also
83 * relevant when compiling a pattern if #G_REGEX_EXTENDED is set, and an
84 * unescaped "#" outside a character class is encountered. This indicates
85 * a comment that lasts until after the next newline.
86 *
87 * When setting the %G_REGEX_JAVASCRIPT_COMPAT flag, pattern syntax and pattern
88 * matching is changed to be compatible with the way that regular expressions
89 * work in JavaScript. More precisely, a lonely ']' character in the pattern
90 * is a syntax error; the '\x' escape only allows 0 to 2 hexadecimal digits, and
91 * you must use the '\u' escape sequence with 4 hex digits to specify a unicode
92 * codepoint instead of '\x' or 'x{....}'. If '\x' or '\u' are not followed by
93 * the specified number of hex digits, they match 'x' and 'u' literally; also
94 * '\U' always matches 'U' instead of being an error in the pattern. Finally,
95 * pattern matching is modified so that back references to an unset subpattern
96 * group produces a match with the empty string instead of an error. See
97 * pcreapi(3) for more information.
98 *
99 * Creating and manipulating the same #GRegex structure from different
100 * threads is not a problem as #GRegex does not modify its internal
101 * state between creation and destruction, on the other hand #GMatchInfo
102 * is not threadsafe.
103 *
104 * The regular expressions low-level functionalities are obtained through
105 * the excellent
106 * [PCRE](http://www.pcre.org/)
107 * library written by Philip Hazel.
108 */
109
110 /* signifies that flags have already been converted from pcre1 to pcre2 */
111 #define G_REGEX_FLAGS_CONVERTED 0x04000000u
112 /* Mask of all the possible values for GRegexCompileFlags. */
113 #define G_REGEX_COMPILE_MASK (PCRE2_CASELESS | \
114 PCRE2_MULTILINE | \
115 PCRE2_DOTALL | \
116 PCRE2_EXTENDED | \
117 PCRE2_ANCHORED | \
118 PCRE2_DOLLAR_ENDONLY | \
119 PCRE2_UNGREEDY | \
120 PCRE2_UTF | \
121 PCRE2_NO_AUTO_CAPTURE | \
122 PCRE2_FIRSTLINE | \
123 PCRE2_DUPNAMES | \
124 PCRE2_NEWLINE_CR | \
125 PCRE2_NEWLINE_LF | \
126 PCRE2_NEWLINE_CRLF | \
127 PCRE2_NEWLINE_ANYCRLF | \
128 PCRE2_BSR_ANYCRLF | \
129 G_REGEX_FLAGS_CONVERTED)
130
131 /* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */
132 #define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK)
133 #define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF | \
134 G_REGEX_FLAGS_CONVERTED)
135
136 /* Mask of all the possible values for GRegexMatchFlags. */
137 #define G_REGEX_MATCH_MASK (PCRE2_ANCHORED | \
138 PCRE2_NOTBOL | \
139 PCRE2_NOTEOL | \
140 PCRE2_NOTEMPTY | \
141 PCRE2_PARTIAL_SOFT | \
142 PCRE2_NEWLINE_CR | \
143 PCRE2_NEWLINE_LF | \
144 PCRE2_NEWLINE_CRLF | \
145 PCRE2_NEWLINE_ANY | \
146 PCRE2_NEWLINE_ANYCRLF | \
147 PCRE2_BSR_ANYCRLF | \
148 PCRE2_BSR_UNICODE | \
149 PCRE2_PARTIAL_SOFT | \
150 PCRE2_PARTIAL_HARD | \
151 PCRE2_NOTEMPTY_ATSTART | \
152 G_REGEX_FLAGS_CONVERTED)
153
154 /* if the string is in UTF-8 use g_utf8_ functions, else use
155 * use just +/- 1. */
156 #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
157 ((s) + 1) : \
158 g_utf8_next_char (s))
159 #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
160 ((s) - 1) : \
161 g_utf8_prev_char (s))
162
163 struct _GMatchInfo
164 {
165 gint ref_count; /* the ref count (atomic) */
166 GRegex *regex; /* the regex */
167 GRegexMatchFlags match_opts; /* options used at match time on the regex */
168 gint matches; /* number of matching sub patterns */
169 gint pos; /* position in the string where last match left off */
170 gint n_offsets; /* number of offsets */
171 gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
172 gint *workspace; /* workspace for pcre_dfa_exec() */
173 gint n_workspace; /* number of workspace elements */
174 const gchar *string; /* string passed to the match function */
175 gssize string_len; /* length of string, in bytes */
176 pcre2_match_data *match_data;
177 };
178
179 struct _GRegex
180 {
181 gint ref_count; /* the ref count for the immutable part (atomic) */
182 gchar *pattern; /* the pattern */
183 pcre2_code *pcre_re; /* compiled form of the pattern */
184 GRegexCompileFlags compile_opts; /* options used at compile time on the pattern */
185 GRegexMatchFlags match_opts; /* options used at match time on the regex */
186 };
187
188 /* TRUE if ret is an error code, FALSE otherwise. */
189 #define IS_PCRE_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL)
190
191 typedef struct _InterpolationData InterpolationData;
192 static gboolean interpolation_list_needs_match (GList *list);
193 static gboolean interpolate_replacement (const GMatchInfo *match_info,
194 GString *result,
195 gpointer data);
196 static GList *split_replacement (const gchar *replacement,
197 GError **error);
198 static void free_interpolation_data (InterpolationData *data);
199
200 static gint
map_to_pcre2_compile_flags(gint pcre1_flags)201 map_to_pcre2_compile_flags (gint pcre1_flags)
202 {
203 /* Maps compile flags from pcre1 to pcre2 values
204 */
205 gint pcre2_flags = G_REGEX_FLAGS_CONVERTED;
206
207 if (pcre1_flags & G_REGEX_FLAGS_CONVERTED)
208 return pcre1_flags;
209
210 if (pcre1_flags & G_REGEX_CASELESS)
211 pcre2_flags |= PCRE2_CASELESS;
212 if (pcre1_flags & G_REGEX_MULTILINE)
213 pcre2_flags |= PCRE2_MULTILINE;
214 if (pcre1_flags & G_REGEX_DOTALL)
215 pcre2_flags |= PCRE2_DOTALL;
216 if (pcre1_flags & G_REGEX_EXTENDED)
217 pcre2_flags |= PCRE2_EXTENDED;
218 if (pcre1_flags & G_REGEX_ANCHORED)
219 pcre2_flags |= PCRE2_ANCHORED;
220 if (pcre1_flags & G_REGEX_DOLLAR_ENDONLY)
221 pcre2_flags |= PCRE2_DOLLAR_ENDONLY;
222 if (pcre1_flags & G_REGEX_UNGREEDY)
223 pcre2_flags |= PCRE2_UNGREEDY;
224 if (pcre1_flags & G_REGEX_RAW)
225 pcre2_flags |= PCRE2_UTF;
226 if (pcre1_flags & G_REGEX_NO_AUTO_CAPTURE)
227 pcre2_flags |= PCRE2_NO_AUTO_CAPTURE;
228 if (pcre1_flags & G_REGEX_FIRSTLINE)
229 pcre2_flags |= PCRE2_FIRSTLINE;
230 if (pcre1_flags & G_REGEX_DUPNAMES)
231 pcre2_flags |= PCRE2_DUPNAMES;
232 if (pcre1_flags & G_REGEX_NEWLINE_CR)
233 pcre2_flags |= PCRE2_NEWLINE_CR;
234 if (pcre1_flags & G_REGEX_NEWLINE_LF)
235 pcre2_flags |= PCRE2_NEWLINE_LF;
236 if ((pcre1_flags & G_REGEX_NEWLINE_CRLF) == G_REGEX_NEWLINE_CRLF)
237 pcre2_flags |= PCRE2_NEWLINE_CRLF;
238 if ((pcre1_flags & G_REGEX_NEWLINE_ANYCRLF) == G_REGEX_NEWLINE_ANYCRLF)
239 pcre2_flags |= PCRE2_NEWLINE_ANYCRLF;
240 if (pcre1_flags & G_REGEX_BSR_ANYCRLF)
241 pcre2_flags |= PCRE2_BSR_ANYCRLF;
242
243 /* these are not available in pcre2 */
244 if (pcre1_flags & G_REGEX_OPTIMIZE)
245 pcre2_flags |= 0;
246 if (pcre1_flags & G_REGEX_JAVASCRIPT_COMPAT)
247 pcre2_flags |= 0;
248
249 return pcre2_flags;
250 }
251
252 static gint
map_to_pcre2_match_flags(gint pcre1_flags)253 map_to_pcre2_match_flags (gint pcre1_flags)
254 {
255 /* Maps match flags from pcre1 to pcre2 values
256 */
257 gint pcre2_flags = G_REGEX_FLAGS_CONVERTED;
258
259 if (pcre1_flags & G_REGEX_FLAGS_CONVERTED)
260 return pcre1_flags;
261
262 if (pcre1_flags & G_REGEX_MATCH_ANCHORED)
263 pcre2_flags |= PCRE2_ANCHORED;
264 if (pcre1_flags & G_REGEX_MATCH_NOTBOL)
265 pcre2_flags |= PCRE2_NOTBOL;
266 if (pcre1_flags & G_REGEX_MATCH_NOTEOL)
267 pcre2_flags |= PCRE2_NOTEOL;
268 if (pcre1_flags & G_REGEX_MATCH_NOTEMPTY)
269 pcre2_flags |= PCRE2_NOTEMPTY;
270 if (pcre1_flags & G_REGEX_MATCH_PARTIAL)
271 pcre2_flags |= PCRE2_PARTIAL_SOFT;
272 if (pcre1_flags & G_REGEX_MATCH_NEWLINE_CR)
273 pcre2_flags |= PCRE2_NEWLINE_CR;
274 if (pcre1_flags & G_REGEX_MATCH_NEWLINE_LF)
275 pcre2_flags |= PCRE2_NEWLINE_LF;
276 if ((pcre1_flags & G_REGEX_MATCH_NEWLINE_CRLF) == G_REGEX_MATCH_NEWLINE_CRLF)
277 pcre2_flags |= PCRE2_NEWLINE_CRLF;
278 if (pcre1_flags & G_REGEX_MATCH_NEWLINE_ANY)
279 pcre2_flags |= PCRE2_NEWLINE_ANY;
280 if ((pcre1_flags & G_REGEX_MATCH_NEWLINE_ANYCRLF) == G_REGEX_MATCH_NEWLINE_ANYCRLF)
281 pcre2_flags |= PCRE2_NEWLINE_ANYCRLF;
282 if (pcre1_flags & G_REGEX_MATCH_BSR_ANYCRLF)
283 pcre2_flags |= PCRE2_BSR_ANYCRLF;
284 if (pcre1_flags & G_REGEX_MATCH_BSR_ANY)
285 pcre2_flags |= PCRE2_BSR_UNICODE;
286 if (pcre1_flags & G_REGEX_MATCH_PARTIAL_SOFT)
287 pcre2_flags |= PCRE2_PARTIAL_SOFT;
288 if (pcre1_flags & G_REGEX_MATCH_PARTIAL_HARD)
289 pcre2_flags |= PCRE2_PARTIAL_HARD;
290 if (pcre1_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART)
291 pcre2_flags |= PCRE2_NOTEMPTY_ATSTART;
292 if (pcre1_flags & G_REGEX_RAW)
293 pcre2_flags |= PCRE2_UTF;
294
295 return pcre2_flags;
296 }
297
298 static gint
map_to_pcre1_compile_flags(gint pcre2_flags)299 map_to_pcre1_compile_flags (gint pcre2_flags)
300 {
301 /* Maps compile flags from pcre2 to pcre1 values
302 */
303 gint pcre1_flags = 0;
304
305 if (!(pcre2_flags & G_REGEX_FLAGS_CONVERTED))
306 return pcre2_flags;
307
308 if (pcre2_flags & PCRE2_CASELESS)
309 pcre1_flags |= G_REGEX_CASELESS;
310 if (pcre2_flags & PCRE2_MULTILINE)
311 pcre1_flags |= G_REGEX_MULTILINE;
312 if (pcre2_flags & PCRE2_DOTALL)
313 pcre1_flags |= G_REGEX_DOTALL;
314 if (pcre2_flags & PCRE2_EXTENDED)
315 pcre1_flags |= G_REGEX_EXTENDED;
316 if (pcre2_flags & PCRE2_ANCHORED)
317 pcre1_flags |= G_REGEX_ANCHORED;
318 if (pcre2_flags & PCRE2_DOLLAR_ENDONLY)
319 pcre1_flags |= G_REGEX_DOLLAR_ENDONLY;
320 if (pcre2_flags & PCRE2_UNGREEDY)
321 pcre1_flags |= G_REGEX_UNGREEDY;
322 if (pcre2_flags & PCRE2_UTF)
323 pcre1_flags |= G_REGEX_RAW;
324 if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE)
325 pcre1_flags |= G_REGEX_NO_AUTO_CAPTURE;
326 if (pcre2_flags & PCRE2_FIRSTLINE)
327 pcre1_flags |= G_REGEX_FIRSTLINE;
328 if (pcre2_flags & PCRE2_DUPNAMES)
329 pcre1_flags |= G_REGEX_DUPNAMES;
330 if (pcre2_flags & PCRE2_NEWLINE_CR)
331 pcre1_flags |= G_REGEX_NEWLINE_CR;
332 if (pcre2_flags & PCRE2_NEWLINE_LF)
333 pcre1_flags |= G_REGEX_NEWLINE_LF;
334 if ((pcre2_flags & PCRE2_NEWLINE_CRLF) == PCRE2_NEWLINE_CRLF)
335 pcre1_flags |= G_REGEX_NEWLINE_CRLF;
336 if ((pcre2_flags & PCRE2_NEWLINE_ANYCRLF) == PCRE2_NEWLINE_ANYCRLF)
337 pcre1_flags |= G_REGEX_NEWLINE_ANYCRLF;
338 if (pcre2_flags & PCRE2_BSR_ANYCRLF)
339 pcre1_flags |= G_REGEX_BSR_ANYCRLF;
340
341 return pcre1_flags;
342 }
343
344 static gint
map_to_pcre1_match_flags(gint pcre2_flags)345 map_to_pcre1_match_flags (gint pcre2_flags)
346 {
347 /* Maps match flags from pcre2 to pcre1 values
348 */
349 gint pcre1_flags = 0;
350
351 if (!(pcre2_flags & G_REGEX_FLAGS_CONVERTED))
352 return pcre2_flags;
353
354 if (pcre2_flags & PCRE2_ANCHORED)
355 pcre1_flags |= G_REGEX_MATCH_ANCHORED;
356 if (pcre2_flags & PCRE2_NOTBOL)
357 pcre1_flags |= G_REGEX_MATCH_NOTBOL;
358 if (pcre2_flags & PCRE2_NOTEOL)
359 pcre1_flags |= G_REGEX_MATCH_NOTEOL;
360 if (pcre2_flags & PCRE2_NOTEMPTY)
361 pcre1_flags |= G_REGEX_MATCH_NOTEMPTY;
362 if (pcre2_flags & PCRE2_PARTIAL_SOFT)
363 pcre1_flags |= G_REGEX_MATCH_PARTIAL;
364 if (pcre2_flags & PCRE2_NEWLINE_CR)
365 pcre1_flags |= G_REGEX_MATCH_NEWLINE_CR;
366 if (pcre2_flags & PCRE2_NEWLINE_LF)
367 pcre1_flags |= G_REGEX_MATCH_NEWLINE_LF;
368 if ((pcre2_flags & PCRE2_NEWLINE_CRLF) == PCRE2_NEWLINE_CRLF)
369 pcre1_flags |= G_REGEX_MATCH_NEWLINE_CRLF;
370 if (pcre2_flags & PCRE2_NEWLINE_ANY)
371 pcre1_flags |= G_REGEX_MATCH_NEWLINE_ANY;
372 if ((pcre2_flags & PCRE2_NEWLINE_ANYCRLF) == PCRE2_NEWLINE_ANYCRLF)
373 pcre1_flags |= G_REGEX_MATCH_NEWLINE_ANYCRLF;
374 if (pcre2_flags & PCRE2_BSR_ANYCRLF)
375 pcre1_flags |= G_REGEX_MATCH_BSR_ANYCRLF;
376 if (pcre2_flags & PCRE2_BSR_UNICODE)
377 pcre1_flags |= G_REGEX_MATCH_BSR_ANY;
378 if (pcre2_flags & PCRE2_PARTIAL_SOFT)
379 pcre1_flags |= G_REGEX_MATCH_PARTIAL_SOFT;
380 if (pcre2_flags & PCRE2_PARTIAL_HARD)
381 pcre1_flags |= G_REGEX_MATCH_PARTIAL_HARD;
382 if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART)
383 pcre1_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART;
384 if (pcre2_flags & PCRE2_UTF)
385 pcre1_flags |= G_REGEX_RAW;
386
387 return pcre1_flags;
388 }
389
390 static gint
map_to_gregex_error(gint pcre2_error)391 map_to_gregex_error (gint pcre2_error)
392 {
393 /* Maps error codes from pcre2 to gregex values (which were based on pcre1)
394 */
395 switch (pcre2_error)
396 {
397 case PCRE2_ERROR_END_BACKSLASH:
398 return G_REGEX_ERROR_STRAY_BACKSLASH;
399 case PCRE2_ERROR_END_BACKSLASH_C:
400 return G_REGEX_ERROR_MISSING_CONTROL_CHAR;
401 case PCRE2_ERROR_UNKNOWN_ESCAPE:
402 return G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
403 case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER:
404 return G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER;
405 case PCRE2_ERROR_QUANTIFIER_TOO_BIG:
406 return G_REGEX_ERROR_QUANTIFIER_TOO_BIG;
407 case PCRE2_ERROR_MISSING_SQUARE_BRACKET:
408 return G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS;
409 case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS:
410 return G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS;
411 case PCRE2_ERROR_CLASS_RANGE_ORDER:
412 return G_REGEX_ERROR_RANGE_OUT_OF_ORDER;
413 case PCRE2_ERROR_QUANTIFIER_INVALID:
414 return G_REGEX_ERROR_NOTHING_TO_REPEAT;
415 case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT:
416 return G_REGEX_ERROR_NOTHING_TO_REPEAT;
417 case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY:
418 return G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
419 case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS:
420 return G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS;
421 case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING:
422 return G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED;
423 case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS:
424 return G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
425 case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE:
426 return G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE;
427 case PCRE2_ERROR_MISSING_COMMENT_CLOSING:
428 return G_REGEX_ERROR_UNTERMINATED_COMMENT;
429 case PCRE2_ERROR_PATTERN_TOO_LARGE:
430 return G_REGEX_ERROR_EXPRESSION_TOO_LARGE;
431 case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS:
432 return G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
433 case PCRE2_ERROR_MISSING_CONDITION_CLOSING:
434 return G_REGEX_ERROR_MALFORMED_CONDITION;
435 case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH:
436 return G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND;
437 case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES:
438 return G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES;
439 case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED:
440 return G_REGEX_ERROR_ASSERTION_EXPECTED;
441 case PCRE2_ERROR_BAD_RELATIVE_REFERENCE:
442 return G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE;
443 case PCRE2_ERROR_UNKNOWN_POSIX_CLASS:
444 return G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME;
445 case PCRE2_ERROR_CODE_POINT_TOO_BIG:
446 return G_REGEX_ERROR_HEX_CODE_TOO_LARGE;
447 case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C:
448 return G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND;
449 case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE:
450 return G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
451 case PCRE2_ERROR_MISSING_NAME_TERMINATOR:
452 return G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR;
453 case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME:
454 return G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME;
455 case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY:
456 return G_REGEX_ERROR_MALFORMED_PROPERTY;
457 case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY:
458 return G_REGEX_ERROR_UNKNOWN_PROPERTY;
459 case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG:
460 return G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG;
461 case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS:
462 return G_REGEX_ERROR_TOO_MANY_SUBPATTERNS;
463 case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG:
464 return G_REGEX_ERROR_INVALID_OCTAL_VALUE;
465 case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES:
466 return G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE;
467 case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE:
468 return G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS;
469 case PCRE2_ERROR_BACKSLASH_G_SYNTAX:
470 return G_REGEX_ERROR_MISSING_BACK_REFERENCE;
471 case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING:
472 return G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
473 case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED:
474 return G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN;
475 case PCRE2_ERROR_VERB_UNKNOWN:
476 return G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB;
477 case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG:
478 return G_REGEX_ERROR_NUMBER_TOO_BIG;
479 case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED:
480 return G_REGEX_ERROR_MISSING_SUBPATTERN_NAME;
481 case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH:
482 return G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME;
483 case PCRE2_ERROR_MARK_MISSING_ARGUMENT:
484 return G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED;
485 case PCRE2_ERROR_INVALID_HEXADECIMAL:
486 return G_REGEX_ERROR_HEX_CODE_TOO_LARGE;
487 case PCRE2_ERROR_BACKSLASH_C_SYNTAX:
488 return G_REGEX_ERROR_INVALID_CONTROL_CHAR;
489 case PCRE2_ERROR_BACKSLASH_K_SYNTAX:
490 return G_REGEX_ERROR_MISSING_NAME;
491 case PCRE2_ERROR_BACKSLASH_N_IN_CLASS:
492 return G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS;
493 case PCRE2_ERROR_VERB_NAME_TOO_LONG:
494 return G_REGEX_ERROR_NAME_TOO_LONG;
495 case PCRE2_ERROR_NULL_PATTERN:
496 case PCRE2_ERROR_BAD_OPTIONS:
497 case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP:
498 case PCRE2_ERROR_HEAP_FAILED:
499 case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW:
500 case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE:
501 case PCRE2_ERROR_INTERNAL_STUDY_ERROR:
502 case PCRE2_ERROR_UNICODE_NOT_SUPPORTED:
503 case PCRE2_ERROR_PARENTHESES_STACK_CHECK:
504 case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED:
505 case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG:
506 case PCRE2_ERROR_MISSING_CALLOUT_CLOSING:
507 case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB:
508 case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P:
509 case PCRE2_ERROR_INVALID_SUBPATTERN_NAME:
510 case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE:
511 case PCRE2_ERROR_CLASS_INVALID_RANGE:
512 case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE:
513 case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN:
514 case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE:
515 case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW:
516 case PCRE2_ERROR_INVALID_OCTAL:
517 case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS:
518 case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG:
519 case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT:
520 case PCRE2_ERROR_UTF_IS_DISABLED:
521 case PCRE2_ERROR_UCP_IS_DISABLED:
522 case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG:
523 case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS:
524 case PCRE2_ERROR_VERSION_CONDITION_SYNTAX:
525 case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS:
526 case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER:
527 case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER:
528 case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED:
529 case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP:
530 case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED:
531 case PCRE2_ERROR_PATTERN_TOO_COMPLICATED:
532 case PCRE2_ERROR_LOOKBEHIND_TOO_LONG:
533 case PCRE2_ERROR_PATTERN_STRING_TOO_LONG:
534 case PCRE2_ERROR_INTERNAL_BAD_CODE:
535 case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP:
536 case PCRE2_ERROR_NO_SURROGATES_IN_UTF16:
537 case PCRE2_ERROR_BAD_LITERAL_OPTIONS:
538 default:
539 return G_REGEX_ERROR_COMPILE;
540 }
541 }
542
543 static const gchar *
match_error(gint errcode)544 match_error (gint errcode)
545 {
546 switch (errcode)
547 {
548 case PCRE2_ERROR_NOMATCH:
549 /* not an error */
550 break;
551 case PCRE2_ERROR_NULL:
552 /* NULL argument, this should not happen in GRegex */
553 g_warning ("A NULL argument was passed to PCRE");
554 break;
555 case PCRE2_ERROR_BADOPTION:
556 return "bad options";
557 case PCRE2_ERROR_BADMAGIC:
558 return _("corrupted object");
559 case PCRE2_ERROR_NOMEMORY:
560 return _("out of memory");
561 case PCRE2_ERROR_NOSUBSTRING:
562 /* not used by pcre_exec() */
563 break;
564 case PCRE2_ERROR_MATCHLIMIT:
565 return _("backtracking limit reached");
566 case PCRE2_ERROR_CALLOUT:
567 /* callouts are not implemented */
568 break;
569 case PCRE2_ERROR_BADUTFOFFSET:
570 /* we do not check if strings are valid */
571 break;
572 case PCRE2_ERROR_PARTIAL:
573 /* not an error */
574 break;
575 case PCRE2_ERROR_INTERNAL:
576 return _("internal error");
577 case PCRE2_ERROR_DFA_UITEM:
578 return _("the pattern contains items not supported for partial matching");
579 case PCRE2_ERROR_DFA_UCOND:
580 return _("back references as conditions are not supported for partial matching");
581 case PCRE2_ERROR_DFA_WSSIZE:
582 /* handled expanding the workspace */
583 break;
584 case PCRE2_ERROR_DFA_RECURSE:
585 case PCRE2_ERROR_RECURSIONLIMIT:
586 return _("recursion limit reached");
587 case PCRE2_ERROR_BADOFFSET:
588 return _("bad offset");
589 case PCRE2_ERROR_RECURSELOOP:
590 return _("recursion loop");
591 default:
592 break;
593 }
594 return _("unknown error");
595 }
596
597 static void
translate_compile_error(gint * errcode,const gchar ** errmsg)598 translate_compile_error (gint *errcode, const gchar **errmsg)
599 {
600 /* Compile errors are created adding 100 to the error code returned
601 * by PCRE.
602 * If errcode is known we put the translatable error message in
603 * erromsg. If errcode is unknown we put the generic
604 * G_REGEX_ERROR_COMPILE error code in errcode and keep the
605 * untranslated error message returned by PCRE.
606 * Note that there can be more PCRE errors with the same GRegexError
607 * and that some PCRE errors are useless for us.
608 */
609
610 *errcode = map_to_gregex_error (*errcode);
611
612 switch (*errcode)
613 {
614 case G_REGEX_ERROR_STRAY_BACKSLASH:
615 *errmsg = _("\\ at end of pattern");
616 break;
617 case G_REGEX_ERROR_MISSING_CONTROL_CHAR:
618 *errmsg = _("\\c at end of pattern");
619 break;
620 case G_REGEX_ERROR_UNRECOGNIZED_ESCAPE:
621 *errmsg = _("unrecognized character following \\");
622 break;
623 case G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER:
624 *errmsg = _("numbers out of order in {} quantifier");
625 break;
626 case G_REGEX_ERROR_QUANTIFIER_TOO_BIG:
627 *errmsg = _("number too big in {} quantifier");
628 break;
629 case G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS:
630 *errmsg = _("missing terminating ] for character class");
631 break;
632 case G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS:
633 *errmsg = _("invalid escape sequence in character class");
634 break;
635 case G_REGEX_ERROR_RANGE_OUT_OF_ORDER:
636 *errmsg = _("range out of order in character class");
637 break;
638 case G_REGEX_ERROR_NOTHING_TO_REPEAT:
639 *errmsg = _("nothing to repeat");
640 break;
641 case 111: /* internal error: unexpected repeat */
642 *errcode = G_REGEX_ERROR_INTERNAL;
643 *errmsg = _("unexpected repeat");
644 break;
645 case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER:
646 *errmsg = _("unrecognized character after (? or (?-");
647 break;
648 case G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS:
649 *errmsg = _("POSIX named classes are supported only within a class");
650 break;
651 case G_REGEX_ERROR_UNMATCHED_PARENTHESIS:
652 *errmsg = _("missing terminating )");
653 break;
654 case G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE:
655 *errmsg = _("reference to non-existent subpattern");
656 break;
657 case G_REGEX_ERROR_UNTERMINATED_COMMENT:
658 *errmsg = _("missing ) after comment");
659 break;
660 case G_REGEX_ERROR_EXPRESSION_TOO_LARGE:
661 *errmsg = _("regular expression is too large");
662 break;
663 case G_REGEX_ERROR_MEMORY_ERROR:
664 *errmsg = _("failed to get memory");
665 break;
666 case 122: /* unmatched parentheses */
667 *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
668 *errmsg = _(") without opening (");
669 break;
670 case 123: /* internal error: code overflow */
671 *errcode = G_REGEX_ERROR_INTERNAL;
672 *errmsg = _("code overflow");
673 break;
674 case 124: /* "unrecognized character after (?<\0 */
675 *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
676 *errmsg = _("unrecognized character after (?<");
677 break;
678 case G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND:
679 *errmsg = _("lookbehind assertion is not fixed length");
680 break;
681 case G_REGEX_ERROR_MALFORMED_CONDITION:
682 *errmsg = _("malformed number or name after (?(");
683 break;
684 case G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES:
685 *errmsg = _("conditional group contains more than two branches");
686 break;
687 case G_REGEX_ERROR_ASSERTION_EXPECTED:
688 *errmsg = _("assertion expected after (?(");
689 break;
690 case 129:
691 *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;
692 /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of)
693 * sequences here, '(?-54' would be an example for the second group.
694 */
695 *errmsg = _("(?R or (?[+-]digits must be followed by )");
696 break;
697 case G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME:
698 *errmsg = _("unknown POSIX class name");
699 break;
700 case G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED:
701 *errmsg = _("POSIX collating elements are not supported");
702 break;
703 case G_REGEX_ERROR_HEX_CODE_TOO_LARGE:
704 *errmsg = _("character value in \\x{...} sequence is too large");
705 break;
706 case G_REGEX_ERROR_INVALID_CONDITION:
707 *errmsg = _("invalid condition (?(0)");
708 break;
709 case G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND:
710 *errmsg = _("\\C not allowed in lookbehind assertion");
711 break;
712 case 137: /* PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0 */
713 /* A number of Perl escapes are not handled by PCRE.
714 * Therefore it explicitly raises ERR37.
715 */
716 *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;
717 *errmsg = _("escapes \\L, \\l, \\N{name}, \\U, and \\u are not supported");
718 break;
719 case G_REGEX_ERROR_INFINITE_LOOP:
720 *errmsg = _("recursive call could loop indefinitely");
721 break;
722 case 141: /* unrecognized character after (?P\0 */
723 *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;
724 *errmsg = _("unrecognized character after (?P");
725 break;
726 case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR:
727 *errmsg = _("missing terminator in subpattern name");
728 break;
729 case G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME:
730 *errmsg = _("two named subpatterns have the same name");
731 break;
732 case G_REGEX_ERROR_MALFORMED_PROPERTY:
733 *errmsg = _("malformed \\P or \\p sequence");
734 break;
735 case G_REGEX_ERROR_UNKNOWN_PROPERTY:
736 *errmsg = _("unknown property name after \\P or \\p");
737 break;
738 case G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG:
739 *errmsg = _("subpattern name is too long (maximum 32 characters)");
740 break;
741 case G_REGEX_ERROR_TOO_MANY_SUBPATTERNS:
742 *errmsg = _("too many named subpatterns (maximum 10,000)");
743 break;
744 case G_REGEX_ERROR_INVALID_OCTAL_VALUE:
745 *errmsg = _("octal value is greater than \\377");
746 break;
747 case 152: /* internal error: overran compiling workspace */
748 *errcode = G_REGEX_ERROR_INTERNAL;
749 *errmsg = _("overran compiling workspace");
750 break;
751 case 153: /* internal error: previously-checked referenced subpattern not found */
752 *errcode = G_REGEX_ERROR_INTERNAL;
753 *errmsg = _("previously-checked referenced subpattern not found");
754 break;
755 case G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE:
756 *errmsg = _("DEFINE group contains more than one branch");
757 break;
758 case G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS:
759 *errmsg = _("inconsistent NEWLINE options");
760 break;
761 case G_REGEX_ERROR_MISSING_BACK_REFERENCE:
762 *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or "
763 "number, or by a plain number");
764 break;
765 case G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE:
766 *errmsg = _("a numbered reference must not be zero");
767 break;
768 case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN:
769 *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)");
770 break;
771 case G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB:
772 *errmsg = _("(*VERB) not recognized");
773 break;
774 case G_REGEX_ERROR_NUMBER_TOO_BIG:
775 *errmsg = _("number is too big");
776 break;
777 case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME:
778 *errmsg = _("missing subpattern name after (?&");
779 break;
780 case G_REGEX_ERROR_MISSING_DIGIT:
781 *errmsg = _("digit expected after (?+");
782 break;
783 case G_REGEX_ERROR_INVALID_DATA_CHARACTER:
784 *errmsg = _("] is an invalid data character in JavaScript compatibility mode");
785 break;
786 case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME:
787 *errmsg = _("different names for subpatterns of the same number are not allowed");
788 break;
789 case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED:
790 *errmsg = _("(*MARK) must have an argument");
791 break;
792 case G_REGEX_ERROR_INVALID_CONTROL_CHAR:
793 *errmsg = _( "\\c must be followed by an ASCII character");
794 break;
795 case G_REGEX_ERROR_MISSING_NAME:
796 *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name");
797 break;
798 case G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS:
799 *errmsg = _("\\N is not supported in a class");
800 break;
801 case G_REGEX_ERROR_TOO_MANY_FORWARD_REFERENCES:
802 *errmsg = _("too many forward references");
803 break;
804 case G_REGEX_ERROR_NAME_TOO_LONG:
805 *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)");
806 break;
807 case G_REGEX_ERROR_CHARACTER_VALUE_TOO_LARGE:
808 *errmsg = _("character value in \\u.... sequence is too large");
809 break;
810
811 case 116: /* erroffset passed as NULL */
812 /* This should not happen as we never pass a NULL erroffset */
813 g_warning ("erroffset passed as NULL");
814 *errcode = G_REGEX_ERROR_COMPILE;
815 break;
816 case 117: /* unknown option bit(s) set */
817 /* This should not happen as we check options before passing them
818 * to pcre_compile2() */
819 g_warning ("unknown option bit(s) set");
820 *errcode = G_REGEX_ERROR_COMPILE;
821 break;
822 case 132: /* this version of PCRE is compiled without UTF support */
823 case 144: /* invalid UTF-8 string */
824 case 145: /* support for \\P, \\p, and \\X has not been compiled */
825 case 167: /* this version of PCRE is not compiled with Unicode property support */
826 case 173: /* disallowed Unicode code point (>= 0xd800 && <= 0xdfff) */
827 case 174: /* invalid UTF-16 string */
828 /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE
829 * and we do not check if strings are valid */
830 case 170: /* internal error: unknown opcode in find_fixedlength() */
831 *errcode = G_REGEX_ERROR_INTERNAL;
832 break;
833
834 default:
835 *errcode = G_REGEX_ERROR_COMPILE;
836 }
837 }
838
839 /* GMatchInfo */
840
841 static GMatchInfo *
match_info_new(const GRegex * regex,const gchar * string,gint string_len,gint start_position,gint match_options,gboolean is_dfa)842 match_info_new (const GRegex *regex,
843 const gchar *string,
844 gint string_len,
845 gint start_position,
846 gint match_options,
847 gboolean is_dfa)
848 {
849 GMatchInfo *match_info;
850
851 match_options = map_to_pcre2_match_flags (match_options);
852
853 if (string_len < 0)
854 string_len = strlen (string);
855
856 match_info = g_new0 (GMatchInfo, 1);
857 match_info->ref_count = 1;
858 match_info->regex = g_regex_ref ((GRegex *)regex);
859 match_info->string = string;
860 match_info->string_len = string_len;
861 match_info->matches = PCRE2_ERROR_NOMATCH;
862 match_info->pos = start_position;
863 match_info->match_opts = match_options;
864
865 if (is_dfa)
866 {
867 /* These values should be enough for most cases, if they are not
868 * enough g_regex_match_all_full() will expand them. */
869 match_info->n_offsets = 24;
870 match_info->n_workspace = 100;
871 match_info->workspace = g_new (gint, match_info->n_workspace);
872 }
873 else
874 {
875 gint capture_count;
876 pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT,
877 &capture_count);
878 match_info->n_offsets = (capture_count + 1) * 3;
879 }
880
881 match_info->offsets = g_new0 (gint, match_info->n_offsets);
882 /* Set an invalid position for the previous match. */
883 match_info->offsets[0] = -1;
884 match_info->offsets[1] = -1;
885
886 match_info->match_data = pcre2_match_data_create_from_pattern (
887 match_info->regex->pcre_re,
888 NULL);
889
890 return match_info;
891 }
892
893 /**
894 * g_match_info_get_regex:
895 * @match_info: a #GMatchInfo
896 *
897 * Returns #GRegex object used in @match_info. It belongs to Glib
898 * and must not be freed. Use g_regex_ref() if you need to keep it
899 * after you free @match_info object.
900 *
901 * Returns: #GRegex object used in @match_info
902 *
903 * Since: 2.14
904 */
905 GRegex *
g_match_info_get_regex(const GMatchInfo * match_info)906 g_match_info_get_regex (const GMatchInfo *match_info)
907 {
908 g_return_val_if_fail (match_info != NULL, NULL);
909 return match_info->regex;
910 }
911
912 /**
913 * g_match_info_get_string:
914 * @match_info: a #GMatchInfo
915 *
916 * Returns the string searched with @match_info. This is the
917 * string passed to g_regex_match() or g_regex_replace() so
918 * you may not free it before calling this function.
919 *
920 * Returns: the string searched with @match_info
921 *
922 * Since: 2.14
923 */
924 const gchar *
g_match_info_get_string(const GMatchInfo * match_info)925 g_match_info_get_string (const GMatchInfo *match_info)
926 {
927 g_return_val_if_fail (match_info != NULL, NULL);
928 return match_info->string;
929 }
930
931 /**
932 * g_match_info_ref:
933 * @match_info: a #GMatchInfo
934 *
935 * Increases reference count of @match_info by 1.
936 *
937 * Returns: @match_info
938 *
939 * Since: 2.30
940 */
941 GMatchInfo *
g_match_info_ref(GMatchInfo * match_info)942 g_match_info_ref (GMatchInfo *match_info)
943 {
944 g_return_val_if_fail (match_info != NULL, NULL);
945 g_atomic_int_inc (&match_info->ref_count);
946 return match_info;
947 }
948
949 /**
950 * g_match_info_unref:
951 * @match_info: a #GMatchInfo
952 *
953 * Decreases reference count of @match_info by 1. When reference count drops
954 * to zero, it frees all the memory associated with the match_info structure.
955 *
956 * Since: 2.30
957 */
958 void
g_match_info_unref(GMatchInfo * match_info)959 g_match_info_unref (GMatchInfo *match_info)
960 {
961 if (g_atomic_int_dec_and_test (&match_info->ref_count))
962 {
963 g_regex_unref (match_info->regex);
964 if (match_info->match_data)
965 pcre2_match_data_free (match_info->match_data);
966 g_free (match_info->offsets);
967 g_free (match_info->workspace);
968 g_free (match_info);
969 }
970 }
971
972 /**
973 * g_match_info_free:
974 * @match_info: (nullable): a #GMatchInfo, or %NULL
975 *
976 * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does
977 * nothing.
978 *
979 * Since: 2.14
980 */
981 void
g_match_info_free(GMatchInfo * match_info)982 g_match_info_free (GMatchInfo *match_info)
983 {
984 if (match_info == NULL)
985 return;
986
987 g_match_info_unref (match_info);
988 }
989
990 /**
991 * g_match_info_next:
992 * @match_info: a #GMatchInfo structure
993 * @error: location to store the error occurring, or %NULL to ignore errors
994 *
995 * Scans for the next match using the same parameters of the previous
996 * call to g_regex_match_full() or g_regex_match() that returned
997 * @match_info.
998 *
999 * The match is done on the string passed to the match function, so you
1000 * cannot free it before calling this function.
1001 *
1002 * Returns: %TRUE is the string matched, %FALSE otherwise
1003 *
1004 * Since: 2.14
1005 */
1006 gboolean
g_match_info_next(GMatchInfo * match_info,GError ** error)1007 g_match_info_next (GMatchInfo *match_info,
1008 GError **error)
1009 {
1010 gint prev_match_start;
1011 gint prev_match_end;
1012 gint i;
1013 gint opts;
1014 PCRE2_SIZE *ovector;
1015
1016 g_return_val_if_fail (match_info != NULL, FALSE);
1017 g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
1018 g_return_val_if_fail (match_info->pos >= 0, FALSE);
1019
1020 prev_match_start = match_info->offsets[0];
1021 prev_match_end = match_info->offsets[1];
1022
1023 if (match_info->pos > match_info->string_len)
1024 {
1025 /* we have reached the end of the string */
1026 match_info->pos = -1;
1027 match_info->matches = PCRE2_ERROR_NOMATCH;
1028 return FALSE;
1029 }
1030
1031 opts = map_to_pcre2_match_flags (match_info->regex->match_opts | match_info->match_opts);
1032 match_info->matches = pcre2_match (match_info->regex->pcre_re,
1033 (PCRE2_SPTR)match_info->string,
1034 match_info->string_len,
1035 match_info->pos,
1036 opts & ~G_REGEX_FLAGS_CONVERTED,
1037 match_info->match_data,
1038 NULL);
1039
1040 if (IS_PCRE_ERROR (match_info->matches))
1041 {
1042 g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
1043 _("Error while matching regular expression %s: %s"),
1044 match_info->regex->pattern, match_error (match_info->matches));
1045 return FALSE;
1046 }
1047 else
1048 {
1049 match_info->n_offsets = pcre2_get_ovector_count (match_info->match_data) * 2;
1050 ovector = pcre2_get_ovector_pointer (match_info->match_data);
1051 match_info->offsets = g_realloc_n (match_info->offsets,
1052 match_info->n_offsets,
1053 sizeof (gint));
1054 for (i = 0; i < match_info->n_offsets; i++)
1055 {
1056 match_info->offsets[i] = (int) ovector[i];
1057 }
1058 }
1059
1060 /* avoid infinite loops if the pattern is an empty string or something
1061 * equivalent */
1062 if (match_info->pos == match_info->offsets[1])
1063 {
1064 if (match_info->pos > match_info->string_len)
1065 {
1066 /* we have reached the end of the string */
1067 match_info->pos = -1;
1068 match_info->matches = PCRE2_ERROR_NOMATCH;
1069 return FALSE;
1070 }
1071
1072 match_info->pos = NEXT_CHAR (match_info->regex,
1073 &match_info->string[match_info->pos]) -
1074 match_info->string;
1075 }
1076 else
1077 {
1078 match_info->pos = match_info->offsets[1];
1079 }
1080
1081 /* it's possible to get two identical matches when we are matching
1082 * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and
1083 * the string is "RegExTest" we have:
1084 * - search at position 0: match from 0 to 0
1085 * - search at position 1: match from 3 to 3
1086 * - search at position 3: match from 3 to 3 (duplicate)
1087 * - search at position 4: match from 5 to 5
1088 * - search at position 5: match from 5 to 5 (duplicate)
1089 * - search at position 6: no match -> stop
1090 * so we have to ignore the duplicates.
1091 * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */
1092 if (match_info->matches >= 0 &&
1093 prev_match_start == match_info->offsets[0] &&
1094 prev_match_end == match_info->offsets[1])
1095 {
1096 /* ignore this match and search the next one */
1097 return g_match_info_next (match_info, error);
1098 }
1099
1100 return match_info->matches >= 0;
1101 }
1102
1103 /**
1104 * g_match_info_matches:
1105 * @match_info: a #GMatchInfo structure
1106 *
1107 * Returns whether the previous match operation succeeded.
1108 *
1109 * Returns: %TRUE if the previous match operation succeeded,
1110 * %FALSE otherwise
1111 *
1112 * Since: 2.14
1113 */
1114 gboolean
g_match_info_matches(const GMatchInfo * match_info)1115 g_match_info_matches (const GMatchInfo *match_info)
1116 {
1117 g_return_val_if_fail (match_info != NULL, FALSE);
1118
1119 return match_info->matches >= 0;
1120 }
1121
1122 /**
1123 * g_match_info_get_match_count:
1124 * @match_info: a #GMatchInfo structure
1125 *
1126 * Retrieves the number of matched substrings (including substring 0,
1127 * that is the whole matched text), so 1 is returned if the pattern
1128 * has no substrings in it and 0 is returned if the match failed.
1129 *
1130 * If the last match was obtained using the DFA algorithm, that is
1131 * using g_regex_match_all() or g_regex_match_all_full(), the retrieved
1132 * count is not that of the number of capturing parentheses but that of
1133 * the number of matched substrings.
1134 *
1135 * Returns: Number of matched substrings, or -1 if an error occurred
1136 *
1137 * Since: 2.14
1138 */
1139 gint
g_match_info_get_match_count(const GMatchInfo * match_info)1140 g_match_info_get_match_count (const GMatchInfo *match_info)
1141 {
1142 g_return_val_if_fail (match_info, -1);
1143
1144 if (match_info->matches == PCRE2_ERROR_NOMATCH)
1145 /* no match */
1146 return 0;
1147 else if (match_info->matches < PCRE2_ERROR_NOMATCH)
1148 /* error */
1149 return -1;
1150 else
1151 /* match */
1152 return match_info->matches;
1153 }
1154
1155 /**
1156 * g_match_info_is_partial_match:
1157 * @match_info: a #GMatchInfo structure
1158 *
1159 * Usually if the string passed to g_regex_match*() matches as far as
1160 * it goes, but is too short to match the entire pattern, %FALSE is
1161 * returned. There are circumstances where it might be helpful to
1162 * distinguish this case from other cases in which there is no match.
1163 *
1164 * Consider, for example, an application where a human is required to
1165 * type in data for a field with specific formatting requirements. An
1166 * example might be a date in the form ddmmmyy, defined by the pattern
1167 * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".
1168 * If the application sees the user’s keystrokes one by one, and can
1169 * check that what has been typed so far is potentially valid, it is
1170 * able to raise an error as soon as a mistake is made.
1171 *
1172 * GRegex supports the concept of partial matching by means of the
1173 * #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD flags.
1174 * When they are used, the return code for
1175 * g_regex_match() or g_regex_match_full() is, as usual, %TRUE
1176 * for a complete match, %FALSE otherwise. But, when these functions
1177 * return %FALSE, you can check if the match was partial calling
1178 * g_match_info_is_partial_match().
1179 *
1180 * The difference between #G_REGEX_MATCH_PARTIAL_SOFT and
1181 * #G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered
1182 * with #G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a
1183 * possible complete match, while with #G_REGEX_MATCH_PARTIAL_HARD matching
1184 * stops at the partial match.
1185 * When both #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD
1186 * are set, the latter takes precedence.
1187 *
1188 * There were formerly some restrictions on the pattern for partial matching.
1189 * The restrictions no longer apply.
1190 *
1191 * See pcrepartial(3) for more information on partial matching.
1192 *
1193 * Returns: %TRUE if the match was partial, %FALSE otherwise
1194 *
1195 * Since: 2.14
1196 */
1197 gboolean
g_match_info_is_partial_match(const GMatchInfo * match_info)1198 g_match_info_is_partial_match (const GMatchInfo *match_info)
1199 {
1200 g_return_val_if_fail (match_info != NULL, FALSE);
1201
1202 return match_info->matches == PCRE2_ERROR_PARTIAL;
1203 }
1204
1205 /**
1206 * g_match_info_expand_references:
1207 * @match_info: (nullable): a #GMatchInfo or %NULL
1208 * @string_to_expand: the string to expand
1209 * @error: location to store the error occurring, or %NULL to ignore errors
1210 *
1211 * Returns a new string containing the text in @string_to_expand with
1212 * references and escape sequences expanded. References refer to the last
1213 * match done with @string against @regex and have the same syntax used by
1214 * g_regex_replace().
1215 *
1216 * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was
1217 * passed to g_regex_new().
1218 *
1219 * The backreferences are extracted from the string passed to the match
1220 * function, so you cannot call this function after freeing the string.
1221 *
1222 * @match_info may be %NULL in which case @string_to_expand must not
1223 * contain references. For instance "foo\n" does not refer to an actual
1224 * pattern and '\n' merely will be replaced with \n character,
1225 * while to expand "\0" (whole match) one needs the result of a match.
1226 * Use g_regex_check_replacement() to find out whether @string_to_expand
1227 * contains references.
1228 *
1229 * Returns: (nullable): the expanded string, or %NULL if an error occurred
1230 *
1231 * Since: 2.14
1232 */
1233 gchar *
g_match_info_expand_references(const GMatchInfo * match_info,const gchar * string_to_expand,GError ** error)1234 g_match_info_expand_references (const GMatchInfo *match_info,
1235 const gchar *string_to_expand,
1236 GError **error)
1237 {
1238 GString *result;
1239 GList *list;
1240 GError *tmp_error = NULL;
1241
1242 g_return_val_if_fail (string_to_expand != NULL, NULL);
1243 g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1244
1245 list = split_replacement (string_to_expand, &tmp_error);
1246 if (tmp_error != NULL)
1247 {
1248 g_propagate_error (error, tmp_error);
1249 return NULL;
1250 }
1251
1252 if (!match_info && interpolation_list_needs_match (list))
1253 {
1254 g_critical ("String '%s' contains references to the match, can't "
1255 "expand references without GMatchInfo object",
1256 string_to_expand);
1257 return NULL;
1258 }
1259
1260 result = g_string_sized_new (strlen (string_to_expand));
1261 interpolate_replacement (match_info, result, list);
1262
1263 g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
1264
1265 return g_string_free (result, FALSE);
1266 }
1267
1268 /**
1269 * g_match_info_fetch:
1270 * @match_info: #GMatchInfo structure
1271 * @match_num: number of the sub expression
1272 *
1273 * Retrieves the text matching the @match_num'th capturing
1274 * parentheses. 0 is the full text of the match, 1 is the first paren
1275 * set, 2 the second, and so on.
1276 *
1277 * If @match_num is a valid sub pattern but it didn't match anything
1278 * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty
1279 * string is returned.
1280 *
1281 * If the match was obtained using the DFA algorithm, that is using
1282 * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1283 * string is not that of a set of parentheses but that of a matched
1284 * substring. Substrings are matched in reverse order of length, so
1285 * 0 is the longest match.
1286 *
1287 * The string is fetched from the string passed to the match function,
1288 * so you cannot call this function after freeing the string.
1289 *
1290 * Returns: (nullable): The matched substring, or %NULL if an error
1291 * occurred. You have to free the string yourself
1292 *
1293 * Since: 2.14
1294 */
1295 gchar *
g_match_info_fetch(const GMatchInfo * match_info,gint match_num)1296 g_match_info_fetch (const GMatchInfo *match_info,
1297 gint match_num)
1298 {
1299 /* we cannot use pcre_get_substring() because it allocates the
1300 * string using pcre_malloc(). */
1301 gchar *match = NULL;
1302 gint start, end;
1303
1304 g_return_val_if_fail (match_info != NULL, NULL);
1305 g_return_val_if_fail (match_num >= 0, NULL);
1306
1307 /* match_num does not exist or it didn't matched, i.e. matching "b"
1308 * against "(a)?b" then group 0 is empty. */
1309 if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))
1310 match = NULL;
1311 else if (start == -1)
1312 match = g_strdup ("");
1313 else
1314 match = g_strndup (&match_info->string[start], end - start);
1315
1316 return match;
1317 }
1318
1319 /**
1320 * g_match_info_fetch_pos:
1321 * @match_info: #GMatchInfo structure
1322 * @match_num: number of the sub expression
1323 * @start_pos: (out) (optional): pointer to location where to store
1324 * the start position, or %NULL
1325 * @end_pos: (out) (optional): pointer to location where to store
1326 * the end position, or %NULL
1327 *
1328 * Retrieves the position in bytes of the @match_num'th capturing
1329 * parentheses. 0 is the full text of the match, 1 is the first
1330 * paren set, 2 the second, and so on.
1331 *
1332 * If @match_num is a valid sub pattern but it didn't match anything
1333 * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos
1334 * and @end_pos are set to -1 and %TRUE is returned.
1335 *
1336 * If the match was obtained using the DFA algorithm, that is using
1337 * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1338 * position is not that of a set of parentheses but that of a matched
1339 * substring. Substrings are matched in reverse order of length, so
1340 * 0 is the longest match.
1341 *
1342 * Returns: %TRUE if the position was fetched, %FALSE otherwise. If
1343 * the position cannot be fetched, @start_pos and @end_pos are left
1344 * unchanged
1345 *
1346 * Since: 2.14
1347 */
1348 gboolean
g_match_info_fetch_pos(const GMatchInfo * match_info,gint match_num,gint * start_pos,gint * end_pos)1349 g_match_info_fetch_pos (const GMatchInfo *match_info,
1350 gint match_num,
1351 gint *start_pos,
1352 gint *end_pos)
1353 {
1354 g_return_val_if_fail (match_info != NULL, FALSE);
1355 g_return_val_if_fail (match_num >= 0, FALSE);
1356
1357 /* make sure the sub expression number they're requesting is less than
1358 * the total number of sub expressions that were matched. */
1359 if (match_num >= match_info->matches)
1360 return FALSE;
1361
1362 if (start_pos != NULL)
1363 *start_pos = match_info->offsets[2 * match_num];
1364
1365 if (end_pos != NULL)
1366 *end_pos = match_info->offsets[2 * match_num + 1];
1367
1368 return TRUE;
1369 }
1370
1371 /*
1372 * Returns number of first matched subpattern with name @name.
1373 * There may be more than one in case when DUPNAMES is used,
1374 * and not all subpatterns with that name match;
1375 * pcre_get_stringnumber() does not work in that case.
1376 */
1377 static gint
get_matched_substring_number(const GMatchInfo * match_info,const gchar * name)1378 get_matched_substring_number (const GMatchInfo *match_info,
1379 const gchar *name)
1380 {
1381 gint entrysize;
1382 PCRE2_SPTR first, last;
1383 guchar *entry;
1384
1385 if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES))
1386 return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR)name);
1387
1388 /* This code is copied from pcre_get.c: get_first_set() */
1389 entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re,
1390 (PCRE2_SPTR)name,
1391 &first,
1392 &last);
1393
1394 if (entrysize <= 0)
1395 return entrysize;
1396
1397 for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)
1398 {
1399 gint n = (entry[0] << 8) + entry[1];
1400 if (match_info->offsets[n*2] >= 0)
1401 return n;
1402 }
1403
1404 return (first[0] << 8) + first[1];
1405 }
1406
1407 /**
1408 * g_match_info_fetch_named:
1409 * @match_info: #GMatchInfo structure
1410 * @name: name of the subexpression
1411 *
1412 * Retrieves the text matching the capturing parentheses named @name.
1413 *
1414 * If @name is a valid sub pattern name but it didn't match anything
1415 * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
1416 * then an empty string is returned.
1417 *
1418 * The string is fetched from the string passed to the match function,
1419 * so you cannot call this function after freeing the string.
1420 *
1421 * Returns: (nullable): The matched substring, or %NULL if an error
1422 * occurred. You have to free the string yourself
1423 *
1424 * Since: 2.14
1425 */
1426 gchar *
g_match_info_fetch_named(const GMatchInfo * match_info,const gchar * name)1427 g_match_info_fetch_named (const GMatchInfo *match_info,
1428 const gchar *name)
1429 {
1430 /* we cannot use pcre_get_named_substring() because it allocates the
1431 * string using pcre_malloc(). */
1432 gint num;
1433
1434 g_return_val_if_fail (match_info != NULL, NULL);
1435 g_return_val_if_fail (name != NULL, NULL);
1436
1437 num = get_matched_substring_number (match_info, name);
1438 if (num < 0)
1439 return NULL;
1440 else
1441 return g_match_info_fetch (match_info, num);
1442 }
1443
1444 /**
1445 * g_match_info_fetch_named_pos:
1446 * @match_info: #GMatchInfo structure
1447 * @name: name of the subexpression
1448 * @start_pos: (out) (optional): pointer to location where to store
1449 * the start position, or %NULL
1450 * @end_pos: (out) (optional): pointer to location where to store
1451 * the end position, or %NULL
1452 *
1453 * Retrieves the position in bytes of the capturing parentheses named @name.
1454 *
1455 * If @name is a valid sub pattern name but it didn't match anything
1456 * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")
1457 * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
1458 *
1459 * Returns: %TRUE if the position was fetched, %FALSE otherwise.
1460 * If the position cannot be fetched, @start_pos and @end_pos
1461 * are left unchanged.
1462 *
1463 * Since: 2.14
1464 */
1465 gboolean
g_match_info_fetch_named_pos(const GMatchInfo * match_info,const gchar * name,gint * start_pos,gint * end_pos)1466 g_match_info_fetch_named_pos (const GMatchInfo *match_info,
1467 const gchar *name,
1468 gint *start_pos,
1469 gint *end_pos)
1470 {
1471 gint num;
1472
1473 g_return_val_if_fail (match_info != NULL, FALSE);
1474 g_return_val_if_fail (name != NULL, FALSE);
1475
1476 num = get_matched_substring_number (match_info, name);
1477 if (num < 0)
1478 return FALSE;
1479
1480 return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);
1481 }
1482
1483 /**
1484 * g_match_info_fetch_all:
1485 * @match_info: a #GMatchInfo structure
1486 *
1487 * Bundles up pointers to each of the matching substrings from a match
1488 * and stores them in an array of gchar pointers. The first element in
1489 * the returned array is the match number 0, i.e. the entire matched
1490 * text.
1491 *
1492 * If a sub pattern didn't match anything (e.g. sub pattern 1, matching
1493 * "b" against "(a)?b") then an empty string is inserted.
1494 *
1495 * If the last match was obtained using the DFA algorithm, that is using
1496 * g_regex_match_all() or g_regex_match_all_full(), the retrieved
1497 * strings are not that matched by sets of parentheses but that of the
1498 * matched substring. Substrings are matched in reverse order of length,
1499 * so the first one is the longest match.
1500 *
1501 * The strings are fetched from the string passed to the match function,
1502 * so you cannot call this function after freeing the string.
1503 *
1504 * Returns: (transfer full): a %NULL-terminated array of gchar *
1505 * pointers. It must be freed using g_strfreev(). If the previous
1506 * match failed %NULL is returned
1507 *
1508 * Since: 2.14
1509 */
1510 gchar **
g_match_info_fetch_all(const GMatchInfo * match_info)1511 g_match_info_fetch_all (const GMatchInfo *match_info)
1512 {
1513 /* we cannot use pcre_get_substring_list() because the returned value
1514 * isn't suitable for g_strfreev(). */
1515 gchar **result;
1516 gint i;
1517
1518 g_return_val_if_fail (match_info != NULL, NULL);
1519
1520 if (match_info->matches < 0)
1521 return NULL;
1522
1523 result = g_new (gchar *, match_info->matches + 1);
1524 for (i = 0; i < match_info->matches; i++)
1525 result[i] = g_match_info_fetch (match_info, i);
1526 result[i] = NULL;
1527
1528 return result;
1529 }
1530
1531
1532 /* GRegex */
1533
1534 G_DEFINE_QUARK (g-regex-error-quark, g_regex_error)
1535
1536 /**
1537 * g_regex_ref:
1538 * @regex: a #GRegex
1539 *
1540 * Increases reference count of @regex by 1.
1541 *
1542 * Returns: @regex
1543 *
1544 * Since: 2.14
1545 */
1546 GRegex *
g_regex_ref(GRegex * regex)1547 g_regex_ref (GRegex *regex)
1548 {
1549 g_return_val_if_fail (regex != NULL, NULL);
1550 g_atomic_int_inc (®ex->ref_count);
1551 return regex;
1552 }
1553
1554 /**
1555 * g_regex_unref:
1556 * @regex: a #GRegex
1557 *
1558 * Decreases reference count of @regex by 1. When reference count drops
1559 * to zero, it frees all the memory associated with the regex structure.
1560 *
1561 * Since: 2.14
1562 */
1563 void
g_regex_unref(GRegex * regex)1564 g_regex_unref (GRegex *regex)
1565 {
1566 g_return_if_fail (regex != NULL);
1567
1568 if (g_atomic_int_dec_and_test (®ex->ref_count))
1569 {
1570 g_free (regex->pattern);
1571 if (regex->pcre_re != NULL)
1572 pcre2_code_free (regex->pcre_re);
1573 g_free (regex);
1574 }
1575 }
1576
1577 /*
1578 * @match_options: (inout) (optional):
1579 */
1580 static pcre2_code *regex_compile (const gchar *pattern,
1581 GRegexCompileFlags compile_options,
1582 GRegexCompileFlags *compile_options_out,
1583 GRegexMatchFlags *match_options,
1584 GError **error);
1585
1586 /**
1587 * g_regex_new:
1588 * @pattern: the regular expression
1589 * @compile_options: compile options for the regular expression, or 0
1590 * @match_options: match options for the regular expression, or 0
1591 * @error: return location for a #GError
1592 *
1593 * Compiles the regular expression to an internal form, and does
1594 * the initial setup of the #GRegex structure.
1595 *
1596 * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call
1597 * g_regex_unref() when you are done with it
1598 *
1599 * Since: 2.14
1600 */
1601 GRegex *
g_regex_new(const gchar * pattern,GRegexCompileFlags compile_options,GRegexMatchFlags match_options,GError ** error)1602 g_regex_new (const gchar *pattern,
1603 GRegexCompileFlags compile_options,
1604 GRegexMatchFlags match_options,
1605 GError **error)
1606 {
1607 GRegex *regex;
1608 pcre2_code *re;
1609 static gsize initialised = 0;
1610
1611 compile_options = map_to_pcre2_compile_flags (compile_options);
1612 match_options = map_to_pcre2_match_flags (match_options);
1613
1614 g_return_val_if_fail (pattern != NULL, NULL);
1615 g_return_val_if_fail (error == NULL || *error == NULL, NULL);
1616 g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);
1617 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
1618
1619 if (g_once_init_enter (&initialised))
1620 {
1621 int supports_utf8;
1622
1623 pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8);
1624 if (!supports_utf8)
1625 g_critical (_("PCRE library is compiled without UTF8 support"));
1626
1627 g_once_init_leave (&initialised, supports_utf8 ? 1 : 2);
1628 }
1629
1630 if (G_UNLIKELY (initialised != 1))
1631 {
1632 g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE,
1633 _("PCRE library is compiled with incompatible options"));
1634 return NULL;
1635 }
1636
1637 re = regex_compile (pattern, compile_options, &compile_options,
1638 &match_options, error);
1639 if (re == NULL)
1640 return NULL;
1641
1642 regex = g_new0 (GRegex, 1);
1643 regex->ref_count = 1;
1644 regex->pattern = g_strdup (pattern);
1645 regex->pcre_re = re;
1646 regex->compile_opts = compile_options;
1647 regex->match_opts = match_options;
1648
1649 return regex;
1650 }
1651
1652 static pcre2_code *
regex_compile(const gchar * pattern,GRegexCompileFlags compile_options,GRegexCompileFlags * compile_options_out,GRegexMatchFlags * match_options,GError ** error)1653 regex_compile (const gchar *pattern,
1654 GRegexCompileFlags compile_options,
1655 GRegexCompileFlags *compile_options_out,
1656 GRegexMatchFlags *match_options,
1657 GError **error)
1658 {
1659 pcre2_code *re;
1660 const gchar *errmsg;
1661 PCRE2_SIZE erroffset;
1662 gint errcode;
1663 GRegexCompileFlags nonpcre_compile_options;
1664 unsigned long int pcre_compile_options;
1665
1666 compile_options = map_to_pcre2_compile_flags (compile_options);
1667 *match_options = map_to_pcre2_match_flags (*match_options);
1668
1669 nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
1670
1671 /* In GRegex the string are, by default, UTF-8 encoded. PCRE
1672 * instead uses UTF-8 only if required with PCRE_UTF8. */
1673 if (compile_options & PCRE2_UTF)
1674 {
1675 /* disable utf-8 */
1676 compile_options &= ~PCRE2_UTF;
1677 }
1678 else
1679 {
1680 /* enable utf-8 */
1681 compile_options |= PCRE2_UTF | PCRE2_NO_UTF_CHECK;
1682
1683 if (match_options != NULL)
1684 *match_options |= PCRE2_NO_UTF_CHECK;
1685 }
1686 /* PCRE_NEWLINE_ANY is the default for the internal PCRE but
1687 * not for the system one. */
1688 if (!(compile_options & PCRE2_NEWLINE_CR) &&
1689 !(compile_options & PCRE2_NEWLINE_LF))
1690 {
1691 compile_options |= PCRE2_NEWLINE_ANY;
1692 }
1693
1694 compile_options |= PCRE2_UCP;
1695
1696 /* compile the pattern */
1697 re = pcre2_compile ((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, compile_options & ~G_REGEX_FLAGS_CONVERTED,
1698 &errcode, &erroffset, NULL);
1699
1700 /* if the compilation failed, set the error member and return
1701 * immediately */
1702 if (re == NULL)
1703 {
1704 GError *tmp_error;
1705
1706 /* Translate the PCRE error code to GRegexError and use a translated
1707 * error message if possible */
1708 translate_compile_error (&errcode, &errmsg);
1709
1710 /* PCRE uses byte offsets but we want to show character offsets */
1711 erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]);
1712
1713 tmp_error = g_error_new (G_REGEX_ERROR, errcode,
1714 _ ("Error while compiling regular "
1715 "expression %s at char %" G_GSIZE_FORMAT ": %s"),
1716 pattern, erroffset, errmsg);
1717 g_propagate_error (error, tmp_error);
1718
1719 return NULL;
1720 }
1721
1722 /* For options set at the beginning of the pattern, pcre puts them into
1723 * compile options, e.g. "(?i)foo" will make the pcre structure store
1724 * PCRE_CASELESS even though it wasn't explicitly given for compilation. */
1725 pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options);
1726 compile_options = pcre_compile_options & G_REGEX_COMPILE_PCRE_MASK;
1727
1728 /* Don't leak PCRE_NEWLINE_ANY, which is part of PCRE_NEWLINE_ANYCRLF */
1729 if ((pcre_compile_options & PCRE2_NEWLINE_ANYCRLF) != PCRE2_NEWLINE_ANYCRLF)
1730 compile_options &= ~PCRE2_NEWLINE_ANY;
1731
1732 compile_options |= nonpcre_compile_options;
1733
1734 if (!(compile_options & PCRE2_DUPNAMES))
1735 {
1736 gboolean jchanged = FALSE;
1737 pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged);
1738 if (jchanged)
1739 compile_options |= PCRE2_DUPNAMES;
1740 }
1741
1742 if (compile_options_out != 0)
1743 *compile_options_out = compile_options;
1744
1745 return re;
1746 }
1747
1748 /**
1749 * g_regex_get_pattern:
1750 * @regex: a #GRegex structure
1751 *
1752 * Gets the pattern string associated with @regex, i.e. a copy of
1753 * the string passed to g_regex_new().
1754 *
1755 * Returns: the pattern of @regex
1756 *
1757 * Since: 2.14
1758 */
1759 const gchar *
g_regex_get_pattern(const GRegex * regex)1760 g_regex_get_pattern (const GRegex *regex)
1761 {
1762 g_return_val_if_fail (regex != NULL, NULL);
1763
1764 return regex->pattern;
1765 }
1766
1767 /**
1768 * g_regex_get_max_backref:
1769 * @regex: a #GRegex
1770 *
1771 * Returns the number of the highest back reference
1772 * in the pattern, or 0 if the pattern does not contain
1773 * back references.
1774 *
1775 * Returns: the number of the highest back reference
1776 *
1777 * Since: 2.14
1778 */
1779 gint
g_regex_get_max_backref(const GRegex * regex)1780 g_regex_get_max_backref (const GRegex *regex)
1781 {
1782 gint value;
1783
1784 pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value);
1785
1786 return value;
1787 }
1788
1789 /**
1790 * g_regex_get_capture_count:
1791 * @regex: a #GRegex
1792 *
1793 * Returns the number of capturing subpatterns in the pattern.
1794 *
1795 * Returns: the number of capturing subpatterns
1796 *
1797 * Since: 2.14
1798 */
1799 gint
g_regex_get_capture_count(const GRegex * regex)1800 g_regex_get_capture_count (const GRegex *regex)
1801 {
1802 gint value;
1803
1804 pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value);
1805
1806 return value;
1807 }
1808
1809 /**
1810 * g_regex_get_has_cr_or_lf:
1811 * @regex: a #GRegex structure
1812 *
1813 * Checks whether the pattern contains explicit CR or LF references.
1814 *
1815 * Returns: %TRUE if the pattern contains explicit CR or LF references
1816 *
1817 * Since: 2.34
1818 */
1819 gboolean
g_regex_get_has_cr_or_lf(const GRegex * regex)1820 g_regex_get_has_cr_or_lf (const GRegex *regex)
1821 {
1822 gint value;
1823
1824 pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value);
1825
1826 return !!value;
1827 }
1828
1829 /**
1830 * g_regex_get_max_lookbehind:
1831 * @regex: a #GRegex structure
1832 *
1833 * Gets the number of characters in the longest lookbehind assertion in the
1834 * pattern. This information is useful when doing multi-segment matching using
1835 * the partial matching facilities.
1836 *
1837 * Returns: the number of characters in the longest lookbehind assertion.
1838 *
1839 * Since: 2.38
1840 */
1841 gint
g_regex_get_max_lookbehind(const GRegex * regex)1842 g_regex_get_max_lookbehind (const GRegex *regex)
1843 {
1844 gint max_lookbehind;
1845
1846 pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND,
1847 &max_lookbehind);
1848
1849 return max_lookbehind;
1850 }
1851
1852 /**
1853 * g_regex_get_compile_flags:
1854 * @regex: a #GRegex
1855 *
1856 * Returns the compile options that @regex was created with.
1857 *
1858 * Depending on the version of PCRE that is used, this may or may not
1859 * include flags set by option expressions such as `(?i)` found at the
1860 * top-level within the compiled pattern.
1861 *
1862 * Returns: flags from #GRegexCompileFlags
1863 *
1864 * Since: 2.26
1865 */
1866 GRegexCompileFlags
g_regex_get_compile_flags(const GRegex * regex)1867 g_regex_get_compile_flags (const GRegex *regex)
1868 {
1869 g_return_val_if_fail (regex != NULL, 0);
1870
1871 return map_to_pcre1_compile_flags (regex->compile_opts);
1872 }
1873
1874 /**
1875 * g_regex_get_match_flags:
1876 * @regex: a #GRegex
1877 *
1878 * Returns the match options that @regex was created with.
1879 *
1880 * Returns: flags from #GRegexMatchFlags
1881 *
1882 * Since: 2.26
1883 */
1884 GRegexMatchFlags
g_regex_get_match_flags(const GRegex * regex)1885 g_regex_get_match_flags (const GRegex *regex)
1886 {
1887 g_return_val_if_fail (regex != NULL, 0);
1888
1889 return map_to_pcre1_match_flags (regex->match_opts & G_REGEX_MATCH_MASK);
1890 }
1891
1892 /**
1893 * g_regex_match_simple:
1894 * @pattern: the regular expression
1895 * @string: the string to scan for matches
1896 * @compile_options: compile options for the regular expression, or 0
1897 * @match_options: match options, or 0
1898 *
1899 * Scans for a match in @string for @pattern.
1900 *
1901 * This function is equivalent to g_regex_match() but it does not
1902 * require to compile the pattern with g_regex_new(), avoiding some
1903 * lines of code when you need just to do a match without extracting
1904 * substrings, capture counts, and so on.
1905 *
1906 * If this function is to be called on the same @pattern more than
1907 * once, it's more efficient to compile the pattern once with
1908 * g_regex_new() and then use g_regex_match().
1909 *
1910 * Returns: %TRUE if the string matched, %FALSE otherwise
1911 *
1912 * Since: 2.14
1913 */
1914 gboolean
g_regex_match_simple(const gchar * pattern,const gchar * string,GRegexCompileFlags compile_options,GRegexMatchFlags match_options)1915 g_regex_match_simple (const gchar *pattern,
1916 const gchar *string,
1917 GRegexCompileFlags compile_options,
1918 GRegexMatchFlags match_options)
1919 {
1920 GRegex *regex;
1921 gboolean result;
1922
1923 compile_options = map_to_pcre2_compile_flags (compile_options);
1924 match_options = map_to_pcre2_match_flags (match_options);
1925
1926 regex = g_regex_new (pattern, compile_options, 0, NULL);
1927 if (!regex)
1928 return FALSE;
1929 result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);
1930 g_regex_unref (regex);
1931 return result;
1932 }
1933
1934 /**
1935 * g_regex_match:
1936 * @regex: a #GRegex structure from g_regex_new()
1937 * @string: the string to scan for matches
1938 * @match_options: match options
1939 * @match_info: (out) (optional): pointer to location where to store
1940 * the #GMatchInfo, or %NULL if you do not need it
1941 *
1942 * Scans for a match in @string for the pattern in @regex.
1943 * The @match_options are combined with the match options specified
1944 * when the @regex structure was created, letting you have more
1945 * flexibility in reusing #GRegex structures.
1946 *
1947 * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
1948 *
1949 * A #GMatchInfo structure, used to get information on the match,
1950 * is stored in @match_info if not %NULL. Note that if @match_info
1951 * is not %NULL then it is created even if the function returns %FALSE,
1952 * i.e. you must free it regardless if regular expression actually matched.
1953 *
1954 * To retrieve all the non-overlapping matches of the pattern in
1955 * string you can use g_match_info_next().
1956 *
1957 * |[<!-- language="C" -->
1958 * static void
1959 * print_uppercase_words (const gchar *string)
1960 * {
1961 * // Print all uppercase-only words.
1962 * GRegex *regex;
1963 * GMatchInfo *match_info;
1964 *
1965 * regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1966 * g_regex_match (regex, string, 0, &match_info);
1967 * while (g_match_info_matches (match_info))
1968 * {
1969 * gchar *word = g_match_info_fetch (match_info, 0);
1970 * g_print ("Found: %s\n", word);
1971 * g_free (word);
1972 * g_match_info_next (match_info, NULL);
1973 * }
1974 * g_match_info_free (match_info);
1975 * g_regex_unref (regex);
1976 * }
1977 * ]|
1978 *
1979 * @string is not copied and is used in #GMatchInfo internally. If
1980 * you use any #GMatchInfo method (except g_match_info_free()) after
1981 * freeing or modifying @string then the behaviour is undefined.
1982 *
1983 * Returns: %TRUE is the string matched, %FALSE otherwise
1984 *
1985 * Since: 2.14
1986 */
1987 gboolean
g_regex_match(const GRegex * regex,const gchar * string,GRegexMatchFlags match_options,GMatchInfo ** match_info)1988 g_regex_match (const GRegex *regex,
1989 const gchar *string,
1990 GRegexMatchFlags match_options,
1991 GMatchInfo **match_info)
1992 {
1993 match_options = map_to_pcre2_match_flags (match_options);
1994
1995 return g_regex_match_full (regex, string, -1, 0, match_options,
1996 match_info, NULL);
1997 }
1998
1999 /**
2000 * g_regex_match_full:
2001 * @regex: a #GRegex structure from g_regex_new()
2002 * @string: (array length=string_len): the string to scan for matches
2003 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2004 * @start_position: starting index of the string to match, in bytes
2005 * @match_options: match options
2006 * @match_info: (out) (optional): pointer to location where to store
2007 * the #GMatchInfo, or %NULL if you do not need it
2008 * @error: location to store the error occurring, or %NULL to ignore errors
2009 *
2010 * Scans for a match in @string for the pattern in @regex.
2011 * The @match_options are combined with the match options specified
2012 * when the @regex structure was created, letting you have more
2013 * flexibility in reusing #GRegex structures.
2014 *
2015 * Setting @start_position differs from just passing over a shortened
2016 * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2017 * that begins with any kind of lookbehind assertion, such as "\b".
2018 *
2019 * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2020 *
2021 * A #GMatchInfo structure, used to get information on the match, is
2022 * stored in @match_info if not %NULL. Note that if @match_info is
2023 * not %NULL then it is created even if the function returns %FALSE,
2024 * i.e. you must free it regardless if regular expression actually
2025 * matched.
2026 *
2027 * @string is not copied and is used in #GMatchInfo internally. If
2028 * you use any #GMatchInfo method (except g_match_info_free()) after
2029 * freeing or modifying @string then the behaviour is undefined.
2030 *
2031 * To retrieve all the non-overlapping matches of the pattern in
2032 * string you can use g_match_info_next().
2033 *
2034 * |[<!-- language="C" -->
2035 * static void
2036 * print_uppercase_words (const gchar *string)
2037 * {
2038 * // Print all uppercase-only words.
2039 * GRegex *regex;
2040 * GMatchInfo *match_info;
2041 * GError *error = NULL;
2042 *
2043 * regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
2044 * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error);
2045 * while (g_match_info_matches (match_info))
2046 * {
2047 * gchar *word = g_match_info_fetch (match_info, 0);
2048 * g_print ("Found: %s\n", word);
2049 * g_free (word);
2050 * g_match_info_next (match_info, &error);
2051 * }
2052 * g_match_info_free (match_info);
2053 * g_regex_unref (regex);
2054 * if (error != NULL)
2055 * {
2056 * g_printerr ("Error while matching: %s\n", error->message);
2057 * g_error_free (error);
2058 * }
2059 * }
2060 * ]|
2061 *
2062 * Returns: %TRUE is the string matched, %FALSE otherwise
2063 *
2064 * Since: 2.14
2065 */
2066 gboolean
g_regex_match_full(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,GRegexMatchFlags match_options,GMatchInfo ** match_info,GError ** error)2067 g_regex_match_full (const GRegex *regex,
2068 const gchar *string,
2069 gssize string_len,
2070 gint start_position,
2071 GRegexMatchFlags match_options,
2072 GMatchInfo **match_info,
2073 GError **error)
2074 {
2075 GMatchInfo *info;
2076 gboolean match_ok;
2077
2078 match_options = map_to_pcre2_match_flags (match_options);
2079
2080 g_return_val_if_fail (regex != NULL, FALSE);
2081 g_return_val_if_fail (string != NULL, FALSE);
2082 g_return_val_if_fail (start_position >= 0, FALSE);
2083 g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
2084 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
2085
2086 info = match_info_new (regex, string, string_len, start_position,
2087 match_options, FALSE);
2088 match_ok = g_match_info_next (info, error);
2089 if (match_info != NULL)
2090 *match_info = info;
2091 else
2092 g_match_info_free (info);
2093
2094 return match_ok;
2095 }
2096
2097 /**
2098 * g_regex_match_all:
2099 * @regex: a #GRegex structure from g_regex_new()
2100 * @string: the string to scan for matches
2101 * @match_options: match options
2102 * @match_info: (out) (optional): pointer to location where to store
2103 * the #GMatchInfo, or %NULL if you do not need it
2104 *
2105 * Using the standard algorithm for regular expression matching only
2106 * the longest match in the string is retrieved. This function uses
2107 * a different algorithm so it can retrieve all the possible matches.
2108 * For more documentation see g_regex_match_all_full().
2109 *
2110 * A #GMatchInfo structure, used to get information on the match, is
2111 * stored in @match_info if not %NULL. Note that if @match_info is
2112 * not %NULL then it is created even if the function returns %FALSE,
2113 * i.e. you must free it regardless if regular expression actually
2114 * matched.
2115 *
2116 * @string is not copied and is used in #GMatchInfo internally. If
2117 * you use any #GMatchInfo method (except g_match_info_free()) after
2118 * freeing or modifying @string then the behaviour is undefined.
2119 *
2120 * Returns: %TRUE is the string matched, %FALSE otherwise
2121 *
2122 * Since: 2.14
2123 */
2124 gboolean
g_regex_match_all(const GRegex * regex,const gchar * string,GRegexMatchFlags match_options,GMatchInfo ** match_info)2125 g_regex_match_all (const GRegex *regex,
2126 const gchar *string,
2127 GRegexMatchFlags match_options,
2128 GMatchInfo **match_info)
2129 {
2130 match_options = map_to_pcre2_match_flags (match_options);
2131
2132 return g_regex_match_all_full (regex, string, -1, 0, match_options,
2133 match_info, NULL);
2134 }
2135
2136 /**
2137 * g_regex_match_all_full:
2138 * @regex: a #GRegex structure from g_regex_new()
2139 * @string: (array length=string_len): the string to scan for matches
2140 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2141 * @start_position: starting index of the string to match, in bytes
2142 * @match_options: match options
2143 * @match_info: (out) (optional): pointer to location where to store
2144 * the #GMatchInfo, or %NULL if you do not need it
2145 * @error: location to store the error occurring, or %NULL to ignore errors
2146 *
2147 * Using the standard algorithm for regular expression matching only
2148 * the longest match in the @string is retrieved, it is not possible
2149 * to obtain all the available matches. For instance matching
2150 * "<a> <b> <c>" against the pattern "<.*>"
2151 * you get "<a> <b> <c>".
2152 *
2153 * This function uses a different algorithm (called DFA, i.e. deterministic
2154 * finite automaton), so it can retrieve all the possible matches, all
2155 * starting at the same point in the string. For instance matching
2156 * "<a> <b> <c>" against the pattern "<.*>;"
2157 * you would obtain three matches: "<a> <b> <c>",
2158 * "<a> <b>" and "<a>".
2159 *
2160 * The number of matched strings is retrieved using
2161 * g_match_info_get_match_count(). To obtain the matched strings and
2162 * their position you can use, respectively, g_match_info_fetch() and
2163 * g_match_info_fetch_pos(). Note that the strings are returned in
2164 * reverse order of length; that is, the longest matching string is
2165 * given first.
2166 *
2167 * Note that the DFA algorithm is slower than the standard one and it
2168 * is not able to capture substrings, so backreferences do not work.
2169 *
2170 * Setting @start_position differs from just passing over a shortened
2171 * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2172 * that begins with any kind of lookbehind assertion, such as "\b".
2173 *
2174 * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2175 *
2176 * A #GMatchInfo structure, used to get information on the match, is
2177 * stored in @match_info if not %NULL. Note that if @match_info is
2178 * not %NULL then it is created even if the function returns %FALSE,
2179 * i.e. you must free it regardless if regular expression actually
2180 * matched.
2181 *
2182 * @string is not copied and is used in #GMatchInfo internally. If
2183 * you use any #GMatchInfo method (except g_match_info_free()) after
2184 * freeing or modifying @string then the behaviour is undefined.
2185 *
2186 * Returns: %TRUE is the string matched, %FALSE otherwise
2187 *
2188 * Since: 2.14
2189 */
2190 gboolean
g_regex_match_all_full(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,GRegexMatchFlags match_options,GMatchInfo ** match_info,GError ** error)2191 g_regex_match_all_full (const GRegex *regex,
2192 const gchar *string,
2193 gssize string_len,
2194 gint start_position,
2195 GRegexMatchFlags match_options,
2196 GMatchInfo **match_info,
2197 GError **error)
2198 {
2199 GMatchInfo *info;
2200 gboolean done;
2201 pcre2_code *pcre_re;
2202 gboolean retval;
2203 PCRE2_SIZE *ovector;
2204 gint i;
2205
2206 match_options = map_to_pcre2_match_flags (match_options);
2207
2208 g_return_val_if_fail (regex != NULL, FALSE);
2209 g_return_val_if_fail (string != NULL, FALSE);
2210 g_return_val_if_fail (start_position >= 0, FALSE);
2211 g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
2212 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
2213
2214 #ifdef PCRE_NO_AUTO_POSSESS
2215 /* For PCRE >= 8.34 we need to turn off PCRE_NO_AUTO_POSSESS, which
2216 * is an optimization for normal regex matching, but results in omitting
2217 * some shorter matches here, and an observable behaviour change.
2218 *
2219 * DFA matching is rather niche, and very rarely used according to
2220 * codesearch.debian.net, so don't bother caching the recompiled RE. */
2221 pcre_re = regex_compile (regex->pattern,
2222 regex->compile_opts | PCRE2_NO_AUTO_POSSESS,
2223 NULL, NULL, error);
2224 if (pcre_re == NULL)
2225 return FALSE;
2226
2227 #else
2228 /* For PCRE < 8.33 the precompiled regex is fine. */
2229 pcre_re = regex->pcre_re;
2230 #endif
2231
2232 info = match_info_new (regex, string, string_len, start_position,
2233 match_options, TRUE);
2234
2235 done = FALSE;
2236 while (!done)
2237 {
2238 done = TRUE;
2239 info->matches = pcre2_dfa_match (pcre_re,
2240 (PCRE2_SPTR)info->string, info->string_len,
2241 info->pos,
2242 (match_options | PCRE2_NO_UTF_CHECK) & ~G_REGEX_FLAGS_CONVERTED,
2243 info->match_data,
2244 NULL,
2245 info->workspace, info->n_workspace);
2246
2247 info->n_offsets = pcre2_get_ovector_count (info->match_data) * 2;
2248 ovector = pcre2_get_ovector_pointer (info->match_data);
2249 info->offsets = g_realloc (info->offsets,
2250 info->n_offsets * sizeof (gint));
2251 for (i = 0; i < info->n_offsets; i++)
2252 {
2253 info->offsets[i] = (int) ovector[i];
2254 }
2255
2256 if (info->matches == PCRE2_ERROR_DFA_WSSIZE)
2257 {
2258 /* info->workspace is too small. */
2259 info->n_workspace *= 2;
2260 info->workspace = g_realloc (info->workspace,
2261 info->n_workspace * sizeof (gint));
2262 done = FALSE;
2263 }
2264 else if (info->matches == 0)
2265 {
2266 /* info->offsets is too small. */
2267 info->n_offsets *= 2;
2268 info->offsets = g_realloc (info->offsets,
2269 info->n_offsets * sizeof (gint));
2270 done = FALSE;
2271 }
2272 else if (IS_PCRE_ERROR (info->matches))
2273 {
2274 g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,
2275 _("Error while matching regular expression %s: %s"),
2276 regex->pattern, match_error (info->matches));
2277 }
2278 }
2279
2280 #ifdef PCRE_NO_AUTO_POSSESS
2281 pcre2_code_free (pcre_re);
2282 #endif
2283
2284 /* set info->pos to -1 so that a call to g_match_info_next() fails. */
2285 info->pos = -1;
2286 retval = info->matches >= 0;
2287
2288 if (match_info != NULL)
2289 *match_info = info;
2290 else
2291 g_match_info_free (info);
2292
2293 return retval;
2294 }
2295
2296 /**
2297 * g_regex_get_string_number:
2298 * @regex: #GRegex structure
2299 * @name: name of the subexpression
2300 *
2301 * Retrieves the number of the subexpression named @name.
2302 *
2303 * Returns: The number of the subexpression or -1 if @name
2304 * does not exists
2305 *
2306 * Since: 2.14
2307 */
2308 gint
g_regex_get_string_number(const GRegex * regex,const gchar * name)2309 g_regex_get_string_number (const GRegex *regex,
2310 const gchar *name)
2311 {
2312 gint num;
2313
2314 g_return_val_if_fail (regex != NULL, -1);
2315 g_return_val_if_fail (name != NULL, -1);
2316
2317 num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR)name);
2318 if (num == PCRE2_ERROR_NOSUBSTRING)
2319 num = -1;
2320
2321 return num;
2322 }
2323
2324 /**
2325 * g_regex_split_simple:
2326 * @pattern: the regular expression
2327 * @string: the string to scan for matches
2328 * @compile_options: compile options for the regular expression, or 0
2329 * @match_options: match options, or 0
2330 *
2331 * Breaks the string on the pattern, and returns an array of
2332 * the tokens. If the pattern contains capturing parentheses,
2333 * then the text for each of the substrings will also be returned.
2334 * If the pattern does not match anywhere in the string, then the
2335 * whole string is returned as the first token.
2336 *
2337 * This function is equivalent to g_regex_split() but it does
2338 * not require to compile the pattern with g_regex_new(), avoiding
2339 * some lines of code when you need just to do a split without
2340 * extracting substrings, capture counts, and so on.
2341 *
2342 * If this function is to be called on the same @pattern more than
2343 * once, it's more efficient to compile the pattern once with
2344 * g_regex_new() and then use g_regex_split().
2345 *
2346 * As a special case, the result of splitting the empty string ""
2347 * is an empty vector, not a vector containing a single string.
2348 * The reason for this special case is that being able to represent
2349 * an empty vector is typically more useful than consistent handling
2350 * of empty elements. If you do need to represent empty elements,
2351 * you'll need to check for the empty string before calling this
2352 * function.
2353 *
2354 * A pattern that can match empty strings splits @string into
2355 * separate characters wherever it matches the empty string between
2356 * characters. For example splitting "ab c" using as a separator
2357 * "\s*", you will get "a", "b" and "c".
2358 *
2359 * Returns: (transfer full): a %NULL-terminated array of strings. Free
2360 * it using g_strfreev()
2361 *
2362 * Since: 2.14
2363 **/
2364 gchar **
g_regex_split_simple(const gchar * pattern,const gchar * string,GRegexCompileFlags compile_options,GRegexMatchFlags match_options)2365 g_regex_split_simple (const gchar *pattern,
2366 const gchar *string,
2367 GRegexCompileFlags compile_options,
2368 GRegexMatchFlags match_options)
2369 {
2370 GRegex *regex;
2371 gchar **result;
2372
2373 compile_options = map_to_pcre2_compile_flags (compile_options);
2374 match_options = map_to_pcre2_match_flags (match_options);
2375
2376 regex = g_regex_new (pattern, compile_options, 0, NULL);
2377 if (!regex)
2378 return NULL;
2379
2380 result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);
2381 g_regex_unref (regex);
2382 return result;
2383 }
2384
2385 /**
2386 * g_regex_split:
2387 * @regex: a #GRegex structure
2388 * @string: the string to split with the pattern
2389 * @match_options: match time option flags
2390 *
2391 * Breaks the string on the pattern, and returns an array of the tokens.
2392 * If the pattern contains capturing parentheses, then the text for each
2393 * of the substrings will also be returned. If the pattern does not match
2394 * anywhere in the string, then the whole string is returned as the first
2395 * token.
2396 *
2397 * As a special case, the result of splitting the empty string "" is an
2398 * empty vector, not a vector containing a single string. The reason for
2399 * this special case is that being able to represent an empty vector is
2400 * typically more useful than consistent handling of empty elements. If
2401 * you do need to represent empty elements, you'll need to check for the
2402 * empty string before calling this function.
2403 *
2404 * A pattern that can match empty strings splits @string into separate
2405 * characters wherever it matches the empty string between characters.
2406 * For example splitting "ab c" using as a separator "\s*", you will get
2407 * "a", "b" and "c".
2408 *
2409 * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2410 * it using g_strfreev()
2411 *
2412 * Since: 2.14
2413 **/
2414 gchar **
g_regex_split(const GRegex * regex,const gchar * string,GRegexMatchFlags match_options)2415 g_regex_split (const GRegex *regex,
2416 const gchar *string,
2417 GRegexMatchFlags match_options)
2418 {
2419 match_options = map_to_pcre2_match_flags (match_options);
2420
2421 return g_regex_split_full (regex, string, -1, 0,
2422 match_options, 0, NULL);
2423 }
2424
2425 /**
2426 * g_regex_split_full:
2427 * @regex: a #GRegex structure
2428 * @string: (array length=string_len): the string to split with the pattern
2429 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2430 * @start_position: starting index of the string to match, in bytes
2431 * @match_options: match time option flags
2432 * @max_tokens: the maximum number of tokens to split @string into.
2433 * If this is less than 1, the string is split completely
2434 * @error: return location for a #GError
2435 *
2436 * Breaks the string on the pattern, and returns an array of the tokens.
2437 * If the pattern contains capturing parentheses, then the text for each
2438 * of the substrings will also be returned. If the pattern does not match
2439 * anywhere in the string, then the whole string is returned as the first
2440 * token.
2441 *
2442 * As a special case, the result of splitting the empty string "" is an
2443 * empty vector, not a vector containing a single string. The reason for
2444 * this special case is that being able to represent an empty vector is
2445 * typically more useful than consistent handling of empty elements. If
2446 * you do need to represent empty elements, you'll need to check for the
2447 * empty string before calling this function.
2448 *
2449 * A pattern that can match empty strings splits @string into separate
2450 * characters wherever it matches the empty string between characters.
2451 * For example splitting "ab c" using as a separator "\s*", you will get
2452 * "a", "b" and "c".
2453 *
2454 * Setting @start_position differs from just passing over a shortened
2455 * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
2456 * that begins with any kind of lookbehind assertion, such as "\b".
2457 *
2458 * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2459 * it using g_strfreev()
2460 *
2461 * Since: 2.14
2462 **/
2463 gchar **
g_regex_split_full(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,GRegexMatchFlags match_options,gint max_tokens,GError ** error)2464 g_regex_split_full (const GRegex *regex,
2465 const gchar *string,
2466 gssize string_len,
2467 gint start_position,
2468 GRegexMatchFlags match_options,
2469 gint max_tokens,
2470 GError **error)
2471 {
2472 GError *tmp_error = NULL;
2473 GMatchInfo *match_info;
2474 GList *list, *last;
2475 gint i;
2476 gint token_count;
2477 gboolean match_ok;
2478 /* position of the last separator. */
2479 gint last_separator_end;
2480 /* was the last match 0 bytes long? */
2481 gboolean last_match_is_empty;
2482 /* the returned array of char **s */
2483 gchar **string_list;
2484
2485 match_options = map_to_pcre2_match_flags (match_options);
2486
2487 g_return_val_if_fail (regex != NULL, NULL);
2488 g_return_val_if_fail (string != NULL, NULL);
2489 g_return_val_if_fail (start_position >= 0, NULL);
2490 g_return_val_if_fail (error == NULL || *error == NULL, NULL);
2491 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
2492
2493 if (max_tokens <= 0)
2494 max_tokens = G_MAXINT;
2495
2496 if (string_len < 0)
2497 string_len = strlen (string);
2498
2499 /* zero-length string */
2500 if (string_len - start_position == 0)
2501 return g_new0 (gchar *, 1);
2502
2503 if (max_tokens == 1)
2504 {
2505 string_list = g_new0 (gchar *, 2);
2506 string_list[0] = g_strndup (&string[start_position],
2507 string_len - start_position);
2508 return string_list;
2509 }
2510
2511 list = NULL;
2512 token_count = 0;
2513 last_separator_end = start_position;
2514 last_match_is_empty = FALSE;
2515
2516 match_ok = g_regex_match_full (regex, string, string_len, start_position,
2517 match_options, &match_info, &tmp_error);
2518
2519 while (tmp_error == NULL)
2520 {
2521 if (match_ok)
2522 {
2523 last_match_is_empty =
2524 (match_info->offsets[0] == match_info->offsets[1]);
2525
2526 /* we need to skip empty separators at the same position of the end
2527 * of another separator. e.g. the string is "a b" and the separator
2528 * is " *", so from 1 to 2 we have a match and at position 2 we have
2529 * an empty match. */
2530 if (last_separator_end != match_info->offsets[1])
2531 {
2532 gchar *token;
2533 gint match_count;
2534
2535 token = g_strndup (string + last_separator_end,
2536 match_info->offsets[0] - last_separator_end);
2537 list = g_list_prepend (list, token);
2538 token_count++;
2539
2540 /* if there were substrings, these need to be added to
2541 * the list. */
2542 match_count = g_match_info_get_match_count (match_info);
2543 if (match_count > 1)
2544 {
2545 for (i = 1; i < match_count; i++)
2546 list = g_list_prepend (list, g_match_info_fetch (match_info, i));
2547 }
2548 }
2549 }
2550 else
2551 {
2552 /* if there was no match, copy to end of string. */
2553 if (!last_match_is_empty)
2554 {
2555 gchar *token = g_strndup (string + last_separator_end,
2556 match_info->string_len - last_separator_end);
2557 list = g_list_prepend (list, token);
2558 }
2559 /* no more tokens, end the loop. */
2560 break;
2561 }
2562
2563 /* -1 to leave room for the last part. */
2564 if (token_count >= max_tokens - 1)
2565 {
2566 /* we have reached the maximum number of tokens, so we copy
2567 * the remaining part of the string. */
2568 if (last_match_is_empty)
2569 {
2570 /* the last match was empty, so we have moved one char
2571 * after the real position to avoid empty matches at the
2572 * same position. */
2573 match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;
2574 }
2575 /* the if is needed in the case we have terminated the available
2576 * tokens, but we are at the end of the string, so there are no
2577 * characters left to copy. */
2578 if (string_len > match_info->pos)
2579 {
2580 gchar *token = g_strndup (string + match_info->pos,
2581 string_len - match_info->pos);
2582 list = g_list_prepend (list, token);
2583 }
2584 /* end the loop. */
2585 break;
2586 }
2587
2588 last_separator_end = match_info->pos;
2589 if (last_match_is_empty)
2590 /* if the last match was empty, g_match_info_next() has moved
2591 * forward to avoid infinite loops, but we still need to copy that
2592 * character. */
2593 last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;
2594
2595 match_ok = g_match_info_next (match_info, &tmp_error);
2596 }
2597 g_match_info_free (match_info);
2598 if (tmp_error != NULL)
2599 {
2600 g_propagate_error (error, tmp_error);
2601 g_list_free_full (list, g_free);
2602 return NULL;
2603 }
2604
2605 string_list = g_new (gchar *, g_list_length (list) + 1);
2606 i = 0;
2607 for (last = g_list_last (list); last; last = g_list_previous (last))
2608 string_list[i++] = last->data;
2609 string_list[i] = NULL;
2610 g_list_free (list);
2611
2612 return string_list;
2613 }
2614
2615 enum
2616 {
2617 REPL_TYPE_STRING,
2618 REPL_TYPE_CHARACTER,
2619 REPL_TYPE_SYMBOLIC_REFERENCE,
2620 REPL_TYPE_NUMERIC_REFERENCE,
2621 REPL_TYPE_CHANGE_CASE
2622 };
2623
2624 typedef enum
2625 {
2626 CHANGE_CASE_NONE = 1 << 0,
2627 CHANGE_CASE_UPPER = 1 << 1,
2628 CHANGE_CASE_LOWER = 1 << 2,
2629 CHANGE_CASE_UPPER_SINGLE = 1 << 3,
2630 CHANGE_CASE_LOWER_SINGLE = 1 << 4,
2631 CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,
2632 CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,
2633 CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE
2634 } ChangeCase;
2635
2636 struct _InterpolationData
2637 {
2638 gchar *text;
2639 gint type;
2640 gint num;
2641 gchar c;
2642 ChangeCase change_case;
2643 };
2644
2645 static void
free_interpolation_data(InterpolationData * data)2646 free_interpolation_data (InterpolationData *data)
2647 {
2648 g_free (data->text);
2649 g_free (data);
2650 }
2651
2652 static const gchar *
expand_escape(const gchar * replacement,const gchar * p,InterpolationData * data,GError ** error)2653 expand_escape (const gchar *replacement,
2654 const gchar *p,
2655 InterpolationData *data,
2656 GError **error)
2657 {
2658 const gchar *q, *r;
2659 gint x, d, h, i;
2660 const gchar *error_detail;
2661 gint base = 0;
2662 GError *tmp_error = NULL;
2663
2664 p++;
2665 switch (*p)
2666 {
2667 case 't':
2668 p++;
2669 data->c = '\t';
2670 data->type = REPL_TYPE_CHARACTER;
2671 break;
2672 case 'n':
2673 p++;
2674 data->c = '\n';
2675 data->type = REPL_TYPE_CHARACTER;
2676 break;
2677 case 'v':
2678 p++;
2679 data->c = '\v';
2680 data->type = REPL_TYPE_CHARACTER;
2681 break;
2682 case 'r':
2683 p++;
2684 data->c = '\r';
2685 data->type = REPL_TYPE_CHARACTER;
2686 break;
2687 case 'f':
2688 p++;
2689 data->c = '\f';
2690 data->type = REPL_TYPE_CHARACTER;
2691 break;
2692 case 'a':
2693 p++;
2694 data->c = '\a';
2695 data->type = REPL_TYPE_CHARACTER;
2696 break;
2697 case 'b':
2698 p++;
2699 data->c = '\b';
2700 data->type = REPL_TYPE_CHARACTER;
2701 break;
2702 case '\\':
2703 p++;
2704 data->c = '\\';
2705 data->type = REPL_TYPE_CHARACTER;
2706 break;
2707 case 'x':
2708 p++;
2709 x = 0;
2710 if (*p == '{')
2711 {
2712 p++;
2713 do
2714 {
2715 h = g_ascii_xdigit_value (*p);
2716 if (h < 0)
2717 {
2718 error_detail = _("hexadecimal digit or “}” expected");
2719 goto error;
2720 }
2721 x = x * 16 + h;
2722 p++;
2723 }
2724 while (*p != '}');
2725 p++;
2726 }
2727 else
2728 {
2729 for (i = 0; i < 2; i++)
2730 {
2731 h = g_ascii_xdigit_value (*p);
2732 if (h < 0)
2733 {
2734 error_detail = _("hexadecimal digit expected");
2735 goto error;
2736 }
2737 x = x * 16 + h;
2738 p++;
2739 }
2740 }
2741 data->type = REPL_TYPE_STRING;
2742 data->text = g_new0 (gchar, 8);
2743 g_unichar_to_utf8 (x, data->text);
2744 break;
2745 case 'l':
2746 p++;
2747 data->type = REPL_TYPE_CHANGE_CASE;
2748 data->change_case = CHANGE_CASE_LOWER_SINGLE;
2749 break;
2750 case 'u':
2751 p++;
2752 data->type = REPL_TYPE_CHANGE_CASE;
2753 data->change_case = CHANGE_CASE_UPPER_SINGLE;
2754 break;
2755 case 'L':
2756 p++;
2757 data->type = REPL_TYPE_CHANGE_CASE;
2758 data->change_case = CHANGE_CASE_LOWER;
2759 break;
2760 case 'U':
2761 p++;
2762 data->type = REPL_TYPE_CHANGE_CASE;
2763 data->change_case = CHANGE_CASE_UPPER;
2764 break;
2765 case 'E':
2766 p++;
2767 data->type = REPL_TYPE_CHANGE_CASE;
2768 data->change_case = CHANGE_CASE_NONE;
2769 break;
2770 case 'g':
2771 p++;
2772 if (*p != '<')
2773 {
2774 error_detail = _("missing “<” in symbolic reference");
2775 goto error;
2776 }
2777 q = p + 1;
2778 do
2779 {
2780 p++;
2781 if (!*p)
2782 {
2783 error_detail = _("unfinished symbolic reference");
2784 goto error;
2785 }
2786 }
2787 while (*p != '>');
2788 if (p - q == 0)
2789 {
2790 error_detail = _("zero-length symbolic reference");
2791 goto error;
2792 }
2793 if (g_ascii_isdigit (*q))
2794 {
2795 x = 0;
2796 do
2797 {
2798 h = g_ascii_digit_value (*q);
2799 if (h < 0)
2800 {
2801 error_detail = _("digit expected");
2802 p = q;
2803 goto error;
2804 }
2805 x = x * 10 + h;
2806 q++;
2807 }
2808 while (q != p);
2809 data->num = x;
2810 data->type = REPL_TYPE_NUMERIC_REFERENCE;
2811 }
2812 else
2813 {
2814 r = q;
2815 do
2816 {
2817 if (!g_ascii_isalnum (*r))
2818 {
2819 error_detail = _("illegal symbolic reference");
2820 p = r;
2821 goto error;
2822 }
2823 r++;
2824 }
2825 while (r != p);
2826 data->text = g_strndup (q, p - q);
2827 data->type = REPL_TYPE_SYMBOLIC_REFERENCE;
2828 }
2829 p++;
2830 break;
2831 case '0':
2832 /* if \0 is followed by a number is an octal number representing a
2833 * character, else it is a numeric reference. */
2834 if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)
2835 {
2836 base = 8;
2837 p = g_utf8_next_char (p);
2838 }
2839 G_GNUC_FALLTHROUGH;
2840 case '1':
2841 case '2':
2842 case '3':
2843 case '4':
2844 case '5':
2845 case '6':
2846 case '7':
2847 case '8':
2848 case '9':
2849 x = 0;
2850 d = 0;
2851 for (i = 0; i < 3; i++)
2852 {
2853 h = g_ascii_digit_value (*p);
2854 if (h < 0)
2855 break;
2856 if (h > 7)
2857 {
2858 if (base == 8)
2859 break;
2860 else
2861 base = 10;
2862 }
2863 if (i == 2 && base == 10)
2864 break;
2865 x = x * 8 + h;
2866 d = d * 10 + h;
2867 p++;
2868 }
2869 if (base == 8 || i == 3)
2870 {
2871 data->type = REPL_TYPE_STRING;
2872 data->text = g_new0 (gchar, 8);
2873 g_unichar_to_utf8 (x, data->text);
2874 }
2875 else
2876 {
2877 data->type = REPL_TYPE_NUMERIC_REFERENCE;
2878 data->num = d;
2879 }
2880 break;
2881 case 0:
2882 error_detail = _("stray final “\\”");
2883 goto error;
2884 break;
2885 default:
2886 error_detail = _("unknown escape sequence");
2887 goto error;
2888 }
2889
2890 return p;
2891
2892 error:
2893 /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */
2894 tmp_error = g_error_new (G_REGEX_ERROR,
2895 G_REGEX_ERROR_REPLACE,
2896 _("Error while parsing replacement "
2897 "text “%s” at char %lu: %s"),
2898 replacement,
2899 (gulong)(p - replacement),
2900 error_detail);
2901 g_propagate_error (error, tmp_error);
2902
2903 return NULL;
2904 }
2905
2906 static GList *
split_replacement(const gchar * replacement,GError ** error)2907 split_replacement (const gchar *replacement,
2908 GError **error)
2909 {
2910 GList *list = NULL;
2911 InterpolationData *data;
2912 const gchar *p, *start;
2913
2914 start = p = replacement;
2915 while (*p)
2916 {
2917 if (*p == '\\')
2918 {
2919 data = g_new0 (InterpolationData, 1);
2920 start = p = expand_escape (replacement, p, data, error);
2921 if (p == NULL)
2922 {
2923 g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
2924 free_interpolation_data (data);
2925
2926 return NULL;
2927 }
2928 list = g_list_prepend (list, data);
2929 }
2930 else
2931 {
2932 p++;
2933 if (*p == '\\' || *p == '\0')
2934 {
2935 if (p - start > 0)
2936 {
2937 data = g_new0 (InterpolationData, 1);
2938 data->text = g_strndup (start, p - start);
2939 data->type = REPL_TYPE_STRING;
2940 list = g_list_prepend (list, data);
2941 }
2942 }
2943 }
2944 }
2945
2946 return g_list_reverse (list);
2947 }
2948
2949 /* Change the case of c based on change_case. */
2950 #define CHANGE_CASE(c, change_case) \
2951 (((change_case) & CHANGE_CASE_LOWER_MASK) ? \
2952 g_unichar_tolower (c) : \
2953 g_unichar_toupper (c))
2954
2955 static void
string_append(GString * string,const gchar * text,ChangeCase * change_case)2956 string_append (GString *string,
2957 const gchar *text,
2958 ChangeCase *change_case)
2959 {
2960 gunichar c;
2961
2962 if (text[0] == '\0')
2963 return;
2964
2965 if (*change_case == CHANGE_CASE_NONE)
2966 {
2967 g_string_append (string, text);
2968 }
2969 else if (*change_case & CHANGE_CASE_SINGLE_MASK)
2970 {
2971 c = g_utf8_get_char (text);
2972 g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2973 g_string_append (string, g_utf8_next_char (text));
2974 *change_case = CHANGE_CASE_NONE;
2975 }
2976 else
2977 {
2978 while (*text != '\0')
2979 {
2980 c = g_utf8_get_char (text);
2981 g_string_append_unichar (string, CHANGE_CASE (c, *change_case));
2982 text = g_utf8_next_char (text);
2983 }
2984 }
2985 }
2986
2987 static gboolean
interpolate_replacement(const GMatchInfo * match_info,GString * result,gpointer data)2988 interpolate_replacement (const GMatchInfo *match_info,
2989 GString *result,
2990 gpointer data)
2991 {
2992 GList *list;
2993 InterpolationData *idata;
2994 gchar *match;
2995 ChangeCase change_case = CHANGE_CASE_NONE;
2996
2997 for (list = data; list; list = list->next)
2998 {
2999 idata = list->data;
3000 switch (idata->type)
3001 {
3002 case REPL_TYPE_STRING:
3003 string_append (result, idata->text, &change_case);
3004 break;
3005 case REPL_TYPE_CHARACTER:
3006 g_string_append_c (result, CHANGE_CASE (idata->c, change_case));
3007 if (change_case & CHANGE_CASE_SINGLE_MASK)
3008 change_case = CHANGE_CASE_NONE;
3009 break;
3010 case REPL_TYPE_NUMERIC_REFERENCE:
3011 match = g_match_info_fetch (match_info, idata->num);
3012 if (match)
3013 {
3014 string_append (result, match, &change_case);
3015 g_free (match);
3016 }
3017 break;
3018 case REPL_TYPE_SYMBOLIC_REFERENCE:
3019 match = g_match_info_fetch_named (match_info, idata->text);
3020 if (match)
3021 {
3022 string_append (result, match, &change_case);
3023 g_free (match);
3024 }
3025 break;
3026 case REPL_TYPE_CHANGE_CASE:
3027 change_case = idata->change_case;
3028 break;
3029 }
3030 }
3031
3032 return FALSE;
3033 }
3034
3035 /* whether actual match_info is needed for replacement, i.e.
3036 * whether there are references
3037 */
3038 static gboolean
interpolation_list_needs_match(GList * list)3039 interpolation_list_needs_match (GList *list)
3040 {
3041 while (list != NULL)
3042 {
3043 InterpolationData *data = list->data;
3044
3045 if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||
3046 data->type == REPL_TYPE_NUMERIC_REFERENCE)
3047 {
3048 return TRUE;
3049 }
3050
3051 list = list->next;
3052 }
3053
3054 return FALSE;
3055 }
3056
3057 /**
3058 * g_regex_replace:
3059 * @regex: a #GRegex structure
3060 * @string: (array length=string_len): the string to perform matches against
3061 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3062 * @start_position: starting index of the string to match, in bytes
3063 * @replacement: text to replace each match with
3064 * @match_options: options for the match
3065 * @error: location to store the error occurring, or %NULL to ignore errors
3066 *
3067 * Replaces all occurrences of the pattern in @regex with the
3068 * replacement text. Backreferences of the form '\number' or
3069 * '\g<number>' in the replacement text are interpolated by the
3070 * number-th captured subexpression of the match, '\g<name>' refers
3071 * to the captured subexpression with the given name. '\0' refers
3072 * to the complete match, but '\0' followed by a number is the octal
3073 * representation of a character. To include a literal '\' in the
3074 * replacement, write '\\\\'.
3075 *
3076 * There are also escapes that changes the case of the following text:
3077 *
3078 * - \l: Convert to lower case the next character
3079 * - \u: Convert to upper case the next character
3080 * - \L: Convert to lower case till \E
3081 * - \U: Convert to upper case till \E
3082 * - \E: End case modification
3083 *
3084 * If you do not need to use backreferences use g_regex_replace_literal().
3085 *
3086 * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was
3087 * passed to g_regex_new(). If you want to use not UTF-8 encoded strings
3088 * you can use g_regex_replace_literal().
3089 *
3090 * Setting @start_position differs from just passing over a shortened
3091 * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that
3092 * begins with any kind of lookbehind assertion, such as "\b".
3093 *
3094 * Returns: a newly allocated string containing the replacements
3095 *
3096 * Since: 2.14
3097 */
3098 gchar *
g_regex_replace(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,const gchar * replacement,GRegexMatchFlags match_options,GError ** error)3099 g_regex_replace (const GRegex *regex,
3100 const gchar *string,
3101 gssize string_len,
3102 gint start_position,
3103 const gchar *replacement,
3104 GRegexMatchFlags match_options,
3105 GError **error)
3106 {
3107 gchar *result;
3108 GList *list;
3109 GError *tmp_error = NULL;
3110
3111 match_options = map_to_pcre2_match_flags (match_options);
3112
3113 g_return_val_if_fail (regex != NULL, NULL);
3114 g_return_val_if_fail (string != NULL, NULL);
3115 g_return_val_if_fail (start_position >= 0, NULL);
3116 g_return_val_if_fail (replacement != NULL, NULL);
3117 g_return_val_if_fail (error == NULL || *error == NULL, NULL);
3118 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3119
3120 list = split_replacement (replacement, &tmp_error);
3121 if (tmp_error != NULL)
3122 {
3123 g_propagate_error (error, tmp_error);
3124 return NULL;
3125 }
3126
3127 result = g_regex_replace_eval (regex,
3128 string, string_len, start_position,
3129 match_options,
3130 interpolate_replacement,
3131 (gpointer)list,
3132 &tmp_error);
3133 if (tmp_error != NULL)
3134 g_propagate_error (error, tmp_error);
3135
3136 g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3137
3138 return result;
3139 }
3140
3141 static gboolean
literal_replacement(const GMatchInfo * match_info,GString * result,gpointer data)3142 literal_replacement (const GMatchInfo *match_info,
3143 GString *result,
3144 gpointer data)
3145 {
3146 g_string_append (result, data);
3147 return FALSE;
3148 }
3149
3150 /**
3151 * g_regex_replace_literal:
3152 * @regex: a #GRegex structure
3153 * @string: (array length=string_len): the string to perform matches against
3154 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3155 * @start_position: starting index of the string to match, in bytes
3156 * @replacement: text to replace each match with
3157 * @match_options: options for the match
3158 * @error: location to store the error occurring, or %NULL to ignore errors
3159 *
3160 * Replaces all occurrences of the pattern in @regex with the
3161 * replacement text. @replacement is replaced literally, to
3162 * include backreferences use g_regex_replace().
3163 *
3164 * Setting @start_position differs from just passing over a
3165 * shortened string and setting #G_REGEX_MATCH_NOTBOL in the
3166 * case of a pattern that begins with any kind of lookbehind
3167 * assertion, such as "\b".
3168 *
3169 * Returns: a newly allocated string containing the replacements
3170 *
3171 * Since: 2.14
3172 */
3173 gchar *
g_regex_replace_literal(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,const gchar * replacement,GRegexMatchFlags match_options,GError ** error)3174 g_regex_replace_literal (const GRegex *regex,
3175 const gchar *string,
3176 gssize string_len,
3177 gint start_position,
3178 const gchar *replacement,
3179 GRegexMatchFlags match_options,
3180 GError **error)
3181 {
3182 match_options = map_to_pcre2_match_flags (match_options);
3183
3184 g_return_val_if_fail (replacement != NULL, NULL);
3185 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3186
3187 return g_regex_replace_eval (regex,
3188 string, string_len, start_position,
3189 match_options,
3190 literal_replacement,
3191 (gpointer)replacement,
3192 error);
3193 }
3194
3195 /**
3196 * g_regex_replace_eval:
3197 * @regex: a #GRegex structure from g_regex_new()
3198 * @string: (array length=string_len): string to perform matches against
3199 * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3200 * @start_position: starting index of the string to match, in bytes
3201 * @match_options: options for the match
3202 * @eval: a function to call for each match
3203 * @user_data: user data to pass to the function
3204 * @error: location to store the error occurring, or %NULL to ignore errors
3205 *
3206 * Replaces occurrences of the pattern in regex with the output of
3207 * @eval for that occurrence.
3208 *
3209 * Setting @start_position differs from just passing over a shortened
3210 * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern
3211 * that begins with any kind of lookbehind assertion, such as "\b".
3212 *
3213 * The following example uses g_regex_replace_eval() to replace multiple
3214 * strings at once:
3215 * |[<!-- language="C" -->
3216 * static gboolean
3217 * eval_cb (const GMatchInfo *info,
3218 * GString *res,
3219 * gpointer data)
3220 * {
3221 * gchar *match;
3222 * gchar *r;
3223 *
3224 * match = g_match_info_fetch (info, 0);
3225 * r = g_hash_table_lookup ((GHashTable *)data, match);
3226 * g_string_append (res, r);
3227 * g_free (match);
3228 *
3229 * return FALSE;
3230 * }
3231 *
3232 * ...
3233 *
3234 * GRegex *reg;
3235 * GHashTable *h;
3236 * gchar *res;
3237 *
3238 * h = g_hash_table_new (g_str_hash, g_str_equal);
3239 *
3240 * g_hash_table_insert (h, "1", "ONE");
3241 * g_hash_table_insert (h, "2", "TWO");
3242 * g_hash_table_insert (h, "3", "THREE");
3243 * g_hash_table_insert (h, "4", "FOUR");
3244 *
3245 * reg = g_regex_new ("1|2|3|4", 0, 0, NULL);
3246 * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL);
3247 * g_hash_table_destroy (h);
3248 *
3249 * ...
3250 * ]|
3251 *
3252 * Returns: a newly allocated string containing the replacements
3253 *
3254 * Since: 2.14
3255 */
3256 gchar *
g_regex_replace_eval(const GRegex * regex,const gchar * string,gssize string_len,gint start_position,GRegexMatchFlags match_options,GRegexEvalCallback eval,gpointer user_data,GError ** error)3257 g_regex_replace_eval (const GRegex *regex,
3258 const gchar *string,
3259 gssize string_len,
3260 gint start_position,
3261 GRegexMatchFlags match_options,
3262 GRegexEvalCallback eval,
3263 gpointer user_data,
3264 GError **error)
3265 {
3266 GMatchInfo *match_info;
3267 GString *result;
3268 gint str_pos = 0;
3269 gboolean done = FALSE;
3270 GError *tmp_error = NULL;
3271
3272 match_options = map_to_pcre2_match_flags (match_options);
3273
3274 g_return_val_if_fail (regex != NULL, NULL);
3275 g_return_val_if_fail (string != NULL, NULL);
3276 g_return_val_if_fail (start_position >= 0, NULL);
3277 g_return_val_if_fail (eval != NULL, NULL);
3278 g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);
3279
3280 if (string_len < 0)
3281 string_len = strlen (string);
3282
3283 result = g_string_sized_new (string_len);
3284
3285 /* run down the string making matches. */
3286 g_regex_match_full (regex, string, string_len, start_position,
3287 match_options, &match_info, &tmp_error);
3288 while (!done && g_match_info_matches (match_info))
3289 {
3290 g_string_append_len (result,
3291 string + str_pos,
3292 match_info->offsets[0] - str_pos);
3293 done = (*eval) (match_info, result, user_data);
3294 str_pos = match_info->offsets[1];
3295 g_match_info_next (match_info, &tmp_error);
3296 }
3297 g_match_info_free (match_info);
3298 if (tmp_error != NULL)
3299 {
3300 g_propagate_error (error, tmp_error);
3301 g_string_free (result, TRUE);
3302 return NULL;
3303 }
3304
3305 g_string_append_len (result, string + str_pos, string_len - str_pos);
3306 return g_string_free (result, FALSE);
3307 }
3308
3309 /**
3310 * g_regex_check_replacement:
3311 * @replacement: the replacement string
3312 * @has_references: (out) (optional): location to store information about
3313 * references in @replacement or %NULL
3314 * @error: location to store error
3315 *
3316 * Checks whether @replacement is a valid replacement string
3317 * (see g_regex_replace()), i.e. that all escape sequences in
3318 * it are valid.
3319 *
3320 * If @has_references is not %NULL then @replacement is checked
3321 * for pattern references. For instance, replacement text 'foo\n'
3322 * does not contain references and may be evaluated without information
3323 * about actual match, but '\0\1' (whole match followed by first
3324 * subpattern) requires valid #GMatchInfo object.
3325 *
3326 * Returns: whether @replacement is a valid replacement string
3327 *
3328 * Since: 2.14
3329 */
3330 gboolean
g_regex_check_replacement(const gchar * replacement,gboolean * has_references,GError ** error)3331 g_regex_check_replacement (const gchar *replacement,
3332 gboolean *has_references,
3333 GError **error)
3334 {
3335 GList *list;
3336 GError *tmp = NULL;
3337
3338 list = split_replacement (replacement, &tmp);
3339
3340 if (tmp)
3341 {
3342 g_propagate_error (error, tmp);
3343 return FALSE;
3344 }
3345
3346 if (has_references)
3347 *has_references = interpolation_list_needs_match (list);
3348
3349 g_list_free_full (list, (GDestroyNotify) free_interpolation_data);
3350
3351 return TRUE;
3352 }
3353
3354 /**
3355 * g_regex_escape_nul:
3356 * @string: the string to escape
3357 * @length: the length of @string
3358 *
3359 * Escapes the nul characters in @string to "\x00". It can be used
3360 * to compile a regex with embedded nul characters.
3361 *
3362 * For completeness, @length can be -1 for a nul-terminated string.
3363 * In this case the output string will be of course equal to @string.
3364 *
3365 * Returns: a newly-allocated escaped string
3366 *
3367 * Since: 2.30
3368 */
3369 gchar *
g_regex_escape_nul(const gchar * string,gint length)3370 g_regex_escape_nul (const gchar *string,
3371 gint length)
3372 {
3373 GString *escaped;
3374 const gchar *p, *piece_start, *end;
3375 gint backslashes;
3376
3377 g_return_val_if_fail (string != NULL, NULL);
3378
3379 if (length < 0)
3380 return g_strdup (string);
3381
3382 end = string + length;
3383 p = piece_start = string;
3384 escaped = g_string_sized_new (length + 1);
3385
3386 backslashes = 0;
3387 while (p < end)
3388 {
3389 switch (*p)
3390 {
3391 case '\0':
3392 if (p != piece_start)
3393 {
3394 /* copy the previous piece. */
3395 g_string_append_len (escaped, piece_start, p - piece_start);
3396 }
3397 if ((backslashes & 1) == 0)
3398 g_string_append_c (escaped, '\\');
3399 g_string_append_c (escaped, 'x');
3400 g_string_append_c (escaped, '0');
3401 g_string_append_c (escaped, '0');
3402 piece_start = ++p;
3403 backslashes = 0;
3404 break;
3405 case '\\':
3406 backslashes++;
3407 ++p;
3408 break;
3409 default:
3410 backslashes = 0;
3411 p = g_utf8_next_char (p);
3412 break;
3413 }
3414 }
3415
3416 if (piece_start < end)
3417 g_string_append_len (escaped, piece_start, end - piece_start);
3418
3419 return g_string_free (escaped, FALSE);
3420 }
3421
3422 /**
3423 * g_regex_escape_string:
3424 * @string: (array length=length): the string to escape
3425 * @length: the length of @string, in bytes, or -1 if @string is nul-terminated
3426 *
3427 * Escapes the special characters used for regular expressions
3428 * in @string, for instance "a.b*c" becomes "a\.b\*c". This
3429 * function is useful to dynamically generate regular expressions.
3430 *
3431 * @string can contain nul characters that are replaced with "\0",
3432 * in this case remember to specify the correct length of @string
3433 * in @length.
3434 *
3435 * Returns: a newly-allocated escaped string
3436 *
3437 * Since: 2.14
3438 */
3439 gchar *
g_regex_escape_string(const gchar * string,gint length)3440 g_regex_escape_string (const gchar *string,
3441 gint length)
3442 {
3443 GString *escaped;
3444 const char *p, *piece_start, *end;
3445
3446 g_return_val_if_fail (string != NULL, NULL);
3447
3448 if (length < 0)
3449 length = strlen (string);
3450
3451 end = string + length;
3452 p = piece_start = string;
3453 escaped = g_string_sized_new (length + 1);
3454
3455 while (p < end)
3456 {
3457 switch (*p)
3458 {
3459 case '\0':
3460 case '\\':
3461 case '|':
3462 case '(':
3463 case ')':
3464 case '[':
3465 case ']':
3466 case '{':
3467 case '}':
3468 case '^':
3469 case '$':
3470 case '*':
3471 case '+':
3472 case '?':
3473 case '.':
3474 if (p != piece_start)
3475 /* copy the previous piece. */
3476 g_string_append_len (escaped, piece_start, p - piece_start);
3477 g_string_append_c (escaped, '\\');
3478 if (*p == '\0')
3479 g_string_append_c (escaped, '0');
3480 else
3481 g_string_append_c (escaped, *p);
3482 piece_start = ++p;
3483 break;
3484 default:
3485 p = g_utf8_next_char (p);
3486 break;
3487 }
3488 }
3489
3490 if (piece_start < end)
3491 g_string_append_len (escaped, piece_start, end - piece_start);
3492
3493 return g_string_free (escaped, FALSE);
3494 }
3495