gregex.c - OpenGrok cross reference for /third

Lines Matching +full:no +full:- +full:useless +full:- +full:escape
1 /* GRegex -- regular expression API wrapper around PCRE.
5  * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org>
39  * @title: Perl-compatible regular expressions
41  * @see_also: [Regular expression syntax][glib-regex-syntax]
61  * to these functions must be encoded in UTF-8. The lengths and the positions
64  * single character. If you set #G_REGEX_RAW the strings can be non-valid
65  * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two
70  * sequence use "\R". This particular group matches either the two-character
90  * is a syntax error; the '\x' escape only allows 0 to 2 hexadecimal digits, and
91  * you must use the '\u' escape sequence with 4 hex digits to specify a unicode
104  * The regular expressions low-level functionalities are obtained through
154 /* if the string is in UTF-8 use g_utf8_ functions, else use
155  * use just +/- 1. */
156 #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
159 #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \
160                                 ((s) - 1) : \
607    * and that some PCRE errors are useless for us.  in translate_compile_error()
633       *errmsg = _("invalid escape sequence in character class");  in translate_compile_error()
646       *errmsg = _("unrecognized character after (? or (?-");  in translate_compile_error()
655       *errmsg = _("reference to non-existent subpattern");  in translate_compile_error()
692       /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of)  in translate_compile_error()
693        * sequences here, '(?-54' would be an example for the second group.  in translate_compile_error()
695       *errmsg = _("(?R or (?[+-]digits must be followed by )");  in translate_compile_error()
751     case 153: /* internal error: previously-checked referenced subpattern not found */  in translate_compile_error()
753       *errmsg = _("previously-checked referenced subpattern not found");  in translate_compile_error()
762       *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or "  in translate_compile_error()
796       *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name");  in translate_compile_error()
823     case 144: /* invalid UTF-8 string */  in translate_compile_error()
827     case 174: /* invalid UTF-16 string */  in translate_compile_error()
828       /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE  in translate_compile_error()
857   match_info->ref_count = 1;  in match_info_new()
858   match_info->regex = g_regex_ref ((GRegex *)regex);  in match_info_new()
859   match_info->string = string;  in match_info_new()
860   match_info->string_len = string_len;  in match_info_new()
861   match_info->matches = PCRE2_ERROR_NOMATCH;  in match_info_new()
862   match_info->pos = start_position;  in match_info_new()
863   match_info->match_opts = match_options;  in match_info_new()
869       match_info->n_offsets = 24;  in match_info_new()
870       match_info->n_workspace = 100;  in match_info_new()
871       match_info->workspace = g_new (gint, match_info->n_workspace);  in match_info_new()
876       pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT,   in match_info_new()
878       match_info->n_offsets = (capture_count + 1) * 3;  in match_info_new()
881   match_info->offsets = g_new0 (gint, match_info->n_offsets);  in match_info_new()
883   match_info->offsets[0] = -1;  in match_info_new()
884   match_info->offsets[1] = -1;  in match_info_new()
886   match_info->match_data = pcre2_match_data_create_from_pattern (  in match_info_new()
887       match_info->regex->pcre_re,  in match_info_new()
909   return match_info->regex;  in g_match_info_get_regex()
928   return match_info->string;  in g_match_info_get_string()
945   g_atomic_int_inc (&match_info->ref_count);  in g_match_info_ref()
961   if (g_atomic_int_dec_and_test (&match_info->ref_count))  in g_match_info_unref()
963       g_regex_unref (match_info->regex);  in g_match_info_unref()
964       if (match_info->match_data)  in g_match_info_unref()
965         pcre2_match_data_free (match_info->match_data);  in g_match_info_unref()
966       g_free (match_info->offsets);  in g_match_info_unref()
967       g_free (match_info->workspace);  in g_match_info_unref()
1018   g_return_val_if_fail (match_info->pos >= 0, FALSE);  in g_match_info_next()
1020   prev_match_start = match_info->offsets[0];  in g_match_info_next()
1021   prev_match_end = match_info->offsets[1];  in g_match_info_next()
1023   if (match_info->pos > match_info->string_len)  in g_match_info_next()
1026       match_info->pos = -1;  in g_match_info_next()
1027       match_info->matches = PCRE2_ERROR_NOMATCH;  in g_match_info_next()
1031   opts = map_to_pcre2_match_flags (match_info->regex->match_opts | match_info->match_opts);  in g_match_info_next()
1032   match_info->matches = pcre2_match (match_info->regex->pcre_re,  in g_match_info_next()
1033                                      (PCRE2_SPTR)match_info->string,  in g_match_info_next()
1034                                      match_info->string_len,  in g_match_info_next()
1035                                      match_info->pos,  in g_match_info_next()
1037                                      match_info->match_data,  in g_match_info_next()
1040   if (IS_PCRE_ERROR (match_info->matches))  in g_match_info_next()
1044                    match_info->regex->pattern, match_error (match_info->matches));  in g_match_info_next()
1049       match_info->n_offsets = pcre2_get_ovector_count (match_info->match_data) * 2;  in g_match_info_next()
1050       ovector = pcre2_get_ovector_pointer (match_info->match_data);  in g_match_info_next()
1051       match_info->offsets = g_realloc_n (match_info->offsets,  in g_match_info_next()
1052                                          match_info->n_offsets,  in g_match_info_next()
1054       for (i = 0; i < match_info->n_offsets; i++)  in g_match_info_next()
1056           match_info->offsets[i] = (int) ovector[i];  in g_match_info_next()
1062   if (match_info->pos == match_info->offsets[1])  in g_match_info_next()
1064       if (match_info->pos > match_info->string_len)  in g_match_info_next()
1067           match_info->pos = -1;  in g_match_info_next()
1068           match_info->matches = PCRE2_ERROR_NOMATCH;  in g_match_info_next()
1072       match_info->pos = NEXT_CHAR (match_info->regex,  in g_match_info_next()
1073                                    &match_info->string[match_info->pos]) -  in g_match_info_next()
1074                                    match_info->string;  in g_match_info_next()
1078       match_info->pos = match_info->offsets[1];  in g_match_info_next()
1082    * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and  in g_match_info_next()
1084    *  - search at position 0: match from 0 to 0  in g_match_info_next()
1085    *  - search at position 1: match from 3 to 3  in g_match_info_next()
1086    *  - search at position 3: match from 3 to 3 (duplicate)  in g_match_info_next()
1087    *  - search at position 4: match from 5 to 5  in g_match_info_next()
1088    *  - search at position 5: match from 5 to 5 (duplicate)  in g_match_info_next()
1089    *  - search at position 6: no match -> stop  in g_match_info_next()
1092   if (match_info->matches >= 0 &&  in g_match_info_next()
1093       prev_match_start == match_info->offsets[0] &&  in g_match_info_next()
1094       prev_match_end == match_info->offsets[1])  in g_match_info_next()
1100   return match_info->matches >= 0;  in g_match_info_next()
1119   return match_info->matches >= 0;  in g_match_info_matches()
1128  * has no substrings in it and 0 is returned if the match failed.
1135  * Returns: Number of matched substrings, or -1 if an error occurred
1142   g_return_val_if_fail (match_info, -1);  in g_match_info_get_match_count()
1144   if (match_info->matches == PCRE2_ERROR_NOMATCH)  in g_match_info_get_match_count()
1145     /* no match */  in g_match_info_get_match_count()
1147   else if (match_info->matches < PCRE2_ERROR_NOMATCH)  in g_match_info_get_match_count()
1149     return -1;  in g_match_info_get_match_count()
1152     return match_info->matches;  in g_match_info_get_match_count()
1162  * distinguish this case from other cases in which there is no match.
1189  * The restrictions no longer apply.
1202   return match_info->matches == PCRE2_ERROR_PARTIAL;  in g_match_info_is_partial_match()
1212  * references and escape sequences expanded. References refer to the last
1216  * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was
1311   else if (start == -1)  in g_match_info_fetch()
1314     match = g_strndup (&match_info->string[start], end - start);  in g_match_info_fetch()
1334  * and @end_pos are set to -1 and %TRUE is returned.
1359   if (match_num >= match_info->matches)  in g_match_info_fetch_pos()
1363     *start_pos = match_info->offsets[2 * match_num];  in g_match_info_fetch_pos()
1366     *end_pos = match_info->offsets[2 * match_num + 1];  in g_match_info_fetch_pos()
1385   if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES))  in get_matched_substring_number()
1386     return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR)name);  in get_matched_substring_number()
1389   entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re,  in get_matched_substring_number()
1400       if (match_info->offsets[n*2] >= 0)  in get_matched_substring_number()
1457  * then @start_pos and @end_pos are set to -1 and %TRUE is returned.
1504  * Returns: (transfer full): a %NULL-terminated array of gchar *
1520   if (match_info->matches < 0)  in g_match_info_fetch_all()
1523   result = g_new (gchar *, match_info->matches + 1);  in g_match_info_fetch_all()
1524   for (i = 0; i < match_info->matches; i++)  in g_match_info_fetch_all()
1534 G_DEFINE_QUARK (g-regex-error-quark, g_regex_error)
1550   g_atomic_int_inc (&regex->ref_count);  in g_regex_ref()
1568   if (g_atomic_int_dec_and_test (&regex->ref_count))  in g_regex_unref()
1570       g_free (regex->pattern);  in g_regex_unref()
1571       if (regex->pcre_re != NULL)  in g_regex_unref()
1572         pcre2_code_free (regex->pcre_re);  in g_regex_unref()
1643   regex->ref_count = 1;  in g_regex_new()
1644   regex->pattern = g_strdup (pattern);  in g_regex_new()
1645   regex->pcre_re = re;  in g_regex_new()
1646   regex->compile_opts = compile_options;  in g_regex_new()
1647   regex->match_opts = match_options;  in g_regex_new()
1671   /* In GRegex the string are, by default, UTF-8 encoded. PCRE  in regex_compile()
1672    * instead uses UTF-8 only if required with PCRE_UTF8. */  in regex_compile()
1675       /* disable utf-8 */  in regex_compile()
1680       /* enable utf-8 */  in regex_compile()
1764   return regex->pattern;  in g_regex_get_pattern()
1784   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value);  in g_regex_get_max_backref()
1804   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value);  in g_regex_get_capture_count()
1824   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value);  in g_regex_get_has_cr_or_lf()
1834  * pattern. This information is useful when doing multi-segment matching using
1846   pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND,  in g_regex_get_max_lookbehind()
1860  * top-level within the compiled pattern.
1871   return map_to_pcre1_compile_flags (regex->compile_opts);  in g_regex_get_compile_flags()
1889   return map_to_pcre1_match_flags (regex->match_opts & G_REGEX_MATCH_MASK);  in g_regex_get_match_flags()
1929   result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);  in g_regex_match_simple()
1947  * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
1954  * To retrieve all the non-overlapping matches of the pattern in
1957  * |[<!-- language="C" --> 
1961  *   // Print all uppercase-only words.
1965  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
1995   return g_regex_match_full (regex, string, -1, 0, match_options,  in g_regex_match()
2003  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2019  * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2031  * To retrieve all the non-overlapping matches of the pattern in
2034  * |[<!-- language="C" --> 
2038  *   // Print all uppercase-only words.
2043  *   regex = g_regex_new ("[A-Z]+", 0, 0, NULL);
2044  *   g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error);
2056  *       g_printerr ("Error while matching: %s\n", error->message);
2132   return g_regex_match_all_full (regex, string, -1, 0, match_options,  in g_regex_match_all()
2140  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2174  * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8.
2221   pcre_re = regex_compile (regex->pattern,  in g_regex_match_all_full()
2222                            regex->compile_opts | PCRE2_NO_AUTO_POSSESS,  in g_regex_match_all_full()
2229   pcre_re = regex->pcre_re;  in g_regex_match_all_full()
2239       info->matches = pcre2_dfa_match (pcre_re,  in g_regex_match_all_full()
2240                                        (PCRE2_SPTR)info->string, info->string_len,  in g_regex_match_all_full()
2241                                        info->pos,  in g_regex_match_all_full()
2243                                        info->match_data,  in g_regex_match_all_full()
2245                                        info->workspace, info->n_workspace);  in g_regex_match_all_full()
2247       info->n_offsets = pcre2_get_ovector_count (info->match_data) * 2;  in g_regex_match_all_full()
2248       ovector = pcre2_get_ovector_pointer (info->match_data);  in g_regex_match_all_full()
2249       info->offsets = g_realloc (info->offsets,  in g_regex_match_all_full()
2250                                  info->n_offsets * sizeof (gint));  in g_regex_match_all_full()
2251       for (i = 0; i < info->n_offsets; i++)  in g_regex_match_all_full()
2253           info->offsets[i] = (int) ovector[i];  in g_regex_match_all_full()
2256       if (info->matches == PCRE2_ERROR_DFA_WSSIZE)  in g_regex_match_all_full()
2258           /* info->workspace is too small. */  in g_regex_match_all_full()
2259           info->n_workspace *= 2;  in g_regex_match_all_full()
2260           info->workspace = g_realloc (info->workspace,  in g_regex_match_all_full()
2261                                        info->n_workspace * sizeof (gint));  in g_regex_match_all_full()
2264       else if (info->matches == 0)  in g_regex_match_all_full()
2266           /* info->offsets is too small. */  in g_regex_match_all_full()
2267           info->n_offsets *= 2;  in g_regex_match_all_full()
2268           info->offsets = g_realloc (info->offsets,  in g_regex_match_all_full()
2269                                      info->n_offsets * sizeof (gint));  in g_regex_match_all_full()
2272       else if (IS_PCRE_ERROR (info->matches))  in g_regex_match_all_full()
2276                        regex->pattern, match_error (info->matches));  in g_regex_match_all_full()
2284   /* set info->pos to -1 so that a call to g_match_info_next() fails. */  in g_regex_match_all_full()
2285   info->pos = -1;  in g_regex_match_all_full()
2286   retval = info->matches >= 0;  in g_regex_match_all_full()
2303  * Returns: The number of the subexpression or -1 if @name
2314   g_return_val_if_fail (regex != NULL, -1);  in g_regex_get_string_number()
2315   g_return_val_if_fail (name != NULL, -1);  in g_regex_get_string_number()
2317   num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR)name);  in g_regex_get_string_number()
2319     num = -1;  in g_regex_get_string_number()
2359  * Returns: (transfer full): a %NULL-terminated array of strings. Free
2380   result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);  in g_regex_split_simple()
2409  * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2421   return g_regex_split_full (regex, string, -1, 0,  in g_regex_split()
2429  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
2458  * Returns: (transfer full): a %NULL-terminated gchar ** array. Free
2499   /* zero-length string */  in g_regex_split_full()
2500   if (string_len - start_position == 0)  in g_regex_split_full()
2507                                   string_len - start_position);  in g_regex_split_full()
2524                     (match_info->offsets[0] == match_info->offsets[1]);  in g_regex_split_full()
2530           if (last_separator_end != match_info->offsets[1])  in g_regex_split_full()
2536                                  match_info->offsets[0] - last_separator_end);  in g_regex_split_full()
2552           /* if there was no match, copy to end of string. */  in g_regex_split_full()
2556                                         match_info->string_len - last_separator_end);  in g_regex_split_full()
2559           /* no more tokens, end the loop. */  in g_regex_split_full()
2563       /* -1 to leave room for the last part. */  in g_regex_split_full()
2564       if (token_count >= max_tokens - 1)  in g_regex_split_full()
2573               match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;  in g_regex_split_full()
2576            * tokens, but we are at the end of the string, so there are no  in g_regex_split_full()
2578           if (string_len > match_info->pos)  in g_regex_split_full()
2580               gchar *token = g_strndup (string + match_info->pos,  in g_regex_split_full()
2581                                         string_len - match_info->pos);  in g_regex_split_full()
2588       last_separator_end = match_info->pos;  in g_regex_split_full()
2593         last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;  in g_regex_split_full()
2608     string_list[i++] = last->data;  in g_regex_split_full()
2648   g_free (data->text);  in free_interpolation_data()
2669       data->c = '\t';  in expand_escape()
2670       data->type = REPL_TYPE_CHARACTER;  in expand_escape()
2674       data->c = '\n';  in expand_escape()
2675       data->type = REPL_TYPE_CHARACTER;  in expand_escape()
2679       data->c = '\v';  in expand_escape()
2680       data->type = REPL_TYPE_CHARACTER;  in expand_escape()
2684       data->c = '\r';  in expand_escape()
2685       data->type = REPL_TYPE_CHARACTER;  in expand_escape()
2689       data->c = '\f';  in expand_escape()
2690       data->type = REPL_TYPE_CHARACTER;  in expand_escape()
2694       data->c = '\a';  in expand_escape()
2695       data->type = REPL_TYPE_CHARACTER;  in expand_escape()
2699       data->c = '\b';  in expand_escape()
2700       data->type = REPL_TYPE_CHARACTER;  in expand_escape()
2704       data->c = '\\';  in expand_escape()
2705       data->type = REPL_TYPE_CHARACTER;  in expand_escape()
2741       data->type = REPL_TYPE_STRING;  in expand_escape()
2742       data->text = g_new0 (gchar, 8);  in expand_escape()
2743       g_unichar_to_utf8 (x, data->text);  in expand_escape()
2747       data->type = REPL_TYPE_CHANGE_CASE;  in expand_escape()
2748       data->change_case = CHANGE_CASE_LOWER_SINGLE;  in expand_escape()
2752       data->type = REPL_TYPE_CHANGE_CASE;  in expand_escape()
2753       data->change_case = CHANGE_CASE_UPPER_SINGLE;  in expand_escape()
2757       data->type = REPL_TYPE_CHANGE_CASE;  in expand_escape()
2758       data->change_case = CHANGE_CASE_LOWER;  in expand_escape()
2762       data->type = REPL_TYPE_CHANGE_CASE;  in expand_escape()
2763       data->change_case = CHANGE_CASE_UPPER;  in expand_escape()
2767       data->type = REPL_TYPE_CHANGE_CASE;  in expand_escape()
2768       data->change_case = CHANGE_CASE_NONE;  in expand_escape()
2788       if (p - q == 0)  in expand_escape()
2790           error_detail = _("zero-length symbolic reference");  in expand_escape()
2809           data->num = x;  in expand_escape()
2810           data->type = REPL_TYPE_NUMERIC_REFERENCE;  in expand_escape()
2826           data->text = g_strndup (q, p - q);  in expand_escape()
2827           data->type = REPL_TYPE_SYMBOLIC_REFERENCE;  in expand_escape()
2871           data->type = REPL_TYPE_STRING;  in expand_escape()
2872           data->text = g_new0 (gchar, 8);  in expand_escape()
2873           g_unichar_to_utf8 (x, data->text);  in expand_escape()
2877           data->type = REPL_TYPE_NUMERIC_REFERENCE;  in expand_escape()
2878           data->num = d;  in expand_escape()
2886       error_detail = _("unknown escape sequence");  in expand_escape()
2899                            (gulong)(p - replacement),  in expand_escape()
2935               if (p - start > 0)  in split_replacement()
2938                   data->text = g_strndup (start, p - start);  in split_replacement()
2939                   data->type = REPL_TYPE_STRING;  in split_replacement()
2997   for (list = data; list; list = list->next)  in interpolate_replacement()
2999       idata = list->data;  in interpolate_replacement()
3000       switch (idata->type)  in interpolate_replacement()
3003           string_append (result, idata->text, &change_case);  in interpolate_replacement()
3006           g_string_append_c (result, CHANGE_CASE (idata->c, change_case));  in interpolate_replacement()
3011           match = g_match_info_fetch (match_info, idata->num);  in interpolate_replacement()
3019           match = g_match_info_fetch_named (match_info, idata->text);  in interpolate_replacement()
3027           change_case = idata->change_case;  in interpolate_replacement()
3043       InterpolationData *data = list->data;  in interpolation_list_needs_match()
3045       if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||  in interpolation_list_needs_match()
3046           data->type == REPL_TYPE_NUMERIC_REFERENCE)  in interpolation_list_needs_match()
3051       list = list->next;  in interpolation_list_needs_match()
3061  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3070  * number-th captured subexpression of the match, '\g<name>' refers
3078  * - \l: Convert to lower case the next character
3079  * - \u: Convert to upper case the next character
3080  * - \L: Convert to lower case till \E
3081  * - \U: Convert to upper case till \E
3082  * - \E: End case modification
3086  * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was
3087  * passed to g_regex_new(). If you want to use not UTF-8 encoded strings
3154  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3199  * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated
3215  * |[<!-- language="C" --> 
3246  * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL);
3292                            match_info->offsets[0] - str_pos);  in g_regex_replace_eval()
3294       str_pos = match_info->offsets[1];  in g_regex_replace_eval()
3305   g_string_append_len (result, string + str_pos, string_len - str_pos);  in g_regex_replace_eval()
3317  * (see g_regex_replace()), i.e. that all escape sequences in
3356  * @string: the string to escape
3362  * For completeness, @length can be -1 for a nul-terminated string.
3365  * Returns: a newly-allocated escaped string
3395               g_string_append_len (escaped, piece_start, p - piece_start);  in g_regex_escape_nul()
3417     g_string_append_len (escaped, piece_start, end - piece_start);  in g_regex_escape_nul()
3424  * @string: (array length=length): the string to escape
3425  * @length: the length of @string, in bytes, or -1 if @string is nul-terminated
3435  * Returns: a newly-allocated escaped string
3476             g_string_append_len (escaped, piece_start, p - piece_start);  in g_regex_escape_string()
3491     g_string_append_len (escaped, piece_start, end - piece_start);  in g_regex_escape_string()