• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GLIB - Library of useful routines for C programming
2  * Copyright (C) 1995-1997, 1999  Peter Mattis, Red Hat, Inc.
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "config.h"
19 
20 #include <string.h>
21 
22 #include "gpattern.h"
23 
24 #include "gmacros.h"
25 #include "gmessages.h"
26 #include "gmem.h"
27 #include "gunicode.h"
28 #include "gutils.h"
29 
30 /**
31  * SECTION:patterns
32  * @title: Glob-style pattern matching
33  * @short_description: matches strings against patterns containing '*'
34  *                     (wildcard) and '?' (joker)
35  *
36  * The g_pattern_match* functions match a string
37  * against a pattern containing '*' and '?' wildcards with similar
38  * semantics as the standard glob() function: '*' matches an arbitrary,
39  * possibly empty, string, '?' matches an arbitrary character.
40  *
41  * Note that in contrast to glob(), the '/' character can be matched by
42  * the wildcards, there are no '[...]' character ranges and '*' and '?'
43  * can not be escaped to include them literally in a pattern.
44  *
45  * When multiple strings must be matched against the same pattern, it
46  * is better to compile the pattern to a #GPatternSpec using
47  * g_pattern_spec_new() and use g_pattern_match_string() instead of
48  * g_pattern_match_simple(). This avoids the overhead of repeated
49  * pattern compilation.
50  **/
51 
52 /**
53  * GPatternSpec:
54  *
55  * A GPatternSpec struct is the 'compiled' form of a pattern. This
56  * structure is opaque and its fields cannot be accessed directly.
57  */
58 
59 /* keep enum and structure of gpattern.c and patterntest.c in sync */
60 typedef enum
61 {
62   G_MATCH_ALL,       /* "*A?A*" */
63   G_MATCH_ALL_TAIL,  /* "*A?AA" */
64   G_MATCH_HEAD,      /* "AAAA*" */
65   G_MATCH_TAIL,      /* "*AAAA" */
66   G_MATCH_EXACT,     /* "AAAAA" */
67   G_MATCH_LAST
68 } GMatchType;
69 
70 struct _GPatternSpec
71 {
72   GMatchType match_type;
73   guint      pattern_length;
74   guint      min_length;
75   guint      max_length;
76   gchar     *pattern;
77 };
78 
79 
80 /* --- functions --- */
81 static inline gboolean
g_pattern_ph_match(const gchar * match_pattern,const gchar * match_string,gboolean * wildcard_reached_p)82 g_pattern_ph_match (const gchar *match_pattern,
83 		    const gchar *match_string,
84 		    gboolean    *wildcard_reached_p)
85 {
86   const gchar *pattern, *string;
87   gchar ch;
88 
89   pattern = match_pattern;
90   string = match_string;
91 
92   ch = *pattern;
93   pattern++;
94   while (ch)
95     {
96       switch (ch)
97 	{
98 	case '?':
99 	  if (!*string)
100 	    return FALSE;
101 	  string = g_utf8_next_char (string);
102 	  break;
103 
104 	case '*':
105 	  *wildcard_reached_p = TRUE;
106 	  do
107 	    {
108 	      ch = *pattern;
109 	      pattern++;
110 	      if (ch == '?')
111 		{
112 		  if (!*string)
113 		    return FALSE;
114 		  string = g_utf8_next_char (string);
115 		}
116 	    }
117 	  while (ch == '*' || ch == '?');
118 	  if (!ch)
119 	    return TRUE;
120 	  do
121 	    {
122               gboolean next_wildcard_reached = FALSE;
123 	      while (ch != *string)
124 		{
125 		  if (!*string)
126 		    return FALSE;
127 		  string = g_utf8_next_char (string);
128 		}
129 	      string++;
130 	      if (g_pattern_ph_match (pattern, string, &next_wildcard_reached))
131 		return TRUE;
132               if (next_wildcard_reached)
133                 /* the forthcoming pattern substring up to the next wildcard has
134                  * been matched, but a mismatch occurred for the rest of the
135                  * pattern, following the next wildcard.
136                  * there's no need to advance the current match position any
137                  * further if the rest pattern will not match.
138                  */
139 		return FALSE;
140 	    }
141 	  while (*string);
142 	  break;
143 
144 	default:
145 	  if (ch == *string)
146 	    string++;
147 	  else
148 	    return FALSE;
149 	  break;
150 	}
151 
152       ch = *pattern;
153       pattern++;
154     }
155 
156   return *string == 0;
157 }
158 
159 /**
160  * g_pattern_match:
161  * @pspec: a #GPatternSpec
162  * @string_length: the length of @string (in bytes, i.e. strlen(),
163  *     not g_utf8_strlen())
164  * @string: the UTF-8 encoded string to match
165  * @string_reversed: (nullable): the reverse of @string or %NULL
166  *
167  * Matches a string against a compiled pattern. Passing the correct
168  * length of the string given is mandatory. The reversed string can be
169  * omitted by passing %NULL, this is more efficient if the reversed
170  * version of the string to be matched is not at hand, as
171  * g_pattern_match() will only construct it if the compiled pattern
172  * requires reverse matches.
173  *
174  * Note that, if the user code will (possibly) match a string against a
175  * multitude of patterns containing wildcards, chances are high that
176  * some patterns will require a reversed string. In this case, it's
177  * more efficient to provide the reversed string to avoid multiple
178  * constructions thereof in the various calls to g_pattern_match().
179  *
180  * Note also that the reverse of a UTF-8 encoded string can in general
181  * not be obtained by g_strreverse(). This works only if the string
182  * does not contain any multibyte characters. GLib offers the
183  * g_utf8_strreverse() function to reverse UTF-8 encoded strings.
184  *
185  * Returns: %TRUE if @string matches @pspec
186  **/
187 gboolean
g_pattern_match(GPatternSpec * pspec,guint string_length,const gchar * string,const gchar * string_reversed)188 g_pattern_match (GPatternSpec *pspec,
189 		 guint         string_length,
190 		 const gchar  *string,
191 		 const gchar  *string_reversed)
192 {
193   g_return_val_if_fail (pspec != NULL, FALSE);
194   g_return_val_if_fail (string != NULL, FALSE);
195 
196   if (string_length < pspec->min_length ||
197       string_length > pspec->max_length)
198     return FALSE;
199 
200   switch (pspec->match_type)
201     {
202       gboolean dummy;
203     case G_MATCH_ALL:
204       return g_pattern_ph_match (pspec->pattern, string, &dummy);
205     case G_MATCH_ALL_TAIL:
206       if (string_reversed)
207 	return g_pattern_ph_match (pspec->pattern, string_reversed, &dummy);
208       else
209 	{
210           gboolean result;
211           gchar *tmp;
212 	  tmp = g_utf8_strreverse (string, string_length);
213 	  result = g_pattern_ph_match (pspec->pattern, tmp, &dummy);
214 	  g_free (tmp);
215 	  return result;
216 	}
217     case G_MATCH_HEAD:
218       if (pspec->pattern_length == string_length)
219 	return strcmp (pspec->pattern, string) == 0;
220       else if (pspec->pattern_length)
221 	return strncmp (pspec->pattern, string, pspec->pattern_length) == 0;
222       else
223 	return TRUE;
224     case G_MATCH_TAIL:
225       if (pspec->pattern_length)
226         return strcmp (pspec->pattern, string + (string_length - pspec->pattern_length)) == 0;
227       else
228 	return TRUE;
229     case G_MATCH_EXACT:
230       if (pspec->pattern_length != string_length)
231         return FALSE;
232       else
233         return strcmp (pspec->pattern, string) == 0;
234     default:
235       g_return_val_if_fail (pspec->match_type < G_MATCH_LAST, FALSE);
236       return FALSE;
237     }
238 }
239 
240 /**
241  * g_pattern_spec_new:
242  * @pattern: a zero-terminated UTF-8 encoded string
243  *
244  * Compiles a pattern to a #GPatternSpec.
245  *
246  * Returns: a newly-allocated #GPatternSpec
247  **/
248 GPatternSpec*
g_pattern_spec_new(const gchar * pattern)249 g_pattern_spec_new (const gchar *pattern)
250 {
251   GPatternSpec *pspec;
252   gboolean seen_joker = FALSE, seen_wildcard = FALSE, more_wildcards = FALSE;
253   gint hw_pos = -1, tw_pos = -1, hj_pos = -1, tj_pos = -1;
254   gboolean follows_wildcard = FALSE;
255   guint pending_jokers = 0;
256   const gchar *s;
257   gchar *d;
258   guint i;
259 
260   g_return_val_if_fail (pattern != NULL, NULL);
261 
262   /* canonicalize pattern and collect necessary stats */
263   pspec = g_new (GPatternSpec, 1);
264   pspec->pattern_length = strlen (pattern);
265   pspec->min_length = 0;
266   pspec->max_length = 0;
267   pspec->pattern = g_new (gchar, pspec->pattern_length + 1);
268   d = pspec->pattern;
269   for (i = 0, s = pattern; *s != 0; s++)
270     {
271       switch (*s)
272 	{
273 	case '*':
274 	  if (follows_wildcard)	/* compress multiple wildcards */
275 	    {
276 	      pspec->pattern_length--;
277 	      continue;
278 	    }
279 	  follows_wildcard = TRUE;
280 	  if (hw_pos < 0)
281 	    hw_pos = i;
282 	  tw_pos = i;
283 	  break;
284 	case '?':
285 	  pending_jokers++;
286 	  pspec->min_length++;
287 	  pspec->max_length += 4; /* maximum UTF-8 character length */
288 	  continue;
289 	default:
290 	  for (; pending_jokers; pending_jokers--, i++) {
291 	    *d++ = '?';
292   	    if (hj_pos < 0)
293 	     hj_pos = i;
294 	    tj_pos = i;
295 	  }
296 	  follows_wildcard = FALSE;
297 	  pspec->min_length++;
298 	  pspec->max_length++;
299 	  break;
300 	}
301       *d++ = *s;
302       i++;
303     }
304   for (; pending_jokers; pending_jokers--) {
305     *d++ = '?';
306     if (hj_pos < 0)
307       hj_pos = i;
308     tj_pos = i;
309   }
310   *d++ = 0;
311   seen_joker = hj_pos >= 0;
312   seen_wildcard = hw_pos >= 0;
313   more_wildcards = seen_wildcard && hw_pos != tw_pos;
314   if (seen_wildcard)
315     pspec->max_length = G_MAXUINT;
316 
317   /* special case sole head/tail wildcard or exact matches */
318   if (!seen_joker && !more_wildcards)
319     {
320       if (pspec->pattern[0] == '*')
321 	{
322 	  pspec->match_type = G_MATCH_TAIL;
323           memmove (pspec->pattern, pspec->pattern + 1, --pspec->pattern_length);
324 	  pspec->pattern[pspec->pattern_length] = 0;
325 	  return pspec;
326 	}
327       if (pspec->pattern_length > 0 &&
328 	  pspec->pattern[pspec->pattern_length - 1] == '*')
329 	{
330 	  pspec->match_type = G_MATCH_HEAD;
331 	  pspec->pattern[--pspec->pattern_length] = 0;
332 	  return pspec;
333 	}
334       if (!seen_wildcard)
335 	{
336 	  pspec->match_type = G_MATCH_EXACT;
337 	  return pspec;
338 	}
339     }
340 
341   /* now just need to distinguish between head or tail match start */
342   tw_pos = pspec->pattern_length - 1 - tw_pos;	/* last pos to tail distance */
343   tj_pos = pspec->pattern_length - 1 - tj_pos;	/* last pos to tail distance */
344   if (seen_wildcard)
345     pspec->match_type = tw_pos > hw_pos ? G_MATCH_ALL_TAIL : G_MATCH_ALL;
346   else /* seen_joker */
347     pspec->match_type = tj_pos > hj_pos ? G_MATCH_ALL_TAIL : G_MATCH_ALL;
348   if (pspec->match_type == G_MATCH_ALL_TAIL) {
349     gchar *tmp = pspec->pattern;
350     pspec->pattern = g_utf8_strreverse (pspec->pattern, pspec->pattern_length);
351     g_free (tmp);
352   }
353   return pspec;
354 }
355 
356 /**
357  * g_pattern_spec_free:
358  * @pspec: a #GPatternSpec
359  *
360  * Frees the memory allocated for the #GPatternSpec.
361  **/
362 void
g_pattern_spec_free(GPatternSpec * pspec)363 g_pattern_spec_free (GPatternSpec *pspec)
364 {
365   g_return_if_fail (pspec != NULL);
366 
367   g_free (pspec->pattern);
368   g_free (pspec);
369 }
370 
371 /**
372  * g_pattern_spec_equal:
373  * @pspec1: a #GPatternSpec
374  * @pspec2: another #GPatternSpec
375  *
376  * Compares two compiled pattern specs and returns whether they will
377  * match the same set of strings.
378  *
379  * Returns: Whether the compiled patterns are equal
380  **/
381 gboolean
g_pattern_spec_equal(GPatternSpec * pspec1,GPatternSpec * pspec2)382 g_pattern_spec_equal (GPatternSpec *pspec1,
383 		      GPatternSpec *pspec2)
384 {
385   g_return_val_if_fail (pspec1 != NULL, FALSE);
386   g_return_val_if_fail (pspec2 != NULL, FALSE);
387 
388   return (pspec1->pattern_length == pspec2->pattern_length &&
389 	  pspec1->match_type == pspec2->match_type &&
390 	  strcmp (pspec1->pattern, pspec2->pattern) == 0);
391 }
392 
393 /**
394  * g_pattern_match_string:
395  * @pspec: a #GPatternSpec
396  * @string: the UTF-8 encoded string to match
397  *
398  * Matches a string against a compiled pattern. If the string is to be
399  * matched against more than one pattern, consider using
400  * g_pattern_match() instead while supplying the reversed string.
401  *
402  * Returns: %TRUE if @string matches @pspec
403  **/
404 gboolean
g_pattern_match_string(GPatternSpec * pspec,const gchar * string)405 g_pattern_match_string (GPatternSpec *pspec,
406 			const gchar  *string)
407 {
408   g_return_val_if_fail (pspec != NULL, FALSE);
409   g_return_val_if_fail (string != NULL, FALSE);
410 
411   return g_pattern_match (pspec, strlen (string), string, NULL);
412 }
413 
414 /**
415  * g_pattern_match_simple:
416  * @pattern: the UTF-8 encoded pattern
417  * @string: the UTF-8 encoded string to match
418  *
419  * Matches a string against a pattern given as a string. If this
420  * function is to be called in a loop, it's more efficient to compile
421  * the pattern once with g_pattern_spec_new() and call
422  * g_pattern_match_string() repeatedly.
423  *
424  * Returns: %TRUE if @string matches @pspec
425  **/
426 gboolean
g_pattern_match_simple(const gchar * pattern,const gchar * string)427 g_pattern_match_simple (const gchar *pattern,
428 			const gchar *string)
429 {
430   GPatternSpec *pspec;
431   gboolean ergo;
432 
433   g_return_val_if_fail (pattern != NULL, FALSE);
434   g_return_val_if_fail (string != NULL, FALSE);
435 
436   pspec = g_pattern_spec_new (pattern);
437   ergo = g_pattern_match (pspec, strlen (string), string, NULL);
438   g_pattern_spec_free (pspec);
439 
440   return ergo;
441 }
442