1 /* -*- buffer-read-only: t -*- vi: set ro: */
2 /* DO NOT EDIT! GENERATED AUTOMATICALLY! */
3 /* Extended regular expression matching and search library.
4 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
5 Free Software Foundation, Inc.
6 This file is part of the GNU C Library.
7 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3, or (at your option)
12 any later version.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License along
20 with this program; if not, write to the Free Software Foundation,
21 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22
23 static void re_string_construct_common (const char *str, Idx len,
24 re_string_t *pstr,
25 RE_TRANSLATE_TYPE trans, bool icase,
26 const re_dfa_t *dfa) internal_function;
27 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 re_hashval_t hash) internal_function;
30 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
31 const re_node_set *nodes,
32 unsigned int context,
33 re_hashval_t hash) internal_function;
34
35 /* Functions for string operation. */
36
37 /* This function allocate the buffers. It is necessary to call
38 re_string_reconstruct before using the object. */
39
40 static reg_errcode_t
41 internal_function
re_string_allocate(re_string_t * pstr,const char * str,Idx len,Idx init_len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)42 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
43 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
44 {
45 reg_errcode_t ret;
46 Idx init_buf_len;
47
48 /* Ensure at least one character fits into the buffers. */
49 if (init_len < dfa->mb_cur_max)
50 init_len = dfa->mb_cur_max;
51 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
52 re_string_construct_common (str, len, pstr, trans, icase, dfa);
53
54 ret = re_string_realloc_buffers (pstr, init_buf_len);
55 if (BE (ret != REG_NOERROR, 0))
56 return ret;
57
58 pstr->word_char = dfa->word_char;
59 pstr->word_ops_used = dfa->word_ops_used;
60 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
61 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
62 pstr->valid_raw_len = pstr->valid_len;
63 return REG_NOERROR;
64 }
65
66 /* This function allocate the buffers, and initialize them. */
67
68 static reg_errcode_t
69 internal_function
re_string_construct(re_string_t * pstr,const char * str,Idx len,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)70 re_string_construct (re_string_t *pstr, const char *str, Idx len,
71 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
72 {
73 reg_errcode_t ret;
74 memset (pstr, '\0', sizeof (re_string_t));
75 re_string_construct_common (str, len, pstr, trans, icase, dfa);
76
77 if (len > 0)
78 {
79 ret = re_string_realloc_buffers (pstr, len + 1);
80 if (BE (ret != REG_NOERROR, 0))
81 return ret;
82 }
83 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
84
85 if (icase)
86 {
87 #ifdef RE_ENABLE_I18N
88 if (dfa->mb_cur_max > 1)
89 {
90 while (1)
91 {
92 ret = build_wcs_upper_buffer (pstr);
93 if (BE (ret != REG_NOERROR, 0))
94 return ret;
95 if (pstr->valid_raw_len >= len)
96 break;
97 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
98 break;
99 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
100 if (BE (ret != REG_NOERROR, 0))
101 return ret;
102 }
103 }
104 else
105 #endif /* RE_ENABLE_I18N */
106 build_upper_buffer (pstr);
107 }
108 else
109 {
110 #ifdef RE_ENABLE_I18N
111 if (dfa->mb_cur_max > 1)
112 build_wcs_buffer (pstr);
113 else
114 #endif /* RE_ENABLE_I18N */
115 {
116 if (trans != NULL)
117 re_string_translate_buffer (pstr);
118 else
119 {
120 pstr->valid_len = pstr->bufs_len;
121 pstr->valid_raw_len = pstr->bufs_len;
122 }
123 }
124 }
125
126 return REG_NOERROR;
127 }
128
129 /* Helper functions for re_string_allocate, and re_string_construct. */
130
131 static reg_errcode_t
132 internal_function
re_string_realloc_buffers(re_string_t * pstr,Idx new_buf_len)133 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
134 {
135 #ifdef RE_ENABLE_I18N
136 if (pstr->mb_cur_max > 1)
137 {
138 wint_t *new_wcs;
139
140 /* Avoid overflow. */
141 size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
142 if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
143 return REG_ESPACE;
144
145 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
146 if (BE (new_wcs == NULL, 0))
147 return REG_ESPACE;
148 pstr->wcs = new_wcs;
149 if (pstr->offsets != NULL)
150 {
151 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
152 if (BE (new_offsets == NULL, 0))
153 return REG_ESPACE;
154 pstr->offsets = new_offsets;
155 }
156 }
157 #endif /* RE_ENABLE_I18N */
158 if (pstr->mbs_allocated)
159 {
160 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
161 new_buf_len);
162 if (BE (new_mbs == NULL, 0))
163 return REG_ESPACE;
164 pstr->mbs = new_mbs;
165 }
166 pstr->bufs_len = new_buf_len;
167 return REG_NOERROR;
168 }
169
170
171 static void
172 internal_function
re_string_construct_common(const char * str,Idx len,re_string_t * pstr,RE_TRANSLATE_TYPE trans,bool icase,const re_dfa_t * dfa)173 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
174 RE_TRANSLATE_TYPE trans, bool icase,
175 const re_dfa_t *dfa)
176 {
177 pstr->raw_mbs = (const unsigned char *) str;
178 pstr->len = len;
179 pstr->raw_len = len;
180 pstr->trans = trans;
181 pstr->icase = icase;
182 pstr->mbs_allocated = (trans != NULL || icase);
183 pstr->mb_cur_max = dfa->mb_cur_max;
184 pstr->is_utf8 = dfa->is_utf8;
185 pstr->map_notascii = dfa->map_notascii;
186 pstr->stop = pstr->len;
187 pstr->raw_stop = pstr->stop;
188 }
189
190 #ifdef RE_ENABLE_I18N
191
192 /* Build wide character buffer PSTR->WCS.
193 If the byte sequence of the string are:
194 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
195 Then wide character buffer will be:
196 <wc1> , WEOF , <wc2> , WEOF , <wc3>
197 We use WEOF for padding, they indicate that the position isn't
198 a first byte of a multibyte character.
199
200 Note that this function assumes PSTR->VALID_LEN elements are already
201 built and starts from PSTR->VALID_LEN. */
202
203 static void
204 internal_function
build_wcs_buffer(re_string_t * pstr)205 build_wcs_buffer (re_string_t *pstr)
206 {
207 #ifdef _LIBC
208 unsigned char buf[MB_LEN_MAX];
209 assert (MB_LEN_MAX >= pstr->mb_cur_max);
210 #else
211 unsigned char buf[64];
212 #endif
213 mbstate_t prev_st;
214 Idx byte_idx, end_idx, remain_len;
215 size_t mbclen;
216
217 /* Build the buffers from pstr->valid_len to either pstr->len or
218 pstr->bufs_len. */
219 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
220 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
221 {
222 wchar_t wc;
223 const char *p;
224
225 remain_len = end_idx - byte_idx;
226 prev_st = pstr->cur_state;
227 /* Apply the translation if we need. */
228 if (BE (pstr->trans != NULL, 0))
229 {
230 int i, ch;
231
232 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
233 {
234 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
235 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
236 }
237 p = (const char *) buf;
238 }
239 else
240 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
241 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
242 if (BE (mbclen == (size_t) -2, 0))
243 {
244 /* The buffer doesn't have enough space, finish to build. */
245 pstr->cur_state = prev_st;
246 break;
247 }
248 else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
249 {
250 /* We treat these cases as a singlebyte character. */
251 mbclen = 1;
252 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
253 if (BE (pstr->trans != NULL, 0))
254 wc = pstr->trans[wc];
255 pstr->cur_state = prev_st;
256 }
257
258 /* Write wide character and padding. */
259 pstr->wcs[byte_idx++] = wc;
260 /* Write paddings. */
261 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
262 pstr->wcs[byte_idx++] = WEOF;
263 }
264 pstr->valid_len = byte_idx;
265 pstr->valid_raw_len = byte_idx;
266 }
267
268 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
269 but for REG_ICASE. */
270
271 static reg_errcode_t
272 internal_function
build_wcs_upper_buffer(re_string_t * pstr)273 build_wcs_upper_buffer (re_string_t *pstr)
274 {
275 mbstate_t prev_st;
276 Idx src_idx, byte_idx, end_idx, remain_len;
277 size_t mbclen;
278 #ifdef _LIBC
279 char buf[MB_LEN_MAX];
280 assert (MB_LEN_MAX >= pstr->mb_cur_max);
281 #else
282 char buf[64];
283 #endif
284
285 byte_idx = pstr->valid_len;
286 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
287
288 /* The following optimization assumes that ASCII characters can be
289 mapped to wide characters with a simple cast. */
290 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
291 {
292 while (byte_idx < end_idx)
293 {
294 wchar_t wc;
295
296 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
297 && mbsinit (&pstr->cur_state))
298 {
299 /* In case of a singlebyte character. */
300 pstr->mbs[byte_idx]
301 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
302 /* The next step uses the assumption that wchar_t is encoded
303 ASCII-safe: all ASCII values can be converted like this. */
304 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
305 ++byte_idx;
306 continue;
307 }
308
309 remain_len = end_idx - byte_idx;
310 prev_st = pstr->cur_state;
311 mbclen = __mbrtowc (&wc,
312 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
313 + byte_idx), remain_len, &pstr->cur_state);
314 if (BE (mbclen < (size_t) -2, 1))
315 {
316 wchar_t wcu = wc;
317 if (iswlower (wc))
318 {
319 size_t mbcdlen;
320
321 wcu = towupper (wc);
322 mbcdlen = wcrtomb (buf, wcu, &prev_st);
323 if (BE (mbclen == mbcdlen, 1))
324 memcpy (pstr->mbs + byte_idx, buf, mbclen);
325 else
326 {
327 src_idx = byte_idx;
328 goto offsets_needed;
329 }
330 }
331 else
332 memcpy (pstr->mbs + byte_idx,
333 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
334 pstr->wcs[byte_idx++] = wcu;
335 /* Write paddings. */
336 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
337 pstr->wcs[byte_idx++] = WEOF;
338 }
339 else if (mbclen == (size_t) -1 || mbclen == 0)
340 {
341 /* It is an invalid character or '\0'. Just use the byte. */
342 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
343 pstr->mbs[byte_idx] = ch;
344 /* And also cast it to wide char. */
345 pstr->wcs[byte_idx++] = (wchar_t) ch;
346 if (BE (mbclen == (size_t) -1, 0))
347 pstr->cur_state = prev_st;
348 }
349 else
350 {
351 /* The buffer doesn't have enough space, finish to build. */
352 pstr->cur_state = prev_st;
353 break;
354 }
355 }
356 pstr->valid_len = byte_idx;
357 pstr->valid_raw_len = byte_idx;
358 return REG_NOERROR;
359 }
360 else
361 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
362 {
363 wchar_t wc;
364 const char *p;
365 offsets_needed:
366 remain_len = end_idx - byte_idx;
367 prev_st = pstr->cur_state;
368 if (BE (pstr->trans != NULL, 0))
369 {
370 int i, ch;
371
372 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
373 {
374 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
375 buf[i] = pstr->trans[ch];
376 }
377 p = (const char *) buf;
378 }
379 else
380 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
381 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
382 if (BE (mbclen < (size_t) -2, 1))
383 {
384 wchar_t wcu = wc;
385 if (iswlower (wc))
386 {
387 size_t mbcdlen;
388
389 wcu = towupper (wc);
390 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
391 if (BE (mbclen == mbcdlen, 1))
392 memcpy (pstr->mbs + byte_idx, buf, mbclen);
393 else if (mbcdlen != (size_t) -1)
394 {
395 size_t i;
396
397 if (byte_idx + mbcdlen > pstr->bufs_len)
398 {
399 pstr->cur_state = prev_st;
400 break;
401 }
402
403 if (pstr->offsets == NULL)
404 {
405 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
406
407 if (pstr->offsets == NULL)
408 return REG_ESPACE;
409 }
410 if (!pstr->offsets_needed)
411 {
412 for (i = 0; i < (size_t) byte_idx; ++i)
413 pstr->offsets[i] = i;
414 pstr->offsets_needed = 1;
415 }
416
417 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
418 pstr->wcs[byte_idx] = wcu;
419 pstr->offsets[byte_idx] = src_idx;
420 for (i = 1; i < mbcdlen; ++i)
421 {
422 pstr->offsets[byte_idx + i]
423 = src_idx + (i < mbclen ? i : mbclen - 1);
424 pstr->wcs[byte_idx + i] = WEOF;
425 }
426 pstr->len += mbcdlen - mbclen;
427 if (pstr->raw_stop > src_idx)
428 pstr->stop += mbcdlen - mbclen;
429 end_idx = (pstr->bufs_len > pstr->len)
430 ? pstr->len : pstr->bufs_len;
431 byte_idx += mbcdlen;
432 src_idx += mbclen;
433 continue;
434 }
435 else
436 memcpy (pstr->mbs + byte_idx, p, mbclen);
437 }
438 else
439 memcpy (pstr->mbs + byte_idx, p, mbclen);
440
441 if (BE (pstr->offsets_needed != 0, 0))
442 {
443 size_t i;
444 for (i = 0; i < mbclen; ++i)
445 pstr->offsets[byte_idx + i] = src_idx + i;
446 }
447 src_idx += mbclen;
448
449 pstr->wcs[byte_idx++] = wcu;
450 /* Write paddings. */
451 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
452 pstr->wcs[byte_idx++] = WEOF;
453 }
454 else if (mbclen == (size_t) -1 || mbclen == 0)
455 {
456 /* It is an invalid character or '\0'. Just use the byte. */
457 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
458
459 if (BE (pstr->trans != NULL, 0))
460 ch = pstr->trans [ch];
461 pstr->mbs[byte_idx] = ch;
462
463 if (BE (pstr->offsets_needed != 0, 0))
464 pstr->offsets[byte_idx] = src_idx;
465 ++src_idx;
466
467 /* And also cast it to wide char. */
468 pstr->wcs[byte_idx++] = (wchar_t) ch;
469 if (BE (mbclen == (size_t) -1, 0))
470 pstr->cur_state = prev_st;
471 }
472 else
473 {
474 /* The buffer doesn't have enough space, finish to build. */
475 pstr->cur_state = prev_st;
476 break;
477 }
478 }
479 pstr->valid_len = byte_idx;
480 pstr->valid_raw_len = src_idx;
481 return REG_NOERROR;
482 }
483
484 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
485 Return the index. */
486
487 static Idx
488 internal_function
re_string_skip_chars(re_string_t * pstr,Idx new_raw_idx,wint_t * last_wc)489 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
490 {
491 mbstate_t prev_st;
492 Idx rawbuf_idx;
493 size_t mbclen;
494 wint_t wc = WEOF;
495
496 /* Skip the characters which are not necessary to check. */
497 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
498 rawbuf_idx < new_raw_idx;)
499 {
500 wchar_t wc2;
501 Idx remain_len;
502 remain_len = pstr->len - rawbuf_idx;
503 prev_st = pstr->cur_state;
504 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
505 remain_len, &pstr->cur_state);
506 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
507 {
508 /* We treat these cases as a single byte character. */
509 if (mbclen == 0 || remain_len == 0)
510 wc = L'\0';
511 else
512 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
513 mbclen = 1;
514 pstr->cur_state = prev_st;
515 }
516 else
517 wc = wc2;
518 /* Then proceed the next character. */
519 rawbuf_idx += mbclen;
520 }
521 *last_wc = wc;
522 return rawbuf_idx;
523 }
524 #endif /* RE_ENABLE_I18N */
525
526 /* Build the buffer PSTR->MBS, and apply the translation if we need.
527 This function is used in case of REG_ICASE. */
528
529 static void
530 internal_function
build_upper_buffer(re_string_t * pstr)531 build_upper_buffer (re_string_t *pstr)
532 {
533 Idx char_idx, end_idx;
534 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
535
536 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
537 {
538 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
539 if (BE (pstr->trans != NULL, 0))
540 ch = pstr->trans[ch];
541 if (islower (ch))
542 pstr->mbs[char_idx] = toupper (ch);
543 else
544 pstr->mbs[char_idx] = ch;
545 }
546 pstr->valid_len = char_idx;
547 pstr->valid_raw_len = char_idx;
548 }
549
550 /* Apply TRANS to the buffer in PSTR. */
551
552 static void
553 internal_function
re_string_translate_buffer(re_string_t * pstr)554 re_string_translate_buffer (re_string_t *pstr)
555 {
556 Idx buf_idx, end_idx;
557 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558
559 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560 {
561 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
562 pstr->mbs[buf_idx] = pstr->trans[ch];
563 }
564
565 pstr->valid_len = buf_idx;
566 pstr->valid_raw_len = buf_idx;
567 }
568
569 /* This function re-construct the buffers.
570 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571 convert to upper case in case of REG_ICASE, apply translation. */
572
573 static reg_errcode_t
574 internal_function
re_string_reconstruct(re_string_t * pstr,Idx idx,int eflags)575 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
576 {
577 Idx offset;
578
579 if (BE (pstr->raw_mbs_idx <= idx, 0))
580 offset = idx - pstr->raw_mbs_idx;
581 else
582 {
583 /* Reset buffer. */
584 #ifdef RE_ENABLE_I18N
585 if (pstr->mb_cur_max > 1)
586 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
587 #endif /* RE_ENABLE_I18N */
588 pstr->len = pstr->raw_len;
589 pstr->stop = pstr->raw_stop;
590 pstr->valid_len = 0;
591 pstr->raw_mbs_idx = 0;
592 pstr->valid_raw_len = 0;
593 pstr->offsets_needed = 0;
594 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
595 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
596 if (!pstr->mbs_allocated)
597 pstr->mbs = (unsigned char *) pstr->raw_mbs;
598 offset = idx;
599 }
600
601 if (BE (offset != 0, 1))
602 {
603 /* Should the already checked characters be kept? */
604 if (BE (offset < pstr->valid_raw_len, 1))
605 {
606 /* Yes, move them to the front of the buffer. */
607 #ifdef RE_ENABLE_I18N
608 if (BE (pstr->offsets_needed, 0))
609 {
610 Idx low = 0, high = pstr->valid_len, mid;
611 do
612 {
613 mid = (high + low) / 2;
614 if (pstr->offsets[mid] > offset)
615 high = mid;
616 else if (pstr->offsets[mid] < offset)
617 low = mid + 1;
618 else
619 break;
620 }
621 while (low < high);
622 if (pstr->offsets[mid] < offset)
623 ++mid;
624 pstr->tip_context = re_string_context_at (pstr, mid - 1,
625 eflags);
626 /* This can be quite complicated, so handle specially
627 only the common and easy case where the character with
628 different length representation of lower and upper
629 case is present at or after offset. */
630 if (pstr->valid_len > offset
631 && mid == offset && pstr->offsets[mid] == offset)
632 {
633 memmove (pstr->wcs, pstr->wcs + offset,
634 (pstr->valid_len - offset) * sizeof (wint_t));
635 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
636 pstr->valid_len -= offset;
637 pstr->valid_raw_len -= offset;
638 for (low = 0; low < pstr->valid_len; low++)
639 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
640 }
641 else
642 {
643 /* Otherwise, just find out how long the partial multibyte
644 character at offset is and fill it with WEOF/255. */
645 pstr->len = pstr->raw_len - idx + offset;
646 pstr->stop = pstr->raw_stop - idx + offset;
647 pstr->offsets_needed = 0;
648 while (mid > 0 && pstr->offsets[mid - 1] == offset)
649 --mid;
650 while (mid < pstr->valid_len)
651 if (pstr->wcs[mid] != WEOF)
652 break;
653 else
654 ++mid;
655 if (mid == pstr->valid_len)
656 pstr->valid_len = 0;
657 else
658 {
659 pstr->valid_len = pstr->offsets[mid] - offset;
660 if (pstr->valid_len)
661 {
662 for (low = 0; low < pstr->valid_len; ++low)
663 pstr->wcs[low] = WEOF;
664 memset (pstr->mbs, 255, pstr->valid_len);
665 }
666 }
667 pstr->valid_raw_len = pstr->valid_len;
668 }
669 }
670 else
671 #endif
672 {
673 pstr->tip_context = re_string_context_at (pstr, offset - 1,
674 eflags);
675 #ifdef RE_ENABLE_I18N
676 if (pstr->mb_cur_max > 1)
677 memmove (pstr->wcs, pstr->wcs + offset,
678 (pstr->valid_len - offset) * sizeof (wint_t));
679 #endif /* RE_ENABLE_I18N */
680 if (BE (pstr->mbs_allocated, 0))
681 memmove (pstr->mbs, pstr->mbs + offset,
682 pstr->valid_len - offset);
683 pstr->valid_len -= offset;
684 pstr->valid_raw_len -= offset;
685 #if DEBUG
686 assert (pstr->valid_len > 0);
687 #endif
688 }
689 }
690 else
691 {
692 #ifdef RE_ENABLE_I18N
693 /* No, skip all characters until IDX. */
694 Idx prev_valid_len = pstr->valid_len;
695
696 if (BE (pstr->offsets_needed, 0))
697 {
698 pstr->len = pstr->raw_len - idx + offset;
699 pstr->stop = pstr->raw_stop - idx + offset;
700 pstr->offsets_needed = 0;
701 }
702 #endif
703 pstr->valid_len = 0;
704 #ifdef RE_ENABLE_I18N
705 if (pstr->mb_cur_max > 1)
706 {
707 Idx wcs_idx;
708 wint_t wc = WEOF;
709
710 if (pstr->is_utf8)
711 {
712 const unsigned char *raw, *p, *end;
713
714 /* Special case UTF-8. Multi-byte chars start with any
715 byte other than 0x80 - 0xbf. */
716 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
717 end = raw + (offset - pstr->mb_cur_max);
718 if (end < pstr->raw_mbs)
719 end = pstr->raw_mbs;
720 p = raw + offset - 1;
721 #ifdef _LIBC
722 /* We know the wchar_t encoding is UCS4, so for the simple
723 case, ASCII characters, skip the conversion step. */
724 if (isascii (*p) && BE (pstr->trans == NULL, 1))
725 {
726 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
727 /* pstr->valid_len = 0; */
728 wc = (wchar_t) *p;
729 }
730 else
731 #endif
732 for (; p >= end; --p)
733 if ((*p & 0xc0) != 0x80)
734 {
735 mbstate_t cur_state;
736 wchar_t wc2;
737 Idx mlen = raw + pstr->len - p;
738 unsigned char buf[6];
739 size_t mbclen;
740
741 if (BE (pstr->trans != NULL, 0))
742 {
743 int i = mlen < 6 ? mlen : 6;
744 while (--i >= 0)
745 buf[i] = pstr->trans[p[i]];
746 }
747 /* XXX Don't use mbrtowc, we know which conversion
748 to use (UTF-8 -> UCS4). */
749 memset (&cur_state, 0, sizeof (cur_state));
750 mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
751 &cur_state);
752 if (raw + offset - p <= mbclen
753 && mbclen < (size_t) -2)
754 {
755 memset (&pstr->cur_state, '\0',
756 sizeof (mbstate_t));
757 pstr->valid_len = mbclen - (raw + offset - p);
758 wc = wc2;
759 }
760 break;
761 }
762 }
763
764 if (wc == WEOF)
765 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
766 if (wc == WEOF)
767 pstr->tip_context
768 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
769 else
770 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
771 && IS_WIDE_WORD_CHAR (wc))
772 ? CONTEXT_WORD
773 : ((IS_WIDE_NEWLINE (wc)
774 && pstr->newline_anchor)
775 ? CONTEXT_NEWLINE : 0));
776 if (BE (pstr->valid_len, 0))
777 {
778 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
779 pstr->wcs[wcs_idx] = WEOF;
780 if (pstr->mbs_allocated)
781 memset (pstr->mbs, 255, pstr->valid_len);
782 }
783 pstr->valid_raw_len = pstr->valid_len;
784 }
785 else
786 #endif /* RE_ENABLE_I18N */
787 {
788 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
789 pstr->valid_raw_len = 0;
790 if (pstr->trans)
791 c = pstr->trans[c];
792 pstr->tip_context = (bitset_contain (pstr->word_char, c)
793 ? CONTEXT_WORD
794 : ((IS_NEWLINE (c) && pstr->newline_anchor)
795 ? CONTEXT_NEWLINE : 0));
796 }
797 }
798 if (!BE (pstr->mbs_allocated, 0))
799 pstr->mbs += offset;
800 }
801 pstr->raw_mbs_idx = idx;
802 pstr->len -= offset;
803 pstr->stop -= offset;
804
805 /* Then build the buffers. */
806 #ifdef RE_ENABLE_I18N
807 if (pstr->mb_cur_max > 1)
808 {
809 if (pstr->icase)
810 {
811 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
812 if (BE (ret != REG_NOERROR, 0))
813 return ret;
814 }
815 else
816 build_wcs_buffer (pstr);
817 }
818 else
819 #endif /* RE_ENABLE_I18N */
820 if (BE (pstr->mbs_allocated, 0))
821 {
822 if (pstr->icase)
823 build_upper_buffer (pstr);
824 else if (pstr->trans != NULL)
825 re_string_translate_buffer (pstr);
826 }
827 else
828 pstr->valid_len = pstr->len;
829
830 pstr->cur_idx = 0;
831 return REG_NOERROR;
832 }
833
834 static unsigned char
internal_function(pure)835 internal_function __attribute ((pure))
836 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
837 {
838 int ch;
839 Idx off;
840
841 /* Handle the common (easiest) cases first. */
842 if (BE (!pstr->mbs_allocated, 1))
843 return re_string_peek_byte (pstr, idx);
844
845 #ifdef RE_ENABLE_I18N
846 if (pstr->mb_cur_max > 1
847 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
848 return re_string_peek_byte (pstr, idx);
849 #endif
850
851 off = pstr->cur_idx + idx;
852 #ifdef RE_ENABLE_I18N
853 if (pstr->offsets_needed)
854 off = pstr->offsets[off];
855 #endif
856
857 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
858
859 #ifdef RE_ENABLE_I18N
860 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
861 this function returns CAPITAL LETTER I instead of first byte of
862 DOTLESS SMALL LETTER I. The latter would confuse the parser,
863 since peek_byte_case doesn't advance cur_idx in any way. */
864 if (pstr->offsets_needed && !isascii (ch))
865 return re_string_peek_byte (pstr, idx);
866 #endif
867
868 return ch;
869 }
870
871 static unsigned char
internal_function(pure)872 internal_function __attribute ((pure))
873 re_string_fetch_byte_case (re_string_t *pstr)
874 {
875 if (BE (!pstr->mbs_allocated, 1))
876 return re_string_fetch_byte (pstr);
877
878 #ifdef RE_ENABLE_I18N
879 if (pstr->offsets_needed)
880 {
881 Idx off;
882 int ch;
883
884 /* For tr_TR.UTF-8 [[:islower:]] there is
885 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
886 in that case the whole multi-byte character and return
887 the original letter. On the other side, with
888 [[: DOTLESS SMALL LETTER I return [[:I, as doing
889 anything else would complicate things too much. */
890
891 if (!re_string_first_byte (pstr, pstr->cur_idx))
892 return re_string_fetch_byte (pstr);
893
894 off = pstr->offsets[pstr->cur_idx];
895 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
896
897 if (! isascii (ch))
898 return re_string_fetch_byte (pstr);
899
900 re_string_skip_bytes (pstr,
901 re_string_char_size_at (pstr, pstr->cur_idx));
902 return ch;
903 }
904 #endif
905
906 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
907 }
908
909 static void
910 internal_function
re_string_destruct(re_string_t * pstr)911 re_string_destruct (re_string_t *pstr)
912 {
913 #ifdef RE_ENABLE_I18N
914 re_free (pstr->wcs);
915 re_free (pstr->offsets);
916 #endif /* RE_ENABLE_I18N */
917 if (pstr->mbs_allocated)
918 re_free (pstr->mbs);
919 }
920
921 /* Return the context at IDX in INPUT. */
922
923 static unsigned int
924 internal_function
re_string_context_at(const re_string_t * input,Idx idx,int eflags)925 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
926 {
927 int c;
928 if (BE (! REG_VALID_INDEX (idx), 0))
929 /* In this case, we use the value stored in input->tip_context,
930 since we can't know the character in input->mbs[-1] here. */
931 return input->tip_context;
932 if (BE (idx == input->len, 0))
933 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
935 #ifdef RE_ENABLE_I18N
936 if (input->mb_cur_max > 1)
937 {
938 wint_t wc;
939 Idx wc_idx = idx;
940 while(input->wcs[wc_idx] == WEOF)
941 {
942 #ifdef DEBUG
943 /* It must not happen. */
944 assert (REG_VALID_INDEX (wc_idx));
945 #endif
946 --wc_idx;
947 if (! REG_VALID_INDEX (wc_idx))
948 return input->tip_context;
949 }
950 wc = input->wcs[wc_idx];
951 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
952 return CONTEXT_WORD;
953 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954 ? CONTEXT_NEWLINE : 0);
955 }
956 else
957 #endif
958 {
959 c = re_string_byte_at (input, idx);
960 if (bitset_contain (input->word_char, c))
961 return CONTEXT_WORD;
962 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
963 }
964 }
965
966 /* Functions for set operation. */
967
968 static reg_errcode_t
969 internal_function
re_node_set_alloc(re_node_set * set,Idx size)970 re_node_set_alloc (re_node_set *set, Idx size)
971 {
972 set->alloc = size;
973 set->nelem = 0;
974 set->elems = re_malloc (Idx, size);
975 if (BE (set->elems == NULL, 0))
976 return REG_ESPACE;
977 return REG_NOERROR;
978 }
979
980 static reg_errcode_t
981 internal_function
re_node_set_init_1(re_node_set * set,Idx elem)982 re_node_set_init_1 (re_node_set *set, Idx elem)
983 {
984 set->alloc = 1;
985 set->nelem = 1;
986 set->elems = re_malloc (Idx, 1);
987 if (BE (set->elems == NULL, 0))
988 {
989 set->alloc = set->nelem = 0;
990 return REG_ESPACE;
991 }
992 set->elems[0] = elem;
993 return REG_NOERROR;
994 }
995
996 static reg_errcode_t
997 internal_function
re_node_set_init_2(re_node_set * set,Idx elem1,Idx elem2)998 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
999 {
1000 set->alloc = 2;
1001 set->elems = re_malloc (Idx, 2);
1002 if (BE (set->elems == NULL, 0))
1003 return REG_ESPACE;
1004 if (elem1 == elem2)
1005 {
1006 set->nelem = 1;
1007 set->elems[0] = elem1;
1008 }
1009 else
1010 {
1011 set->nelem = 2;
1012 if (elem1 < elem2)
1013 {
1014 set->elems[0] = elem1;
1015 set->elems[1] = elem2;
1016 }
1017 else
1018 {
1019 set->elems[0] = elem2;
1020 set->elems[1] = elem1;
1021 }
1022 }
1023 return REG_NOERROR;
1024 }
1025
1026 static reg_errcode_t
1027 internal_function
re_node_set_init_copy(re_node_set * dest,const re_node_set * src)1028 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1029 {
1030 dest->nelem = src->nelem;
1031 if (src->nelem > 0)
1032 {
1033 dest->alloc = dest->nelem;
1034 dest->elems = re_malloc (Idx, dest->alloc);
1035 if (BE (dest->elems == NULL, 0))
1036 {
1037 dest->alloc = dest->nelem = 0;
1038 return REG_ESPACE;
1039 }
1040 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1041 }
1042 else
1043 re_node_set_init_empty (dest);
1044 return REG_NOERROR;
1045 }
1046
1047 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1048 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1049 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1050
1051 static reg_errcode_t
1052 internal_function
re_node_set_add_intersect(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1053 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1054 const re_node_set *src2)
1055 {
1056 Idx i1, i2, is, id, delta, sbase;
1057 if (src1->nelem == 0 || src2->nelem == 0)
1058 return REG_NOERROR;
1059
1060 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1061 conservative estimate. */
1062 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1063 {
1064 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1065 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1066 if (BE (new_elems == NULL, 0))
1067 return REG_ESPACE;
1068 dest->elems = new_elems;
1069 dest->alloc = new_alloc;
1070 }
1071
1072 /* Find the items in the intersection of SRC1 and SRC2, and copy
1073 into the top of DEST those that are not already in DEST itself. */
1074 sbase = dest->nelem + src1->nelem + src2->nelem;
1075 i1 = src1->nelem - 1;
1076 i2 = src2->nelem - 1;
1077 id = dest->nelem - 1;
1078 for (;;)
1079 {
1080 if (src1->elems[i1] == src2->elems[i2])
1081 {
1082 /* Try to find the item in DEST. Maybe we could binary search? */
1083 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1084 --id;
1085
1086 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1087 dest->elems[--sbase] = src1->elems[i1];
1088
1089 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1090 break;
1091 }
1092
1093 /* Lower the highest of the two items. */
1094 else if (src1->elems[i1] < src2->elems[i2])
1095 {
1096 if (! REG_VALID_INDEX (--i2))
1097 break;
1098 }
1099 else
1100 {
1101 if (! REG_VALID_INDEX (--i1))
1102 break;
1103 }
1104 }
1105
1106 id = dest->nelem - 1;
1107 is = dest->nelem + src1->nelem + src2->nelem - 1;
1108 delta = is - sbase + 1;
1109
1110 /* Now copy. When DELTA becomes zero, the remaining
1111 DEST elements are already in place; this is more or
1112 less the same loop that is in re_node_set_merge. */
1113 dest->nelem += delta;
1114 if (delta > 0 && REG_VALID_INDEX (id))
1115 for (;;)
1116 {
1117 if (dest->elems[is] > dest->elems[id])
1118 {
1119 /* Copy from the top. */
1120 dest->elems[id + delta--] = dest->elems[is--];
1121 if (delta == 0)
1122 break;
1123 }
1124 else
1125 {
1126 /* Slide from the bottom. */
1127 dest->elems[id + delta] = dest->elems[id];
1128 if (! REG_VALID_INDEX (--id))
1129 break;
1130 }
1131 }
1132
1133 /* Copy remaining SRC elements. */
1134 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1135
1136 return REG_NOERROR;
1137 }
1138
1139 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1140 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1141
1142 static reg_errcode_t
1143 internal_function
re_node_set_init_union(re_node_set * dest,const re_node_set * src1,const re_node_set * src2)1144 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1145 const re_node_set *src2)
1146 {
1147 Idx i1, i2, id;
1148 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1149 {
1150 dest->alloc = src1->nelem + src2->nelem;
1151 dest->elems = re_malloc (Idx, dest->alloc);
1152 if (BE (dest->elems == NULL, 0))
1153 return REG_ESPACE;
1154 }
1155 else
1156 {
1157 if (src1 != NULL && src1->nelem > 0)
1158 return re_node_set_init_copy (dest, src1);
1159 else if (src2 != NULL && src2->nelem > 0)
1160 return re_node_set_init_copy (dest, src2);
1161 else
1162 re_node_set_init_empty (dest);
1163 return REG_NOERROR;
1164 }
1165 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1166 {
1167 if (src1->elems[i1] > src2->elems[i2])
1168 {
1169 dest->elems[id++] = src2->elems[i2++];
1170 continue;
1171 }
1172 if (src1->elems[i1] == src2->elems[i2])
1173 ++i2;
1174 dest->elems[id++] = src1->elems[i1++];
1175 }
1176 if (i1 < src1->nelem)
1177 {
1178 memcpy (dest->elems + id, src1->elems + i1,
1179 (src1->nelem - i1) * sizeof (Idx));
1180 id += src1->nelem - i1;
1181 }
1182 else if (i2 < src2->nelem)
1183 {
1184 memcpy (dest->elems + id, src2->elems + i2,
1185 (src2->nelem - i2) * sizeof (Idx));
1186 id += src2->nelem - i2;
1187 }
1188 dest->nelem = id;
1189 return REG_NOERROR;
1190 }
1191
1192 /* Calculate the union set of the sets DEST and SRC. And store it to
1193 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1194
1195 static reg_errcode_t
1196 internal_function
re_node_set_merge(re_node_set * dest,const re_node_set * src)1197 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1198 {
1199 Idx is, id, sbase, delta;
1200 if (src == NULL || src->nelem == 0)
1201 return REG_NOERROR;
1202 if (dest->alloc < 2 * src->nelem + dest->nelem)
1203 {
1204 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1205 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1206 if (BE (new_buffer == NULL, 0))
1207 return REG_ESPACE;
1208 dest->elems = new_buffer;
1209 dest->alloc = new_alloc;
1210 }
1211
1212 if (BE (dest->nelem == 0, 0))
1213 {
1214 dest->nelem = src->nelem;
1215 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1216 return REG_NOERROR;
1217 }
1218
1219 /* Copy into the top of DEST the items of SRC that are not
1220 found in DEST. Maybe we could binary search in DEST? */
1221 for (sbase = dest->nelem + 2 * src->nelem,
1222 is = src->nelem - 1, id = dest->nelem - 1;
1223 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1224 {
1225 if (dest->elems[id] == src->elems[is])
1226 is--, id--;
1227 else if (dest->elems[id] < src->elems[is])
1228 dest->elems[--sbase] = src->elems[is--];
1229 else /* if (dest->elems[id] > src->elems[is]) */
1230 --id;
1231 }
1232
1233 if (REG_VALID_INDEX (is))
1234 {
1235 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1236 sbase -= is + 1;
1237 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1238 }
1239
1240 id = dest->nelem - 1;
1241 is = dest->nelem + 2 * src->nelem - 1;
1242 delta = is - sbase + 1;
1243 if (delta == 0)
1244 return REG_NOERROR;
1245
1246 /* Now copy. When DELTA becomes zero, the remaining
1247 DEST elements are already in place. */
1248 dest->nelem += delta;
1249 for (;;)
1250 {
1251 if (dest->elems[is] > dest->elems[id])
1252 {
1253 /* Copy from the top. */
1254 dest->elems[id + delta--] = dest->elems[is--];
1255 if (delta == 0)
1256 break;
1257 }
1258 else
1259 {
1260 /* Slide from the bottom. */
1261 dest->elems[id + delta] = dest->elems[id];
1262 if (! REG_VALID_INDEX (--id))
1263 {
1264 /* Copy remaining SRC elements. */
1265 memcpy (dest->elems, dest->elems + sbase,
1266 delta * sizeof (Idx));
1267 break;
1268 }
1269 }
1270 }
1271
1272 return REG_NOERROR;
1273 }
1274
1275 /* Insert the new element ELEM to the re_node_set* SET.
1276 SET should not already have ELEM.
1277 Return true if successful. */
1278
1279 static bool
1280 internal_function
re_node_set_insert(re_node_set * set,Idx elem)1281 re_node_set_insert (re_node_set *set, Idx elem)
1282 {
1283 Idx idx;
1284 /* In case the set is empty. */
1285 if (set->alloc == 0)
1286 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1287
1288 if (BE (set->nelem, 0) == 0)
1289 {
1290 /* We already guaranteed above that set->alloc != 0. */
1291 set->elems[0] = elem;
1292 ++set->nelem;
1293 return true;
1294 }
1295
1296 /* Realloc if we need. */
1297 if (set->alloc == set->nelem)
1298 {
1299 Idx *new_elems;
1300 set->alloc = set->alloc * 2;
1301 new_elems = re_realloc (set->elems, Idx, set->alloc);
1302 if (BE (new_elems == NULL, 0))
1303 return false;
1304 set->elems = new_elems;
1305 }
1306
1307 /* Move the elements which follows the new element. Test the
1308 first element separately to skip a check in the inner loop. */
1309 if (elem < set->elems[0])
1310 {
1311 idx = 0;
1312 for (idx = set->nelem; idx > 0; idx--)
1313 set->elems[idx] = set->elems[idx - 1];
1314 }
1315 else
1316 {
1317 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1318 set->elems[idx] = set->elems[idx - 1];
1319 }
1320
1321 /* Insert the new element. */
1322 set->elems[idx] = elem;
1323 ++set->nelem;
1324 return true;
1325 }
1326
1327 /* Insert the new element ELEM to the re_node_set* SET.
1328 SET should not already have any element greater than or equal to ELEM.
1329 Return true if successful. */
1330
1331 static bool
1332 internal_function
re_node_set_insert_last(re_node_set * set,Idx elem)1333 re_node_set_insert_last (re_node_set *set, Idx elem)
1334 {
1335 /* Realloc if we need. */
1336 if (set->alloc == set->nelem)
1337 {
1338 Idx *new_elems;
1339 set->alloc = (set->alloc + 1) * 2;
1340 new_elems = re_realloc (set->elems, Idx, set->alloc);
1341 if (BE (new_elems == NULL, 0))
1342 return false;
1343 set->elems = new_elems;
1344 }
1345
1346 /* Insert the new element. */
1347 set->elems[set->nelem++] = elem;
1348 return true;
1349 }
1350
1351 /* Compare two node sets SET1 and SET2.
1352 Return true if SET1 and SET2 are equivalent. */
1353
1354 static bool
internal_function(pure)1355 internal_function __attribute ((pure))
1356 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1357 {
1358 Idx i;
1359 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1360 return false;
1361 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1362 if (set1->elems[i] != set2->elems[i])
1363 return false;
1364 return true;
1365 }
1366
1367 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1368
1369 static Idx
internal_function(pure)1370 internal_function __attribute ((pure))
1371 re_node_set_contains (const re_node_set *set, Idx elem)
1372 {
1373 __re_size_t idx, right, mid;
1374 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1375 return 0;
1376
1377 /* Binary search the element. */
1378 idx = 0;
1379 right = set->nelem - 1;
1380 while (idx < right)
1381 {
1382 mid = (idx + right) / 2;
1383 if (set->elems[mid] < elem)
1384 idx = mid + 1;
1385 else
1386 right = mid;
1387 }
1388 return set->elems[idx] == elem ? idx + 1 : 0;
1389 }
1390
1391 static void
1392 internal_function
re_node_set_remove_at(re_node_set * set,Idx idx)1393 re_node_set_remove_at (re_node_set *set, Idx idx)
1394 {
1395 if (idx < 0 || idx >= set->nelem)
1396 return;
1397 --set->nelem;
1398 for (; idx < set->nelem; idx++)
1399 set->elems[idx] = set->elems[idx + 1];
1400 }
1401
1402
1403 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1404 Or return REG_MISSING if an error occurred. */
1405
1406 static Idx
1407 internal_function
re_dfa_add_node(re_dfa_t * dfa,re_token_t token)1408 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1409 {
1410 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1411 {
1412 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1413 Idx *new_nexts, *new_indices;
1414 re_node_set *new_edests, *new_eclosures;
1415 re_token_t *new_nodes;
1416 size_t max_object_size =
1417 MAX (sizeof (re_token_t),
1418 MAX (sizeof (re_node_set),
1419 sizeof (Idx)));
1420
1421 /* Avoid overflows. */
1422 if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
1423 return REG_MISSING;
1424
1425 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1426 if (BE (new_nodes == NULL, 0))
1427 return REG_MISSING;
1428 dfa->nodes = new_nodes;
1429 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1430 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1431 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1432 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1433 if (BE (new_nexts == NULL || new_indices == NULL
1434 || new_edests == NULL || new_eclosures == NULL, 0))
1435 return REG_MISSING;
1436 dfa->nexts = new_nexts;
1437 dfa->org_indices = new_indices;
1438 dfa->edests = new_edests;
1439 dfa->eclosures = new_eclosures;
1440 dfa->nodes_alloc = new_nodes_alloc;
1441 }
1442 dfa->nodes[dfa->nodes_len] = token;
1443 dfa->nodes[dfa->nodes_len].constraint = 0;
1444 #ifdef RE_ENABLE_I18N
1445 {
1446 int type = token.type;
1447 dfa->nodes[dfa->nodes_len].accept_mb =
1448 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1449 }
1450 #endif
1451 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1452 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1453 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1454 return dfa->nodes_len++;
1455 }
1456
1457 static inline re_hashval_t
1458 internal_function
calc_state_hash(const re_node_set * nodes,unsigned int context)1459 calc_state_hash (const re_node_set *nodes, unsigned int context)
1460 {
1461 re_hashval_t hash = nodes->nelem + context;
1462 Idx i;
1463 for (i = 0 ; i < nodes->nelem ; i++)
1464 hash += nodes->elems[i];
1465 return hash;
1466 }
1467
1468 /* Search for the state whose node_set is equivalent to NODES.
1469 Return the pointer to the state, if we found it in the DFA.
1470 Otherwise create the new one and return it. In case of an error
1471 return NULL and set the error code in ERR.
1472 Note: - We assume NULL as the invalid state, then it is possible that
1473 return value is NULL and ERR is REG_NOERROR.
1474 - We never return non-NULL value in case of any errors, it is for
1475 optimization. */
1476
1477 static re_dfastate_t *
1478 internal_function
re_acquire_state(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes)1479 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1480 const re_node_set *nodes)
1481 {
1482 re_hashval_t hash;
1483 re_dfastate_t *new_state;
1484 struct re_state_table_entry *spot;
1485 Idx i;
1486 #ifdef lint
1487 /* Suppress bogus uninitialized-variable warnings. */
1488 *err = REG_NOERROR;
1489 #endif
1490 if (BE (nodes->nelem == 0, 0))
1491 {
1492 *err = REG_NOERROR;
1493 return NULL;
1494 }
1495 hash = calc_state_hash (nodes, 0);
1496 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1497
1498 for (i = 0 ; i < spot->num ; i++)
1499 {
1500 re_dfastate_t *state = spot->array[i];
1501 if (hash != state->hash)
1502 continue;
1503 if (re_node_set_compare (&state->nodes, nodes))
1504 return state;
1505 }
1506
1507 /* There are no appropriate state in the dfa, create the new one. */
1508 new_state = create_ci_newstate (dfa, nodes, hash);
1509 if (BE (new_state == NULL, 0))
1510 *err = REG_ESPACE;
1511
1512 return new_state;
1513 }
1514
1515 /* Search for the state whose node_set is equivalent to NODES and
1516 whose context is equivalent to CONTEXT.
1517 Return the pointer to the state, if we found it in the DFA.
1518 Otherwise create the new one and return it. In case of an error
1519 return NULL and set the error code in ERR.
1520 Note: - We assume NULL as the invalid state, then it is possible that
1521 return value is NULL and ERR is REG_NOERROR.
1522 - We never return non-NULL value in case of any errors, it is for
1523 optimization. */
1524
1525 static re_dfastate_t *
1526 internal_function
re_acquire_state_context(reg_errcode_t * err,const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context)1527 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1528 const re_node_set *nodes, unsigned int context)
1529 {
1530 re_hashval_t hash;
1531 re_dfastate_t *new_state;
1532 struct re_state_table_entry *spot;
1533 Idx i;
1534 #ifdef lint
1535 /* Suppress bogus uninitialized-variable warnings. */
1536 *err = REG_NOERROR;
1537 #endif
1538 if (nodes->nelem == 0)
1539 {
1540 *err = REG_NOERROR;
1541 return NULL;
1542 }
1543 hash = calc_state_hash (nodes, context);
1544 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1545
1546 for (i = 0 ; i < spot->num ; i++)
1547 {
1548 re_dfastate_t *state = spot->array[i];
1549 if (state->hash == hash
1550 && state->context == context
1551 && re_node_set_compare (state->entrance_nodes, nodes))
1552 return state;
1553 }
1554 /* There are no appropriate state in `dfa', create the new one. */
1555 new_state = create_cd_newstate (dfa, nodes, context, hash);
1556 if (BE (new_state == NULL, 0))
1557 *err = REG_ESPACE;
1558
1559 return new_state;
1560 }
1561
1562 /* Finish initialization of the new state NEWSTATE, and using its hash value
1563 HASH put in the appropriate bucket of DFA's state table. Return value
1564 indicates the error code if failed. */
1565
1566 static reg_errcode_t
register_state(const re_dfa_t * dfa,re_dfastate_t * newstate,re_hashval_t hash)1567 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1568 re_hashval_t hash)
1569 {
1570 struct re_state_table_entry *spot;
1571 reg_errcode_t err;
1572 Idx i;
1573
1574 newstate->hash = hash;
1575 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1576 if (BE (err != REG_NOERROR, 0))
1577 return REG_ESPACE;
1578 for (i = 0; i < newstate->nodes.nelem; i++)
1579 {
1580 Idx elem = newstate->nodes.elems[i];
1581 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1582 if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
1583 return REG_ESPACE;
1584 }
1585
1586 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1587 if (BE (spot->alloc <= spot->num, 0))
1588 {
1589 Idx new_alloc = 2 * spot->num + 2;
1590 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1591 new_alloc);
1592 if (BE (new_array == NULL, 0))
1593 return REG_ESPACE;
1594 spot->array = new_array;
1595 spot->alloc = new_alloc;
1596 }
1597 spot->array[spot->num++] = newstate;
1598 return REG_NOERROR;
1599 }
1600
1601 static void
free_state(re_dfastate_t * state)1602 free_state (re_dfastate_t *state)
1603 {
1604 re_node_set_free (&state->non_eps_nodes);
1605 re_node_set_free (&state->inveclosure);
1606 if (state->entrance_nodes != &state->nodes)
1607 {
1608 re_node_set_free (state->entrance_nodes);
1609 re_free (state->entrance_nodes);
1610 }
1611 re_node_set_free (&state->nodes);
1612 re_free (state->word_trtable);
1613 re_free (state->trtable);
1614 re_free (state);
1615 }
1616
1617 /* Create the new state which is independ of contexts.
1618 Return the new state if succeeded, otherwise return NULL. */
1619
1620 static re_dfastate_t *
1621 internal_function
create_ci_newstate(const re_dfa_t * dfa,const re_node_set * nodes,re_hashval_t hash)1622 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1623 re_hashval_t hash)
1624 {
1625 Idx i;
1626 reg_errcode_t err;
1627 re_dfastate_t *newstate;
1628
1629 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1630 if (BE (newstate == NULL, 0))
1631 return NULL;
1632 err = re_node_set_init_copy (&newstate->nodes, nodes);
1633 if (BE (err != REG_NOERROR, 0))
1634 {
1635 re_free (newstate);
1636 return NULL;
1637 }
1638
1639 newstate->entrance_nodes = &newstate->nodes;
1640 for (i = 0 ; i < nodes->nelem ; i++)
1641 {
1642 re_token_t *node = dfa->nodes + nodes->elems[i];
1643 re_token_type_t type = node->type;
1644 if (type == CHARACTER && !node->constraint)
1645 continue;
1646 #ifdef RE_ENABLE_I18N
1647 newstate->accept_mb |= node->accept_mb;
1648 #endif /* RE_ENABLE_I18N */
1649
1650 /* If the state has the halt node, the state is a halt state. */
1651 if (type == END_OF_RE)
1652 newstate->halt = 1;
1653 else if (type == OP_BACK_REF)
1654 newstate->has_backref = 1;
1655 else if (type == ANCHOR || node->constraint)
1656 newstate->has_constraint = 1;
1657 }
1658 err = register_state (dfa, newstate, hash);
1659 if (BE (err != REG_NOERROR, 0))
1660 {
1661 free_state (newstate);
1662 newstate = NULL;
1663 }
1664 return newstate;
1665 }
1666
1667 /* Create the new state which is depend on the context CONTEXT.
1668 Return the new state if succeeded, otherwise return NULL. */
1669
1670 static re_dfastate_t *
1671 internal_function
create_cd_newstate(const re_dfa_t * dfa,const re_node_set * nodes,unsigned int context,re_hashval_t hash)1672 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1673 unsigned int context, re_hashval_t hash)
1674 {
1675 Idx i, nctx_nodes = 0;
1676 reg_errcode_t err;
1677 re_dfastate_t *newstate;
1678
1679 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1680 if (BE (newstate == NULL, 0))
1681 return NULL;
1682 err = re_node_set_init_copy (&newstate->nodes, nodes);
1683 if (BE (err != REG_NOERROR, 0))
1684 {
1685 re_free (newstate);
1686 return NULL;
1687 }
1688
1689 newstate->context = context;
1690 newstate->entrance_nodes = &newstate->nodes;
1691
1692 for (i = 0 ; i < nodes->nelem ; i++)
1693 {
1694 re_token_t *node = dfa->nodes + nodes->elems[i];
1695 re_token_type_t type = node->type;
1696 unsigned int constraint = node->constraint;
1697
1698 if (type == CHARACTER && !constraint)
1699 continue;
1700 #ifdef RE_ENABLE_I18N
1701 newstate->accept_mb |= node->accept_mb;
1702 #endif /* RE_ENABLE_I18N */
1703
1704 /* If the state has the halt node, the state is a halt state. */
1705 if (type == END_OF_RE)
1706 newstate->halt = 1;
1707 else if (type == OP_BACK_REF)
1708 newstate->has_backref = 1;
1709
1710 if (constraint)
1711 {
1712 if (newstate->entrance_nodes == &newstate->nodes)
1713 {
1714 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1715 if (BE (newstate->entrance_nodes == NULL, 0))
1716 {
1717 free_state (newstate);
1718 return NULL;
1719 }
1720 re_node_set_init_copy (newstate->entrance_nodes, nodes);
1721 nctx_nodes = 0;
1722 newstate->has_constraint = 1;
1723 }
1724
1725 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1726 {
1727 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1728 ++nctx_nodes;
1729 }
1730 }
1731 }
1732 err = register_state (dfa, newstate, hash);
1733 if (BE (err != REG_NOERROR, 0))
1734 {
1735 free_state (newstate);
1736 newstate = NULL;
1737 }
1738 return newstate;
1739 }
1740