1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 #ifdef HAVE_CONFIG_H
42 #include "config.h"
43 #endif
44
45 #include <stdio.h>
46 #include <string.h>
47
48 #define PCRE2_CODE_UNIT_WIDTH 0
49 #include "pcre2.h"
50
51 /*
52 Letter characters:
53 \xe6\x92\xad = 0x64ad = 25773 (kanji)
54 Non-letter characters:
55 \xc2\xa1 = 0xa1 = (Inverted Exclamation Mark)
56 \xf3\xa9\xb7\x80 = 0xe9dc0 = 957888
57 \xed\xa0\x80 = 55296 = 0xd800 (Invalid UTF character)
58 \xed\xb0\x80 = 56320 = 0xdc00 (Invalid UTF character)
59 Newlines:
60 \xc2\x85 = 0x85 = 133 (NExt Line = NEL)
61 \xe2\x80\xa8 = 0x2028 = 8232 (Line Separator)
62 Othercase pairs:
63 \xc3\xa9 = 0xe9 = 233 (e')
64 \xc3\x89 = 0xc9 = 201 (E')
65 \xc3\xa1 = 0xe1 = 225 (a')
66 \xc3\x81 = 0xc1 = 193 (A')
67 \x53 = 0x53 = S
68 \x73 = 0x73 = s
69 \xc5\xbf = 0x17f = 383 (long S)
70 \xc8\xba = 0x23a = 570
71 \xe2\xb1\xa5 = 0x2c65 = 11365
72 \xe1\xbd\xb8 = 0x1f78 = 8056
73 \xe1\xbf\xb8 = 0x1ff8 = 8184
74 \xf0\x90\x90\x80 = 0x10400 = 66560
75 \xf0\x90\x90\xa8 = 0x10428 = 66600
76 \xc7\x84 = 0x1c4 = 452
77 \xc7\x85 = 0x1c5 = 453
78 \xc7\x86 = 0x1c6 = 454
79 Caseless sets:
80 ucp_Armenian - \x{531}-\x{556} -> \x{561}-\x{586}
81 ucp_Coptic - \x{2c80}-\x{2ce3} -> caseless: XOR 0x1
82 ucp_Latin - \x{ff21}-\x{ff3a} -> \x{ff41]-\x{ff5a}
83
84 Mark property:
85 \xcc\x8d = 0x30d = 781
86 Special:
87 \xc2\x80 = 0x80 = 128 (lowest 2 byte character)
88 \xdf\xbf = 0x7ff = 2047 (highest 2 byte character)
89 \xe0\xa0\x80 = 0x800 = 2048 (lowest 2 byte character)
90 \xef\xbf\xbf = 0xffff = 65535 (highest 3 byte character)
91 \xf0\x90\x80\x80 = 0x10000 = 65536 (lowest 4 byte character)
92 \xf4\x8f\xbf\xbf = 0x10ffff = 1114111 (highest allowed utf character)
93 */
94
95 static int regression_tests(void);
96 static int invalid_utf8_regression_tests(void);
97 static int invalid_utf16_regression_tests(void);
98 static int invalid_utf32_regression_tests(void);
99
main(void)100 int main(void)
101 {
102 int jit = 0;
103 #if defined SUPPORT_PCRE2_8
104 pcre2_config_8(PCRE2_CONFIG_JIT, &jit);
105 #elif defined SUPPORT_PCRE2_16
106 pcre2_config_16(PCRE2_CONFIG_JIT, &jit);
107 #elif defined SUPPORT_PCRE2_32
108 pcre2_config_32(PCRE2_CONFIG_JIT, &jit);
109 #endif
110 if (!jit) {
111 printf("JIT must be enabled to run pcre_jit_test\n");
112 return 1;
113 }
114 return regression_tests()
115 | invalid_utf8_regression_tests()
116 | invalid_utf16_regression_tests()
117 | invalid_utf32_regression_tests();
118 }
119
120 /* --------------------------------------------------------------------------------------- */
121
122 #if !(defined SUPPORT_PCRE2_8) && !(defined SUPPORT_PCRE2_16) && !(defined SUPPORT_PCRE2_32)
123 #error SUPPORT_PCRE2_8 or SUPPORT_PCRE2_16 or SUPPORT_PCRE2_32 must be defined
124 #endif
125
126 #define MU (PCRE2_MULTILINE | PCRE2_UTF)
127 #define MUP (PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
128 #define CMU (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF)
129 #define CMUP (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
130 #define M (PCRE2_MULTILINE)
131 #define MP (PCRE2_MULTILINE | PCRE2_UCP)
132 #define U (PCRE2_UTF)
133 #define CM (PCRE2_CASELESS | PCRE2_MULTILINE)
134
135 #define BSR(x) ((x) << 16)
136 #define A PCRE2_NEWLINE_ANYCRLF
137
138 #define GET_NEWLINE(x) ((x) & 0xffff)
139 #define GET_BSR(x) ((x) >> 16)
140
141 #define OFFSET_MASK 0x00ffff
142 #define F_NO8 0x010000
143 #define F_NO16 0x020000
144 #define F_NO32 0x020000
145 #define F_NOMATCH 0x040000
146 #define F_DIFF 0x080000
147 #define F_FORCECONV 0x100000
148 #define F_PROPERTY 0x200000
149
150 struct regression_test_case {
151 int compile_options;
152 int newline;
153 int match_options;
154 int start_offset;
155 const char *pattern;
156 const char *input;
157 };
158
159 static struct regression_test_case regression_test_cases[] = {
160 /* Constant strings. */
161 { MU, A, 0, 0, "AbC", "AbAbC" },
162 { MU, A, 0, 0, "ACCEPT", "AACACCACCEACCEPACCEPTACCEPTT" },
163 { CMU, A, 0, 0, "aA#\xc3\xa9\xc3\x81", "aA#Aa#\xc3\x89\xc3\xa1" },
164 { M, A, 0, 0, "[^a]", "aAbB" },
165 { CM, A, 0, 0, "[^m]", "mMnN" },
166 { M, A, 0, 0, "a[^b][^#]", "abacd" },
167 { CM, A, 0, 0, "A[^B][^E]", "abacd" },
168 { CMU, A, 0, 0, "[^x][^#]", "XxBll" },
169 { MU, A, 0, 0, "[^a]", "aaa\xc3\xa1#Ab" },
170 { CMU, A, 0, 0, "[^A]", "aA\xe6\x92\xad" },
171 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\n+bc" },
172 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\r+bc" },
173 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\r+bc" },
174 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\n+bc" },
175 { MU, A, 0, 0, "[axd]", "sAXd" },
176 { CMU, A, 0, 0, "[axd]", "sAXd" },
177 { CMU, A, 0, 0 | F_NOMATCH, "[^axd]", "DxA" },
178 { MU, A, 0, 0, "[a-dA-C]", "\xe6\x92\xad\xc3\xa9.B" },
179 { MU, A, 0, 0, "[^a-dA-C]", "\xe6\x92\xad\xc3\xa9" },
180 { CMU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
181 { MU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
182 { MU, A, 0, 0, "[^a]", "\xc2\x80[]" },
183 { CMU, A, 0, 0, "\xf0\x90\x90\xa7", "\xf0\x90\x91\x8f" },
184 { CM, A, 0, 0, "1a2b3c4", "1a2B3c51A2B3C4" },
185 { PCRE2_CASELESS, 0, 0, 0, "\xff#a", "\xff#\xff\xfe##\xff#A" },
186 { PCRE2_CASELESS, 0, 0, 0, "\xfe", "\xff\xfc#\xfe\xfe" },
187 { PCRE2_CASELESS, 0, 0, 0, "a1", "Aa1" },
188 #ifndef NEVER_BACKSLASH_C
189 { M, A, 0, 0, "\\Ca", "cda" },
190 { CM, A, 0, 0, "\\Ca", "CDA" },
191 { M, A, 0, 0 | F_NOMATCH, "\\Cx", "cda" },
192 { CM, A, 0, 0 | F_NOMATCH, "\\Cx", "CDA" },
193 #endif /* !NEVER_BACKSLASH_C */
194 { CMUP, A, 0, 0, "\xf0\x90\x90\x80\xf0\x90\x90\xa8", "\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
195 { CMUP, A, 0, 0, "\xf0\x90\x90\x80{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
196 { CMUP, A, 0, 0, "\xf0\x90\x90\xa8{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
197 { CMUP, A, 0, 0, "\xe1\xbd\xb8\xe1\xbf\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
198 { M, A, 0, 0, "[3-57-9]", "5" },
199 { PCRE2_AUTO_CALLOUT, A, 0, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890",
200 "12345678901234567890123456789012345678901234567890123456789012345678901234567890" },
201
202 /* Assertions. */
203 { MU, A, 0, 0, "\\b[^A]", "A_B#" },
204 { M, A, 0, 0 | F_NOMATCH, "\\b\\W", "\n*" },
205 { MU, A, 0, 0, "\\B[^,]\\b[^s]\\b", "#X" },
206 { MP, A, 0, 0, "\\B", "_\xa1" },
207 { MP, A, 0, 0 | F_PROPERTY, "\\b_\\b[,A]\\B", "_," },
208 { MUP, A, 0, 0, "\\b", "\xe6\x92\xad!" },
209 { MUP, A, 0, 0, "\\B", "_\xc2\xa1\xc3\xa1\xc2\x85" },
210 { MUP, A, 0, 0, "\\b[^A]\\B[^c]\\b[^_]\\B", "_\xc3\xa1\xe2\x80\xa8" },
211 { MUP, A, 0, 0, "\\b\\w+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
212 { MU, A, 0, 0 | F_NOMATCH, "\\b.", "\xcd\xbe" },
213 { CMUP, A, 0, 0, "\\By", "\xf0\x90\x90\xa8y" },
214 { M, A, 0, 0 | F_NOMATCH, "\\R^", "\n" },
215 { M, A, 0, 1 | F_NOMATCH, "^", "\n" },
216 { 0, 0, 0, 0, "^ab", "ab" },
217 { 0, 0, 0, 0 | F_NOMATCH, "^ab", "aab" },
218 { M, PCRE2_NEWLINE_CRLF, 0, 0, "^a", "\r\raa\n\naa\r\naa" },
219 { MU, A, 0, 0, "^-", "\xe2\x80\xa8--\xc2\x85-\r\n-" },
220 { M, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--b--\x85--" },
221 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xe2\x80\xa8--" },
222 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xc2\x85--" },
223 { 0, 0, 0, 0, "ab$", "ab" },
224 { 0, 0, 0, 0 | F_NOMATCH, "ab$", "abab\n\n" },
225 { PCRE2_DOLLAR_ENDONLY, 0, 0, 0 | F_NOMATCH, "ab$", "abab\r\n" },
226 { M, PCRE2_NEWLINE_CRLF, 0, 0, "a$", "\r\raa\n\naa\r\naa" },
227 { M, PCRE2_NEWLINE_ANY, 0, 0, "a$", "aaa" },
228 { MU, PCRE2_NEWLINE_ANYCRLF, 0, 0, "#$", "#\xc2\x85###\r#" },
229 { MU, PCRE2_NEWLINE_ANY, 0, 0, "#$", "#\xe2\x80\xa9" },
230 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0 | F_NOMATCH, "^a", "aa\naa" },
231 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0, "^a", "aa\naa" },
232 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\naa" },
233 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\r\n" },
234 { U | PCRE2_DOLLAR_ENDONLY, PCRE2_NEWLINE_ANY, 0, 0 | F_PROPERTY, "\\p{Any}{2,}$", "aa\r\n" },
235 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0, "a$", "aa\naa" },
236 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa" },
237 { U, PCRE2_NEWLINE_CR, 0, 0, "a\\Z", "aaa\r" },
238 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa\n" },
239 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r" },
240 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\n" },
241 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r\n" },
242 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
243 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
244 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
245 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
246 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
247 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
248 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
249 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
250 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
251 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xc2\x85" },
252 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
253 { M, A, 0, 0, "\\Aa", "aaa" },
254 { M, A, 0, 1 | F_NOMATCH, "\\Aa", "aaa" },
255 { M, A, 0, 1, "\\Ga", "aaa" },
256 { M, A, 0, 1 | F_NOMATCH, "\\Ga", "aba" },
257 { M, A, 0, 0, "a\\z", "aaa" },
258 { M, A, 0, 0 | F_NOMATCH, "a\\z", "aab" },
259
260 /* Brackets and alternatives. */
261 { MU, A, 0, 0, "(ab|bb|cd)", "bacde" },
262 { MU, A, 0, 0, "(?:ab|a)(bc|c)", "ababc" },
263 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|efg))", "abac" },
264 { CMU, A, 0, 0, "((aB|(Cc))|(bB)|(?:cd|EFg))", "AcCe" },
265 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|ebg))", "acebebg" },
266 { MU, A, 0, 0, "(?:(a)|(?:b))(cc|(?:d|e))(a|b)k", "accabdbbccbk" },
267 { MU, A, 0, 0, "\xc7\x82|\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
268 { MU, A, 0, 0, "=\xc7\x82|#\xc6\x82", "\xf1\x83\x82\x82=\xc7\x82\xc7\x83" },
269 { MU, A, 0, 0, "\xc7\x82\xc7\x83|\xc6\x82\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
270 { MU, A, 0, 0, "\xc6\x82\xc6\x82|\xc7\x83\xc7\x83|\xc8\x84\xc8\x84", "\xf1\x83\x82\x82\xc8\x84\xc8\x84" },
271 { U, A, 0, 0, "\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80", "\xdf\xbf\xc2\x80\xe4\x84\x80" },
272 { U, A, 0, 0, "(?:\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80)#", "\xdf\xbf\xc2\x80#\xe4\x84\x80#" },
273 { CM, A, 0, 0, "ab|cd", "CD" },
274 { CM, A, 0, 0, "a1277|a1377|bX487", "bx487" },
275 { CM, A, 0, 0, "a1277|a1377|bx487", "bX487" },
276
277 /* Greedy and non-greedy ? operators. */
278 { MU, A, 0, 0, "(?:a)?a", "laab" },
279 { CMU, A, 0, 0, "(A)?A", "llaab" },
280 { MU, A, 0, 0, "(a)?\?a", "aab" }, /* ?? is the prefix of trygraphs in GCC. */
281 { MU, A, 0, 0, "(a)?a", "manm" },
282 { CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" },
283 { MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" },
284 { MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" },
285
286 /* Greedy and non-greedy + operators */
287 { MU, A, 0, 0, "(aa)+aa", "aaaaaaa" },
288 { MU, A, 0, 0, "(aa)+?aa", "aaaaaaa" },
289 { MU, A, 0, 0, "(?:aba|ab|a)+l", "ababamababal" },
290 { MU, A, 0, 0, "(?:aba|ab|a)+?l", "ababamababal" },
291 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+?|ss)+e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
292 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+|ss)+?e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
293 { MU, A, 0, 0, "(?:(b(c)+?)+)?\?(?:(bc)+|(cb)+)+(?:m)+", "bccbcccbcbccbcbPbccbcccbcbccbcbmmn" },
294
295 /* Greedy and non-greedy * operators */
296 { CMU, A, 0, 0, "(?:AA)*AB", "aaaaaaamaaaaaaab" },
297 { MU, A, 0, 0, "(?:aa)*?ab", "aaaaaaamaaaaaaab" },
298 { MU, A, 0, 0, "(aa|ab)*ab", "aaabaaab" },
299 { CMU, A, 0, 0, "(aa|Ab)*?aB", "aaabaaab" },
300 { MU, A, 0, 0, "(a|b)*(?:a)*(?:b)*m", "abbbaaababanabbbaaababamm" },
301 { MU, A, 0, 0, "(a|b)*?(?:a)*?(?:b)*?m", "abbbaaababanabbbaaababamm" },
302 { M, A, 0, 0, "a(a(\\1*)a|(b)b+){0}a", "aa" },
303 { M, A, 0, 0, "((?:a|)*){0}a", "a" },
304
305 /* Combining ? + * operators */
306 { MU, A, 0, 0, "((bm)+)?\?(?:a)*(bm)+n|((am)+?)?(?:a)+(am)*n", "bmbmabmamaaamambmaman" },
307 { MU, A, 0, 0, "(((ab)?cd)*ef)+g", "abcdcdefcdefefmabcdcdefcdefefgg" },
308 { MU, A, 0, 0, "(((ab)?\?cd)*?ef)+?g", "abcdcdefcdefefmabcdcdefcdefefgg" },
309 { MU, A, 0, 0, "(?:(ab)?c|(?:ab)+?d)*g", "ababcdccababddg" },
310 { MU, A, 0, 0, "(?:(?:ab)?\?c|(ab)+d)*?g", "ababcdccababddg" },
311
312 /* Single character iterators. */
313 { MU, A, 0, 0, "(a+aab)+aaaab", "aaaabcaaaabaabcaabcaaabaaaab" },
314 { MU, A, 0, 0, "(a*a*aab)+x", "aaaaabaabaaabmaabx" },
315 { MU, A, 0, 0, "(a*?(b|ab)a*?)+x", "aaaabcxbbaabaacbaaabaabax" },
316 { MU, A, 0, 0, "(a+(ab|ad)a+)+x", "aaabaaaadaabaaabaaaadaaax" },
317 { MU, A, 0, 0, "(a?(a)a?)+(aaa)", "abaaabaaaaaaaa" },
318 { MU, A, 0, 0, "(a?\?(a)a?\?)+(b)", "aaaacaaacaacacbaaab" },
319 { MU, A, 0, 0, "(a{0,4}(b))+d", "aaaaaabaabcaaaaabaaaaabd" },
320 { MU, A, 0, 0, "(a{0,4}?[^b])+d+(a{0,4}[^b])d+", "aaaaadaaaacaadddaaddd" },
321 { MU, A, 0, 0, "(ba{2})+c", "baabaaabacbaabaac" },
322 { MU, A, 0, 0, "(a*+bc++)+", "aaabbcaaabcccab" },
323 { MU, A, 0, 0, "(a?+[^b])+", "babaacacb" },
324 { MU, A, 0, 0, "(a{0,3}+b)(a{0,3}+b)(a{0,3}+)[^c]", "abaabaaacbaabaaaac" },
325 { CMU, A, 0, 0, "([a-c]+[d-f]+?)+?g", "aBdacdehAbDaFgA" },
326 { CMU, A, 0, 0, "[c-f]+k", "DemmFke" },
327 { MU, A, 0, 0, "([DGH]{0,4}M)+", "GGDGHDGMMHMDHHGHM" },
328 { MU, A, 0, 0, "([a-c]{4,}s)+", "abasabbasbbaabsbba" },
329 { CMU, A, 0, 0, "[ace]{3,7}", "AcbDAcEEcEd" },
330 { CMU, A, 0, 0, "[ace]{3,7}?", "AcbDAcEEcEd" },
331 { CMU, A, 0, 0, "[ace]{3,}", "AcbDAcEEcEd" },
332 { CMU, A, 0, 0, "[ace]{3,}?", "AcbDAcEEcEd" },
333 { MU, A, 0, 0, "[ckl]{2,}?g", "cdkkmlglglkcg" },
334 { CMU, A, 0, 0, "[ace]{5}?", "AcCebDAcEEcEd" },
335 { MU, A, 0, 0, "([AbC]{3,5}?d)+", "BACaAbbAEAACCbdCCbdCCAAbb" },
336 { MU, A, 0, 0, "([^ab]{0,}s){2}", "abaabcdsABamsDDs" },
337 { MU, A, 0, 0, "\\b\\w+\\B", "x,a_cd" },
338 { MUP, A, 0, 0, "\\b[^\xc2\xa1]+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
339 { CMU, A, 0, 0, "[^b]+(a*)([^c]?d{3})", "aaaaddd" },
340 { CMUP, A, 0, 0, "\xe1\xbd\xb8{2}", "\xe1\xbf\xb8#\xe1\xbf\xb8\xe1\xbd\xb8" },
341 { CMU, A, 0, 0, "[^\xf0\x90\x90\x80]{2,4}@", "\xf0\x90\x90\xa8\xf0\x90\x90\x80###\xf0\x90\x90\x80@@@" },
342 { CMU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
343 { MU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
344 { MU, A, 0, 0, "[^\xe1\xbd\xb8]{3,}?", "##\xe1\xbd\xb8#\xe1\xbd\xb8#\xc3\x89#\xe1\xbd\xb8" },
345 { MU, A, 0, 0, "\\d+123", "987654321,01234" },
346 { MU, A, 0, 0, "abcd*|\\w+xy", "aaaaa,abxyz" },
347 { MU, A, 0, 0, "(?:abc|((?:amc|\\b\\w*xy)))", "aaaaa,abxyz" },
348 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.abcd#."},
349 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.mbcd#."},
350 { MU, A, 0, 0, ".[ab]*.", "xx" },
351 { MU, A, 0, 0, ".[ab]*a", "xxa" },
352 { MU, A, 0, 0, ".[ab]?.", "xx" },
353 { MU, A, 0, 0, "_[ab]+_*a", "_aa" },
354 { MU, A, 0, 0, "#(A+)#\\d+", "#A#A#0" },
355 { MU, A, 0, 0, "(?P<size>\\d+)m|M", "4M" },
356
357 /* Bracket repeats with limit. */
358 { MU, A, 0, 0, "(?:(ab){2}){5}M", "abababababababababababM" },
359 { MU, A, 0, 0, "(?:ab|abab){1,5}M", "abababababababababababM" },
360 { MU, A, 0, 0, "(?>ab|abab){1,5}M", "abababababababababababM" },
361 { MU, A, 0, 0, "(?:ab|abab){1,5}?M", "abababababababababababM" },
362 { MU, A, 0, 0, "(?>ab|abab){1,5}?M", "abababababababababababM" },
363 { MU, A, 0, 0, "(?:(ab){1,4}?){1,3}?M", "abababababababababababababM" },
364 { MU, A, 0, 0, "(?:(ab){1,4}){1,3}abababababababababababM", "ababababababababababababM" },
365 { MU, A, 0, 0 | F_NOMATCH, "(?:(ab){1,4}){1,3}abababababababababababM", "abababababababababababM" },
366 { MU, A, 0, 0, "(ab){4,6}?M", "abababababababM" },
367
368 /* Basic character sets. */
369 { MU, A, 0, 0, "(?:\\s)+(?:\\S)+", "ab \t\xc3\xa9\xe6\x92\xad " },
370 { MU, A, 0, 0, "(\\w)*(k)(\\W)?\?", "abcdef abck11" },
371 { MU, A, 0, 0, "\\((\\d)+\\)\\D", "a() (83 (8)2 (9)ab" },
372 { MU, A, 0, 0, "\\w(\\s|(?:\\d)*,)+\\w\\wb", "a 5, 4,, bb 5, 4,, aab" },
373 { MU, A, 0, 0, "(\\v+)(\\V+)", "\x0e\xc2\x85\xe2\x80\xa8\x0b\x09\xe2\x80\xa9" },
374 { MU, A, 0, 0, "(\\h+)(\\H+)", "\xe2\x80\xa8\xe2\x80\x80\x20\xe2\x80\x8a\xe2\x81\x9f\xe3\x80\x80\x09\x20\xc2\xa0\x0a" },
375 { MU, A, 0, 0, "x[bcef]+", "xaxdxecbfg" },
376 { MU, A, 0, 0, "x[bcdghij]+", "xaxexfxdgbjk" },
377 { MU, A, 0, 0, "x[^befg]+", "xbxexacdhg" },
378 { MU, A, 0, 0, "x[^bcdl]+", "xlxbxaekmd" },
379 { MU, A, 0, 0, "x[^bcdghi]+", "xbxdxgxaefji" },
380 { MU, A, 0, 0, "x[B-Fb-f]+", "xaxAxgxbfBFG" },
381 { CMU, A, 0, 0, "\\x{e9}+", "#\xf0\x90\x90\xa8\xc3\xa8\xc3\xa9\xc3\x89\xc3\x88" },
382 { CMU, A, 0, 0, "[^\\x{e9}]+", "\xc3\xa9#\xf0\x90\x90\xa8\xc3\xa8\xc3\x88\xc3\x89" },
383 { MU, A, 0, 0, "[\\x02\\x7e]+", "\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x02\x7e\x7f" },
384 { MU, A, 0, 0, "[^\\x02\\x7e]+", "\x02\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x7f\x7e" },
385 { MU, A, 0, 0, "[\\x{81}-\\x{7fe}]+", "#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xc2\x81\xdf\xbe\xdf\xbf" },
386 { MU, A, 0, 0, "[^\\x{81}-\\x{7fe}]+", "\xc2\x81#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xdf\xbf\xdf\xbe" },
387 { MU, A, 0, 0, "[\\x{801}-\\x{fffe}]+", "#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xe0\xa0\x81\xef\xbf\xbe\xef\xbf\xbf" },
388 { MU, A, 0, 0, "[^\\x{801}-\\x{fffe}]+", "\xe0\xa0\x81#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xef\xbf\xbf\xef\xbf\xbe" },
389 { MU, A, 0, 0, "[\\x{10001}-\\x{10fffe}]+", "#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf0\x90\x80\x81\xf4\x8f\xbf\xbe\xf4\x8f\xbf\xbf" },
390 { MU, A, 0, 0, "[^\\x{10001}-\\x{10fffe}]+", "\xf0\x90\x80\x81#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbe" },
391 { CMU, A, 0, 0 | F_NOMATCH, "^[\\x{0100}-\\x{017f}]", " " },
392
393 /* Unicode properties. */
394 { MUP, A, 0, 0, "[1-5\xc3\xa9\\w]", "\xc3\xa1_" },
395 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\x81\\p{Ll}]", "A_\xc3\x89\xc3\xa1" },
396 { MUP, A, 0, 0, "[\\Wd-h_x-z]+", "a\xc2\xa1#_yhzdxi" },
397 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}]", "abc" },
398 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}]", "abc" },
399 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}\xc3\xa1-\xc3\xa8]", "abc" },
400 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}\xc3\xa1-\xc3\xa8]", "abc" },
401 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
402 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
403 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
404 { MUP, A, 0, 0 | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
405 { MUP, A, 0, 0, "[b-\xc3\xa9\\s]", "a\xc\xe6\x92\xad" },
406 { CMUP, A, 0, 0, "[\xc2\x85-\xc2\x89\xc3\x89]", "\xc2\x84\xc3\xa9" },
407 { MUP, A, 0, 0, "[^b-d^&\\s]{3,}", "db^ !a\xe2\x80\xa8_ae" },
408 { MUP, A, 0, 0 | F_PROPERTY, "[^\\S\\P{Any}][\\sN]{1,3}[\\P{N}]{4}", "\xe2\x80\xaa\xa N\x9\xc3\xa9_0" },
409 { MU, A, 0, 0 | F_PROPERTY, "[^\\P{L}\x9!D-F\xa]{2,3}", "\x9,.DF\xa.CG\xc3\x81" },
410 { CMUP, A, 0, 0, "[\xc3\xa1-\xc3\xa9_\xe2\x80\xa0-\xe2\x80\xaf]{1,5}[^\xe2\x80\xa0-\xe2\x80\xaf]", "\xc2\xa1\xc3\x89\xc3\x89\xe2\x80\xaf_\xe2\x80\xa0" },
411 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa2-\xc3\xa6\xc3\x81-\xc3\x84\xe2\x80\xa8-\xe2\x80\xa9\xe6\x92\xad\\p{Zs}]{2,}", "\xe2\x80\xa7\xe2\x80\xa9\xe6\x92\xad \xe6\x92\xae" },
412 { MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
413 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" },
414 { MUP, 0, 0, 0 | F_NOMATCH, "[^\\p{Hangul}\\p{Z}]", " " },
415 { CMUP, 0, 0, 0, "[^S]\\B", "\xe2\x80\x8a" },
416
417 /* Possible empty brackets. */
418 { MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },
419 { MU, A, 0, 0, "(|ab||bc|a)+d", "abcxabcabd" },
420 { MU, A, 0, 0, "(?:|ab||bc|a)*d", "abcxabcabd" },
421 { MU, A, 0, 0, "(|ab||bc|a)*d", "abcxabcabd" },
422 { MU, A, 0, 0, "(?:|ab||bc|a)+?d", "abcxabcabd" },
423 { MU, A, 0, 0, "(|ab||bc|a)+?d", "abcxabcabd" },
424 { MU, A, 0, 0, "(?:|ab||bc|a)*?d", "abcxabcabd" },
425 { MU, A, 0, 0, "(|ab||bc|a)*?d", "abcxabcabd" },
426 { MU, A, 0, 0, "(((a)*?|(?:ba)+)+?|(?:|c|ca)*)*m", "abaacaccabacabalabaacaccabacabamm" },
427 { MU, A, 0, 0, "(?:((?:a)*|(ba)+?)+|(|c|ca)*?)*?m", "abaacaccabacabalabaacaccabacabamm" },
428
429 /* Start offset. */
430 { MU, A, 0, 3, "(\\d|(?:\\w)*\\w)+", "0ac01Hb" },
431 { MU, A, 0, 4 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
432 { MU, A, 0, 2 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
433 { MU, A, 0, 1, "(\\w\\W\\w)+", "ab#d" },
434
435 /* Newline. */
436 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
437 { M, PCRE2_NEWLINE_CR, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
438 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{1,3}[^#]", "\r\n##...." },
439 { MU, A, PCRE2_NO_UTF_CHECK, 1, "^.a", "\n\x80\nxa" },
440 { MU, A, 0, 1, "^", "\r\n" },
441 { M, PCRE2_NEWLINE_CRLF, 0, 1 | F_NOMATCH, "^", "\r\n" },
442 { M, PCRE2_NEWLINE_CRLF, 0, 1, "^", "\r\na" },
443
444 /* Any character except newline or any newline. */
445 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
446 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".(.).", "a\xc3\xa1\r\n\n\r\r" },
447 { 0, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
448 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
449 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.).", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa9$de" },
450 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0 | F_NOMATCH, ".(.).", "\xe2\x80\xa8\nb\r" },
451 { 0, PCRE2_NEWLINE_ANY, 0, 0, "(.)(.)", "#\x85#\r#\n#\r\n#\x84" },
452 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.+)#", "#\rMn\xc2\x85#\n###" },
453 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\r" },
454 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\x85#\r\n#" },
455 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\xe2\x80\xa8#c" },
456 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\r\nc" },
457 { U, PCRE2_NEWLINE_CRLF | BSR(PCRE2_BSR_UNICODE), 0, 0, "(\\R.)+", "\xc2\x85\r\n#\xe2\x80\xa8\n\r\n\r" },
458 { MU, A, 0, 0 | F_NOMATCH, "\\R+", "ab" },
459 { MU, A, 0, 0, "\\R+", "ab\r\n\r" },
460 { MU, A, 0, 0, "\\R*", "ab\r\n\r" },
461 { MU, A, 0, 0, "\\R*", "\r\n\r" },
462 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\r\r" },
463 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\n\n\n\r\r\r" },
464 { MU, A, 0, 0, "\\R{2,}", "\r\nab\n\n\n\r\r\r" },
465 { MU, A, 0, 0, "\\R{0,3}", "\r\n\r\n\r\n\r\n\r\n" },
466 { MU, A, 0, 0 | F_NOMATCH, "\\R+\\R\\R", "\r\n\r\n" },
467 { MU, A, 0, 0, "\\R+\\R\\R", "\r\r\r" },
468 { MU, A, 0, 0, "\\R*\\R\\R", "\n\r" },
469 { MU, A, 0, 0 | F_NOMATCH, "\\R{2,4}\\R\\R", "\r\r\r" },
470 { MU, A, 0, 0, "\\R{2,4}\\R\\R", "\r\r\r\r" },
471
472 /* Atomic groups (no fallback from "next" direction). */
473 { MU, A, 0, 0 | F_NOMATCH, "(?>ab)ab", "bab" },
474 { MU, A, 0, 0 | F_NOMATCH, "(?>(ab))ab", "bab" },
475 { MU, A, 0, 0, "(?>ab)+abc(?>de)*def(?>gh)?ghe(?>ij)+?k(?>lm)*?n(?>op)?\?op",
476 "bababcdedefgheijijklmlmnop" },
477 { MU, A, 0, 0, "(?>a(b)+a|(ab)?\?(b))an", "abban" },
478 { MU, A, 0, 0, "(?>ab+a|(?:ab)?\?b)an", "abban" },
479 { MU, A, 0, 0, "((?>ab|ad|)*?)(?>|c)*abad", "abababcababad" },
480 { MU, A, 0, 0, "(?>(aa|b|)*+(?>(##)|###)*d|(aa)(?>(baa)?)m)", "aabaa#####da" },
481 { MU, A, 0, 0, "((?>a|)+?)b", "aaacaaab" },
482 { MU, A, 0, 0, "(?>x|)*$", "aaa" },
483 { MU, A, 0, 0, "(?>(x)|)*$", "aaa" },
484 { MU, A, 0, 0, "(?>x|())*$", "aaa" },
485 { MU, A, 0, 0, "((?>[cxy]a|[a-d])*?)b", "aaa+ aaab" },
486 { MU, A, 0, 0, "((?>[cxy](a)|[a-d])*?)b", "aaa+ aaab" },
487 { MU, A, 0, 0, "(?>((?>(a+))))bab|(?>((?>(a+))))bb", "aaaabaaabaabab" },
488 { MU, A, 0, 0, "(?>(?>a+))bab|(?>(?>a+))bb", "aaaabaaabaabab" },
489 { MU, A, 0, 0, "(?>(a)c|(?>(c)|(a))a)b*?bab", "aaaabaaabaabab" },
490 { MU, A, 0, 0, "(?>ac|(?>c|a)a)b*?bab", "aaaabaaabaabab" },
491 { MU, A, 0, 0, "(?>(b)b|(a))*b(?>(c)|d)?x", "ababcaaabdbx" },
492 { MU, A, 0, 0, "(?>bb|a)*b(?>c|d)?x", "ababcaaabdbx" },
493 { MU, A, 0, 0, "(?>(bb)|a)*b(?>c|(d))?x", "ababcaaabdbx" },
494 { MU, A, 0, 0, "(?>(a))*?(?>(a))+?(?>(a))??x", "aaaaaacccaaaaabax" },
495 { MU, A, 0, 0, "(?>a)*?(?>a)+?(?>a)??x", "aaaaaacccaaaaabax" },
496 { MU, A, 0, 0, "(?>(a)|)*?(?>(a)|)+?(?>(a)|)??x", "aaaaaacccaaaaabax" },
497 { MU, A, 0, 0, "(?>a|)*?(?>a|)+?(?>a|)??x", "aaaaaacccaaaaabax" },
498 { MU, A, 0, 0, "(?>a(?>(a{0,2}))*?b|aac)+b", "aaaaaaacaaaabaaaaacaaaabaacaaabb" },
499 { CM, A, 0, 0, "(?>((?>a{32}|b+|(a*))?(?>c+|d*)?\?)+e)+?f", "aaccebbdde bbdaaaccebbdee bbdaaaccebbdeef" },
500 { MU, A, 0, 0, "(?>(?:(?>aa|a||x)+?b|(?>aa|a||(x))+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
501 { MU, A, 0, 0, "(?>(?:(?>aa|a||(x))+?b|(?>aa|a||x)+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
502 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d" },
503 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d#\xcc\x8d\xcc\x8d" },
504 { MU, A, 0, 0 | F_PROPERTY, "\\X+..", "\xcc\x8d#\xcc\x8d#\xcc\x8d\xcc\x8d" },
505 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}", "abcdef" },
506 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}?", "abcdef" },
507 { MU, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d##" },
508 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d#\xcc\x8d##" },
509 { MU, A, 0, 0, "(c(ab)?+ab)+", "cabcababcab" },
510 { MU, A, 0, 0, "(?>(a+)b)+aabab", "aaaabaaabaabab" },
511
512 /* Possessive quantifiers. */
513 { MU, A, 0, 0, "(?:a|b)++m", "mababbaaxababbaam" },
514 { MU, A, 0, 0, "(?:a|b)*+m", "mababbaaxababbaam" },
515 { MU, A, 0, 0, "(?:a|b)*+m", "ababbaaxababbaam" },
516 { MU, A, 0, 0, "(a|b)++m", "mababbaaxababbaam" },
517 { MU, A, 0, 0, "(a|b)*+m", "mababbaaxababbaam" },
518 { MU, A, 0, 0, "(a|b)*+m", "ababbaaxababbaam" },
519 { MU, A, 0, 0, "(a|b(*ACCEPT))++m", "maaxab" },
520 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxm" },
521 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxbbm" },
522 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxm" },
523 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxbbm" },
524 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxm" },
525 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxbbm" },
526 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxm" },
527 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxbbm" },
528 { MU, A, 0, 0, "(?:a|(b))++m", "mababbaaxababbaam" },
529 { MU, A, 0, 0, "(?:(a)|b)*+m", "mababbaaxababbaam" },
530 { MU, A, 0, 0, "(?:(a)|(b))*+m", "ababbaaxababbaam" },
531 { MU, A, 0, 0, "(a|(b))++m", "mababbaaxababbaam" },
532 { MU, A, 0, 0, "((a)|b)*+m", "mababbaaxababbaam" },
533 { MU, A, 0, 0, "((a)|(b))*+m", "ababbaaxababbaam" },
534 { MU, A, 0, 0, "(a|(b)(*ACCEPT))++m", "maaxab" },
535 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxm" },
536 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxbbm" },
537 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxm" },
538 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxbbm" },
539 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxm" },
540 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxbbm" },
541 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxm" },
542 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxbbm" },
543 { MU, A, 0, 0 | F_NOMATCH, "(?>(b{2,4}))(?:(?:(aa|c))++m|(?:(aa|c))+n)", "bbaacaaccaaaacxbbbmbn" },
544 { MU, A, 0, 0, "((?:b)++a)+(cd)*+m", "bbababbacdcdnbbababbacdcdm" },
545 { MU, A, 0, 0, "((?:(b))++a)+((c)d)*+m", "bbababbacdcdnbbababbacdcdm" },
546 { MU, A, 0, 0, "(?:(?:(?:ab)*+k)++(?:n(?:cd)++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
547 { MU, A, 0, 0, "(?:((ab)*+(k))++(n(?:c(d))++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
548
549 /* Back references. */
550 { MU, A, 0, 0, "(aa|bb)(\\1*)(ll|)(\\3*)bbbbbbc", "aaaaaabbbbbbbbc" },
551 { CMU, A, 0, 0, "(aa|bb)(\\1+)(ll|)(\\3+)bbbbbbc", "bBbbBbCbBbbbBbbcbbBbbbBBbbC" },
552 { CM, A, 0, 0, "(a{2,4})\\1", "AaAaaAaA" },
553 { MU, A, 0, 0, "(aa|bb)(\\1?)aa(\\1?)(ll|)(\\4+)bbc", "aaaaaaaabbaabbbbaabbbbc" },
554 { MU, A, 0, 0, "(aa|bb)(\\1{0,5})(ll|)(\\3{0,5})cc", "bbxxbbbbxxaaaaaaaaaaaaaaaacc" },
555 { MU, A, 0, 0, "(aa|bb)(\\1{3,5})(ll|)(\\3{3,5})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
556 { MU, A, 0, 0, "(aa|bb)(\\1{3,})(ll|)(\\3{3,})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
557 { MU, A, 0, 0, "(\\w+)b(\\1+)c", "GabGaGaDbGaDGaDc" },
558 { MU, A, 0, 0, "(?:(aa)|b)\\1?b", "bb" },
559 { CMU, A, 0, 0, "(aa|bb)(\\1*?)aa(\\1+?)", "bBBbaaAAaaAAaa" },
560 { MU, A, 0, 0, "(aa|bb)(\\1*?)(dd|)cc(\\3+?)", "aaaaaccdd" },
561 { CMU, A, 0, 0, "(?:(aa|bb)(\\1?\?)cc){2}(\\1?\?)", "aAaABBbbAAaAcCaAcCaA" },
562 { MU, A, 0, 0, "(?:(aa|bb)(\\1{3,5}?)){2}(dd|)(\\3{3,5}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
563 { CM, A, 0, 0, "(?:(aa|bb)(\\1{3,}?)){2}(dd|)(\\3{3,}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
564 { MU, A, 0, 0, "(?:(aa|bb)(\\1{0,3}?)){2}(dd|)(\\3{0,3}?)b(\\1{0,3}?)(\\1{0,3})", "aaaaaaaaaaaaaaabaaaaa" },
565 { MU, A, 0, 0, "(a(?:\\1|)a){3}b", "aaaaaaaaaaab" },
566 { M, A, 0, 0, "(a?)b(\\1\\1*\\1+\\1?\\1*?\\1+?\\1??\\1*+\\1++\\1?+\\1{4}\\1{3,5}\\1{4,}\\1{0,5}\\1{3,5}?\\1{4,}?\\1{0,5}?\\1{3,5}+\\1{4,}+\\1{0,5}+#){2}d", "bb#b##d" },
567 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
568 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{0,2}", "wwwww." },
569 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwww" },
570 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwwww" },
571 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
572 { CMUP, A, 0, 0, "(\xf0\x90\x90\x80)\\1", "\xf0\x90\x90\xa8\xf0\x90\x90\xa8" },
573 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
574 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
575 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>*(?<A>aa)(?<A>bb)", "aabb" },
576 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{0,3}aaaaaa", "aabbaaaaaa" },
577 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{2,5}bb", "aabbaaaabb" },
578 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}m", "aaaaaaaabbbbaabbbbm" },
579 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
580 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
581 { MU | PCRE2_DUPNAMES, A, 0, 0, "\\k<A>*?(?<A>aa)(?<A>bb)", "aabb" },
582 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
583 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>*?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
584 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
585 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}M", "aaaaaaaabbbbaabbbbm" },
586 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{1,3}M", "aaaaaaaabbbbaabbbbm" },
587 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}?M", "aaaaaabbbbbbaabbbbbbbbbbm" },
588 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
589
590 /* Assertions. */
591 { MU, A, 0, 0, "(?=xx|yy|zz)\\w{4}", "abczzdefg" },
592 { MU, A, 0, 0, "(?=((\\w+)b){3}|ab)", "dbbbb ab" },
593 { MU, A, 0, 0, "(?!ab|bc|cd)[a-z]{2}", "Xabcdef" },
594 { MU, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
595 { MU, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
596 { M, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
597 { M, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
598 { MU, A, 0, 0, "(\\d{2})(?!\\w+c|(((\\w?)m){2}n)+|\\1)", "x5656" },
599 { MU, A, 0, 0, "((?=((\\d{2,6}\\w){2,}))\\w{5,20}K){2,}", "567v09708K12l00M00 567v09708K12l00M00K45K" },
600 { MU, A, 0, 0, "(?=(?:(?=\\S+a)\\w*(b)){3})\\w+\\d", "bba bbab nbbkba nbbkba0kl" },
601 { MU, A, 0, 0, "(?>a(?>(b+))a(?=(..)))*?k", "acabbcabbaabacabaabbakk" },
602 { MU, A, 0, 0, "((?(?=(a))a)+k)", "bbak" },
603 { MU, A, 0, 0, "((?(?=a)a)+k)", "bbak" },
604 { MU, A, 0, 0 | F_NOMATCH, "(?=(?>(a))m)amk", "a k" },
605 { MU, A, 0, 0 | F_NOMATCH, "(?!(?>(a))m)amk", "a k" },
606 { MU, A, 0, 0 | F_NOMATCH, "(?>(?=(a))am)amk", "a k" },
607 { MU, A, 0, 0, "(?=(?>a|(?=(?>(b+))a|c)[a-c]+)*?m)[a-cm]+k", "aaam bbam baaambaam abbabba baaambaamk" },
608 { MU, A, 0, 0, "(?> ?\?\\b(?(?=\\w{1,4}(a))m)\\w{0,8}bc){2,}?", "bca ssbc mabd ssbc mabc" },
609 { MU, A, 0, 0, "(?:(?=ab)?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
610 { MU, A, 0, 0, "(?:(?=a(b))?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
611 { MU, A, 0, 0, "(?:(?=.(.))??\\1.)+m", "aabbbcbacccanaabbbcbacccam" },
612 { MU, A, 0, 0, "(?:(?=.)??[a-c])+m", "abacdcbacacdcaccam" },
613 { MU, A, 0, 0, "((?!a)?(?!([^a]))?)+$", "acbab" },
614 { MU, A, 0, 0, "((?!a)?\?(?!([^a]))?\?)+$", "acbab" },
615 { MU, A, 0, 0, "a(?=(?C)\\B(?C`x`))b", "ab" },
616 { MU, A, 0, 0, "a(?!(?C)\\B(?C`x`))bb|ab", "abb" },
617 { MU, A, 0, 0, "a(?=\\b|(?C)\\B(?C`x`))b", "ab" },
618 { MU, A, 0, 0, "a(?!\\b|(?C)\\B(?C`x`))bb|ab", "abb" },
619 { MU, A, 0, 0, "c(?(?=(?C)\\B(?C`x`))ab|a)", "cab" },
620 { MU, A, 0, 0, "c(?(?!(?C)\\B(?C`x`))ab|a)", "cab" },
621 { MU, A, 0, 0, "c(?(?=\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
622 { MU, A, 0, 0, "c(?(?!\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
623 { MU, A, 0, 0, "a(?=)b", "ab" },
624 { MU, A, 0, 0 | F_NOMATCH, "a(?!)b", "ab" },
625
626 /* Not empty, ACCEPT, FAIL */
627 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*", "bcx" },
628 { MU, A, PCRE2_NOTEMPTY, 0, "a*", "bcaad" },
629 { MU, A, PCRE2_NOTEMPTY, 0, "a*?", "bcaad" },
630 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*", "bcaad" },
631 { MU, A, 0, 0, "a(*ACCEPT)b", "ab" },
632 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*(*ACCEPT)b", "bcx" },
633 { MU, A, PCRE2_NOTEMPTY, 0, "a*(*ACCEPT)b", "bcaad" },
634 { MU, A, PCRE2_NOTEMPTY, 0, "a*?(*ACCEPT)b", "bcaad" },
635 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "(?:z|a*(*ACCEPT)b)", "bcx" },
636 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*(*ACCEPT)b)", "bcaad" },
637 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*?(*ACCEPT)b)", "bcaad" },
638 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*(*ACCEPT)b", "bcx" },
639 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0 | F_NOMATCH, "a*(*ACCEPT)b", "" },
640 { MU, A, 0, 0, "((a(*ACCEPT)b))", "ab" },
641 { MU, A, 0, 0, "(a(*FAIL)a|a)", "aaa" },
642 { MU, A, 0, 0, "(?=ab(*ACCEPT)b)a", "ab" },
643 { MU, A, 0, 0, "(?=(?:x|ab(*ACCEPT)b))", "ab" },
644 { MU, A, 0, 0, "(?=(a(b(*ACCEPT)b)))a", "ab" },
645 { MU, A, PCRE2_NOTEMPTY, 0, "(?=a*(*ACCEPT))c", "c" },
646 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "(?=A)", "AB" },
647
648 /* Conditional blocks. */
649 { MU, A, 0, 0, "(?(?=(a))a|b)+k", "ababbalbbadabak" },
650 { MU, A, 0, 0, "(?(?!(b))a|b)+k", "ababbalbbadabak" },
651 { MU, A, 0, 0, "(?(?=a)a|b)+k", "ababbalbbadabak" },
652 { MU, A, 0, 0, "(?(?!b)a|b)+k", "ababbalbbadabak" },
653 { MU, A, 0, 0, "(?(?=(a))a*|b*)+k", "ababbalbbadabak" },
654 { MU, A, 0, 0, "(?(?!(b))a*|b*)+k", "ababbalbbadabak" },
655 { MU, A, 0, 0, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
656 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
657 { MU, A, 0, 0 | F_DIFF, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
658 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
659 { MU, A, 0, 0, "(?(?=a)a*|b*)+k", "ababbalbbadabak" },
660 { MU, A, 0, 0, "(?(?!b)a*|b*)+k", "ababbalbbadabak" },
661 { MU, A, 0, 0, "(?(?=a)ab)", "a" },
662 { MU, A, 0, 0, "(?(?<!b)c)", "b" },
663 { MU, A, 0, 0, "(?(DEFINE)a(b))", "a" },
664 { MU, A, 0, 0, "a(?(DEFINE)(?:b|(?:c?)+)*)", "a" },
665 { MU, A, 0, 0, "(?(?=.[a-c])[k-l]|[A-D])", "kdB" },
666 { MU, A, 0, 0, "(?(?!.{0,4}[cd])(aa|bb)|(cc|dd))+", "aabbccddaa" },
667 { MU, A, 0, 0, "(?(?=[^#@]*@)(aaab|aa|aba)|(aba|aab)){3,}", "aaabaaaba#aaabaaaba#aaabaaaba@" },
668 { MU, A, 0, 0, "((?=\\w{5})\\w(?(?=\\w*k)\\d|[a-f_])*\\w\\s)+", "mol m10kk m088k _f_a_ mbkkl" },
669 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cdcaa" },
670 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cbb" },
671 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))(aaaa|a?))+aak", "aaaaab aaaaak" },
672 { MU, A, 0, 0, "(?(?=a)(aaaa|a?))+aak", "aaaaab aaaaak" },
673 { MU, A, 0, 0, "(?(?!(b))(aaaa|a?))+aak", "aaaaab aaaaak" },
674 { MU, A, 0, 0, "(?(?!b)(aaaa|a?))+aak", "aaaaab aaaaak" },
675 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))a*)+aak", "aaaaab aaaaak" },
676 { MU, A, 0, 0, "(?(?=a)a*)+aak", "aaaaab aaaaak" },
677 { MU, A, 0, 0, "(?(?!(b))a*)+aak", "aaaaab aaaaak" },
678 { MU, A, 0, 0, "(?(?!b)a*)+aak", "aaaaab aaaaak" },
679 { MU, A, 0, 0, "(?(?=(?=(?!(x))a)aa)aaa|(?(?=(?!y)bb)bbb))*k", "abaabbaaabbbaaabbb abaabbaaabbbaaabbbk" },
680 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)*l", "bc ddd abccabccl" },
681 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+?dd", "bcabcacdb bdddd" },
682 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+l", "ababccddabdbccd abcccl" },
683 { MU, A, 0, 0, "((?:a|aa)(?(1)aaa))x", "aax" },
684 { MU, A, 0, 0, "(?(?!)a|b)", "ab" },
685 { MU, A, 0, 0, "(?(?!)a)", "ab" },
686 { MU, A, 0, 0 | F_NOMATCH, "(?(?!)a|b)", "ac" },
687
688 /* Set start of match. */
689 { MU, A, 0, 0, "(?:\\Ka)*aaaab", "aaaaaaaa aaaaaaabb" },
690 { MU, A, 0, 0, "(?>\\Ka\\Ka)*aaaab", "aaaaaaaa aaaaaaaaaabb" },
691 { MU, A, 0, 0, "a+\\K(?<=\\Gaa)a", "aaaaaa" },
692 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a\\K(*ACCEPT)b", "aa" },
693 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a\\K(*ACCEPT)b", "aa" },
694
695 /* First line. */
696 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_PROPERTY, "\\p{Any}a", "bb\naaa" },
697 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}a", "bb\r\naaa" },
698 { MU | PCRE2_FIRSTLINE, A, 0, 0, "(?<=a)", "a" },
699 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[^a][^b]", "ab" },
700 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "a", "\na" },
701 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[abc]", "\na" },
702 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^a", "\na" },
703 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^(?<=\n)", "\na" },
704 { MU | PCRE2_FIRSTLINE, A, 0, 0, "\xf0\x90\x90\x80", "\xf0\x90\x90\x80" },
705 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\xc2\x85#" },
706 { M | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\x85#" },
707 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "^#", "\xe2\x80\xa8#" },
708 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_PROPERTY, "\\p{Any}", "\r\na" },
709 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
710 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, "a", "\ra" },
711 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH, "ba", "bbb\r\nba" },
712 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}{4}|a", "\r\na" },
713 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 1, ".", "\r\n" },
714 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_LF, 0, 0 | F_NOMATCH, "ab.", "ab" },
715 { MU | PCRE2_FIRSTLINE, A, 0, 1 | F_NOMATCH, "^[a-d0-9]", "\nxx\nd" },
716 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_ANY, 0, 0, "....a", "012\n0a" },
717 { MU | PCRE2_FIRSTLINE, A, 0, 0, "[aC]", "a" },
718
719 /* Recurse. */
720 { MU, A, 0, 0, "(a)(?1)", "aa" },
721 { MU, A, 0, 0, "((a))(?1)", "aa" },
722 { MU, A, 0, 0, "(b|a)(?1)", "aa" },
723 { MU, A, 0, 0, "(b|(a))(?1)", "aa" },
724 { MU, A, 0, 0 | F_NOMATCH, "((a)(b)(?:a*))(?1)", "aba" },
725 { MU, A, 0, 0, "((a)(b)(?:a*))(?1)", "abab" },
726 { MU, A, 0, 0, "((a+)c(?2))b(?1)", "aacaabaca" },
727 { MU, A, 0, 0, "((?2)b|(a)){2}(?1)", "aabab" },
728 { MU, A, 0, 0, "(?1)(a)*+(?2)(b(?1))", "aababa" },
729 { MU, A, 0, 0, "(?1)(((a(*ACCEPT)))b)", "axaa" },
730 { MU, A, 0, 0, "(?1)(?(DEFINE) (((ac(*ACCEPT)))b) )", "akaac" },
731 { MU, A, 0, 0, "(a+)b(?1)b\\1", "abaaabaaaaa" },
732 { MU, A, 0, 0, "(?(DEFINE)(aa|a))(?1)ab", "aab" },
733 { MU, A, 0, 0, "(?(DEFINE)(a\\Kb))(?1)+ababc", "abababxabababc" },
734 { MU, A, 0, 0, "(a\\Kb)(?1)+ababc", "abababxababababc" },
735 { MU, A, 0, 0 | F_NOMATCH, "(a\\Kb)(?1)+ababc", "abababxababababxc" },
736 { MU, A, 0, 0, "b|<(?R)*>", "<<b>" },
737 { MU, A, 0, 0, "(a\\K){0}(?:(?1)b|ac)", "ac" },
738 { MU, A, 0, 0, "(?(DEFINE)(a(?2)|b)(b(?1)|(a)))(?:(?1)|(?2))m", "ababababnababababaam" },
739 { MU, A, 0, 0, "(a)((?(R)a|b))(?2)", "aabbabaa" },
740 { MU, A, 0, 0, "(a)((?(R2)a|b))(?2)", "aabbabaa" },
741 { MU, A, 0, 0, "(a)((?(R1)a|b))(?2)", "ababba" },
742 { MU, A, 0, 0, "(?(R0)aa|bb(?R))", "abba aabb bbaa" },
743 { MU, A, 0, 0, "((?(R)(?:aaaa|a)|(?:(aaaa)|(a)))+)(?1)$", "aaaaaaaaaa aaaa" },
744 { MU, A, 0, 0, "(?P<Name>a(?(R&Name)a|b))(?1)", "aab abb abaa" },
745 { MU, A, 0, 0, "((?(R)a|(?1)){3})", "XaaaaaaaaaX" },
746 { MU, A, 0, 0, "((?:(?(R)a|(?1))){3})", "XaaaaaaaaaX" },
747 { MU, A, 0, 0, "((?(R)a|(?1)){1,3})aaaaaa", "aaaaaaaaXaaaaaaaaa" },
748 { MU, A, 0, 0, "((?(R)a|(?1)){1,3}?)M", "aaaM" },
749 { MU, A, 0, 0, "((.)(?:.|\\2(?1))){0}#(?1)#", "#aabbccdde# #aabbccddee#" },
750 { MU, A, 0, 0, "((.)(?:\\2|\\2{4}b)){0}#(?:(?1))+#", "#aaaab# #aaaaab#" },
751 { MU, A, 0, 0 | F_NOMATCH, "(?1)$((.|\\2xx){1,2})", "abc" },
752
753 /* 16 bit specific tests. */
754 { CM, A, 0, 0 | F_FORCECONV, "\xc3\xa1", "\xc3\x81\xc3\xa1" },
755 { CM, A, 0, 0 | F_FORCECONV, "\xe1\xbd\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
756 { CM, A, 0, 0 | F_FORCECONV, "[\xc3\xa1]", "\xc3\x81\xc3\xa1" },
757 { CM, A, 0, 0 | F_FORCECONV, "[\xe1\xbd\xb8]", "\xe1\xbf\xb8\xe1\xbd\xb8" },
758 { CM, A, 0, 0 | F_FORCECONV, "[a-\xed\xb0\x80]", "A" },
759 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[a-\\x{dc00}]", "B" },
760 { CM, A, 0, 0 | F_NO8 | F_NOMATCH | F_FORCECONV, "[b-\\x{dc00}]", "a" },
761 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "\xed\xa0\x80\\x{d800}\xed\xb0\x80\\x{dc00}", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80" },
762 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\xed\xa0\x80\\x{d800}]{1,2}?[\xed\xb0\x80\\x{dc00}]{1,2}?#", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80#" },
763 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80\xed\xb0\x80#]{0,3}(?<=\xed\xb0\x80.)", "\xed\xa0\x80#\xed\xa0\x80##\xed\xb0\x80\xed\xa0\x80" },
764 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\x9f\xbf\xed\xa0\x83" },
765 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\xb4\x80\xed\xb3\xb0" },
766 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\x9f\xbf\xed\xa0\x83" },
767 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\xb4\x80\xed\xb3\xb0" },
768 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xef\xbf\xbf]+[\x1-\xed\xb0\x80]+#", "\xed\xa0\x85\xc3\x81\xed\xa0\x85\xef\xbf\xb0\xc2\x85\xed\xa9\x89#" },
769 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80][\xed\xb0\x80]{2,}", "\xed\xa0\x80\xed\xb0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80\xed\xb0\x80" },
770 { M, A, 0, 0 | F_FORCECONV, "[^\xed\xb0\x80]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
771 { M, A, 0, 0 | F_NO8 | F_FORCECONV, "[^\\x{dc00}]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
772 { CM, A, 0, 0 | F_FORCECONV, ".\\B.", "\xed\xa0\x80\xed\xb0\x80" },
773 { CM, A, 0, 0 | F_FORCECONV, "\\D+(?:\\d+|.)\\S+(?:\\s+|.)\\W+(?:\\w+|.)\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80" },
774 { CM, A, 0, 0 | F_FORCECONV, "\\d*\\s*\\w*\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80" },
775 { CM, A, 0, 0 | F_FORCECONV | F_NOMATCH, "\\d*?\\D*?\\s*?\\S*?\\w*?\\W*?##", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80#" },
776 { CM | PCRE2_EXTENDED, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80 \xed\xb0\x80 !", "\xed\xa0\x80\xed\xb0\x80!" },
777 { CM, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80+#[^#]+\xed\xa0\x80", "\xed\xa0\x80#a\xed\xa0\x80" },
778 { CM, A, 0, 0 | F_FORCECONV, "(\xed\xa0\x80+)#\\1", "\xed\xa0\x80\xed\xa0\x80#\xed\xa0\x80\xed\xa0\x80" },
779 { M, PCRE2_NEWLINE_ANY, 0, 0 | F_NO8 | F_FORCECONV, "^-", "a--\xe2\x80\xa8--" },
780 { 0, BSR(PCRE2_BSR_UNICODE), 0, 0 | F_NO8 | F_FORCECONV, "\\R", "ab\xe2\x80\xa8" },
781 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v", "ab\xe2\x80\xa9" },
782 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h", "ab\xe1\xa0\x8e" },
783 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v+?\\V+?#", "\xe2\x80\xa9\xe2\x80\xa9\xef\xbf\xbf\xef\xbf\xbf#" },
784 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h+?\\H+?#", "\xe1\xa0\x8e\xe1\xa0\x8e\xef\xbf\xbf\xef\xbf\xbf#" },
785
786 /* Partial matching. */
787 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab", "a" },
788 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab|a", "a" },
789 { MU, A, PCRE2_PARTIAL_HARD, 0, "ab|a", "a" },
790 { MU, A, PCRE2_PARTIAL_SOFT, 0, "\\b#", "a" },
791 { MU, A, PCRE2_PARTIAL_SOFT, 0, "(?<=a)b", "a" },
792 { MU, A, PCRE2_PARTIAL_SOFT, 0, "abc|(?<=xxa)bc", "xxab" },
793 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a\\B", "a" },
794 { MU, A, PCRE2_PARTIAL_HARD, 0, "a\\b", "a" },
795
796 /* (*MARK) verb. */
797 { MU, A, 0, 0, "a(*MARK:aa)a", "ababaa" },
798 { MU, A, 0, 0 | F_NOMATCH, "a(*:aa)a", "abab" },
799 { MU, A, 0, 0, "a(*:aa)(b(*:bb)b|bc)", "abc" },
800 { MU, A, 0, 0 | F_NOMATCH, "a(*:1)x|b(*:2)y", "abc" },
801 { MU, A, 0, 0, "(?>a(*:aa))b|ac", "ac" },
802 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))(?1)", "a" },
803 { MU, A, 0, 0 | F_NOMATCH, "(?(DEFINE)((a)(*:aa)))(?1)b", "aa" },
804 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))a(?1)b|aac", "aac" },
805 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
806 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b)+", "babba" },
807 { MU, A, 0, 0 | F_NOMATCH, "(a(*:aa)){0}(?:b(?1)b)+", "ba" },
808 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
809 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b)+", "babba" },
810 { MU, A, 0, 0 | F_NOMATCH, "(a\\K(*:aa)){0}(?:b(?1)b)+", "ba" },
811 { MU, A, 0, 0 | F_NOMATCH, "(*:mark)m", "a" },
812
813 /* (*COMMIT) verb. */
814 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)b", "ac" },
815 { MU, A, 0, 0, "aa(*COMMIT)b", "xaxaab" },
816 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)(*:msg)b|ac", "ac" },
817 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b)++", "abac" },
818 { MU, A, 0, 0 | F_NOMATCH, "((a)(*COMMIT)b)++", "abac" },
819 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*COMMIT)b)ab|ad", "ad" },
820
821 /* (*PRUNE) verb. */
822 { MU, A, 0, 0, "aa\\K(*PRUNE)b", "aaab" },
823 { MU, A, 0, 0, "aa(*PRUNE:bb)b|a", "aa" },
824 { MU, A, 0, 0, "(a)(a)(*PRUNE)b|(a)", "aa" },
825 { MU, A, 0, 0, "(a)(a)(a)(a)(a)(a)(a)(a)(*PRUNE)b|(a)", "aaaaaaaa" },
826 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|", "a" },
827 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|m", "a" },
828 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*PRUNE)b)ab|ad", "ad" },
829 { MU, A, 0, 0, "a(*COMMIT)(*PRUNE)d|bc", "abc" },
830 { MU, A, 0, 0, "(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
831 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
832 { MU, A, 0, 0, "(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
833 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
834 { MU, A, 0, 0, "(a(*COMMIT)b){0}a(?1)(*PRUNE)c|bc", "abc" },
835 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b){0}a(*COMMIT)(?1)(*PRUNE)c|bc", "abc" },
836 { MU, A, 0, 0, "(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
837 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
838 { MU, A, 0, 0, "((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
839 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
840 { MU, A, 0, 0, "(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
841 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
842 { MU, A, 0, 0, "(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
843 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
844 { MU, A, 0, 0, "(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
845 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
846 { MU, A, 0, 0, "(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
847 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
848 { MU, A, 0, 0, "(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
849 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
850 { MU, A, 0, 0, "(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
851 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
852
853 /* (*SKIP) verb. */
854 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*SKIP)b)ab|ad", "ad" },
855 { MU, A, 0, 0, "(\\w+(*SKIP)#)", "abcd,xyz#," },
856 { MU, A, 0, 0, "\\w+(*SKIP)#|mm", "abcd,xyz#," },
857 { MU, A, 0, 0 | F_NOMATCH, "b+(?<=(*SKIP)#c)|b+", "#bbb" },
858
859 /* (*THEN) verb. */
860 { MU, A, 0, 0, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcaabcaabcaabcnacm" },
861 { MU, A, 0, 0 | F_NOMATCH, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcm" },
862 { MU, A, 0, 0, "((?:a(*THEN)|aab)c|a+)+m", "aabcaabcnmaabcaabcm" },
863 { MU, A, 0, 0, "((?:a|aab)(*THEN)c|a+)+m", "aam" },
864 { MU, A, 0, 0, "((?:a(*COMMIT)|aab)(*THEN)c|a+)+m", "aam" },
865 { MU, A, 0, 0, "(?(?=a(*THEN)b)ab|ad)", "ad" },
866 { MU, A, 0, 0, "(?(?!a(*THEN)b)ad|add)", "add" },
867 { MU, A, 0, 0 | F_NOMATCH, "(?(?=a)a(*THEN)b|ad)", "ad" },
868 { MU, A, 0, 0, "(?!(?(?=a)ab|b(*THEN)d))bn|bnn", "bnn" },
869 { MU, A, 0, 0, "(?=(*THEN: ))* ", " " },
870 { MU, A, 0, 0, "a(*THEN)(?R) |", "a" },
871
872 /* Recurse and control verbs. */
873 { MU, A, 0, 0, "(a(*ACCEPT)b){0}a(?1)b", "aacaabb" },
874 { MU, A, 0, 0, "((a)\\2(*ACCEPT)b){0}a(?1)b", "aaacaaabb" },
875 { MU, A, 0, 0, "((ab|a(*ACCEPT)x)+|ababababax){0}_(?1)_", "_ababababax_ _ababababa_" },
876 { MU, A, 0, 0, "((.)(?:A(*ACCEPT)|(?1)\\2)){0}_(?1)_", "_bcdaAdcb_bcdaAdcb_" },
877 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_", "_ab_" },
878 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_|(_aa_)", "_aa_" },
879 { MU, A, 0, 0, "(a(*COMMIT)(?:b|bb)|c(*ACCEPT)d|dd){0}_(?1)+_", "_ax_ _cd_ _abbb_ _abcd_ _abbcdd_" },
880 { MU, A, 0, 0, "((.)(?:.|(*COMMIT)\\2{3}(*ACCEPT).*|.*)){0}_(?1){0,4}_", "_aaaabbbbccccddd_ _aaaabbbbccccdddd_" },
881
882 #ifdef SUPPORT_UNICODE
883 /* Script runs and iterations. */
884 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
885 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
886 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
887 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
888 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
889 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)++#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
890 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)?#", "!ab!abc!ab!ab#" },
891 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)??#", "!ab!abc!ab!ab#" },
892 #endif
893
894 /* Deep recursion. */
895 { MU, A, 0, 0, "((((?:(?:(?:\\w)+)?)*|(?>\\w)+?)+|(?>\\w)?\?)*)?\\s", "aaaaa+ " },
896 { MU, A, 0, 0, "(?:((?:(?:(?:\\w*?)+)??|(?>\\w)?|\\w*+)*)+)+?\\s", "aa+ " },
897 { MU, A, 0, 0, "((a?)+)+b", "aaaaaaaaaaaa b" },
898
899 /* Deep recursion: Stack limit reached. */
900 { M, A, 0, 0 | F_NOMATCH, "a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaa" },
901 { M, A, 0, 0 | F_NOMATCH, "(?:a+)+b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
902 { M, A, 0, 0 | F_NOMATCH, "(?:a+?)+?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
903 { M, A, 0, 0 | F_NOMATCH, "(?:a*)*b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
904 { M, A, 0, 0 | F_NOMATCH, "(?:a*?)*?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
905
906 { 0, 0, 0, 0, NULL, NULL }
907 };
908
909 #ifdef SUPPORT_PCRE2_8
callback8(void * arg)910 static pcre2_jit_stack_8* callback8(void *arg)
911 {
912 return (pcre2_jit_stack_8 *)arg;
913 }
914 #endif
915
916 #ifdef SUPPORT_PCRE2_16
callback16(void * arg)917 static pcre2_jit_stack_16* callback16(void *arg)
918 {
919 return (pcre2_jit_stack_16 *)arg;
920 }
921 #endif
922
923 #ifdef SUPPORT_PCRE2_32
callback32(void * arg)924 static pcre2_jit_stack_32* callback32(void *arg)
925 {
926 return (pcre2_jit_stack_32 *)arg;
927 }
928 #endif
929
930 #ifdef SUPPORT_PCRE2_8
931 static pcre2_jit_stack_8 *stack8;
932
getstack8(void)933 static pcre2_jit_stack_8 *getstack8(void)
934 {
935 if (!stack8)
936 stack8 = pcre2_jit_stack_create_8(1, 1024 * 1024, NULL);
937 return stack8;
938 }
939
setstack8(pcre2_match_context_8 * mcontext)940 static void setstack8(pcre2_match_context_8 *mcontext)
941 {
942 if (!mcontext) {
943 if (stack8)
944 pcre2_jit_stack_free_8(stack8);
945 stack8 = NULL;
946 return;
947 }
948
949 pcre2_jit_stack_assign_8(mcontext, callback8, getstack8());
950 }
951 #endif /* SUPPORT_PCRE2_8 */
952
953 #ifdef SUPPORT_PCRE2_16
954 static pcre2_jit_stack_16 *stack16;
955
getstack16(void)956 static pcre2_jit_stack_16 *getstack16(void)
957 {
958 if (!stack16)
959 stack16 = pcre2_jit_stack_create_16(1, 1024 * 1024, NULL);
960 return stack16;
961 }
962
setstack16(pcre2_match_context_16 * mcontext)963 static void setstack16(pcre2_match_context_16 *mcontext)
964 {
965 if (!mcontext) {
966 if (stack16)
967 pcre2_jit_stack_free_16(stack16);
968 stack16 = NULL;
969 return;
970 }
971
972 pcre2_jit_stack_assign_16(mcontext, callback16, getstack16());
973 }
974 #endif /* SUPPORT_PCRE2_16 */
975
976 #ifdef SUPPORT_PCRE2_32
977 static pcre2_jit_stack_32 *stack32;
978
getstack32(void)979 static pcre2_jit_stack_32 *getstack32(void)
980 {
981 if (!stack32)
982 stack32 = pcre2_jit_stack_create_32(1, 1024 * 1024, NULL);
983 return stack32;
984 }
985
setstack32(pcre2_match_context_32 * mcontext)986 static void setstack32(pcre2_match_context_32 *mcontext)
987 {
988 if (!mcontext) {
989 if (stack32)
990 pcre2_jit_stack_free_32(stack32);
991 stack32 = NULL;
992 return;
993 }
994
995 pcre2_jit_stack_assign_32(mcontext, callback32, getstack32());
996 }
997 #endif /* SUPPORT_PCRE2_32 */
998
999 #ifdef SUPPORT_PCRE2_16
1000
convert_utf8_to_utf16(PCRE2_SPTR8 input,PCRE2_UCHAR16 * output,int * offsetmap,int max_length)1001 static int convert_utf8_to_utf16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int *offsetmap, int max_length)
1002 {
1003 PCRE2_SPTR8 iptr = input;
1004 PCRE2_UCHAR16 *optr = output;
1005 unsigned int c;
1006
1007 if (max_length == 0)
1008 return 0;
1009
1010 while (*iptr && max_length > 1) {
1011 c = 0;
1012 if (offsetmap)
1013 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1014
1015 if (*iptr < 0xc0)
1016 c = *iptr++;
1017 else if (!(*iptr & 0x20)) {
1018 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1019 iptr += 2;
1020 } else if (!(*iptr & 0x10)) {
1021 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1022 iptr += 3;
1023 } else if (!(*iptr & 0x08)) {
1024 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1025 iptr += 4;
1026 }
1027
1028 if (c < 65536) {
1029 *optr++ = c;
1030 max_length--;
1031 } else if (max_length <= 2) {
1032 *optr = '\0';
1033 return (int)(optr - output);
1034 } else {
1035 c -= 0x10000;
1036 *optr++ = 0xd800 | ((c >> 10) & 0x3ff);
1037 *optr++ = 0xdc00 | (c & 0x3ff);
1038 max_length -= 2;
1039 if (offsetmap)
1040 offsetmap++;
1041 }
1042 }
1043 if (offsetmap)
1044 *offsetmap = (int)(iptr - (unsigned char*)input);
1045 *optr = '\0';
1046 return (int)(optr - output);
1047 }
1048
copy_char8_to_char16(PCRE2_SPTR8 input,PCRE2_UCHAR16 * output,int max_length)1049 static int copy_char8_to_char16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int max_length)
1050 {
1051 PCRE2_SPTR8 iptr = input;
1052 PCRE2_UCHAR16 *optr = output;
1053
1054 if (max_length == 0)
1055 return 0;
1056
1057 while (*iptr && max_length > 1) {
1058 *optr++ = *iptr++;
1059 max_length--;
1060 }
1061 *optr = '\0';
1062 return (int)(optr - output);
1063 }
1064
1065 #define REGTEST_MAX_LENGTH16 4096
1066 static PCRE2_UCHAR16 regtest_buf16[REGTEST_MAX_LENGTH16];
1067 static int regtest_offsetmap16[REGTEST_MAX_LENGTH16];
1068
1069 #endif /* SUPPORT_PCRE2_16 */
1070
1071 #ifdef SUPPORT_PCRE2_32
1072
convert_utf8_to_utf32(PCRE2_SPTR8 input,PCRE2_UCHAR32 * output,int * offsetmap,int max_length)1073 static int convert_utf8_to_utf32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int *offsetmap, int max_length)
1074 {
1075 PCRE2_SPTR8 iptr = input;
1076 PCRE2_UCHAR32 *optr = output;
1077 unsigned int c;
1078
1079 if (max_length == 0)
1080 return 0;
1081
1082 while (*iptr && max_length > 1) {
1083 c = 0;
1084 if (offsetmap)
1085 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1086
1087 if (*iptr < 0xc0)
1088 c = *iptr++;
1089 else if (!(*iptr & 0x20)) {
1090 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1091 iptr += 2;
1092 } else if (!(*iptr & 0x10)) {
1093 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1094 iptr += 3;
1095 } else if (!(*iptr & 0x08)) {
1096 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1097 iptr += 4;
1098 }
1099
1100 *optr++ = c;
1101 max_length--;
1102 }
1103 if (offsetmap)
1104 *offsetmap = (int)(iptr - (unsigned char*)input);
1105 *optr = 0;
1106 return (int)(optr - output);
1107 }
1108
copy_char8_to_char32(PCRE2_SPTR8 input,PCRE2_UCHAR32 * output,int max_length)1109 static int copy_char8_to_char32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int max_length)
1110 {
1111 PCRE2_SPTR8 iptr = input;
1112 PCRE2_UCHAR32 *optr = output;
1113
1114 if (max_length == 0)
1115 return 0;
1116
1117 while (*iptr && max_length > 1) {
1118 *optr++ = *iptr++;
1119 max_length--;
1120 }
1121 *optr = '\0';
1122 return (int)(optr - output);
1123 }
1124
1125 #define REGTEST_MAX_LENGTH32 4096
1126 static PCRE2_UCHAR32 regtest_buf32[REGTEST_MAX_LENGTH32];
1127 static int regtest_offsetmap32[REGTEST_MAX_LENGTH32];
1128
1129 #endif /* SUPPORT_PCRE2_32 */
1130
check_ascii(const char * input)1131 static int check_ascii(const char *input)
1132 {
1133 const unsigned char *ptr = (unsigned char *)input;
1134 while (*ptr) {
1135 if (*ptr > 127)
1136 return 0;
1137 ptr++;
1138 }
1139 return 1;
1140 }
1141
1142 #define OVECTOR_SIZE 15
1143
regression_tests(void)1144 static int regression_tests(void)
1145 {
1146 struct regression_test_case *current = regression_test_cases;
1147 int error;
1148 PCRE2_SIZE err_offs;
1149 int is_successful;
1150 int is_ascii;
1151 int total = 0;
1152 int successful = 0;
1153 int successful_row = 0;
1154 int counter = 0;
1155 int jit_compile_mode;
1156 int utf = 0;
1157 int disabled_options = 0;
1158 int i;
1159 #ifdef SUPPORT_PCRE2_8
1160 pcre2_code_8 *re8;
1161 pcre2_compile_context_8 *ccontext8;
1162 pcre2_match_data_8 *mdata8_1;
1163 pcre2_match_data_8 *mdata8_2;
1164 pcre2_match_context_8 *mcontext8;
1165 PCRE2_SIZE *ovector8_1 = NULL;
1166 PCRE2_SIZE *ovector8_2 = NULL;
1167 int return_value8[2];
1168 #endif
1169 #ifdef SUPPORT_PCRE2_16
1170 pcre2_code_16 *re16;
1171 pcre2_compile_context_16 *ccontext16;
1172 pcre2_match_data_16 *mdata16_1;
1173 pcre2_match_data_16 *mdata16_2;
1174 pcre2_match_context_16 *mcontext16;
1175 PCRE2_SIZE *ovector16_1 = NULL;
1176 PCRE2_SIZE *ovector16_2 = NULL;
1177 int return_value16[2];
1178 int length16;
1179 #endif
1180 #ifdef SUPPORT_PCRE2_32
1181 pcre2_code_32 *re32;
1182 pcre2_compile_context_32 *ccontext32;
1183 pcre2_match_data_32 *mdata32_1;
1184 pcre2_match_data_32 *mdata32_2;
1185 pcre2_match_context_32 *mcontext32;
1186 PCRE2_SIZE *ovector32_1 = NULL;
1187 PCRE2_SIZE *ovector32_2 = NULL;
1188 int return_value32[2];
1189 int length32;
1190 #endif
1191
1192 #if defined SUPPORT_PCRE2_8
1193 PCRE2_UCHAR8 cpu_info[128];
1194 #elif defined SUPPORT_PCRE2_16
1195 PCRE2_UCHAR16 cpu_info[128];
1196 #elif defined SUPPORT_PCRE2_32
1197 PCRE2_UCHAR32 cpu_info[128];
1198 #endif
1199 #if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1200 int return_value;
1201 #endif
1202
1203 /* This test compares the behaviour of interpreter and JIT. Although disabling
1204 utf or ucp may make tests fail, if the pcre_exec result is the SAME, it is
1205 still considered successful from pcre_jit_test point of view. */
1206
1207 #if defined SUPPORT_PCRE2_8
1208 pcre2_config_8(PCRE2_CONFIG_JITTARGET, &cpu_info);
1209 #elif defined SUPPORT_PCRE2_16
1210 pcre2_config_16(PCRE2_CONFIG_JITTARGET, &cpu_info);
1211 #elif defined SUPPORT_PCRE2_32
1212 pcre2_config_32(PCRE2_CONFIG_JITTARGET, &cpu_info);
1213 #endif
1214
1215 printf("Running JIT regression tests\n");
1216 printf(" target CPU of SLJIT compiler: ");
1217 for (i = 0; cpu_info[i]; i++)
1218 printf("%c", (char)(cpu_info[i]));
1219 printf("\n");
1220
1221 #if defined SUPPORT_PCRE2_8
1222 pcre2_config_8(PCRE2_CONFIG_UNICODE, &utf);
1223 #elif defined SUPPORT_PCRE2_16
1224 pcre2_config_16(PCRE2_CONFIG_UNICODE, &utf);
1225 #elif defined SUPPORT_PCRE2_32
1226 pcre2_config_32(PCRE2_CONFIG_UNICODE, &utf);
1227 #endif
1228
1229 if (!utf)
1230 disabled_options |= PCRE2_UTF;
1231 #ifdef SUPPORT_PCRE2_8
1232 printf(" in 8 bit mode with UTF-8 %s:\n", utf ? "enabled" : "disabled");
1233 #endif
1234 #ifdef SUPPORT_PCRE2_16
1235 printf(" in 16 bit mode with UTF-16 %s:\n", utf ? "enabled" : "disabled");
1236 #endif
1237 #ifdef SUPPORT_PCRE2_32
1238 printf(" in 32 bit mode with UTF-32 %s:\n", utf ? "enabled" : "disabled");
1239 #endif
1240
1241 while (current->pattern) {
1242 /* printf("\nPattern: %s :\n", current->pattern); */
1243 total++;
1244 is_ascii = 0;
1245 if (!(current->start_offset & F_PROPERTY))
1246 is_ascii = check_ascii(current->pattern) && check_ascii(current->input);
1247
1248 if (current->match_options & PCRE2_PARTIAL_SOFT)
1249 jit_compile_mode = PCRE2_JIT_PARTIAL_SOFT;
1250 else if (current->match_options & PCRE2_PARTIAL_HARD)
1251 jit_compile_mode = PCRE2_JIT_PARTIAL_HARD;
1252 else
1253 jit_compile_mode = PCRE2_JIT_COMPLETE;
1254 error = 0;
1255 #ifdef SUPPORT_PCRE2_8
1256 re8 = NULL;
1257 ccontext8 = pcre2_compile_context_create_8(NULL);
1258 if (ccontext8) {
1259 if (GET_NEWLINE(current->newline))
1260 pcre2_set_newline_8(ccontext8, GET_NEWLINE(current->newline));
1261 if (GET_BSR(current->newline))
1262 pcre2_set_bsr_8(ccontext8, GET_BSR(current->newline));
1263
1264 if (!(current->start_offset & F_NO8)) {
1265 re8 = pcre2_compile_8((PCRE2_SPTR8)current->pattern, PCRE2_ZERO_TERMINATED,
1266 current->compile_options & ~disabled_options,
1267 &error, &err_offs, ccontext8);
1268
1269 if (!re8 && (utf || is_ascii))
1270 printf("\n8 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1271 }
1272 pcre2_compile_context_free_8(ccontext8);
1273 }
1274 else
1275 printf("\n8 bit: Cannot allocate compile context\n");
1276 #endif
1277 #ifdef SUPPORT_PCRE2_16
1278 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1279 convert_utf8_to_utf16((PCRE2_SPTR8)current->pattern, regtest_buf16, NULL, REGTEST_MAX_LENGTH16);
1280 else
1281 copy_char8_to_char16((PCRE2_SPTR8)current->pattern, regtest_buf16, REGTEST_MAX_LENGTH16);
1282
1283 re16 = NULL;
1284 ccontext16 = pcre2_compile_context_create_16(NULL);
1285 if (ccontext16) {
1286 if (GET_NEWLINE(current->newline))
1287 pcre2_set_newline_16(ccontext16, GET_NEWLINE(current->newline));
1288 if (GET_BSR(current->newline))
1289 pcre2_set_bsr_16(ccontext16, GET_BSR(current->newline));
1290
1291 if (!(current->start_offset & F_NO16)) {
1292 re16 = pcre2_compile_16(regtest_buf16, PCRE2_ZERO_TERMINATED,
1293 current->compile_options & ~disabled_options,
1294 &error, &err_offs, ccontext16);
1295
1296 if (!re16 && (utf || is_ascii))
1297 printf("\n16 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1298 }
1299 pcre2_compile_context_free_16(ccontext16);
1300 }
1301 else
1302 printf("\n16 bit: Cannot allocate compile context\n");
1303 #endif
1304 #ifdef SUPPORT_PCRE2_32
1305 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1306 convert_utf8_to_utf32((PCRE2_SPTR8)current->pattern, regtest_buf32, NULL, REGTEST_MAX_LENGTH32);
1307 else
1308 copy_char8_to_char32((PCRE2_SPTR8)current->pattern, regtest_buf32, REGTEST_MAX_LENGTH32);
1309
1310 re32 = NULL;
1311 ccontext32 = pcre2_compile_context_create_32(NULL);
1312 if (ccontext32) {
1313 if (GET_NEWLINE(current->newline))
1314 pcre2_set_newline_32(ccontext32, GET_NEWLINE(current->newline));
1315 if (GET_BSR(current->newline))
1316 pcre2_set_bsr_32(ccontext32, GET_BSR(current->newline));
1317
1318 if (!(current->start_offset & F_NO32)) {
1319 re32 = pcre2_compile_32(regtest_buf32, PCRE2_ZERO_TERMINATED,
1320 current->compile_options & ~disabled_options,
1321 &error, &err_offs, ccontext32);
1322
1323 if (!re32 && (utf || is_ascii))
1324 printf("\n32 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1325 }
1326 pcre2_compile_context_free_32(ccontext32);
1327 }
1328 else
1329 printf("\n32 bit: Cannot allocate compile context\n");
1330 #endif
1331
1332 counter++;
1333 if ((counter & 0x3) != 0) {
1334 #ifdef SUPPORT_PCRE2_8
1335 setstack8(NULL);
1336 #endif
1337 #ifdef SUPPORT_PCRE2_16
1338 setstack16(NULL);
1339 #endif
1340 #ifdef SUPPORT_PCRE2_32
1341 setstack32(NULL);
1342 #endif
1343 }
1344
1345 #ifdef SUPPORT_PCRE2_8
1346 return_value8[0] = -1000;
1347 return_value8[1] = -1000;
1348 mdata8_1 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1349 mdata8_2 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1350 mcontext8 = pcre2_match_context_create_8(NULL);
1351 if (!mdata8_1 || !mdata8_2 || !mcontext8) {
1352 printf("\n8 bit: Cannot allocate match data\n");
1353 pcre2_match_data_free_8(mdata8_1);
1354 pcre2_match_data_free_8(mdata8_2);
1355 pcre2_match_context_free_8(mcontext8);
1356 pcre2_code_free_8(re8);
1357 re8 = NULL;
1358 } else {
1359 ovector8_1 = pcre2_get_ovector_pointer_8(mdata8_1);
1360 ovector8_2 = pcre2_get_ovector_pointer_8(mdata8_2);
1361 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1362 ovector8_1[i] = -2;
1363 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1364 ovector8_2[i] = -2;
1365 pcre2_set_match_limit_8(mcontext8, 10000000);
1366 }
1367 if (re8) {
1368 return_value8[1] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1369 current->start_offset & OFFSET_MASK, current->match_options, mdata8_2, mcontext8);
1370
1371 if (pcre2_jit_compile_8(re8, jit_compile_mode)) {
1372 printf("\n8 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1373 } else if ((counter & 0x1) != 0) {
1374 setstack8(mcontext8);
1375 return_value8[0] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1376 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1377 } else {
1378 pcre2_jit_stack_assign_8(mcontext8, NULL, getstack8());
1379 return_value8[0] = pcre2_jit_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1380 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1381 }
1382 }
1383 #endif
1384
1385 #ifdef SUPPORT_PCRE2_16
1386 return_value16[0] = -1000;
1387 return_value16[1] = -1000;
1388 mdata16_1 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1389 mdata16_2 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1390 mcontext16 = pcre2_match_context_create_16(NULL);
1391 if (!mdata16_1 || !mdata16_2 || !mcontext16) {
1392 printf("\n16 bit: Cannot allocate match data\n");
1393 pcre2_match_data_free_16(mdata16_1);
1394 pcre2_match_data_free_16(mdata16_2);
1395 pcre2_match_context_free_16(mcontext16);
1396 pcre2_code_free_16(re16);
1397 re16 = NULL;
1398 } else {
1399 ovector16_1 = pcre2_get_ovector_pointer_16(mdata16_1);
1400 ovector16_2 = pcre2_get_ovector_pointer_16(mdata16_2);
1401 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1402 ovector16_1[i] = -2;
1403 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1404 ovector16_2[i] = -2;
1405 pcre2_set_match_limit_16(mcontext16, 10000000);
1406 }
1407 if (re16) {
1408 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1409 length16 = convert_utf8_to_utf16((PCRE2_SPTR8)current->input, regtest_buf16, regtest_offsetmap16, REGTEST_MAX_LENGTH16);
1410 else
1411 length16 = copy_char8_to_char16((PCRE2_SPTR8)current->input, regtest_buf16, REGTEST_MAX_LENGTH16);
1412
1413 return_value16[1] = pcre2_match_16(re16, regtest_buf16, length16,
1414 current->start_offset & OFFSET_MASK, current->match_options, mdata16_2, mcontext16);
1415
1416 if (pcre2_jit_compile_16(re16, jit_compile_mode)) {
1417 printf("\n16 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1418 } else if ((counter & 0x1) != 0) {
1419 setstack16(mcontext16);
1420 return_value16[0] = pcre2_match_16(re16, regtest_buf16, length16,
1421 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1422 } else {
1423 pcre2_jit_stack_assign_16(mcontext16, NULL, getstack16());
1424 return_value16[0] = pcre2_jit_match_16(re16, regtest_buf16, length16,
1425 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1426 }
1427 }
1428 #endif
1429
1430 #ifdef SUPPORT_PCRE2_32
1431 return_value32[0] = -1000;
1432 return_value32[1] = -1000;
1433 mdata32_1 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1434 mdata32_2 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1435 mcontext32 = pcre2_match_context_create_32(NULL);
1436 if (!mdata32_1 || !mdata32_2 || !mcontext32) {
1437 printf("\n32 bit: Cannot allocate match data\n");
1438 pcre2_match_data_free_32(mdata32_1);
1439 pcre2_match_data_free_32(mdata32_2);
1440 pcre2_match_context_free_32(mcontext32);
1441 pcre2_code_free_32(re32);
1442 re32 = NULL;
1443 } else {
1444 ovector32_1 = pcre2_get_ovector_pointer_32(mdata32_1);
1445 ovector32_2 = pcre2_get_ovector_pointer_32(mdata32_2);
1446 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1447 ovector32_1[i] = -2;
1448 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1449 ovector32_2[i] = -2;
1450 pcre2_set_match_limit_32(mcontext32, 10000000);
1451 }
1452 if (re32) {
1453 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1454 length32 = convert_utf8_to_utf32((PCRE2_SPTR8)current->input, regtest_buf32, regtest_offsetmap32, REGTEST_MAX_LENGTH32);
1455 else
1456 length32 = copy_char8_to_char32((PCRE2_SPTR8)current->input, regtest_buf32, REGTEST_MAX_LENGTH32);
1457
1458 return_value32[1] = pcre2_match_32(re32, regtest_buf32, length32,
1459 current->start_offset & OFFSET_MASK, current->match_options, mdata32_2, mcontext32);
1460
1461 if (pcre2_jit_compile_32(re32, jit_compile_mode)) {
1462 printf("\n32 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1463 } else if ((counter & 0x1) != 0) {
1464 setstack32(mcontext32);
1465 return_value32[0] = pcre2_match_32(re32, regtest_buf32, length32,
1466 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1467 } else {
1468 pcre2_jit_stack_assign_32(mcontext32, NULL, getstack32());
1469 return_value32[0] = pcre2_jit_match_32(re32, regtest_buf32, length32,
1470 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1471 }
1472 }
1473 #endif
1474
1475 /* printf("[%d-%d-%d|%d-%d|%d-%d|%d-%d]%s",
1476 return_value8[0], return_value16[0], return_value32[0],
1477 (int)ovector8_1[0], (int)ovector8_1[1],
1478 (int)ovector16_1[0], (int)ovector16_1[1],
1479 (int)ovector32_1[0], (int)ovector32_1[1],
1480 (current->compile_options & PCRE2_CASELESS) ? "C" : ""); */
1481
1482 /* If F_DIFF is set, just run the test, but do not compare the results.
1483 Segfaults can still be captured. */
1484
1485 is_successful = 1;
1486 if (!(current->start_offset & F_DIFF)) {
1487 #if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1488 if (!(current->start_offset & F_FORCECONV)) {
1489
1490 /* All results must be the same. */
1491 #ifdef SUPPORT_PCRE2_8
1492 if ((return_value = return_value8[0]) != return_value8[1]) {
1493 printf("\n8 bit: Return value differs(J8:%d,I8:%d): [%d] '%s' @ '%s'\n",
1494 return_value8[0], return_value8[1], total, current->pattern, current->input);
1495 is_successful = 0;
1496 } else
1497 #endif
1498 #ifdef SUPPORT_PCRE2_16
1499 if ((return_value = return_value16[0]) != return_value16[1]) {
1500 printf("\n16 bit: Return value differs(J16:%d,I16:%d): [%d] '%s' @ '%s'\n",
1501 return_value16[0], return_value16[1], total, current->pattern, current->input);
1502 is_successful = 0;
1503 } else
1504 #endif
1505 #ifdef SUPPORT_PCRE2_32
1506 if ((return_value = return_value32[0]) != return_value32[1]) {
1507 printf("\n32 bit: Return value differs(J32:%d,I32:%d): [%d] '%s' @ '%s'\n",
1508 return_value32[0], return_value32[1], total, current->pattern, current->input);
1509 is_successful = 0;
1510 } else
1511 #endif
1512 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1513 if (return_value8[0] != return_value16[0]) {
1514 printf("\n8 and 16 bit: Return value differs(J8:%d,J16:%d): [%d] '%s' @ '%s'\n",
1515 return_value8[0], return_value16[0],
1516 total, current->pattern, current->input);
1517 is_successful = 0;
1518 } else
1519 #endif
1520 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1521 if (return_value8[0] != return_value32[0]) {
1522 printf("\n8 and 32 bit: Return value differs(J8:%d,J32:%d): [%d] '%s' @ '%s'\n",
1523 return_value8[0], return_value32[0],
1524 total, current->pattern, current->input);
1525 is_successful = 0;
1526 } else
1527 #endif
1528 #if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1529 if (return_value16[0] != return_value32[0]) {
1530 printf("\n16 and 32 bit: Return value differs(J16:%d,J32:%d): [%d] '%s' @ '%s'\n",
1531 return_value16[0], return_value32[0],
1532 total, current->pattern, current->input);
1533 is_successful = 0;
1534 } else
1535 #endif
1536 if (return_value >= 0 || return_value == PCRE2_ERROR_PARTIAL) {
1537 if (return_value == PCRE2_ERROR_PARTIAL) {
1538 return_value = 2;
1539 } else {
1540 return_value *= 2;
1541 }
1542 #ifdef SUPPORT_PCRE2_8
1543 return_value8[0] = return_value;
1544 #endif
1545 #ifdef SUPPORT_PCRE2_16
1546 return_value16[0] = return_value;
1547 #endif
1548 #ifdef SUPPORT_PCRE2_32
1549 return_value32[0] = return_value;
1550 #endif
1551 /* Transform back the results. */
1552 if (current->compile_options & PCRE2_UTF) {
1553 #ifdef SUPPORT_PCRE2_16
1554 for (i = 0; i < return_value; ++i) {
1555 if (ovector16_1[i] != PCRE2_UNSET)
1556 ovector16_1[i] = regtest_offsetmap16[ovector16_1[i]];
1557 if (ovector16_2[i] != PCRE2_UNSET)
1558 ovector16_2[i] = regtest_offsetmap16[ovector16_2[i]];
1559 }
1560 #endif
1561 #ifdef SUPPORT_PCRE2_32
1562 for (i = 0; i < return_value; ++i) {
1563 if (ovector32_1[i] != PCRE2_UNSET)
1564 ovector32_1[i] = regtest_offsetmap32[ovector32_1[i]];
1565 if (ovector32_2[i] != PCRE2_UNSET)
1566 ovector32_2[i] = regtest_offsetmap32[ovector32_2[i]];
1567 }
1568 #endif
1569 }
1570
1571 for (i = 0; i < return_value; ++i) {
1572 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1573 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector16_1[i] || ovector8_1[i] != ovector16_2[i]) {
1574 printf("\n8 and 16 bit: Ovector[%d] value differs(J8:%d,I8:%d,J16:%d,I16:%d): [%d] '%s' @ '%s' \n",
1575 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector16_1[i], (int)ovector16_2[i],
1576 total, current->pattern, current->input);
1577 is_successful = 0;
1578 }
1579 #endif
1580 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1581 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector32_1[i] || ovector8_1[i] != ovector32_2[i]) {
1582 printf("\n8 and 32 bit: Ovector[%d] value differs(J8:%d,I8:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1583 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1584 total, current->pattern, current->input);
1585 is_successful = 0;
1586 }
1587 #endif
1588 #if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1589 if (ovector16_1[i] != ovector16_2[i] || ovector16_1[i] != ovector32_1[i] || ovector16_1[i] != ovector32_2[i]) {
1590 printf("\n16 and 32 bit: Ovector[%d] value differs(J16:%d,I16:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1591 i, (int)ovector16_1[i], (int)ovector16_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1592 total, current->pattern, current->input);
1593 is_successful = 0;
1594 }
1595 #endif
1596 }
1597 }
1598 } else
1599 #endif /* more than one of SUPPORT_PCRE2_8, SUPPORT_PCRE2_16 and SUPPORT_PCRE2_32 */
1600 {
1601 #ifdef SUPPORT_PCRE2_8
1602 if (return_value8[0] != return_value8[1]) {
1603 printf("\n8 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1604 return_value8[0], return_value8[1], total, current->pattern, current->input);
1605 is_successful = 0;
1606 } else if (return_value8[0] >= 0 || return_value8[0] == PCRE2_ERROR_PARTIAL) {
1607 if (return_value8[0] == PCRE2_ERROR_PARTIAL)
1608 return_value8[0] = 2;
1609 else
1610 return_value8[0] *= 2;
1611
1612 for (i = 0; i < return_value8[0]; ++i)
1613 if (ovector8_1[i] != ovector8_2[i]) {
1614 printf("\n8 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1615 i, (int)ovector8_1[i], (int)ovector8_2[i], total, current->pattern, current->input);
1616 is_successful = 0;
1617 }
1618 }
1619 #endif
1620
1621 #ifdef SUPPORT_PCRE2_16
1622 if (return_value16[0] != return_value16[1]) {
1623 printf("\n16 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1624 return_value16[0], return_value16[1], total, current->pattern, current->input);
1625 is_successful = 0;
1626 } else if (return_value16[0] >= 0 || return_value16[0] == PCRE2_ERROR_PARTIAL) {
1627 if (return_value16[0] == PCRE2_ERROR_PARTIAL)
1628 return_value16[0] = 2;
1629 else
1630 return_value16[0] *= 2;
1631
1632 for (i = 0; i < return_value16[0]; ++i)
1633 if (ovector16_1[i] != ovector16_2[i]) {
1634 printf("\n16 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1635 i, (int)ovector16_1[i], (int)ovector16_2[i], total, current->pattern, current->input);
1636 is_successful = 0;
1637 }
1638 }
1639 #endif
1640
1641 #ifdef SUPPORT_PCRE2_32
1642 if (return_value32[0] != return_value32[1]) {
1643 printf("\n32 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1644 return_value32[0], return_value32[1], total, current->pattern, current->input);
1645 is_successful = 0;
1646 } else if (return_value32[0] >= 0 || return_value32[0] == PCRE2_ERROR_PARTIAL) {
1647 if (return_value32[0] == PCRE2_ERROR_PARTIAL)
1648 return_value32[0] = 2;
1649 else
1650 return_value32[0] *= 2;
1651
1652 for (i = 0; i < return_value32[0]; ++i)
1653 if (ovector32_1[i] != ovector32_2[i]) {
1654 printf("\n32 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1655 i, (int)ovector32_1[i], (int)ovector32_2[i], total, current->pattern, current->input);
1656 is_successful = 0;
1657 }
1658 }
1659 #endif
1660 }
1661 }
1662
1663 if (is_successful) {
1664 #ifdef SUPPORT_PCRE2_8
1665 if (!(current->start_offset & F_NO8) && (utf || is_ascii)) {
1666 if (return_value8[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1667 printf("8 bit: Test should match: [%d] '%s' @ '%s'\n",
1668 total, current->pattern, current->input);
1669 is_successful = 0;
1670 }
1671
1672 if (return_value8[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1673 printf("8 bit: Test should not match: [%d] '%s' @ '%s'\n",
1674 total, current->pattern, current->input);
1675 is_successful = 0;
1676 }
1677 }
1678 #endif
1679 #ifdef SUPPORT_PCRE2_16
1680 if (!(current->start_offset & F_NO16) && (utf || is_ascii)) {
1681 if (return_value16[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1682 printf("16 bit: Test should match: [%d] '%s' @ '%s'\n",
1683 total, current->pattern, current->input);
1684 is_successful = 0;
1685 }
1686
1687 if (return_value16[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1688 printf("16 bit: Test should not match: [%d] '%s' @ '%s'\n",
1689 total, current->pattern, current->input);
1690 is_successful = 0;
1691 }
1692 }
1693 #endif
1694 #ifdef SUPPORT_PCRE2_32
1695 if (!(current->start_offset & F_NO32) && (utf || is_ascii)) {
1696 if (return_value32[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1697 printf("32 bit: Test should match: [%d] '%s' @ '%s'\n",
1698 total, current->pattern, current->input);
1699 is_successful = 0;
1700 }
1701
1702 if (return_value32[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1703 printf("32 bit: Test should not match: [%d] '%s' @ '%s'\n",
1704 total, current->pattern, current->input);
1705 is_successful = 0;
1706 }
1707 }
1708 #endif
1709 }
1710
1711 if (is_successful) {
1712 #ifdef SUPPORT_PCRE2_8
1713 if (re8 && !(current->start_offset & F_NO8) && pcre2_get_mark_8(mdata8_1) != pcre2_get_mark_8(mdata8_2)) {
1714 printf("8 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1715 total, current->pattern, current->input);
1716 is_successful = 0;
1717 }
1718 #endif
1719 #ifdef SUPPORT_PCRE2_16
1720 if (re16 && !(current->start_offset & F_NO16) && pcre2_get_mark_16(mdata16_1) != pcre2_get_mark_16(mdata16_2)) {
1721 printf("16 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1722 total, current->pattern, current->input);
1723 is_successful = 0;
1724 }
1725 #endif
1726 #ifdef SUPPORT_PCRE2_32
1727 if (re32 && !(current->start_offset & F_NO32) && pcre2_get_mark_32(mdata32_1) != pcre2_get_mark_32(mdata32_2)) {
1728 printf("32 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1729 total, current->pattern, current->input);
1730 is_successful = 0;
1731 }
1732 #endif
1733 }
1734
1735 #ifdef SUPPORT_PCRE2_8
1736 pcre2_code_free_8(re8);
1737 pcre2_match_data_free_8(mdata8_1);
1738 pcre2_match_data_free_8(mdata8_2);
1739 pcre2_match_context_free_8(mcontext8);
1740 #endif
1741 #ifdef SUPPORT_PCRE2_16
1742 pcre2_code_free_16(re16);
1743 pcre2_match_data_free_16(mdata16_1);
1744 pcre2_match_data_free_16(mdata16_2);
1745 pcre2_match_context_free_16(mcontext16);
1746 #endif
1747 #ifdef SUPPORT_PCRE2_32
1748 pcre2_code_free_32(re32);
1749 pcre2_match_data_free_32(mdata32_1);
1750 pcre2_match_data_free_32(mdata32_2);
1751 pcre2_match_context_free_32(mcontext32);
1752 #endif
1753
1754 if (is_successful) {
1755 successful++;
1756 successful_row++;
1757 printf(".");
1758 if (successful_row >= 60) {
1759 successful_row = 0;
1760 printf("\n");
1761 }
1762 } else
1763 successful_row = 0;
1764
1765 fflush(stdout);
1766 current++;
1767 }
1768 #ifdef SUPPORT_PCRE2_8
1769 setstack8(NULL);
1770 #endif
1771 #ifdef SUPPORT_PCRE2_16
1772 setstack16(NULL);
1773 #endif
1774 #ifdef SUPPORT_PCRE2_32
1775 setstack32(NULL);
1776 #endif
1777
1778 if (total == successful) {
1779 printf("\nAll JIT regression tests are successfully passed.\n");
1780 return 0;
1781 } else {
1782 printf("\nSuccessful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
1783 return 1;
1784 }
1785 }
1786
1787 #if defined SUPPORT_UNICODE
1788
check_invalid_utf_result(int pattern_index,const char * type,int result,int match_start,int match_end,PCRE2_SIZE * ovector)1789 static int check_invalid_utf_result(int pattern_index, const char *type, int result,
1790 int match_start, int match_end, PCRE2_SIZE *ovector)
1791 {
1792 if (match_start < 0) {
1793 if (result != -1) {
1794 printf("Pattern[%d] %s result is not -1.\n", pattern_index, type);
1795 return 1;
1796 }
1797 return 0;
1798 }
1799
1800 if (result <= 0) {
1801 printf("Pattern[%d] %s result (%d) is not greater than 0.\n", pattern_index, type, result);
1802 return 1;
1803 }
1804
1805 if (ovector[0] != (PCRE2_SIZE)match_start) {
1806 printf("Pattern[%d] %s ovector[0] is unexpected (%d instead of %d)\n",
1807 pattern_index, type, (int)ovector[0], match_start);
1808 return 1;
1809 }
1810
1811 if (ovector[1] != (PCRE2_SIZE)match_end) {
1812 printf("Pattern[%d] %s ovector[1] is unexpected (%d instead of %d)\n",
1813 pattern_index, type, (int)ovector[1], match_end);
1814 return 1;
1815 }
1816
1817 return 0;
1818 }
1819
1820 #endif /* SUPPORT_UNICODE */
1821
1822 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8
1823
1824 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
1825 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
1826 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
1827
1828 struct invalid_utf8_regression_test_case {
1829 int compile_options;
1830 int jit_compile_options;
1831 int start_offset;
1832 int skip_left;
1833 int skip_right;
1834 int match_start;
1835 int match_end;
1836 const char *pattern[2];
1837 const char *input;
1838 };
1839
1840 static const char invalid_utf8_newline_cr;
1841
1842 static const struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = {
1843 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1844 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf0\x90\x80\x80" },
1845 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
1846 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1847 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
1848 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
1849 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
1850 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf#" },
1851 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf" },
1852 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80#" },
1853 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80" },
1854 { UDA, CI, 0, 0, 2, -1, -1, { ".", NULL }, "\xef\xbf\xbf#" },
1855 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xef\xbf\xbf" },
1856 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\x7f#" },
1857 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\xc0" },
1858 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
1859 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf" },
1860 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xed\x9f\xbf#" },
1861 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xa0\x80#" },
1862 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xee\x80\x80#" },
1863 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xbf\xbf#" },
1864 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf##" },
1865 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf#" },
1866 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf" },
1867 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80##" },
1868 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80#" },
1869 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80" },
1870 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80##" },
1871 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0##" },
1872 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80" },
1873 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0" },
1874 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf##" },
1875 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf" },
1876 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80###" },
1877 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80" },
1878 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8###" },
1879 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8" },
1880 { UDA, CI, 0, 0, 0, 0, 1, { ".", NULL }, "\x7f" },
1881
1882 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
1883 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80\xf4\xa0\x80\x80" },
1884 { UDA, CPI, 4, 1, 1, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf" },
1885 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xef\xbf\xbf#" },
1886 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xe0\xa0\x80#" },
1887 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
1888 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
1889 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf\xf0\x8f\xbf\xbf" },
1890 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80\xf5\x80\x80\x80" },
1891 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80\xf4\x90\x80\x80" },
1892 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff\xf4\x8f\xbf\xff" },
1893 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf\xf4\x8f\xff\xbf" },
1894 { UDA, CPI, 4, 0, 1, -1, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80\xef\x80\x80" },
1895 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80\x80\x80\x80\x80" },
1896 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf\xe0\x9f\xbf#" },
1897 { UDA, CPI, 4, 2, 2, -1, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80\xe0\xa0\x80#" },
1898 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xf0\x80\x80\xf0\x80\x80#" },
1899 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xed\xa0\x80\xed\xa0\x80#" },
1900 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xdf\xbf#" },
1901 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xdf\xbf#" },
1902 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xc2\x80#" },
1903 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xc2\x80#" },
1904 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xc1\xbf\xc1\xbf##" },
1905 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xdf\xc0\xdf\xc0##" },
1906 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80\xe0\x80##" },
1907
1908 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xef\xbf\xbf#" },
1909 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xe0\xa0\x80#" },
1910 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf\xe0\x9f\xbf" },
1911 { UDA, CPI, 3, 1, 1, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xbf\xef\xbf\xbf" },
1912 { UDA, CPI, 3, 0, 1, -1, -1, { "\\B", "\\b" }, "\xdf\x80\x80\xdf\x80" },
1913 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xff\xef\xbf\xff" },
1914 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xff\xbf\xef\xff\xbf" },
1915 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xed\xbf\xbf\xed\xbf\xbf" },
1916
1917 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xdf\xbf#" },
1918 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xc2\x80#" },
1919 { UDA, CPI, 2, 1, 1, -1, -1, { "\\B", "\\b" }, "\xdf\xbf\xdf\xbf" },
1920 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xc1\xbf\xc1\xbf" },
1921 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x80\xe0\x80" },
1922 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xff\xdf\xff" },
1923 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xff\xbf\xff\xbf" },
1924
1925 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x7f#" },
1926 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x01#" },
1927 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80" },
1928 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\xb0\xb0" },
1929
1930 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
1931 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "a\xff" },
1932 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1933 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1934 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
1935 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 6, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1936 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1937 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 8, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1938 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1939
1940 { UDA, CPI, 0, 0, 0, 0, 1, { "\\X", NULL }, "A" },
1941 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xff" },
1942 { UDA, CPI, 0, 0, 0, 0, 2, { "\\X", NULL }, "\xc3\xa1" },
1943 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xc3\xa1" },
1944 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xc3\x7f" },
1945 { UDA, CPI, 0, 0, 0, 0, 3, { "\\X", NULL }, "\xe1\xbd\xb8" },
1946 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
1947 { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1948 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1949
1950 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "#" },
1951 { UDA, CPI, 0, 0, 0, 0, 4, { "[^#]", NULL }, "\xf4\x8f\xbf\xbf" },
1952 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xf4\x90\x80\x80" },
1953 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xc1\x80" },
1954
1955 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"},
1956 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"},
1957 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"},
1958 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xc3\x0a#"},
1959 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf1\x0a#"},
1960 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xf2\xbf\x0a#"},
1961 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \xf2\xbf\xbf\x0a#"},
1962 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xef\x0a#"},
1963 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xef\xbf\x0a#"},
1964 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \x85#\xc2\x85#"},
1965 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 7, 8, { "^\\W", NULL }, " \xe2\x80\xf8\xe2\x80\xa8#"},
1966
1967 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xe2\x80\xf8\xe2\x80\xa8#"},
1968 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 3, 4, { "#", NULL }, "\xe2\x80\xf8#\xe2\x80\xa8#"},
1969 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "abcd\xc2\x85#"},
1970 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 1, 2, { "#", NULL }, "\x85#\xc2\x85#"},
1971 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 5, 6, { "#", NULL }, "\xef,\x80,\xf8#\x0a"},
1972 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xef,\x80,\xf8\x0a#"},
1973
1974 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1975 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1976 { PCRE2_UTF, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1977 { PCRE2_UTF, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1978
1979 { PCRE2_UTF | PCRE2_UCP, CI, 0, 0, 0, -1, -1, { "[\\s]", NULL }, "\xed\xa0\x80" },
1980
1981 /* These two are not invalid UTF tests, but this infrastructure fits better for them. */
1982 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\X{2}", NULL }, "\r\n\n" },
1983 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\R{2}", NULL }, "\r\n\n" },
1984
1985 { PCRE2_UTF | PCRE2_MULTILINE, CI, 0, 0, 0, -1, -1, { "^.a", &invalid_utf8_newline_cr }, "\xc3\xa7#a" },
1986
1987 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
1988 };
1989
1990 #undef UDA
1991 #undef CI
1992 #undef CPI
1993
run_invalid_utf8_test(const struct invalid_utf8_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_8 * ccontext,pcre2_match_data_8 * mdata)1994 static int run_invalid_utf8_test(const struct invalid_utf8_regression_test_case *current,
1995 int pattern_index, int i, pcre2_compile_context_8 *ccontext, pcre2_match_data_8 *mdata)
1996 {
1997 pcre2_code_8 *code;
1998 int result, errorcode;
1999 PCRE2_SIZE length, erroroffset;
2000 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_8(mdata);
2001
2002 if (current->pattern[i] == NULL)
2003 return 1;
2004
2005 code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED,
2006 current->compile_options, &errorcode, &erroroffset, ccontext);
2007
2008 if (!code) {
2009 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2010 return 0;
2011 }
2012
2013 if (pcre2_jit_compile_8(code, current->jit_compile_options) != 0) {
2014 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2015 pcre2_code_free_8(code);
2016 return 0;
2017 }
2018
2019 length = (PCRE2_SIZE)(strlen(current->input) - current->skip_left - current->skip_right);
2020
2021 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2022 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2023 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2024
2025 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2026 pcre2_code_free_8(code);
2027 return 0;
2028 }
2029 }
2030
2031 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2032 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2033 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2034
2035 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2036 pcre2_code_free_8(code);
2037 return 0;
2038 }
2039 }
2040
2041 pcre2_code_free_8(code);
2042 return 1;
2043 }
2044
invalid_utf8_regression_tests(void)2045 static int invalid_utf8_regression_tests(void)
2046 {
2047 const struct invalid_utf8_regression_test_case *current;
2048 pcre2_compile_context_8 *ccontext;
2049 pcre2_match_data_8 *mdata;
2050 int total = 0, successful = 0;
2051 int result;
2052
2053 printf("\nRunning invalid-utf8 JIT regression tests\n");
2054
2055 ccontext = pcre2_compile_context_create_8(NULL);
2056 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
2057 mdata = pcre2_match_data_create_8(4, NULL);
2058
2059 for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) {
2060 /* printf("\nPattern: %s :\n", current->pattern); */
2061 total++;
2062
2063 result = 1;
2064 if (current->pattern[1] != &invalid_utf8_newline_cr)
2065 {
2066 if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
2067 result = 0;
2068 if (!run_invalid_utf8_test(current, total - 1, 1, ccontext, mdata))
2069 result = 0;
2070 } else {
2071 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_CR);
2072 if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
2073 result = 0;
2074 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
2075 }
2076
2077 if (result) {
2078 successful++;
2079 }
2080
2081 printf(".");
2082 if ((total % 60) == 0)
2083 printf("\n");
2084 }
2085
2086 if ((total % 60) != 0)
2087 printf("\n");
2088
2089 pcre2_match_data_free_8(mdata);
2090 pcre2_compile_context_free_8(ccontext);
2091
2092 if (total == successful) {
2093 printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n");
2094 return 0;
2095 } else {
2096 printf("\nInvalid UTF8 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2097 return 1;
2098 }
2099 }
2100
2101 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_8 */
2102
invalid_utf8_regression_tests(void)2103 static int invalid_utf8_regression_tests(void)
2104 {
2105 return 0;
2106 }
2107
2108 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_8 */
2109
2110 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_16
2111
2112 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2113 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2114 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2115
2116 struct invalid_utf16_regression_test_case {
2117 int compile_options;
2118 int jit_compile_options;
2119 int start_offset;
2120 int skip_left;
2121 int skip_right;
2122 int match_start;
2123 int match_end;
2124 const PCRE2_UCHAR16 *pattern[2];
2125 const PCRE2_UCHAR16 *input;
2126 };
2127
2128 static PCRE2_UCHAR16 allany16[] = { '.', 0 };
2129 static PCRE2_UCHAR16 non_word_boundary16[] = { '\\', 'B', 0 };
2130 static PCRE2_UCHAR16 word_boundary16[] = { '\\', 'b', 0 };
2131 static PCRE2_UCHAR16 backreference16[] = { '(', '.', ')', '\\', '1', 0 };
2132 static PCRE2_UCHAR16 grapheme16[] = { '\\', 'X', 0 };
2133 static PCRE2_UCHAR16 nothashmark16[] = { '[', '^', '#', ']', 0 };
2134 static PCRE2_UCHAR16 afternl16[] = { '^', '\\', 'W', 0 };
2135 static PCRE2_UCHAR16 generic16[] = { '#', 0xd800, 0xdc00, '#', 0 };
2136 static PCRE2_UCHAR16 test16_1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 };
2137 static PCRE2_UCHAR16 test16_2[] = { 0xd800, 0xdc00, 0xd800, 0xdc00, 0 };
2138 static PCRE2_UCHAR16 test16_3[] = { 0xdbff, 0xdfff, 0xdbff, 0xdfff, 0 };
2139 static PCRE2_UCHAR16 test16_4[] = { 0xd800, 0xdbff, 0xd800, 0xdbff, 0 };
2140 static PCRE2_UCHAR16 test16_5[] = { '#', 0xd800, 0xdc00, '#', 0 };
2141 static PCRE2_UCHAR16 test16_6[] = { 'a', 'A', 0xdc28, 0 };
2142 static PCRE2_UCHAR16 test16_7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
2143 static PCRE2_UCHAR16 test16_8[] = { '#', 0xd800, 0xdc00, 0 };
2144 static PCRE2_UCHAR16 test16_9[] = { ' ', 0x2028, '#', 0 };
2145 static PCRE2_UCHAR16 test16_10[] = { ' ', 0xdc00, 0xd800, 0x2028, '#', 0 };
2146 static PCRE2_UCHAR16 test16_11[] = { 0xdc00, 0xdc00, 0xd800, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2147 static PCRE2_UCHAR16 test16_12[] = { '#', 0xd800, 0xdc00, 0xd800, '#', 0xd800, 0xdc00, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2148
2149 static const struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
2150 { UDA, CI, 0, 0, 0, 0, 1, { allany16, NULL }, test16_1 },
2151 { UDA, CI, 1, 0, 0, 1, 2, { allany16, NULL }, test16_1 },
2152 { UDA, CI, 2, 0, 0, 2, 3, { allany16, NULL }, test16_1 },
2153 { UDA, CI, 3, 0, 0, 3, 4, { allany16, NULL }, test16_1 },
2154 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_2 },
2155 { UDA, CI, 0, 0, 3, -1, -1, { allany16, NULL }, test16_2 },
2156 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_2 },
2157 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_3 },
2158 { UDA, CI, 0, 0, 3, -1, -1, { allany16, NULL }, test16_3 },
2159 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_3 },
2160
2161 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary16, NULL }, test16_1 },
2162 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_1 },
2163 { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary16, NULL }, test16_1 },
2164 { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary16, NULL }, test16_1 },
2165 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_2 },
2166 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_3 },
2167 { UDA, CPI, 2, 1, 1, -1, -1, { non_word_boundary16, word_boundary16 }, test16_2 },
2168 { UDA, CPI, 2, 1, 1, -1, -1, { non_word_boundary16, word_boundary16 }, test16_3 },
2169 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_4 },
2170 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_5 },
2171
2172 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference16, NULL }, test16_6 },
2173 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference16, NULL }, test16_6 },
2174 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference16, NULL }, test16_7 },
2175 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference16, NULL }, test16_7 },
2176
2177 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme16, NULL }, test16_6 },
2178 { UDA, CPI, 1, 0, 0, 1, 2, { grapheme16, NULL }, test16_6 },
2179 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme16, NULL }, test16_6 },
2180 { UDA, CPI, 0, 0, 0, 0, 2, { grapheme16, NULL }, test16_7 },
2181 { UDA, CPI, 2, 0, 0, 2, 4, { grapheme16, NULL }, test16_7 },
2182 { UDA, CPI, 1, 0, 0, -1, -1, { grapheme16, NULL }, test16_7 },
2183
2184 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2185 { UDA, CPI, 1, 0, 0, 1, 3, { nothashmark16, NULL }, test16_8 },
2186 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2187
2188 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl16, NULL }, test16_9 },
2189 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { afternl16, NULL }, test16_10 },
2190
2191 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2192 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2193 { PCRE2_UTF, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2194 { PCRE2_UTF, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2195
2196 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2197 };
2198
2199 #undef UDA
2200 #undef CI
2201 #undef CPI
2202
run_invalid_utf16_test(const struct invalid_utf16_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_16 * ccontext,pcre2_match_data_16 * mdata)2203 static int run_invalid_utf16_test(const struct invalid_utf16_regression_test_case *current,
2204 int pattern_index, int i, pcre2_compile_context_16 *ccontext, pcre2_match_data_16 *mdata)
2205 {
2206 pcre2_code_16 *code;
2207 int result, errorcode;
2208 PCRE2_SIZE length, erroroffset;
2209 const PCRE2_UCHAR16 *input;
2210 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_16(mdata);
2211
2212 if (current->pattern[i] == NULL)
2213 return 1;
2214
2215 code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED,
2216 current->compile_options, &errorcode, &erroroffset, ccontext);
2217
2218 if (!code) {
2219 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2220 return 0;
2221 }
2222
2223 if (pcre2_jit_compile_16(code, current->jit_compile_options) != 0) {
2224 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2225 pcre2_code_free_16(code);
2226 return 0;
2227 }
2228
2229 input = current->input;
2230 length = 0;
2231
2232 while (*input++ != 0)
2233 length++;
2234
2235 length -= current->skip_left + current->skip_right;
2236
2237 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2238 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2239 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2240
2241 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2242 pcre2_code_free_16(code);
2243 return 0;
2244 }
2245 }
2246
2247 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2248 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2249 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2250
2251 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2252 pcre2_code_free_16(code);
2253 return 0;
2254 }
2255 }
2256
2257 pcre2_code_free_16(code);
2258 return 1;
2259 }
2260
invalid_utf16_regression_tests(void)2261 static int invalid_utf16_regression_tests(void)
2262 {
2263 const struct invalid_utf16_regression_test_case *current;
2264 pcre2_compile_context_16 *ccontext;
2265 pcre2_match_data_16 *mdata;
2266 int total = 0, successful = 0;
2267 int result;
2268
2269 printf("\nRunning invalid-utf16 JIT regression tests\n");
2270
2271 ccontext = pcre2_compile_context_create_16(NULL);
2272 pcre2_set_newline_16(ccontext, PCRE2_NEWLINE_ANY);
2273 mdata = pcre2_match_data_create_16(4, NULL);
2274
2275 for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) {
2276 /* printf("\nPattern: %s :\n", current->pattern); */
2277 total++;
2278
2279 result = 1;
2280 if (!run_invalid_utf16_test(current, total - 1, 0, ccontext, mdata))
2281 result = 0;
2282 if (!run_invalid_utf16_test(current, total - 1, 1, ccontext, mdata))
2283 result = 0;
2284
2285 if (result) {
2286 successful++;
2287 }
2288
2289 printf(".");
2290 if ((total % 60) == 0)
2291 printf("\n");
2292 }
2293
2294 if ((total % 60) != 0)
2295 printf("\n");
2296
2297 pcre2_match_data_free_16(mdata);
2298 pcre2_compile_context_free_16(ccontext);
2299
2300 if (total == successful) {
2301 printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n");
2302 return 0;
2303 } else {
2304 printf("\nInvalid UTF16 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2305 return 1;
2306 }
2307 }
2308
2309 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_16 */
2310
invalid_utf16_regression_tests(void)2311 static int invalid_utf16_regression_tests(void)
2312 {
2313 return 0;
2314 }
2315
2316 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */
2317
2318 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_32
2319
2320 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2321 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2322 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2323
2324 struct invalid_utf32_regression_test_case {
2325 int compile_options;
2326 int jit_compile_options;
2327 int start_offset;
2328 int skip_left;
2329 int skip_right;
2330 int match_start;
2331 int match_end;
2332 const PCRE2_UCHAR32 *pattern[2];
2333 const PCRE2_UCHAR32 *input;
2334 };
2335
2336 static PCRE2_UCHAR32 allany32[] = { '.', 0 };
2337 static PCRE2_UCHAR32 non_word_boundary32[] = { '\\', 'B', 0 };
2338 static PCRE2_UCHAR32 word_boundary32[] = { '\\', 'b', 0 };
2339 static PCRE2_UCHAR32 backreference32[] = { '(', '.', ')', '\\', '1', 0 };
2340 static PCRE2_UCHAR32 grapheme32[] = { '\\', 'X', 0 };
2341 static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 };
2342 static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 };
2343 static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x110000, 0x10ffff, 0 };
2344 static PCRE2_UCHAR32 test32_2[] = { 0xd7ff, 0xe000, 0xd800, 0xdfff, 0xe000, 0xdfff, 0xd800, 0 };
2345 static PCRE2_UCHAR32 test32_3[] = { 'a', 'A', 0x110000, 0 };
2346 static PCRE2_UCHAR32 test32_4[] = { '#', 0x10ffff, 0x110000, 0 };
2347 static PCRE2_UCHAR32 test32_5[] = { ' ', 0x2028, '#', 0 };
2348 static PCRE2_UCHAR32 test32_6[] = { ' ', 0x110000, 0x2028, '#', 0 };
2349
2350 static const struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = {
2351 { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 },
2352 { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 },
2353 { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_2 },
2354 { UDA, CI, 1, 0, 0, 1, 2, { allany32, NULL }, test32_2 },
2355 { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
2356 { UDA, CI, 3, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
2357
2358 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 },
2359 { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
2360 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_2 },
2361 { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
2362 { UDA, CPI, 6, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
2363
2364 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_3 },
2365 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_3 },
2366
2367 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 },
2368 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 },
2369 { UDA, CPI, 1, 0, 0, 1, 2, { grapheme32, NULL }, test32_2 },
2370 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
2371 { UDA, CPI, 3, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
2372 { UDA, CPI, 4, 0, 0, 4, 5, { grapheme32, NULL }, test32_2 },
2373
2374 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
2375 { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_4 },
2376 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
2377 { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_2 },
2378 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_2 },
2379
2380 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_5 },
2381 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_6 },
2382
2383 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2384 };
2385
2386 #undef UDA
2387 #undef CI
2388 #undef CPI
2389
run_invalid_utf32_test(const struct invalid_utf32_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_32 * ccontext,pcre2_match_data_32 * mdata)2390 static int run_invalid_utf32_test(const struct invalid_utf32_regression_test_case *current,
2391 int pattern_index, int i, pcre2_compile_context_32 *ccontext, pcre2_match_data_32 *mdata)
2392 {
2393 pcre2_code_32 *code;
2394 int result, errorcode;
2395 PCRE2_SIZE length, erroroffset;
2396 const PCRE2_UCHAR32 *input;
2397 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(mdata);
2398
2399 if (current->pattern[i] == NULL)
2400 return 1;
2401
2402 code = pcre2_compile_32(current->pattern[i], PCRE2_ZERO_TERMINATED,
2403 current->compile_options, &errorcode, &erroroffset, ccontext);
2404
2405 if (!code) {
2406 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2407 return 0;
2408 }
2409
2410 if (pcre2_jit_compile_32(code, current->jit_compile_options) != 0) {
2411 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2412 pcre2_code_free_32(code);
2413 return 0;
2414 }
2415
2416 input = current->input;
2417 length = 0;
2418
2419 while (*input++ != 0)
2420 length++;
2421
2422 length -= current->skip_left + current->skip_right;
2423
2424 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2425 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2426 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2427
2428 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2429 pcre2_code_free_32(code);
2430 return 0;
2431 }
2432 }
2433
2434 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2435 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2436 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2437
2438 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2439 pcre2_code_free_32(code);
2440 return 0;
2441 }
2442 }
2443
2444 pcre2_code_free_32(code);
2445 return 1;
2446 }
2447
invalid_utf32_regression_tests(void)2448 static int invalid_utf32_regression_tests(void)
2449 {
2450 const struct invalid_utf32_regression_test_case *current;
2451 pcre2_compile_context_32 *ccontext;
2452 pcre2_match_data_32 *mdata;
2453 int total = 0, successful = 0;
2454 int result;
2455
2456 printf("\nRunning invalid-utf32 JIT regression tests\n");
2457
2458 ccontext = pcre2_compile_context_create_32(NULL);
2459 pcre2_set_newline_32(ccontext, PCRE2_NEWLINE_ANY);
2460 mdata = pcre2_match_data_create_32(4, NULL);
2461
2462 for (current = invalid_utf32_regression_test_cases; current->pattern[0]; current++) {
2463 /* printf("\nPattern: %s :\n", current->pattern); */
2464 total++;
2465
2466 result = 1;
2467 if (!run_invalid_utf32_test(current, total - 1, 0, ccontext, mdata))
2468 result = 0;
2469 if (!run_invalid_utf32_test(current, total - 1, 1, ccontext, mdata))
2470 result = 0;
2471
2472 if (result) {
2473 successful++;
2474 }
2475
2476 printf(".");
2477 if ((total % 60) == 0)
2478 printf("\n");
2479 }
2480
2481 if ((total % 60) != 0)
2482 printf("\n");
2483
2484 pcre2_match_data_free_32(mdata);
2485 pcre2_compile_context_free_32(ccontext);
2486
2487 if (total == successful) {
2488 printf("\nAll invalid UTF32 JIT regression tests are successfully passed.\n");
2489 return 0;
2490 } else {
2491 printf("\nInvalid UTF32 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2492 return 1;
2493 }
2494 }
2495
2496 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_32 */
2497
invalid_utf32_regression_tests(void)2498 static int invalid_utf32_regression_tests(void)
2499 {
2500 return 0;
2501 }
2502
2503 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_32 */
2504
2505 /* End of pcre2_jit_test.c */
2506