1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 #ifdef HAVE_CONFIG_H
42 #include "config.h"
43 #endif
44
45 #include <stdio.h>
46 #include <string.h>
47
48 #define PCRE2_CODE_UNIT_WIDTH 0
49 #include "pcre2.h"
50
51 /*
52 Letter characters:
53 \xe6\x92\xad = 0x64ad = 25773 (kanji)
54 Non-letter characters:
55 \xc2\xa1 = 0xa1 = (Inverted Exclamation Mark)
56 \xf3\xa9\xb7\x80 = 0xe9dc0 = 957888
57 \xed\xa0\x80 = 55296 = 0xd800 (Invalid UTF character)
58 \xed\xb0\x80 = 56320 = 0xdc00 (Invalid UTF character)
59 Newlines:
60 \xc2\x85 = 0x85 = 133 (NExt Line = NEL)
61 \xe2\x80\xa8 = 0x2028 = 8232 (Line Separator)
62 Othercase pairs:
63 \xc3\xa9 = 0xe9 = 233 (e')
64 \xc3\x89 = 0xc9 = 201 (E')
65 \xc3\xa1 = 0xe1 = 225 (a')
66 \xc3\x81 = 0xc1 = 193 (A')
67 \x53 = 0x53 = S
68 \x73 = 0x73 = s
69 \xc5\xbf = 0x17f = 383 (long S)
70 \xc8\xba = 0x23a = 570
71 \xe2\xb1\xa5 = 0x2c65 = 11365
72 \xe1\xbd\xb8 = 0x1f78 = 8056
73 \xe1\xbf\xb8 = 0x1ff8 = 8184
74 \xf0\x90\x90\x80 = 0x10400 = 66560
75 \xf0\x90\x90\xa8 = 0x10428 = 66600
76 \xc7\x84 = 0x1c4 = 452
77 \xc7\x85 = 0x1c5 = 453
78 \xc7\x86 = 0x1c6 = 454
79 Caseless sets:
80 ucp_Armenian - \x{531}-\x{556} -> \x{561}-\x{586}
81 ucp_Coptic - \x{2c80}-\x{2ce3} -> caseless: XOR 0x1
82 ucp_Latin - \x{ff21}-\x{ff3a} -> \x{ff41]-\x{ff5a}
83
84 Mark property:
85 \xcc\x8d = 0x30d = 781
86 Special:
87 \xc2\x80 = 0x80 = 128 (lowest 2 byte character)
88 \xdf\xbf = 0x7ff = 2047 (highest 2 byte character)
89 \xe0\xa0\x80 = 0x800 = 2048 (lowest 2 byte character)
90 \xef\xbf\xbf = 0xffff = 65535 (highest 3 byte character)
91 \xf0\x90\x80\x80 = 0x10000 = 65536 (lowest 4 byte character)
92 \xf4\x8f\xbf\xbf = 0x10ffff = 1114111 (highest allowed utf character)
93 */
94
95 static int regression_tests(void);
96 static int invalid_utf8_regression_tests(void);
97 static int invalid_utf16_regression_tests(void);
98 static int invalid_utf32_regression_tests(void);
99
main(void)100 int main(void)
101 {
102 int jit = 0;
103 #if defined SUPPORT_PCRE2_8
104 pcre2_config_8(PCRE2_CONFIG_JIT, &jit);
105 #elif defined SUPPORT_PCRE2_16
106 pcre2_config_16(PCRE2_CONFIG_JIT, &jit);
107 #elif defined SUPPORT_PCRE2_32
108 pcre2_config_32(PCRE2_CONFIG_JIT, &jit);
109 #endif
110 if (!jit) {
111 printf("JIT must be enabled to run pcre2_jit_test\n");
112 return 1;
113 }
114 return regression_tests()
115 | invalid_utf8_regression_tests()
116 | invalid_utf16_regression_tests()
117 | invalid_utf32_regression_tests();
118 }
119
120 /* --------------------------------------------------------------------------------------- */
121
122 #if !(defined SUPPORT_PCRE2_8) && !(defined SUPPORT_PCRE2_16) && !(defined SUPPORT_PCRE2_32)
123 #error SUPPORT_PCRE2_8 or SUPPORT_PCRE2_16 or SUPPORT_PCRE2_32 must be defined
124 #endif
125
126 #define MU (PCRE2_MULTILINE | PCRE2_UTF)
127 #define MUP (PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
128 #define CMU (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF)
129 #define CMUP (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
130 #define M (PCRE2_MULTILINE)
131 #define MP (PCRE2_MULTILINE | PCRE2_UCP)
132 #define U (PCRE2_UTF)
133 #define CM (PCRE2_CASELESS | PCRE2_MULTILINE)
134
135 #define BSR(x) ((x) << 16)
136 #define A PCRE2_NEWLINE_ANYCRLF
137
138 #define GET_NEWLINE(x) ((x) & 0xffff)
139 #define GET_BSR(x) ((x) >> 16)
140
141 #define OFFSET_MASK 0x00ffff
142 #define F_NO8 0x010000
143 #define F_NO16 0x020000
144 #define F_NO32 0x020000
145 #define F_NOMATCH 0x040000
146 #define F_DIFF 0x080000
147 #define F_FORCECONV 0x100000
148 #define F_PROPERTY 0x200000
149
150 struct regression_test_case {
151 int compile_options;
152 int newline;
153 int match_options;
154 int start_offset;
155 const char *pattern;
156 const char *input;
157 };
158
159 static struct regression_test_case regression_test_cases[] = {
160 /* Constant strings. */
161 { MU, A, 0, 0, "AbC", "AbAbC" },
162 { MU, A, 0, 0, "ACCEPT", "AACACCACCEACCEPACCEPTACCEPTT" },
163 { CMU, A, 0, 0, "aA#\xc3\xa9\xc3\x81", "aA#Aa#\xc3\x89\xc3\xa1" },
164 { M, A, 0, 0, "[^a]", "aAbB" },
165 { CM, A, 0, 0, "[^m]", "mMnN" },
166 { M, A, 0, 0, "a[^b][^#]", "abacd" },
167 { CM, A, 0, 0, "A[^B][^E]", "abacd" },
168 { CMU, A, 0, 0, "[^x][^#]", "XxBll" },
169 { MU, A, 0, 0, "[^a]", "aaa\xc3\xa1#Ab" },
170 { CMU, A, 0, 0, "[^A]", "aA\xe6\x92\xad" },
171 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\n+bc" },
172 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\r+bc" },
173 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\r+bc" },
174 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\n+bc" },
175 { MU, A, 0, 0, "[axd]", "sAXd" },
176 { CMU, A, 0, 0, "[axd]", "sAXd" },
177 { CMU, A, 0, 0 | F_NOMATCH, "[^axd]", "DxA" },
178 { MU, A, 0, 0, "[a-dA-C]", "\xe6\x92\xad\xc3\xa9.B" },
179 { MU, A, 0, 0, "[^a-dA-C]", "\xe6\x92\xad\xc3\xa9" },
180 { CMU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
181 { MU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
182 { MU, A, 0, 0, "[^a]", "\xc2\x80[]" },
183 { CMU, A, 0, 0, "\xf0\x90\x90\xa7", "\xf0\x90\x91\x8f" },
184 { CM, A, 0, 0, "1a2b3c4", "1a2B3c51A2B3C4" },
185 { PCRE2_CASELESS, 0, 0, 0, "\xff#a", "\xff#\xff\xfe##\xff#A" },
186 { PCRE2_CASELESS, 0, 0, 0, "\xfe", "\xff\xfc#\xfe\xfe" },
187 { PCRE2_CASELESS, 0, 0, 0, "a1", "Aa1" },
188 #ifndef NEVER_BACKSLASH_C
189 { M, A, 0, 0, "\\Ca", "cda" },
190 { CM, A, 0, 0, "\\Ca", "CDA" },
191 { M, A, 0, 0 | F_NOMATCH, "\\Cx", "cda" },
192 { CM, A, 0, 0 | F_NOMATCH, "\\Cx", "CDA" },
193 #endif /* !NEVER_BACKSLASH_C */
194 { CMUP, A, 0, 0, "\xf0\x90\x90\x80\xf0\x90\x90\xa8", "\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
195 { CMUP, A, 0, 0, "\xf0\x90\x90\x80{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
196 { CMUP, A, 0, 0, "\xf0\x90\x90\xa8{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
197 { CMUP, A, 0, 0, "\xe1\xbd\xb8\xe1\xbf\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
198 { M, A, 0, 0, "[3-57-9]", "5" },
199 { PCRE2_AUTO_CALLOUT, A, 0, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890",
200 "12345678901234567890123456789012345678901234567890123456789012345678901234567890" },
201
202 /* Assertions. */
203 { MU, A, 0, 0, "\\b[^A]", "A_B#" },
204 { M, A, 0, 0 | F_NOMATCH, "\\b\\W", "\n*" },
205 { MU, A, 0, 0, "\\B[^,]\\b[^s]\\b", "#X" },
206 { MP, A, 0, 0, "\\B", "_\xa1" },
207 { MP, A, 0, 0 | F_PROPERTY, "\\b_\\b[,A]\\B", "_," },
208 { MUP, A, 0, 0, "\\b", "\xe6\x92\xad!" },
209 { MUP, A, 0, 0, "\\B", "_\xc2\xa1\xc3\xa1\xc2\x85" },
210 { MUP, A, 0, 0, "\\b[^A]\\B[^c]\\b[^_]\\B", "_\xc3\xa1\xe2\x80\xa8" },
211 { MUP, A, 0, 0, "\\b\\w+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
212 { MU, A, 0, 0 | F_NOMATCH, "\\b.", "\xcd\xbe" },
213 { CMUP, A, 0, 0, "\\By", "\xf0\x90\x90\xa8y" },
214 { M, A, 0, 0 | F_NOMATCH, "\\R^", "\n" },
215 { M, A, 0, 1 | F_NOMATCH, "^", "\n" },
216 { 0, 0, 0, 0, "^ab", "ab" },
217 { 0, 0, 0, 0 | F_NOMATCH, "^ab", "aab" },
218 { M, PCRE2_NEWLINE_CRLF, 0, 0, "^a", "\r\raa\n\naa\r\naa" },
219 { MU, A, 0, 0, "^-", "\xe2\x80\xa8--\xc2\x85-\r\n-" },
220 { M, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--b--\x85--" },
221 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xe2\x80\xa8--" },
222 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xc2\x85--" },
223 { 0, 0, 0, 0, "ab$", "ab" },
224 { 0, 0, 0, 0 | F_NOMATCH, "ab$", "abab\n\n" },
225 { PCRE2_DOLLAR_ENDONLY, 0, 0, 0 | F_NOMATCH, "ab$", "abab\r\n" },
226 { M, PCRE2_NEWLINE_CRLF, 0, 0, "a$", "\r\raa\n\naa\r\naa" },
227 { M, PCRE2_NEWLINE_ANY, 0, 0, "a$", "aaa" },
228 { MU, PCRE2_NEWLINE_ANYCRLF, 0, 0, "#$", "#\xc2\x85###\r#" },
229 { MU, PCRE2_NEWLINE_ANY, 0, 0, "#$", "#\xe2\x80\xa9" },
230 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0 | F_NOMATCH, "^a", "aa\naa" },
231 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0, "^a", "aa\naa" },
232 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\naa" },
233 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\r\n" },
234 { U | PCRE2_DOLLAR_ENDONLY, PCRE2_NEWLINE_ANY, 0, 0 | F_PROPERTY, "\\p{Any}{2,}$", "aa\r\n" },
235 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0, "a$", "aa\naa" },
236 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa" },
237 { U, PCRE2_NEWLINE_CR, 0, 0, "a\\Z", "aaa\r" },
238 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa\n" },
239 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r" },
240 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\n" },
241 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r\n" },
242 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
243 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
244 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
245 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
246 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
247 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
248 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
249 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
250 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
251 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xc2\x85" },
252 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
253 { M, A, 0, 0, "\\Aa", "aaa" },
254 { M, A, 0, 1 | F_NOMATCH, "\\Aa", "aaa" },
255 { M, A, 0, 1, "\\Ga", "aaa" },
256 { M, A, 0, 1 | F_NOMATCH, "\\Ga", "aba" },
257 { M, A, 0, 0, "a\\z", "aaa" },
258 { M, A, 0, 0 | F_NOMATCH, "a\\z", "aab" },
259
260 /* Brackets and alternatives. */
261 { MU, A, 0, 0, "(ab|bb|cd)", "bacde" },
262 { MU, A, 0, 0, "(?:ab|a)(bc|c)", "ababc" },
263 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|efg))", "abac" },
264 { CMU, A, 0, 0, "((aB|(Cc))|(bB)|(?:cd|EFg))", "AcCe" },
265 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|ebg))", "acebebg" },
266 { MU, A, 0, 0, "(?:(a)|(?:b))(cc|(?:d|e))(a|b)k", "accabdbbccbk" },
267 { MU, A, 0, 0, "\xc7\x82|\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
268 { MU, A, 0, 0, "=\xc7\x82|#\xc6\x82", "\xf1\x83\x82\x82=\xc7\x82\xc7\x83" },
269 { MU, A, 0, 0, "\xc7\x82\xc7\x83|\xc6\x82\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
270 { MU, A, 0, 0, "\xc6\x82\xc6\x82|\xc7\x83\xc7\x83|\xc8\x84\xc8\x84", "\xf1\x83\x82\x82\xc8\x84\xc8\x84" },
271 { U, A, 0, 0, "\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80", "\xdf\xbf\xc2\x80\xe4\x84\x80" },
272 { U, A, 0, 0, "(?:\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80)#", "\xdf\xbf\xc2\x80#\xe4\x84\x80#" },
273 { CM, A, 0, 0, "ab|cd", "CD" },
274 { CM, A, 0, 0, "a1277|a1377|bX487", "bx487" },
275 { CM, A, 0, 0, "a1277|a1377|bx487", "bX487" },
276
277 /* Greedy and non-greedy ? operators. */
278 { MU, A, 0, 0, "(?:a)?a", "laab" },
279 { CMU, A, 0, 0, "(A)?A", "llaab" },
280 { MU, A, 0, 0, "(a)?\?a", "aab" }, /* ?? is the prefix of trygraphs in GCC. */
281 { MU, A, 0, 0, "(a)?a", "manm" },
282 { CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" },
283 { MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" },
284 { MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" },
285
286 /* Greedy and non-greedy + operators */
287 { MU, A, 0, 0, "(aa)+aa", "aaaaaaa" },
288 { MU, A, 0, 0, "(aa)+?aa", "aaaaaaa" },
289 { MU, A, 0, 0, "(?:aba|ab|a)+l", "ababamababal" },
290 { MU, A, 0, 0, "(?:aba|ab|a)+?l", "ababamababal" },
291 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+?|ss)+e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
292 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+|ss)+?e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
293 { MU, A, 0, 0, "(?:(b(c)+?)+)?\?(?:(bc)+|(cb)+)+(?:m)+", "bccbcccbcbccbcbPbccbcccbcbccbcbmmn" },
294 { MU, A, 0, 0, "(aa|bb){8,1000}", "abaabbaabbaabbaab_aabbaabbaabbaabbaabbaabb_" },
295
296 /* Greedy and non-greedy * operators */
297 { CMU, A, 0, 0, "(?:AA)*AB", "aaaaaaamaaaaaaab" },
298 { MU, A, 0, 0, "(?:aa)*?ab", "aaaaaaamaaaaaaab" },
299 { MU, A, 0, 0, "(aa|ab)*ab", "aaabaaab" },
300 { CMU, A, 0, 0, "(aa|Ab)*?aB", "aaabaaab" },
301 { MU, A, 0, 0, "(a|b)*(?:a)*(?:b)*m", "abbbaaababanabbbaaababamm" },
302 { MU, A, 0, 0, "(a|b)*?(?:a)*?(?:b)*?m", "abbbaaababanabbbaaababamm" },
303 { M, A, 0, 0, "a(a(\\1*)a|(b)b+){0}a", "aa" },
304 { M, A, 0, 0, "((?:a|)*){0}a", "a" },
305
306 /* Combining ? + * operators */
307 { MU, A, 0, 0, "((bm)+)?\?(?:a)*(bm)+n|((am)+?)?(?:a)+(am)*n", "bmbmabmamaaamambmaman" },
308 { MU, A, 0, 0, "(((ab)?cd)*ef)+g", "abcdcdefcdefefmabcdcdefcdefefgg" },
309 { MU, A, 0, 0, "(((ab)?\?cd)*?ef)+?g", "abcdcdefcdefefmabcdcdefcdefefgg" },
310 { MU, A, 0, 0, "(?:(ab)?c|(?:ab)+?d)*g", "ababcdccababddg" },
311 { MU, A, 0, 0, "(?:(?:ab)?\?c|(ab)+d)*?g", "ababcdccababddg" },
312
313 /* Single character iterators. */
314 { MU, A, 0, 0, "(a+aab)+aaaab", "aaaabcaaaabaabcaabcaaabaaaab" },
315 { MU, A, 0, 0, "(a*a*aab)+x", "aaaaabaabaaabmaabx" },
316 { MU, A, 0, 0, "(a*?(b|ab)a*?)+x", "aaaabcxbbaabaacbaaabaabax" },
317 { MU, A, 0, 0, "(a+(ab|ad)a+)+x", "aaabaaaadaabaaabaaaadaaax" },
318 { MU, A, 0, 0, "(a?(a)a?)+(aaa)", "abaaabaaaaaaaa" },
319 { MU, A, 0, 0, "(a?\?(a)a?\?)+(b)", "aaaacaaacaacacbaaab" },
320 { MU, A, 0, 0, "(a{0,4}(b))+d", "aaaaaabaabcaaaaabaaaaabd" },
321 { MU, A, 0, 0, "(a{0,4}?[^b])+d+(a{0,4}[^b])d+", "aaaaadaaaacaadddaaddd" },
322 { MU, A, 0, 0, "(ba{2})+c", "baabaaabacbaabaac" },
323 { MU, A, 0, 0, "(a*+bc++)+", "aaabbcaaabcccab" },
324 { MU, A, 0, 0, "(a?+[^b])+", "babaacacb" },
325 { MU, A, 0, 0, "(a{0,3}+b)(a{0,3}+b)(a{0,3}+)[^c]", "abaabaaacbaabaaaac" },
326 { CMU, A, 0, 0, "([a-c]+[d-f]+?)+?g", "aBdacdehAbDaFgA" },
327 { CMU, A, 0, 0, "[c-f]+k", "DemmFke" },
328 { MU, A, 0, 0, "([DGH]{0,4}M)+", "GGDGHDGMMHMDHHGHM" },
329 { MU, A, 0, 0, "([a-c]{4,}s)+", "abasabbasbbaabsbba" },
330 { CMU, A, 0, 0, "[ace]{3,7}", "AcbDAcEEcEd" },
331 { CMU, A, 0, 0, "[ace]{3,7}?", "AcbDAcEEcEd" },
332 { CMU, A, 0, 0, "[ace]{3,}", "AcbDAcEEcEd" },
333 { CMU, A, 0, 0, "[ace]{3,}?", "AcbDAcEEcEd" },
334 { MU, A, 0, 0, "[ckl]{2,}?g", "cdkkmlglglkcg" },
335 { CMU, A, 0, 0, "[ace]{5}?", "AcCebDAcEEcEd" },
336 { MU, A, 0, 0, "([AbC]{3,5}?d)+", "BACaAbbAEAACCbdCCbdCCAAbb" },
337 { MU, A, 0, 0, "([^ab]{0,}s){2}", "abaabcdsABamsDDs" },
338 { MU, A, 0, 0, "\\b\\w+\\B", "x,a_cd" },
339 { MUP, A, 0, 0, "\\b[^\xc2\xa1]+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
340 { CMU, A, 0, 0, "[^b]+(a*)([^c]?d{3})", "aaaaddd" },
341 { CMUP, A, 0, 0, "\xe1\xbd\xb8{2}", "\xe1\xbf\xb8#\xe1\xbf\xb8\xe1\xbd\xb8" },
342 { CMU, A, 0, 0, "[^\xf0\x90\x90\x80]{2,4}@", "\xf0\x90\x90\xa8\xf0\x90\x90\x80###\xf0\x90\x90\x80@@@" },
343 { CMU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
344 { MU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
345 { MU, A, 0, 0, "[^\xe1\xbd\xb8]{3,}?", "##\xe1\xbd\xb8#\xe1\xbd\xb8#\xc3\x89#\xe1\xbd\xb8" },
346 { MU, A, 0, 0, "\\d+123", "987654321,01234" },
347 { MU, A, 0, 0, "abcd*|\\w+xy", "aaaaa,abxyz" },
348 { MU, A, 0, 0, "(?:abc|((?:amc|\\b\\w*xy)))", "aaaaa,abxyz" },
349 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.abcd#."},
350 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.mbcd#."},
351 { MU, A, 0, 0, ".[ab]*.", "xx" },
352 { MU, A, 0, 0, ".[ab]*a", "xxa" },
353 { MU, A, 0, 0, ".[ab]?.", "xx" },
354 { MU, A, 0, 0, "_[ab]+_*a", "_aa" },
355 { MU, A, 0, 0, "#(A+)#\\d+", "#A#A#0" },
356 { MU, A, 0, 0, "(?P<size>\\d+)m|M", "4M" },
357 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\n?.+#", "\n,\n,#" },
358
359 /* Bracket repeats with limit. */
360 { MU, A, 0, 0, "(?:(ab){2}){5}M", "abababababababababababM" },
361 { MU, A, 0, 0, "(?:ab|abab){1,5}M", "abababababababababababM" },
362 { MU, A, 0, 0, "(?>ab|abab){1,5}M", "abababababababababababM" },
363 { MU, A, 0, 0, "(?:ab|abab){1,5}?M", "abababababababababababM" },
364 { MU, A, 0, 0, "(?>ab|abab){1,5}?M", "abababababababababababM" },
365 { MU, A, 0, 0, "(?:(ab){1,4}?){1,3}?M", "abababababababababababababM" },
366 { MU, A, 0, 0, "(?:(ab){1,4}){1,3}abababababababababababM", "ababababababababababababM" },
367 { MU, A, 0, 0 | F_NOMATCH, "(?:(ab){1,4}){1,3}abababababababababababM", "abababababababababababM" },
368 { MU, A, 0, 0, "(ab){4,6}?M", "abababababababM" },
369
370 /* Basic character sets. */
371 { MU, A, 0, 0, "(?:\\s)+(?:\\S)+", "ab \t\xc3\xa9\xe6\x92\xad " },
372 { MU, A, 0, 0, "(\\w)*(k)(\\W)?\?", "abcdef abck11" },
373 { MU, A, 0, 0, "\\((\\d)+\\)\\D", "a() (83 (8)2 (9)ab" },
374 { MU, A, 0, 0, "\\w(\\s|(?:\\d)*,)+\\w\\wb", "a 5, 4,, bb 5, 4,, aab" },
375 { MU, A, 0, 0, "(\\v+)(\\V+)", "\x0e\xc2\x85\xe2\x80\xa8\x0b\x09\xe2\x80\xa9" },
376 { MU, A, 0, 0, "(\\h+)(\\H+)", "\xe2\x80\xa8\xe2\x80\x80\x20\xe2\x80\x8a\xe2\x81\x9f\xe3\x80\x80\x09\x20\xc2\xa0\x0a" },
377 { MU, A, 0, 0, "x[bcef]+", "xaxdxecbfg" },
378 { MU, A, 0, 0, "x[bcdghij]+", "xaxexfxdgbjk" },
379 { MU, A, 0, 0, "x[^befg]+", "xbxexacdhg" },
380 { MU, A, 0, 0, "x[^bcdl]+", "xlxbxaekmd" },
381 { MU, A, 0, 0, "x[^bcdghi]+", "xbxdxgxaefji" },
382 { MU, A, 0, 0, "x[B-Fb-f]+", "xaxAxgxbfBFG" },
383 { CMU, A, 0, 0, "\\x{e9}+", "#\xf0\x90\x90\xa8\xc3\xa8\xc3\xa9\xc3\x89\xc3\x88" },
384 { CMU, A, 0, 0, "[^\\x{e9}]+", "\xc3\xa9#\xf0\x90\x90\xa8\xc3\xa8\xc3\x88\xc3\x89" },
385 { MU, A, 0, 0, "[\\x02\\x7e]+", "\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x02\x7e\x7f" },
386 { MU, A, 0, 0, "[^\\x02\\x7e]+", "\x02\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x7f\x7e" },
387 { MU, A, 0, 0, "[\\x{81}-\\x{7fe}]+", "#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xc2\x81\xdf\xbe\xdf\xbf" },
388 { MU, A, 0, 0, "[^\\x{81}-\\x{7fe}]+", "\xc2\x81#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xdf\xbf\xdf\xbe" },
389 { MU, A, 0, 0, "[\\x{801}-\\x{fffe}]+", "#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xe0\xa0\x81\xef\xbf\xbe\xef\xbf\xbf" },
390 { MU, A, 0, 0, "[^\\x{801}-\\x{fffe}]+", "\xe0\xa0\x81#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xef\xbf\xbf\xef\xbf\xbe" },
391 { MU, A, 0, 0, "[\\x{10001}-\\x{10fffe}]+", "#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf0\x90\x80\x81\xf4\x8f\xbf\xbe\xf4\x8f\xbf\xbf" },
392 { MU, A, 0, 0, "[^\\x{10001}-\\x{10fffe}]+", "\xf0\x90\x80\x81#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbe" },
393 { CMU, A, 0, 0 | F_NOMATCH, "^[\\x{0100}-\\x{017f}]", " " },
394
395 /* Unicode properties. */
396 { MUP, A, 0, 0, "[1-5\xc3\xa9\\w]", "\xc3\xa1_" },
397 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\x81\\p{Ll}]", "A_\xc3\x89\xc3\xa1" },
398 { MUP, A, 0, 0, "[\\Wd-h_x-z]+", "a\xc2\xa1#_yhzdxi" },
399 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}]", "abc" },
400 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}]", "abc" },
401 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}\xc3\xa1-\xc3\xa8]", "abc" },
402 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}\xc3\xa1-\xc3\xa8]", "abc" },
403 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
404 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
405 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
406 { MUP, A, 0, 0 | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
407 { MUP, A, 0, 0, "[b-\xc3\xa9\\s]", "a\xc\xe6\x92\xad" },
408 { CMUP, A, 0, 0, "[\xc2\x85-\xc2\x89\xc3\x89]", "\xc2\x84\xc3\xa9" },
409 { MUP, A, 0, 0, "[^b-d^&\\s]{3,}", "db^ !a\xe2\x80\xa8_ae" },
410 { MUP, A, 0, 0 | F_PROPERTY, "[^\\S\\P{Any}][\\sN]{1,3}[\\P{N}]{4}", "\xe2\x80\xaa\xa N\x9\xc3\xa9_0" },
411 { MU, A, 0, 0 | F_PROPERTY, "[^\\P{L}\x9!D-F\xa]{2,3}", "\x9,.DF\xa.CG\xc3\x81" },
412 { CMUP, A, 0, 0, "[\xc3\xa1-\xc3\xa9_\xe2\x80\xa0-\xe2\x80\xaf]{1,5}[^\xe2\x80\xa0-\xe2\x80\xaf]", "\xc2\xa1\xc3\x89\xc3\x89\xe2\x80\xaf_\xe2\x80\xa0" },
413 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa2-\xc3\xa6\xc3\x81-\xc3\x84\xe2\x80\xa8-\xe2\x80\xa9\xe6\x92\xad\\p{Zs}]{2,}", "\xe2\x80\xa7\xe2\x80\xa9\xe6\x92\xad \xe6\x92\xae" },
414 { MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
415 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" },
416 { MUP, 0, 0, 0 | F_NOMATCH, "[^\\p{Hangul}\\p{Z}]", " " },
417 { MUP, 0, 0, 0, "[\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
418 { MUP, 0, 0, 0, "[\\x{a92e}\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
419 { CMUP, 0, 0, 0, "[^S]\\B", "\xe2\x80\x8a" },
420
421 /* Possible empty brackets. */
422 { MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },
423 { MU, A, 0, 0, "(|ab||bc|a)+d", "abcxabcabd" },
424 { MU, A, 0, 0, "(?:|ab||bc|a)*d", "abcxabcabd" },
425 { MU, A, 0, 0, "(|ab||bc|a)*d", "abcxabcabd" },
426 { MU, A, 0, 0, "(?:|ab||bc|a)+?d", "abcxabcabd" },
427 { MU, A, 0, 0, "(|ab||bc|a)+?d", "abcxabcabd" },
428 { MU, A, 0, 0, "(?:|ab||bc|a)*?d", "abcxabcabd" },
429 { MU, A, 0, 0, "(|ab||bc|a)*?d", "abcxabcabd" },
430 { MU, A, 0, 0, "(((a)*?|(?:ba)+)+?|(?:|c|ca)*)*m", "abaacaccabacabalabaacaccabacabamm" },
431 { MU, A, 0, 0, "(?:((?:a)*|(ba)+?)+|(|c|ca)*?)*?m", "abaacaccabacabalabaacaccabacabamm" },
432
433 /* Start offset. */
434 { MU, A, 0, 3, "(\\d|(?:\\w)*\\w)+", "0ac01Hb" },
435 { MU, A, 0, 4 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
436 { MU, A, 0, 2 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
437 { MU, A, 0, 1, "(\\w\\W\\w)+", "ab#d" },
438
439 /* Newline. */
440 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
441 { M, PCRE2_NEWLINE_CR, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
442 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{1,3}[^#]", "\r\n##...." },
443 { MU, A, PCRE2_NO_UTF_CHECK, 1, "^.a", "\n\x80\nxa" },
444 { MU, A, 0, 1, "^", "\r\n" },
445 { M, PCRE2_NEWLINE_CRLF, 0, 1 | F_NOMATCH, "^", "\r\n" },
446 { M, PCRE2_NEWLINE_CRLF, 0, 1, "^", "\r\na" },
447
448 /* Any character except newline or any newline. */
449 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
450 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".(.).", "a\xc3\xa1\r\n\n\r\r" },
451 { 0, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
452 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
453 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.).", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa9$de" },
454 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0 | F_NOMATCH, ".(.).", "\xe2\x80\xa8\nb\r" },
455 { 0, PCRE2_NEWLINE_ANY, 0, 0, "(.)(.)", "#\x85#\r#\n#\r\n#\x84" },
456 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.+)#", "#\rMn\xc2\x85#\n###" },
457 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\r" },
458 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\x85#\r\n#" },
459 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\xe2\x80\xa8#c" },
460 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\r\nc" },
461 { U, PCRE2_NEWLINE_CRLF | BSR(PCRE2_BSR_UNICODE), 0, 0, "(\\R.)+", "\xc2\x85\r\n#\xe2\x80\xa8\n\r\n\r" },
462 { MU, A, 0, 0 | F_NOMATCH, "\\R+", "ab" },
463 { MU, A, 0, 0, "\\R+", "ab\r\n\r" },
464 { MU, A, 0, 0, "\\R*", "ab\r\n\r" },
465 { MU, A, 0, 0, "\\R*", "\r\n\r" },
466 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\r\r" },
467 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\n\n\n\r\r\r" },
468 { MU, A, 0, 0, "\\R{2,}", "\r\nab\n\n\n\r\r\r" },
469 { MU, A, 0, 0, "\\R{0,3}", "\r\n\r\n\r\n\r\n\r\n" },
470 { MU, A, 0, 0 | F_NOMATCH, "\\R+\\R\\R", "\r\n\r\n" },
471 { MU, A, 0, 0, "\\R+\\R\\R", "\r\r\r" },
472 { MU, A, 0, 0, "\\R*\\R\\R", "\n\r" },
473 { MU, A, 0, 0 | F_NOMATCH, "\\R{2,4}\\R\\R", "\r\r\r" },
474 { MU, A, 0, 0, "\\R{2,4}\\R\\R", "\r\r\r\r" },
475
476 /* Atomic groups (no fallback from "next" direction). */
477 { MU, A, 0, 0 | F_NOMATCH, "(?>ab)ab", "bab" },
478 { MU, A, 0, 0 | F_NOMATCH, "(?>(ab))ab", "bab" },
479 { MU, A, 0, 0, "(?>ab)+abc(?>de)*def(?>gh)?ghe(?>ij)+?k(?>lm)*?n(?>op)?\?op",
480 "bababcdedefgheijijklmlmnop" },
481 { MU, A, 0, 0, "(?>a(b)+a|(ab)?\?(b))an", "abban" },
482 { MU, A, 0, 0, "(?>ab+a|(?:ab)?\?b)an", "abban" },
483 { MU, A, 0, 0, "((?>ab|ad|)*?)(?>|c)*abad", "abababcababad" },
484 { MU, A, 0, 0, "(?>(aa|b|)*+(?>(##)|###)*d|(aa)(?>(baa)?)m)", "aabaa#####da" },
485 { MU, A, 0, 0, "((?>a|)+?)b", "aaacaaab" },
486 { MU, A, 0, 0, "(?>x|)*$", "aaa" },
487 { MU, A, 0, 0, "(?>(x)|)*$", "aaa" },
488 { MU, A, 0, 0, "(?>x|())*$", "aaa" },
489 { MU, A, 0, 0, "((?>[cxy]a|[a-d])*?)b", "aaa+ aaab" },
490 { MU, A, 0, 0, "((?>[cxy](a)|[a-d])*?)b", "aaa+ aaab" },
491 { MU, A, 0, 0, "(?>((?>(a+))))bab|(?>((?>(a+))))bb", "aaaabaaabaabab" },
492 { MU, A, 0, 0, "(?>(?>a+))bab|(?>(?>a+))bb", "aaaabaaabaabab" },
493 { MU, A, 0, 0, "(?>(a)c|(?>(c)|(a))a)b*?bab", "aaaabaaabaabab" },
494 { MU, A, 0, 0, "(?>ac|(?>c|a)a)b*?bab", "aaaabaaabaabab" },
495 { MU, A, 0, 0, "(?>(b)b|(a))*b(?>(c)|d)?x", "ababcaaabdbx" },
496 { MU, A, 0, 0, "(?>bb|a)*b(?>c|d)?x", "ababcaaabdbx" },
497 { MU, A, 0, 0, "(?>(bb)|a)*b(?>c|(d))?x", "ababcaaabdbx" },
498 { MU, A, 0, 0, "(?>(a))*?(?>(a))+?(?>(a))??x", "aaaaaacccaaaaabax" },
499 { MU, A, 0, 0, "(?>a)*?(?>a)+?(?>a)??x", "aaaaaacccaaaaabax" },
500 { MU, A, 0, 0, "(?>(a)|)*?(?>(a)|)+?(?>(a)|)??x", "aaaaaacccaaaaabax" },
501 { MU, A, 0, 0, "(?>a|)*?(?>a|)+?(?>a|)??x", "aaaaaacccaaaaabax" },
502 { MU, A, 0, 0, "(?>a(?>(a{0,2}))*?b|aac)+b", "aaaaaaacaaaabaaaaacaaaabaacaaabb" },
503 { CM, A, 0, 0, "(?>((?>a{32}|b+|(a*))?(?>c+|d*)?\?)+e)+?f", "aaccebbdde bbdaaaccebbdee bbdaaaccebbdeef" },
504 { MU, A, 0, 0, "(?>(?:(?>aa|a||x)+?b|(?>aa|a||(x))+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
505 { MU, A, 0, 0, "(?>(?:(?>aa|a||(x))+?b|(?>aa|a||x)+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
506 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d" },
507 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d#\xcc\x8d\xcc\x8d" },
508 { MU, A, 0, 0 | F_PROPERTY, "\\X+..", "\xcc\x8d#\xcc\x8d#\xcc\x8d\xcc\x8d" },
509 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}", "abcdef" },
510 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}?", "abcdef" },
511 { MU, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d##" },
512 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d#\xcc\x8d##" },
513 { MU, A, 0, 0, "(c(ab)?+ab)+", "cabcababcab" },
514 { MU, A, 0, 0, "(?>(a+)b)+aabab", "aaaabaaabaabab" },
515
516 /* Possessive quantifiers. */
517 { MU, A, 0, 0, "(?:a|b)++m", "mababbaaxababbaam" },
518 { MU, A, 0, 0, "(?:a|b)*+m", "mababbaaxababbaam" },
519 { MU, A, 0, 0, "(?:a|b)*+m", "ababbaaxababbaam" },
520 { MU, A, 0, 0, "(a|b)++m", "mababbaaxababbaam" },
521 { MU, A, 0, 0, "(a|b)*+m", "mababbaaxababbaam" },
522 { MU, A, 0, 0, "(a|b)*+m", "ababbaaxababbaam" },
523 { MU, A, 0, 0, "(a|b(*ACCEPT))++m", "maaxab" },
524 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxm" },
525 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxbbm" },
526 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxm" },
527 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxbbm" },
528 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxm" },
529 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxbbm" },
530 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxm" },
531 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxbbm" },
532 { MU, A, 0, 0, "(?:a|(b))++m", "mababbaaxababbaam" },
533 { MU, A, 0, 0, "(?:(a)|b)*+m", "mababbaaxababbaam" },
534 { MU, A, 0, 0, "(?:(a)|(b))*+m", "ababbaaxababbaam" },
535 { MU, A, 0, 0, "(a|(b))++m", "mababbaaxababbaam" },
536 { MU, A, 0, 0, "((a)|b)*+m", "mababbaaxababbaam" },
537 { MU, A, 0, 0, "((a)|(b))*+m", "ababbaaxababbaam" },
538 { MU, A, 0, 0, "(a|(b)(*ACCEPT))++m", "maaxab" },
539 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxm" },
540 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxbbm" },
541 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxm" },
542 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxbbm" },
543 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxm" },
544 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxbbm" },
545 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxm" },
546 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxbbm" },
547 { MU, A, 0, 0 | F_NOMATCH, "(?>(b{2,4}))(?:(?:(aa|c))++m|(?:(aa|c))+n)", "bbaacaaccaaaacxbbbmbn" },
548 { MU, A, 0, 0, "((?:b)++a)+(cd)*+m", "bbababbacdcdnbbababbacdcdm" },
549 { MU, A, 0, 0, "((?:(b))++a)+((c)d)*+m", "bbababbacdcdnbbababbacdcdm" },
550 { MU, A, 0, 0, "(?:(?:(?:ab)*+k)++(?:n(?:cd)++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
551 { MU, A, 0, 0, "(?:((ab)*+(k))++(n(?:c(d))++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
552
553 /* Back references. */
554 { MU, A, 0, 0, "(aa|bb)(\\1*)(ll|)(\\3*)bbbbbbc", "aaaaaabbbbbbbbc" },
555 { CMU, A, 0, 0, "(aa|bb)(\\1+)(ll|)(\\3+)bbbbbbc", "bBbbBbCbBbbbBbbcbbBbbbBBbbC" },
556 { CM, A, 0, 0, "(a{2,4})\\1", "AaAaaAaA" },
557 { MU, A, 0, 0, "(aa|bb)(\\1?)aa(\\1?)(ll|)(\\4+)bbc", "aaaaaaaabbaabbbbaabbbbc" },
558 { MU, A, 0, 0, "(aa|bb)(\\1{0,5})(ll|)(\\3{0,5})cc", "bbxxbbbbxxaaaaaaaaaaaaaaaacc" },
559 { MU, A, 0, 0, "(aa|bb)(\\1{3,5})(ll|)(\\3{3,5})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
560 { MU, A, 0, 0, "(aa|bb)(\\1{3,})(ll|)(\\3{3,})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
561 { MU, A, 0, 0, "(\\w+)b(\\1+)c", "GabGaGaDbGaDGaDc" },
562 { MU, A, 0, 0, "(?:(aa)|b)\\1?b", "bb" },
563 { CMU, A, 0, 0, "(aa|bb)(\\1*?)aa(\\1+?)", "bBBbaaAAaaAAaa" },
564 { MU, A, 0, 0, "(aa|bb)(\\1*?)(dd|)cc(\\3+?)", "aaaaaccdd" },
565 { CMU, A, 0, 0, "(?:(aa|bb)(\\1?\?)cc){2}(\\1?\?)", "aAaABBbbAAaAcCaAcCaA" },
566 { MU, A, 0, 0, "(?:(aa|bb)(\\1{3,5}?)){2}(dd|)(\\3{3,5}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
567 { CM, A, 0, 0, "(?:(aa|bb)(\\1{3,}?)){2}(dd|)(\\3{3,}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
568 { MU, A, 0, 0, "(?:(aa|bb)(\\1{0,3}?)){2}(dd|)(\\3{0,3}?)b(\\1{0,3}?)(\\1{0,3})", "aaaaaaaaaaaaaaabaaaaa" },
569 { MU, A, 0, 0, "(a(?:\\1|)a){3}b", "aaaaaaaaaaab" },
570 { M, A, 0, 0, "(a?)b(\\1\\1*\\1+\\1?\\1*?\\1+?\\1??\\1*+\\1++\\1?+\\1{4}\\1{3,5}\\1{4,}\\1{0,5}\\1{3,5}?\\1{4,}?\\1{0,5}?\\1{3,5}+\\1{4,}+\\1{0,5}+#){2}d", "bb#b##d" },
571 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
572 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{0,2}", "wwwww." },
573 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwww" },
574 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwwww" },
575 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
576 { CMUP, A, 0, 0, "(\xf0\x90\x90\x80)\\1", "\xf0\x90\x90\xa8\xf0\x90\x90\xa8" },
577 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
578 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
579 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>*(?<A>aa)(?<A>bb)", "aabb" },
580 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{0,3}aaaaaa", "aabbaaaaaa" },
581 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{2,5}bb", "aabbaaaabb" },
582 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}m", "aaaaaaaabbbbaabbbbm" },
583 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
584 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
585 { MU | PCRE2_DUPNAMES, A, 0, 0, "\\k<A>*?(?<A>aa)(?<A>bb)", "aabb" },
586 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
587 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>*?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
588 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
589 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}M", "aaaaaaaabbbbaabbbbm" },
590 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{1,3}M", "aaaaaaaabbbbaabbbbm" },
591 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}?M", "aaaaaabbbbbbaabbbbbbbbbbm" },
592 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
593
594 /* Assertions. */
595 { MU, A, 0, 0, "(?=xx|yy|zz)\\w{4}", "abczzdefg" },
596 { MU, A, 0, 0, "(?=((\\w+)b){3}|ab)", "dbbbb ab" },
597 { MU, A, 0, 0, "(?!ab|bc|cd)[a-z]{2}", "Xabcdef" },
598 { MU, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
599 { MU, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
600 { M, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
601 { M, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
602 { MU, A, 0, 0, "(\\d{2})(?!\\w+c|(((\\w?)m){2}n)+|\\1)", "x5656" },
603 { MU, A, 0, 0, "((?=((\\d{2,6}\\w){2,}))\\w{5,20}K){2,}", "567v09708K12l00M00 567v09708K12l00M00K45K" },
604 { MU, A, 0, 0, "(?=(?:(?=\\S+a)\\w*(b)){3})\\w+\\d", "bba bbab nbbkba nbbkba0kl" },
605 { MU, A, 0, 0, "(?>a(?>(b+))a(?=(..)))*?k", "acabbcabbaabacabaabbakk" },
606 { MU, A, 0, 0, "((?(?=(a))a)+k)", "bbak" },
607 { MU, A, 0, 0, "((?(?=a)a)+k)", "bbak" },
608 { MU, A, 0, 0 | F_NOMATCH, "(?=(?>(a))m)amk", "a k" },
609 { MU, A, 0, 0 | F_NOMATCH, "(?!(?>(a))m)amk", "a k" },
610 { MU, A, 0, 0 | F_NOMATCH, "(?>(?=(a))am)amk", "a k" },
611 { MU, A, 0, 0, "(?=(?>a|(?=(?>(b+))a|c)[a-c]+)*?m)[a-cm]+k", "aaam bbam baaambaam abbabba baaambaamk" },
612 { MU, A, 0, 0, "(?> ?\?\\b(?(?=\\w{1,4}(a))m)\\w{0,8}bc){2,}?", "bca ssbc mabd ssbc mabc" },
613 { MU, A, 0, 0, "(?:(?=ab)?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
614 { MU, A, 0, 0, "(?:(?=a(b))?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
615 { MU, A, 0, 0, "(?:(?=.(.))??\\1.)+m", "aabbbcbacccanaabbbcbacccam" },
616 { MU, A, 0, 0, "(?:(?=.)??[a-c])+m", "abacdcbacacdcaccam" },
617 { MU, A, 0, 0, "((?!a)?(?!([^a]))?)+$", "acbab" },
618 { MU, A, 0, 0, "((?!a)?\?(?!([^a]))?\?)+$", "acbab" },
619 { MU, A, 0, 0, "a(?=(?C)\\B(?C`x`))b", "ab" },
620 { MU, A, 0, 0, "a(?!(?C)\\B(?C`x`))bb|ab", "abb" },
621 { MU, A, 0, 0, "a(?=\\b|(?C)\\B(?C`x`))b", "ab" },
622 { MU, A, 0, 0, "a(?!\\b|(?C)\\B(?C`x`))bb|ab", "abb" },
623 { MU, A, 0, 0, "c(?(?=(?C)\\B(?C`x`))ab|a)", "cab" },
624 { MU, A, 0, 0, "c(?(?!(?C)\\B(?C`x`))ab|a)", "cab" },
625 { MU, A, 0, 0, "c(?(?=\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
626 { MU, A, 0, 0, "c(?(?!\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
627 { MU, A, 0, 0, "a(?=)b", "ab" },
628 { MU, A, 0, 0 | F_NOMATCH, "a(?!)b", "ab" },
629
630 /* Not empty, ACCEPT, FAIL */
631 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*", "bcx" },
632 { MU, A, PCRE2_NOTEMPTY, 0, "a*", "bcaad" },
633 { MU, A, PCRE2_NOTEMPTY, 0, "a*?", "bcaad" },
634 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*", "bcaad" },
635 { MU, A, 0, 0, "a(*ACCEPT)b", "ab" },
636 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*(*ACCEPT)b", "bcx" },
637 { MU, A, PCRE2_NOTEMPTY, 0, "a*(*ACCEPT)b", "bcaad" },
638 { MU, A, PCRE2_NOTEMPTY, 0, "a*?(*ACCEPT)b", "bcaad" },
639 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "(?:z|a*(*ACCEPT)b)", "bcx" },
640 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*(*ACCEPT)b)", "bcaad" },
641 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*?(*ACCEPT)b)", "bcaad" },
642 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*(*ACCEPT)b", "bcx" },
643 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0 | F_NOMATCH, "a*(*ACCEPT)b", "" },
644 { MU, A, 0, 0, "((a(*ACCEPT)b))", "ab" },
645 { MU, A, 0, 0, "(a(*FAIL)a|a)", "aaa" },
646 { MU, A, 0, 0, "(?=ab(*ACCEPT)b)a", "ab" },
647 { MU, A, 0, 0, "(?=(?:x|ab(*ACCEPT)b))", "ab" },
648 { MU, A, 0, 0, "(?=(a(b(*ACCEPT)b)))a", "ab" },
649 { MU, A, PCRE2_NOTEMPTY, 0, "(?=a*(*ACCEPT))c", "c" },
650 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "(?=A)", "AB" },
651
652 /* Conditional blocks. */
653 { MU, A, 0, 0, "(?(?=(a))a|b)+k", "ababbalbbadabak" },
654 { MU, A, 0, 0, "(?(?!(b))a|b)+k", "ababbalbbadabak" },
655 { MU, A, 0, 0, "(?(?=a)a|b)+k", "ababbalbbadabak" },
656 { MU, A, 0, 0, "(?(?!b)a|b)+k", "ababbalbbadabak" },
657 { MU, A, 0, 0, "(?(?=(a))a*|b*)+k", "ababbalbbadabak" },
658 { MU, A, 0, 0, "(?(?!(b))a*|b*)+k", "ababbalbbadabak" },
659 { MU, A, 0, 0, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
660 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
661 { MU, A, 0, 0 | F_DIFF, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
662 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
663 { MU, A, 0, 0, "(?(?=a)a*|b*)+k", "ababbalbbadabak" },
664 { MU, A, 0, 0, "(?(?!b)a*|b*)+k", "ababbalbbadabak" },
665 { MU, A, 0, 0, "(?(?=a)ab)", "a" },
666 { MU, A, 0, 0, "(?(?<!b)c)", "b" },
667 { MU, A, 0, 0, "(?(DEFINE)a(b))", "a" },
668 { MU, A, 0, 0, "a(?(DEFINE)(?:b|(?:c?)+)*)", "a" },
669 { MU, A, 0, 0, "(?(?=.[a-c])[k-l]|[A-D])", "kdB" },
670 { MU, A, 0, 0, "(?(?!.{0,4}[cd])(aa|bb)|(cc|dd))+", "aabbccddaa" },
671 { MU, A, 0, 0, "(?(?=[^#@]*@)(aaab|aa|aba)|(aba|aab)){3,}", "aaabaaaba#aaabaaaba#aaabaaaba@" },
672 { MU, A, 0, 0, "((?=\\w{5})\\w(?(?=\\w*k)\\d|[a-f_])*\\w\\s)+", "mol m10kk m088k _f_a_ mbkkl" },
673 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cdcaa" },
674 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cbb" },
675 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))(aaaa|a?))+aak", "aaaaab aaaaak" },
676 { MU, A, 0, 0, "(?(?=a)(aaaa|a?))+aak", "aaaaab aaaaak" },
677 { MU, A, 0, 0, "(?(?!(b))(aaaa|a?))+aak", "aaaaab aaaaak" },
678 { MU, A, 0, 0, "(?(?!b)(aaaa|a?))+aak", "aaaaab aaaaak" },
679 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))a*)+aak", "aaaaab aaaaak" },
680 { MU, A, 0, 0, "(?(?=a)a*)+aak", "aaaaab aaaaak" },
681 { MU, A, 0, 0, "(?(?!(b))a*)+aak", "aaaaab aaaaak" },
682 { MU, A, 0, 0, "(?(?!b)a*)+aak", "aaaaab aaaaak" },
683 { MU, A, 0, 0, "(?(?=(?=(?!(x))a)aa)aaa|(?(?=(?!y)bb)bbb))*k", "abaabbaaabbbaaabbb abaabbaaabbbaaabbbk" },
684 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)*l", "bc ddd abccabccl" },
685 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+?dd", "bcabcacdb bdddd" },
686 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+l", "ababccddabdbccd abcccl" },
687 { MU, A, 0, 0, "((?:a|aa)(?(1)aaa))x", "aax" },
688 { MU, A, 0, 0, "(?(?!)a|b)", "ab" },
689 { MU, A, 0, 0, "(?(?!)a)", "ab" },
690 { MU, A, 0, 0 | F_NOMATCH, "(?(?!)a|b)", "ac" },
691
692 /* Set start of match. */
693 { MU, A, 0, 0, "(?:\\Ka)*aaaab", "aaaaaaaa aaaaaaabb" },
694 { MU, A, 0, 0, "(?>\\Ka\\Ka)*aaaab", "aaaaaaaa aaaaaaaaaabb" },
695 { MU, A, 0, 0, "a+\\K(?<=\\Gaa)a", "aaaaaa" },
696 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a\\K(*ACCEPT)b", "aa" },
697 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a\\K(*ACCEPT)b", "aa" },
698
699 /* First line. */
700 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_PROPERTY, "\\p{Any}a", "bb\naaa" },
701 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}a", "bb\r\naaa" },
702 { MU | PCRE2_FIRSTLINE, A, 0, 0, "(?<=a)", "a" },
703 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[^a][^b]", "ab" },
704 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "a", "\na" },
705 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[abc]", "\na" },
706 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^a", "\na" },
707 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^(?<=\n)", "\na" },
708 { MU | PCRE2_FIRSTLINE, A, 0, 0, "\xf0\x90\x90\x80", "\xf0\x90\x90\x80" },
709 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\xc2\x85#" },
710 { M | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\x85#" },
711 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "^#", "\xe2\x80\xa8#" },
712 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_PROPERTY, "\\p{Any}", "\r\na" },
713 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
714 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, "a", "\ra" },
715 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH, "ba", "bbb\r\nba" },
716 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}{4}|a", "\r\na" },
717 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 1, ".", "\r\n" },
718 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_LF, 0, 0 | F_NOMATCH, "ab.", "ab" },
719 { MU | PCRE2_FIRSTLINE, A, 0, 1 | F_NOMATCH, "^[a-d0-9]", "\nxx\nd" },
720 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_ANY, 0, 0, "....a", "012\n0a" },
721 { MU | PCRE2_FIRSTLINE, A, 0, 0, "[aC]", "a" },
722
723 /* Recurse. */
724 { MU, A, 0, 0, "(a)(?1)", "aa" },
725 { MU, A, 0, 0, "((a))(?1)", "aa" },
726 { MU, A, 0, 0, "(b|a)(?1)", "aa" },
727 { MU, A, 0, 0, "(b|(a))(?1)", "aa" },
728 { MU, A, 0, 0 | F_NOMATCH, "((a)(b)(?:a*))(?1)", "aba" },
729 { MU, A, 0, 0, "((a)(b)(?:a*))(?1)", "abab" },
730 { MU, A, 0, 0, "((a+)c(?2))b(?1)", "aacaabaca" },
731 { MU, A, 0, 0, "((?2)b|(a)){2}(?1)", "aabab" },
732 { MU, A, 0, 0, "(?1)(a)*+(?2)(b(?1))", "aababa" },
733 { MU, A, 0, 0, "(?1)(((a(*ACCEPT)))b)", "axaa" },
734 { MU, A, 0, 0, "(?1)(?(DEFINE) (((ac(*ACCEPT)))b) )", "akaac" },
735 { MU, A, 0, 0, "(a+)b(?1)b\\1", "abaaabaaaaa" },
736 { MU, A, 0, 0, "(?(DEFINE)(aa|a))(?1)ab", "aab" },
737 { MU, A, 0, 0, "(?(DEFINE)(a\\Kb))(?1)+ababc", "abababxabababc" },
738 { MU, A, 0, 0, "(a\\Kb)(?1)+ababc", "abababxababababc" },
739 { MU, A, 0, 0 | F_NOMATCH, "(a\\Kb)(?1)+ababc", "abababxababababxc" },
740 { MU, A, 0, 0, "b|<(?R)*>", "<<b>" },
741 { MU, A, 0, 0, "(a\\K){0}(?:(?1)b|ac)", "ac" },
742 { MU, A, 0, 0, "(?(DEFINE)(a(?2)|b)(b(?1)|(a)))(?:(?1)|(?2))m", "ababababnababababaam" },
743 { MU, A, 0, 0, "(a)((?(R)a|b))(?2)", "aabbabaa" },
744 { MU, A, 0, 0, "(a)((?(R2)a|b))(?2)", "aabbabaa" },
745 { MU, A, 0, 0, "(a)((?(R1)a|b))(?2)", "ababba" },
746 { MU, A, 0, 0, "(?(R0)aa|bb(?R))", "abba aabb bbaa" },
747 { MU, A, 0, 0, "((?(R)(?:aaaa|a)|(?:(aaaa)|(a)))+)(?1)$", "aaaaaaaaaa aaaa" },
748 { MU, A, 0, 0, "(?P<Name>a(?(R&Name)a|b))(?1)", "aab abb abaa" },
749 { MU, A, 0, 0, "((?(R)a|(?1)){3})", "XaaaaaaaaaX" },
750 { MU, A, 0, 0, "((?:(?(R)a|(?1))){3})", "XaaaaaaaaaX" },
751 { MU, A, 0, 0, "((?(R)a|(?1)){1,3})aaaaaa", "aaaaaaaaXaaaaaaaaa" },
752 { MU, A, 0, 0, "((?(R)a|(?1)){1,3}?)M", "aaaM" },
753 { MU, A, 0, 0, "((.)(?:.|\\2(?1))){0}#(?1)#", "#aabbccdde# #aabbccddee#" },
754 { MU, A, 0, 0, "((.)(?:\\2|\\2{4}b)){0}#(?:(?1))+#", "#aaaab# #aaaaab#" },
755 { MU, A, 0, 0 | F_NOMATCH, "(?1)$((.|\\2xx){1,2})", "abc" },
756
757 /* 16 bit specific tests. */
758 { CM, A, 0, 0 | F_FORCECONV, "\xc3\xa1", "\xc3\x81\xc3\xa1" },
759 { CM, A, 0, 0 | F_FORCECONV, "\xe1\xbd\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
760 { CM, A, 0, 0 | F_FORCECONV, "[\xc3\xa1]", "\xc3\x81\xc3\xa1" },
761 { CM, A, 0, 0 | F_FORCECONV, "[\xe1\xbd\xb8]", "\xe1\xbf\xb8\xe1\xbd\xb8" },
762 { CM, A, 0, 0 | F_FORCECONV, "[a-\xed\xb0\x80]", "A" },
763 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[a-\\x{dc00}]", "B" },
764 { CM, A, 0, 0 | F_NO8 | F_NOMATCH | F_FORCECONV, "[b-\\x{dc00}]", "a" },
765 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "\xed\xa0\x80\\x{d800}\xed\xb0\x80\\x{dc00}", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80" },
766 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\xed\xa0\x80\\x{d800}]{1,2}?[\xed\xb0\x80\\x{dc00}]{1,2}?#", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80#" },
767 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80\xed\xb0\x80#]{0,3}(?<=\xed\xb0\x80.)", "\xed\xa0\x80#\xed\xa0\x80##\xed\xb0\x80\xed\xa0\x80" },
768 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\x9f\xbf\xed\xa0\x83" },
769 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\xb4\x80\xed\xb3\xb0" },
770 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\x9f\xbf\xed\xa0\x83" },
771 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\xb4\x80\xed\xb3\xb0" },
772 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xef\xbf\xbf]+[\x1-\xed\xb0\x80]+#", "\xed\xa0\x85\xc3\x81\xed\xa0\x85\xef\xbf\xb0\xc2\x85\xed\xa9\x89#" },
773 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80][\xed\xb0\x80]{2,}", "\xed\xa0\x80\xed\xb0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80\xed\xb0\x80" },
774 { M, A, 0, 0 | F_FORCECONV, "[^\xed\xb0\x80]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
775 { M, A, 0, 0 | F_NO8 | F_FORCECONV, "[^\\x{dc00}]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
776 { CM, A, 0, 0 | F_FORCECONV, ".\\B.", "\xed\xa0\x80\xed\xb0\x80" },
777 { CM, A, 0, 0 | F_FORCECONV, "\\D+(?:\\d+|.)\\S+(?:\\s+|.)\\W+(?:\\w+|.)\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80" },
778 { CM, A, 0, 0 | F_FORCECONV, "\\d*\\s*\\w*\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80" },
779 { CM, A, 0, 0 | F_FORCECONV | F_NOMATCH, "\\d*?\\D*?\\s*?\\S*?\\w*?\\W*?##", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80#" },
780 { CM | PCRE2_EXTENDED, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80 \xed\xb0\x80 !", "\xed\xa0\x80\xed\xb0\x80!" },
781 { CM, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80+#[^#]+\xed\xa0\x80", "\xed\xa0\x80#a\xed\xa0\x80" },
782 { CM, A, 0, 0 | F_FORCECONV, "(\xed\xa0\x80+)#\\1", "\xed\xa0\x80\xed\xa0\x80#\xed\xa0\x80\xed\xa0\x80" },
783 { M, PCRE2_NEWLINE_ANY, 0, 0 | F_NO8 | F_FORCECONV, "^-", "a--\xe2\x80\xa8--" },
784 { 0, BSR(PCRE2_BSR_UNICODE), 0, 0 | F_NO8 | F_FORCECONV, "\\R", "ab\xe2\x80\xa8" },
785 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v", "ab\xe2\x80\xa9" },
786 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h", "ab\xe1\xa0\x8e" },
787 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v+?\\V+?#", "\xe2\x80\xa9\xe2\x80\xa9\xef\xbf\xbf\xef\xbf\xbf#" },
788 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h+?\\H+?#", "\xe1\xa0\x8e\xe1\xa0\x8e\xef\xbf\xbf\xef\xbf\xbf#" },
789
790 /* Partial matching. */
791 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab", "a" },
792 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab|a", "a" },
793 { MU, A, PCRE2_PARTIAL_HARD, 0, "ab|a", "a" },
794 { MU, A, PCRE2_PARTIAL_SOFT, 0, "\\b#", "a" },
795 { MU, A, PCRE2_PARTIAL_SOFT, 0, "(?<=a)b", "a" },
796 { MU, A, PCRE2_PARTIAL_SOFT, 0, "abc|(?<=xxa)bc", "xxab" },
797 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a\\B", "a" },
798 { MU, A, PCRE2_PARTIAL_HARD, 0, "a\\b", "a" },
799
800 /* (*MARK) verb. */
801 { MU, A, 0, 0, "a(*MARK:aa)a", "ababaa" },
802 { MU, A, 0, 0 | F_NOMATCH, "a(*:aa)a", "abab" },
803 { MU, A, 0, 0, "a(*:aa)(b(*:bb)b|bc)", "abc" },
804 { MU, A, 0, 0 | F_NOMATCH, "a(*:1)x|b(*:2)y", "abc" },
805 { MU, A, 0, 0, "(?>a(*:aa))b|ac", "ac" },
806 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))(?1)", "a" },
807 { MU, A, 0, 0 | F_NOMATCH, "(?(DEFINE)((a)(*:aa)))(?1)b", "aa" },
808 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))a(?1)b|aac", "aac" },
809 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
810 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b)+", "babba" },
811 { MU, A, 0, 0 | F_NOMATCH, "(a(*:aa)){0}(?:b(?1)b)+", "ba" },
812 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
813 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b)+", "babba" },
814 { MU, A, 0, 0 | F_NOMATCH, "(a\\K(*:aa)){0}(?:b(?1)b)+", "ba" },
815 { MU, A, 0, 0 | F_NOMATCH, "(*:mark)m", "a" },
816
817 /* (*COMMIT) verb. */
818 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)b", "ac" },
819 { MU, A, 0, 0, "aa(*COMMIT)b", "xaxaab" },
820 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)(*:msg)b|ac", "ac" },
821 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b)++", "abac" },
822 { MU, A, 0, 0 | F_NOMATCH, "((a)(*COMMIT)b)++", "abac" },
823 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*COMMIT)b)ab|ad", "ad" },
824
825 /* (*PRUNE) verb. */
826 { MU, A, 0, 0, "aa\\K(*PRUNE)b", "aaab" },
827 { MU, A, 0, 0, "aa(*PRUNE:bb)b|a", "aa" },
828 { MU, A, 0, 0, "(a)(a)(*PRUNE)b|(a)", "aa" },
829 { MU, A, 0, 0, "(a)(a)(a)(a)(a)(a)(a)(a)(*PRUNE)b|(a)", "aaaaaaaa" },
830 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|", "a" },
831 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|m", "a" },
832 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*PRUNE)b)ab|ad", "ad" },
833 { MU, A, 0, 0, "a(*COMMIT)(*PRUNE)d|bc", "abc" },
834 { MU, A, 0, 0, "(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
835 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
836 { MU, A, 0, 0, "(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
837 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
838 { MU, A, 0, 0, "(a(*COMMIT)b){0}a(?1)(*PRUNE)c|bc", "abc" },
839 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b){0}a(*COMMIT)(?1)(*PRUNE)c|bc", "abc" },
840 { MU, A, 0, 0, "(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
841 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
842 { MU, A, 0, 0, "((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
843 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
844 { MU, A, 0, 0, "(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
845 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
846 { MU, A, 0, 0, "(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
847 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
848 { MU, A, 0, 0, "(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
849 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
850 { MU, A, 0, 0, "(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
851 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
852 { MU, A, 0, 0, "(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
853 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
854 { MU, A, 0, 0, "(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
855 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
856
857 /* (*SKIP) verb. */
858 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*SKIP)b)ab|ad", "ad" },
859 { MU, A, 0, 0, "(\\w+(*SKIP)#)", "abcd,xyz#," },
860 { MU, A, 0, 0, "\\w+(*SKIP)#|mm", "abcd,xyz#," },
861 { MU, A, 0, 0 | F_NOMATCH, "b+(?<=(*SKIP)#c)|b+", "#bbb" },
862
863 /* (*THEN) verb. */
864 { MU, A, 0, 0, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcaabcaabcaabcnacm" },
865 { MU, A, 0, 0 | F_NOMATCH, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcm" },
866 { MU, A, 0, 0, "((?:a(*THEN)|aab)c|a+)+m", "aabcaabcnmaabcaabcm" },
867 { MU, A, 0, 0, "((?:a|aab)(*THEN)c|a+)+m", "aam" },
868 { MU, A, 0, 0, "((?:a(*COMMIT)|aab)(*THEN)c|a+)+m", "aam" },
869 { MU, A, 0, 0, "(?(?=a(*THEN)b)ab|ad)", "ad" },
870 { MU, A, 0, 0, "(?(?!a(*THEN)b)ad|add)", "add" },
871 { MU, A, 0, 0 | F_NOMATCH, "(?(?=a)a(*THEN)b|ad)", "ad" },
872 { MU, A, 0, 0, "(?!(?(?=a)ab|b(*THEN)d))bn|bnn", "bnn" },
873 { MU, A, 0, 0, "(?=(*THEN: ))* ", " " },
874 { MU, A, 0, 0, "a(*THEN)(?R) |", "a" },
875
876 /* Recurse and control verbs. */
877 { MU, A, 0, 0, "(a(*ACCEPT)b){0}a(?1)b", "aacaabb" },
878 { MU, A, 0, 0, "((a)\\2(*ACCEPT)b){0}a(?1)b", "aaacaaabb" },
879 { MU, A, 0, 0, "((ab|a(*ACCEPT)x)+|ababababax){0}_(?1)_", "_ababababax_ _ababababa_" },
880 { MU, A, 0, 0, "((.)(?:A(*ACCEPT)|(?1)\\2)){0}_(?1)_", "_bcdaAdcb_bcdaAdcb_" },
881 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_", "_ab_" },
882 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_|(_aa_)", "_aa_" },
883 { MU, A, 0, 0, "(a(*COMMIT)(?:b|bb)|c(*ACCEPT)d|dd){0}_(?1)+_", "_ax_ _cd_ _abbb_ _abcd_ _abbcdd_" },
884 { MU, A, 0, 0, "((.)(?:.|(*COMMIT)\\2{3}(*ACCEPT).*|.*)){0}_(?1){0,4}_", "_aaaabbbbccccddd_ _aaaabbbbccccdddd_" },
885
886 #ifdef SUPPORT_UNICODE
887 /* Script runs and iterations. */
888 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
889 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
890 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
891 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
892 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
893 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)++#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
894 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)?#", "!ab!abc!ab!ab#" },
895 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)??#", "!ab!abc!ab!ab#" },
896 #endif
897
898 /* Deep recursion. */
899 { MU, A, 0, 0, "((((?:(?:(?:\\w)+)?)*|(?>\\w)+?)+|(?>\\w)?\?)*)?\\s", "aaaaa+ " },
900 { MU, A, 0, 0, "(?:((?:(?:(?:\\w*?)+)??|(?>\\w)?|\\w*+)*)+)+?\\s", "aa+ " },
901 { MU, A, 0, 0, "((a?)+)+b", "aaaaaaaaaaaa b" },
902
903 /* Deep recursion: Stack limit reached. */
904 { M, A, 0, 0 | F_NOMATCH, "a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaa" },
905 { M, A, 0, 0 | F_NOMATCH, "(?:a+)+b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
906 { M, A, 0, 0 | F_NOMATCH, "(?:a+?)+?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
907 { M, A, 0, 0 | F_NOMATCH, "(?:a*)*b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
908 { M, A, 0, 0 | F_NOMATCH, "(?:a*?)*?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
909
910 { 0, 0, 0, 0, NULL, NULL }
911 };
912
913 #ifdef SUPPORT_PCRE2_8
callback8(void * arg)914 static pcre2_jit_stack_8* callback8(void *arg)
915 {
916 return (pcre2_jit_stack_8 *)arg;
917 }
918 #endif
919
920 #ifdef SUPPORT_PCRE2_16
callback16(void * arg)921 static pcre2_jit_stack_16* callback16(void *arg)
922 {
923 return (pcre2_jit_stack_16 *)arg;
924 }
925 #endif
926
927 #ifdef SUPPORT_PCRE2_32
callback32(void * arg)928 static pcre2_jit_stack_32* callback32(void *arg)
929 {
930 return (pcre2_jit_stack_32 *)arg;
931 }
932 #endif
933
934 #ifdef SUPPORT_PCRE2_8
935 static pcre2_jit_stack_8 *stack8;
936
getstack8(void)937 static pcre2_jit_stack_8 *getstack8(void)
938 {
939 if (!stack8)
940 stack8 = pcre2_jit_stack_create_8(1, 1024 * 1024, NULL);
941 return stack8;
942 }
943
setstack8(pcre2_match_context_8 * mcontext)944 static void setstack8(pcre2_match_context_8 *mcontext)
945 {
946 if (!mcontext) {
947 if (stack8)
948 pcre2_jit_stack_free_8(stack8);
949 stack8 = NULL;
950 return;
951 }
952
953 pcre2_jit_stack_assign_8(mcontext, callback8, getstack8());
954 }
955 #endif /* SUPPORT_PCRE2_8 */
956
957 #ifdef SUPPORT_PCRE2_16
958 static pcre2_jit_stack_16 *stack16;
959
getstack16(void)960 static pcre2_jit_stack_16 *getstack16(void)
961 {
962 if (!stack16)
963 stack16 = pcre2_jit_stack_create_16(1, 1024 * 1024, NULL);
964 return stack16;
965 }
966
setstack16(pcre2_match_context_16 * mcontext)967 static void setstack16(pcre2_match_context_16 *mcontext)
968 {
969 if (!mcontext) {
970 if (stack16)
971 pcre2_jit_stack_free_16(stack16);
972 stack16 = NULL;
973 return;
974 }
975
976 pcre2_jit_stack_assign_16(mcontext, callback16, getstack16());
977 }
978 #endif /* SUPPORT_PCRE2_16 */
979
980 #ifdef SUPPORT_PCRE2_32
981 static pcre2_jit_stack_32 *stack32;
982
getstack32(void)983 static pcre2_jit_stack_32 *getstack32(void)
984 {
985 if (!stack32)
986 stack32 = pcre2_jit_stack_create_32(1, 1024 * 1024, NULL);
987 return stack32;
988 }
989
setstack32(pcre2_match_context_32 * mcontext)990 static void setstack32(pcre2_match_context_32 *mcontext)
991 {
992 if (!mcontext) {
993 if (stack32)
994 pcre2_jit_stack_free_32(stack32);
995 stack32 = NULL;
996 return;
997 }
998
999 pcre2_jit_stack_assign_32(mcontext, callback32, getstack32());
1000 }
1001 #endif /* SUPPORT_PCRE2_32 */
1002
1003 #ifdef SUPPORT_PCRE2_16
1004
convert_utf8_to_utf16(PCRE2_SPTR8 input,PCRE2_UCHAR16 * output,int * offsetmap,int max_length)1005 static int convert_utf8_to_utf16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int *offsetmap, int max_length)
1006 {
1007 PCRE2_SPTR8 iptr = input;
1008 PCRE2_UCHAR16 *optr = output;
1009 unsigned int c;
1010
1011 if (max_length == 0)
1012 return 0;
1013
1014 while (*iptr && max_length > 1) {
1015 c = 0;
1016 if (offsetmap)
1017 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1018
1019 if (*iptr < 0xc0)
1020 c = *iptr++;
1021 else if (!(*iptr & 0x20)) {
1022 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1023 iptr += 2;
1024 } else if (!(*iptr & 0x10)) {
1025 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1026 iptr += 3;
1027 } else if (!(*iptr & 0x08)) {
1028 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1029 iptr += 4;
1030 }
1031
1032 if (c < 65536) {
1033 *optr++ = c;
1034 max_length--;
1035 } else if (max_length <= 2) {
1036 *optr = '\0';
1037 return (int)(optr - output);
1038 } else {
1039 c -= 0x10000;
1040 *optr++ = 0xd800 | ((c >> 10) & 0x3ff);
1041 *optr++ = 0xdc00 | (c & 0x3ff);
1042 max_length -= 2;
1043 if (offsetmap)
1044 offsetmap++;
1045 }
1046 }
1047 if (offsetmap)
1048 *offsetmap = (int)(iptr - (unsigned char*)input);
1049 *optr = '\0';
1050 return (int)(optr - output);
1051 }
1052
copy_char8_to_char16(PCRE2_SPTR8 input,PCRE2_UCHAR16 * output,int max_length)1053 static int copy_char8_to_char16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int max_length)
1054 {
1055 PCRE2_SPTR8 iptr = input;
1056 PCRE2_UCHAR16 *optr = output;
1057
1058 if (max_length == 0)
1059 return 0;
1060
1061 while (*iptr && max_length > 1) {
1062 *optr++ = *iptr++;
1063 max_length--;
1064 }
1065 *optr = '\0';
1066 return (int)(optr - output);
1067 }
1068
1069 #define REGTEST_MAX_LENGTH16 4096
1070 static PCRE2_UCHAR16 regtest_buf16[REGTEST_MAX_LENGTH16];
1071 static int regtest_offsetmap16[REGTEST_MAX_LENGTH16];
1072
1073 #endif /* SUPPORT_PCRE2_16 */
1074
1075 #ifdef SUPPORT_PCRE2_32
1076
convert_utf8_to_utf32(PCRE2_SPTR8 input,PCRE2_UCHAR32 * output,int * offsetmap,int max_length)1077 static int convert_utf8_to_utf32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int *offsetmap, int max_length)
1078 {
1079 PCRE2_SPTR8 iptr = input;
1080 PCRE2_UCHAR32 *optr = output;
1081 unsigned int c;
1082
1083 if (max_length == 0)
1084 return 0;
1085
1086 while (*iptr && max_length > 1) {
1087 c = 0;
1088 if (offsetmap)
1089 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1090
1091 if (*iptr < 0xc0)
1092 c = *iptr++;
1093 else if (!(*iptr & 0x20)) {
1094 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1095 iptr += 2;
1096 } else if (!(*iptr & 0x10)) {
1097 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1098 iptr += 3;
1099 } else if (!(*iptr & 0x08)) {
1100 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1101 iptr += 4;
1102 }
1103
1104 *optr++ = c;
1105 max_length--;
1106 }
1107 if (offsetmap)
1108 *offsetmap = (int)(iptr - (unsigned char*)input);
1109 *optr = 0;
1110 return (int)(optr - output);
1111 }
1112
copy_char8_to_char32(PCRE2_SPTR8 input,PCRE2_UCHAR32 * output,int max_length)1113 static int copy_char8_to_char32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int max_length)
1114 {
1115 PCRE2_SPTR8 iptr = input;
1116 PCRE2_UCHAR32 *optr = output;
1117
1118 if (max_length == 0)
1119 return 0;
1120
1121 while (*iptr && max_length > 1) {
1122 *optr++ = *iptr++;
1123 max_length--;
1124 }
1125 *optr = '\0';
1126 return (int)(optr - output);
1127 }
1128
1129 #define REGTEST_MAX_LENGTH32 4096
1130 static PCRE2_UCHAR32 regtest_buf32[REGTEST_MAX_LENGTH32];
1131 static int regtest_offsetmap32[REGTEST_MAX_LENGTH32];
1132
1133 #endif /* SUPPORT_PCRE2_32 */
1134
check_ascii(const char * input)1135 static int check_ascii(const char *input)
1136 {
1137 const unsigned char *ptr = (unsigned char *)input;
1138 while (*ptr) {
1139 if (*ptr > 127)
1140 return 0;
1141 ptr++;
1142 }
1143 return 1;
1144 }
1145
1146 #define OVECTOR_SIZE 15
1147
regression_tests(void)1148 static int regression_tests(void)
1149 {
1150 struct regression_test_case *current = regression_test_cases;
1151 int error;
1152 PCRE2_SIZE err_offs;
1153 int is_successful;
1154 int is_ascii;
1155 int total = 0;
1156 int successful = 0;
1157 int successful_row = 0;
1158 int counter = 0;
1159 int jit_compile_mode;
1160 int utf = 0;
1161 int disabled_options = 0;
1162 int i;
1163 #ifdef SUPPORT_PCRE2_8
1164 pcre2_code_8 *re8;
1165 pcre2_compile_context_8 *ccontext8;
1166 pcre2_match_data_8 *mdata8_1;
1167 pcre2_match_data_8 *mdata8_2;
1168 pcre2_match_context_8 *mcontext8;
1169 PCRE2_SIZE *ovector8_1 = NULL;
1170 PCRE2_SIZE *ovector8_2 = NULL;
1171 int return_value8[2];
1172 #endif
1173 #ifdef SUPPORT_PCRE2_16
1174 pcre2_code_16 *re16;
1175 pcre2_compile_context_16 *ccontext16;
1176 pcre2_match_data_16 *mdata16_1;
1177 pcre2_match_data_16 *mdata16_2;
1178 pcre2_match_context_16 *mcontext16;
1179 PCRE2_SIZE *ovector16_1 = NULL;
1180 PCRE2_SIZE *ovector16_2 = NULL;
1181 int return_value16[2];
1182 int length16;
1183 #endif
1184 #ifdef SUPPORT_PCRE2_32
1185 pcre2_code_32 *re32;
1186 pcre2_compile_context_32 *ccontext32;
1187 pcre2_match_data_32 *mdata32_1;
1188 pcre2_match_data_32 *mdata32_2;
1189 pcre2_match_context_32 *mcontext32;
1190 PCRE2_SIZE *ovector32_1 = NULL;
1191 PCRE2_SIZE *ovector32_2 = NULL;
1192 int return_value32[2];
1193 int length32;
1194 #endif
1195
1196 #if defined SUPPORT_PCRE2_8
1197 PCRE2_UCHAR8 cpu_info[128];
1198 #elif defined SUPPORT_PCRE2_16
1199 PCRE2_UCHAR16 cpu_info[128];
1200 #elif defined SUPPORT_PCRE2_32
1201 PCRE2_UCHAR32 cpu_info[128];
1202 #endif
1203 #if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1204 int return_value;
1205 #endif
1206
1207 /* This test compares the behaviour of interpreter and JIT. Although disabling
1208 utf or ucp may make tests fail, if the pcre2_match result is the SAME, it is
1209 still considered successful from pcre2_jit_test point of view. */
1210
1211 #if defined SUPPORT_PCRE2_8
1212 pcre2_config_8(PCRE2_CONFIG_JITTARGET, &cpu_info);
1213 #elif defined SUPPORT_PCRE2_16
1214 pcre2_config_16(PCRE2_CONFIG_JITTARGET, &cpu_info);
1215 #elif defined SUPPORT_PCRE2_32
1216 pcre2_config_32(PCRE2_CONFIG_JITTARGET, &cpu_info);
1217 #endif
1218
1219 printf("Running JIT regression tests\n");
1220 printf(" target CPU of SLJIT compiler: ");
1221 for (i = 0; cpu_info[i]; i++)
1222 printf("%c", (char)(cpu_info[i]));
1223 printf("\n");
1224
1225 #if defined SUPPORT_PCRE2_8
1226 pcre2_config_8(PCRE2_CONFIG_UNICODE, &utf);
1227 #elif defined SUPPORT_PCRE2_16
1228 pcre2_config_16(PCRE2_CONFIG_UNICODE, &utf);
1229 #elif defined SUPPORT_PCRE2_32
1230 pcre2_config_32(PCRE2_CONFIG_UNICODE, &utf);
1231 #endif
1232
1233 if (!utf)
1234 disabled_options |= PCRE2_UTF;
1235 #ifdef SUPPORT_PCRE2_8
1236 printf(" in 8 bit mode with UTF-8 %s:\n", utf ? "enabled" : "disabled");
1237 #endif
1238 #ifdef SUPPORT_PCRE2_16
1239 printf(" in 16 bit mode with UTF-16 %s:\n", utf ? "enabled" : "disabled");
1240 #endif
1241 #ifdef SUPPORT_PCRE2_32
1242 printf(" in 32 bit mode with UTF-32 %s:\n", utf ? "enabled" : "disabled");
1243 #endif
1244
1245 while (current->pattern) {
1246 /* printf("\nPattern: %s :\n", current->pattern); */
1247 total++;
1248 is_ascii = 0;
1249 if (!(current->start_offset & F_PROPERTY))
1250 is_ascii = check_ascii(current->pattern) && check_ascii(current->input);
1251
1252 if (current->match_options & PCRE2_PARTIAL_SOFT)
1253 jit_compile_mode = PCRE2_JIT_PARTIAL_SOFT;
1254 else if (current->match_options & PCRE2_PARTIAL_HARD)
1255 jit_compile_mode = PCRE2_JIT_PARTIAL_HARD;
1256 else
1257 jit_compile_mode = PCRE2_JIT_COMPLETE;
1258 error = 0;
1259 #ifdef SUPPORT_PCRE2_8
1260 re8 = NULL;
1261 ccontext8 = pcre2_compile_context_create_8(NULL);
1262 if (ccontext8) {
1263 if (GET_NEWLINE(current->newline))
1264 pcre2_set_newline_8(ccontext8, GET_NEWLINE(current->newline));
1265 if (GET_BSR(current->newline))
1266 pcre2_set_bsr_8(ccontext8, GET_BSR(current->newline));
1267
1268 if (!(current->start_offset & F_NO8)) {
1269 re8 = pcre2_compile_8((PCRE2_SPTR8)current->pattern, PCRE2_ZERO_TERMINATED,
1270 current->compile_options & ~disabled_options,
1271 &error, &err_offs, ccontext8);
1272
1273 if (!re8 && (utf || is_ascii))
1274 printf("\n8 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1275 }
1276 pcre2_compile_context_free_8(ccontext8);
1277 }
1278 else
1279 printf("\n8 bit: Cannot allocate compile context\n");
1280 #endif
1281 #ifdef SUPPORT_PCRE2_16
1282 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1283 convert_utf8_to_utf16((PCRE2_SPTR8)current->pattern, regtest_buf16, NULL, REGTEST_MAX_LENGTH16);
1284 else
1285 copy_char8_to_char16((PCRE2_SPTR8)current->pattern, regtest_buf16, REGTEST_MAX_LENGTH16);
1286
1287 re16 = NULL;
1288 ccontext16 = pcre2_compile_context_create_16(NULL);
1289 if (ccontext16) {
1290 if (GET_NEWLINE(current->newline))
1291 pcre2_set_newline_16(ccontext16, GET_NEWLINE(current->newline));
1292 if (GET_BSR(current->newline))
1293 pcre2_set_bsr_16(ccontext16, GET_BSR(current->newline));
1294
1295 if (!(current->start_offset & F_NO16)) {
1296 re16 = pcre2_compile_16(regtest_buf16, PCRE2_ZERO_TERMINATED,
1297 current->compile_options & ~disabled_options,
1298 &error, &err_offs, ccontext16);
1299
1300 if (!re16 && (utf || is_ascii))
1301 printf("\n16 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1302 }
1303 pcre2_compile_context_free_16(ccontext16);
1304 }
1305 else
1306 printf("\n16 bit: Cannot allocate compile context\n");
1307 #endif
1308 #ifdef SUPPORT_PCRE2_32
1309 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1310 convert_utf8_to_utf32((PCRE2_SPTR8)current->pattern, regtest_buf32, NULL, REGTEST_MAX_LENGTH32);
1311 else
1312 copy_char8_to_char32((PCRE2_SPTR8)current->pattern, regtest_buf32, REGTEST_MAX_LENGTH32);
1313
1314 re32 = NULL;
1315 ccontext32 = pcre2_compile_context_create_32(NULL);
1316 if (ccontext32) {
1317 if (GET_NEWLINE(current->newline))
1318 pcre2_set_newline_32(ccontext32, GET_NEWLINE(current->newline));
1319 if (GET_BSR(current->newline))
1320 pcre2_set_bsr_32(ccontext32, GET_BSR(current->newline));
1321
1322 if (!(current->start_offset & F_NO32)) {
1323 re32 = pcre2_compile_32(regtest_buf32, PCRE2_ZERO_TERMINATED,
1324 current->compile_options & ~disabled_options,
1325 &error, &err_offs, ccontext32);
1326
1327 if (!re32 && (utf || is_ascii))
1328 printf("\n32 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1329 }
1330 pcre2_compile_context_free_32(ccontext32);
1331 }
1332 else
1333 printf("\n32 bit: Cannot allocate compile context\n");
1334 #endif
1335
1336 counter++;
1337 if ((counter & 0x3) != 0) {
1338 #ifdef SUPPORT_PCRE2_8
1339 setstack8(NULL);
1340 #endif
1341 #ifdef SUPPORT_PCRE2_16
1342 setstack16(NULL);
1343 #endif
1344 #ifdef SUPPORT_PCRE2_32
1345 setstack32(NULL);
1346 #endif
1347 }
1348
1349 #ifdef SUPPORT_PCRE2_8
1350 return_value8[0] = -1000;
1351 return_value8[1] = -1000;
1352 mdata8_1 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1353 mdata8_2 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1354 mcontext8 = pcre2_match_context_create_8(NULL);
1355 if (!mdata8_1 || !mdata8_2 || !mcontext8) {
1356 printf("\n8 bit: Cannot allocate match data\n");
1357 pcre2_match_data_free_8(mdata8_1);
1358 pcre2_match_data_free_8(mdata8_2);
1359 pcre2_match_context_free_8(mcontext8);
1360 pcre2_code_free_8(re8);
1361 re8 = NULL;
1362 } else {
1363 ovector8_1 = pcre2_get_ovector_pointer_8(mdata8_1);
1364 ovector8_2 = pcre2_get_ovector_pointer_8(mdata8_2);
1365 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1366 ovector8_1[i] = -2;
1367 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1368 ovector8_2[i] = -2;
1369 pcre2_set_match_limit_8(mcontext8, 10000000);
1370 }
1371 if (re8) {
1372 return_value8[1] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1373 current->start_offset & OFFSET_MASK, current->match_options, mdata8_2, mcontext8);
1374
1375 if (pcre2_jit_compile_8(re8, jit_compile_mode)) {
1376 printf("\n8 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1377 } else if ((counter & 0x1) != 0) {
1378 setstack8(mcontext8);
1379 return_value8[0] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1380 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1381 } else {
1382 pcre2_jit_stack_assign_8(mcontext8, NULL, getstack8());
1383 return_value8[0] = pcre2_jit_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1384 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1385 }
1386 }
1387 #endif
1388
1389 #ifdef SUPPORT_PCRE2_16
1390 return_value16[0] = -1000;
1391 return_value16[1] = -1000;
1392 mdata16_1 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1393 mdata16_2 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1394 mcontext16 = pcre2_match_context_create_16(NULL);
1395 if (!mdata16_1 || !mdata16_2 || !mcontext16) {
1396 printf("\n16 bit: Cannot allocate match data\n");
1397 pcre2_match_data_free_16(mdata16_1);
1398 pcre2_match_data_free_16(mdata16_2);
1399 pcre2_match_context_free_16(mcontext16);
1400 pcre2_code_free_16(re16);
1401 re16 = NULL;
1402 } else {
1403 ovector16_1 = pcre2_get_ovector_pointer_16(mdata16_1);
1404 ovector16_2 = pcre2_get_ovector_pointer_16(mdata16_2);
1405 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1406 ovector16_1[i] = -2;
1407 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1408 ovector16_2[i] = -2;
1409 pcre2_set_match_limit_16(mcontext16, 10000000);
1410 }
1411 if (re16) {
1412 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1413 length16 = convert_utf8_to_utf16((PCRE2_SPTR8)current->input, regtest_buf16, regtest_offsetmap16, REGTEST_MAX_LENGTH16);
1414 else
1415 length16 = copy_char8_to_char16((PCRE2_SPTR8)current->input, regtest_buf16, REGTEST_MAX_LENGTH16);
1416
1417 return_value16[1] = pcre2_match_16(re16, regtest_buf16, length16,
1418 current->start_offset & OFFSET_MASK, current->match_options, mdata16_2, mcontext16);
1419
1420 if (pcre2_jit_compile_16(re16, jit_compile_mode)) {
1421 printf("\n16 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1422 } else if ((counter & 0x1) != 0) {
1423 setstack16(mcontext16);
1424 return_value16[0] = pcre2_match_16(re16, regtest_buf16, length16,
1425 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1426 } else {
1427 pcre2_jit_stack_assign_16(mcontext16, NULL, getstack16());
1428 return_value16[0] = pcre2_jit_match_16(re16, regtest_buf16, length16,
1429 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1430 }
1431 }
1432 #endif
1433
1434 #ifdef SUPPORT_PCRE2_32
1435 return_value32[0] = -1000;
1436 return_value32[1] = -1000;
1437 mdata32_1 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1438 mdata32_2 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1439 mcontext32 = pcre2_match_context_create_32(NULL);
1440 if (!mdata32_1 || !mdata32_2 || !mcontext32) {
1441 printf("\n32 bit: Cannot allocate match data\n");
1442 pcre2_match_data_free_32(mdata32_1);
1443 pcre2_match_data_free_32(mdata32_2);
1444 pcre2_match_context_free_32(mcontext32);
1445 pcre2_code_free_32(re32);
1446 re32 = NULL;
1447 } else {
1448 ovector32_1 = pcre2_get_ovector_pointer_32(mdata32_1);
1449 ovector32_2 = pcre2_get_ovector_pointer_32(mdata32_2);
1450 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1451 ovector32_1[i] = -2;
1452 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1453 ovector32_2[i] = -2;
1454 pcre2_set_match_limit_32(mcontext32, 10000000);
1455 }
1456 if (re32) {
1457 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1458 length32 = convert_utf8_to_utf32((PCRE2_SPTR8)current->input, regtest_buf32, regtest_offsetmap32, REGTEST_MAX_LENGTH32);
1459 else
1460 length32 = copy_char8_to_char32((PCRE2_SPTR8)current->input, regtest_buf32, REGTEST_MAX_LENGTH32);
1461
1462 return_value32[1] = pcre2_match_32(re32, regtest_buf32, length32,
1463 current->start_offset & OFFSET_MASK, current->match_options, mdata32_2, mcontext32);
1464
1465 if (pcre2_jit_compile_32(re32, jit_compile_mode)) {
1466 printf("\n32 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1467 } else if ((counter & 0x1) != 0) {
1468 setstack32(mcontext32);
1469 return_value32[0] = pcre2_match_32(re32, regtest_buf32, length32,
1470 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1471 } else {
1472 pcre2_jit_stack_assign_32(mcontext32, NULL, getstack32());
1473 return_value32[0] = pcre2_jit_match_32(re32, regtest_buf32, length32,
1474 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1475 }
1476 }
1477 #endif
1478
1479 /* printf("[%d-%d-%d|%d-%d|%d-%d|%d-%d]%s",
1480 return_value8[0], return_value16[0], return_value32[0],
1481 (int)ovector8_1[0], (int)ovector8_1[1],
1482 (int)ovector16_1[0], (int)ovector16_1[1],
1483 (int)ovector32_1[0], (int)ovector32_1[1],
1484 (current->compile_options & PCRE2_CASELESS) ? "C" : ""); */
1485
1486 /* If F_DIFF is set, just run the test, but do not compare the results.
1487 Segfaults can still be captured. */
1488
1489 is_successful = 1;
1490 if (!(current->start_offset & F_DIFF)) {
1491 #if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1492 if (!(current->start_offset & F_FORCECONV)) {
1493
1494 /* All results must be the same. */
1495 #ifdef SUPPORT_PCRE2_8
1496 if ((return_value = return_value8[0]) != return_value8[1]) {
1497 printf("\n8 bit: Return value differs(J8:%d,I8:%d): [%d] '%s' @ '%s'\n",
1498 return_value8[0], return_value8[1], total, current->pattern, current->input);
1499 is_successful = 0;
1500 } else
1501 #endif
1502 #ifdef SUPPORT_PCRE2_16
1503 if ((return_value = return_value16[0]) != return_value16[1]) {
1504 printf("\n16 bit: Return value differs(J16:%d,I16:%d): [%d] '%s' @ '%s'\n",
1505 return_value16[0], return_value16[1], total, current->pattern, current->input);
1506 is_successful = 0;
1507 } else
1508 #endif
1509 #ifdef SUPPORT_PCRE2_32
1510 if ((return_value = return_value32[0]) != return_value32[1]) {
1511 printf("\n32 bit: Return value differs(J32:%d,I32:%d): [%d] '%s' @ '%s'\n",
1512 return_value32[0], return_value32[1], total, current->pattern, current->input);
1513 is_successful = 0;
1514 } else
1515 #endif
1516 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1517 if (return_value8[0] != return_value16[0]) {
1518 printf("\n8 and 16 bit: Return value differs(J8:%d,J16:%d): [%d] '%s' @ '%s'\n",
1519 return_value8[0], return_value16[0],
1520 total, current->pattern, current->input);
1521 is_successful = 0;
1522 } else
1523 #endif
1524 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1525 if (return_value8[0] != return_value32[0]) {
1526 printf("\n8 and 32 bit: Return value differs(J8:%d,J32:%d): [%d] '%s' @ '%s'\n",
1527 return_value8[0], return_value32[0],
1528 total, current->pattern, current->input);
1529 is_successful = 0;
1530 } else
1531 #endif
1532 #if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1533 if (return_value16[0] != return_value32[0]) {
1534 printf("\n16 and 32 bit: Return value differs(J16:%d,J32:%d): [%d] '%s' @ '%s'\n",
1535 return_value16[0], return_value32[0],
1536 total, current->pattern, current->input);
1537 is_successful = 0;
1538 } else
1539 #endif
1540 if (return_value >= 0 || return_value == PCRE2_ERROR_PARTIAL) {
1541 if (return_value == PCRE2_ERROR_PARTIAL) {
1542 return_value = 2;
1543 } else {
1544 return_value *= 2;
1545 }
1546 #ifdef SUPPORT_PCRE2_8
1547 return_value8[0] = return_value;
1548 #endif
1549 #ifdef SUPPORT_PCRE2_16
1550 return_value16[0] = return_value;
1551 #endif
1552 #ifdef SUPPORT_PCRE2_32
1553 return_value32[0] = return_value;
1554 #endif
1555 /* Transform back the results. */
1556 if (current->compile_options & PCRE2_UTF) {
1557 #ifdef SUPPORT_PCRE2_16
1558 for (i = 0; i < return_value; ++i) {
1559 if (ovector16_1[i] != PCRE2_UNSET)
1560 ovector16_1[i] = regtest_offsetmap16[ovector16_1[i]];
1561 if (ovector16_2[i] != PCRE2_UNSET)
1562 ovector16_2[i] = regtest_offsetmap16[ovector16_2[i]];
1563 }
1564 #endif
1565 #ifdef SUPPORT_PCRE2_32
1566 for (i = 0; i < return_value; ++i) {
1567 if (ovector32_1[i] != PCRE2_UNSET)
1568 ovector32_1[i] = regtest_offsetmap32[ovector32_1[i]];
1569 if (ovector32_2[i] != PCRE2_UNSET)
1570 ovector32_2[i] = regtest_offsetmap32[ovector32_2[i]];
1571 }
1572 #endif
1573 }
1574
1575 for (i = 0; i < return_value; ++i) {
1576 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1577 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector16_1[i] || ovector8_1[i] != ovector16_2[i]) {
1578 printf("\n8 and 16 bit: Ovector[%d] value differs(J8:%d,I8:%d,J16:%d,I16:%d): [%d] '%s' @ '%s' \n",
1579 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector16_1[i], (int)ovector16_2[i],
1580 total, current->pattern, current->input);
1581 is_successful = 0;
1582 }
1583 #endif
1584 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1585 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector32_1[i] || ovector8_1[i] != ovector32_2[i]) {
1586 printf("\n8 and 32 bit: Ovector[%d] value differs(J8:%d,I8:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1587 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1588 total, current->pattern, current->input);
1589 is_successful = 0;
1590 }
1591 #endif
1592 #if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1593 if (ovector16_1[i] != ovector16_2[i] || ovector16_1[i] != ovector32_1[i] || ovector16_1[i] != ovector32_2[i]) {
1594 printf("\n16 and 32 bit: Ovector[%d] value differs(J16:%d,I16:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1595 i, (int)ovector16_1[i], (int)ovector16_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1596 total, current->pattern, current->input);
1597 is_successful = 0;
1598 }
1599 #endif
1600 }
1601 }
1602 } else
1603 #endif /* more than one of SUPPORT_PCRE2_8, SUPPORT_PCRE2_16 and SUPPORT_PCRE2_32 */
1604 {
1605 #ifdef SUPPORT_PCRE2_8
1606 if (return_value8[0] != return_value8[1]) {
1607 printf("\n8 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1608 return_value8[0], return_value8[1], total, current->pattern, current->input);
1609 is_successful = 0;
1610 } else if (return_value8[0] >= 0 || return_value8[0] == PCRE2_ERROR_PARTIAL) {
1611 if (return_value8[0] == PCRE2_ERROR_PARTIAL)
1612 return_value8[0] = 2;
1613 else
1614 return_value8[0] *= 2;
1615
1616 for (i = 0; i < return_value8[0]; ++i)
1617 if (ovector8_1[i] != ovector8_2[i]) {
1618 printf("\n8 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1619 i, (int)ovector8_1[i], (int)ovector8_2[i], total, current->pattern, current->input);
1620 is_successful = 0;
1621 }
1622 }
1623 #endif
1624
1625 #ifdef SUPPORT_PCRE2_16
1626 if (return_value16[0] != return_value16[1]) {
1627 printf("\n16 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1628 return_value16[0], return_value16[1], total, current->pattern, current->input);
1629 is_successful = 0;
1630 } else if (return_value16[0] >= 0 || return_value16[0] == PCRE2_ERROR_PARTIAL) {
1631 if (return_value16[0] == PCRE2_ERROR_PARTIAL)
1632 return_value16[0] = 2;
1633 else
1634 return_value16[0] *= 2;
1635
1636 for (i = 0; i < return_value16[0]; ++i)
1637 if (ovector16_1[i] != ovector16_2[i]) {
1638 printf("\n16 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1639 i, (int)ovector16_1[i], (int)ovector16_2[i], total, current->pattern, current->input);
1640 is_successful = 0;
1641 }
1642 }
1643 #endif
1644
1645 #ifdef SUPPORT_PCRE2_32
1646 if (return_value32[0] != return_value32[1]) {
1647 printf("\n32 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1648 return_value32[0], return_value32[1], total, current->pattern, current->input);
1649 is_successful = 0;
1650 } else if (return_value32[0] >= 0 || return_value32[0] == PCRE2_ERROR_PARTIAL) {
1651 if (return_value32[0] == PCRE2_ERROR_PARTIAL)
1652 return_value32[0] = 2;
1653 else
1654 return_value32[0] *= 2;
1655
1656 for (i = 0; i < return_value32[0]; ++i)
1657 if (ovector32_1[i] != ovector32_2[i]) {
1658 printf("\n32 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1659 i, (int)ovector32_1[i], (int)ovector32_2[i], total, current->pattern, current->input);
1660 is_successful = 0;
1661 }
1662 }
1663 #endif
1664 }
1665 }
1666
1667 if (is_successful) {
1668 #ifdef SUPPORT_PCRE2_8
1669 if (!(current->start_offset & F_NO8) && (utf || is_ascii)) {
1670 if (return_value8[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1671 printf("8 bit: Test should match: [%d] '%s' @ '%s'\n",
1672 total, current->pattern, current->input);
1673 is_successful = 0;
1674 }
1675
1676 if (return_value8[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1677 printf("8 bit: Test should not match: [%d] '%s' @ '%s'\n",
1678 total, current->pattern, current->input);
1679 is_successful = 0;
1680 }
1681 }
1682 #endif
1683 #ifdef SUPPORT_PCRE2_16
1684 if (!(current->start_offset & F_NO16) && (utf || is_ascii)) {
1685 if (return_value16[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1686 printf("16 bit: Test should match: [%d] '%s' @ '%s'\n",
1687 total, current->pattern, current->input);
1688 is_successful = 0;
1689 }
1690
1691 if (return_value16[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1692 printf("16 bit: Test should not match: [%d] '%s' @ '%s'\n",
1693 total, current->pattern, current->input);
1694 is_successful = 0;
1695 }
1696 }
1697 #endif
1698 #ifdef SUPPORT_PCRE2_32
1699 if (!(current->start_offset & F_NO32) && (utf || is_ascii)) {
1700 if (return_value32[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1701 printf("32 bit: Test should match: [%d] '%s' @ '%s'\n",
1702 total, current->pattern, current->input);
1703 is_successful = 0;
1704 }
1705
1706 if (return_value32[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1707 printf("32 bit: Test should not match: [%d] '%s' @ '%s'\n",
1708 total, current->pattern, current->input);
1709 is_successful = 0;
1710 }
1711 }
1712 #endif
1713 }
1714
1715 if (is_successful) {
1716 #ifdef SUPPORT_PCRE2_8
1717 if (re8 && !(current->start_offset & F_NO8) && pcre2_get_mark_8(mdata8_1) != pcre2_get_mark_8(mdata8_2)) {
1718 printf("8 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1719 total, current->pattern, current->input);
1720 is_successful = 0;
1721 }
1722 #endif
1723 #ifdef SUPPORT_PCRE2_16
1724 if (re16 && !(current->start_offset & F_NO16) && pcre2_get_mark_16(mdata16_1) != pcre2_get_mark_16(mdata16_2)) {
1725 printf("16 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1726 total, current->pattern, current->input);
1727 is_successful = 0;
1728 }
1729 #endif
1730 #ifdef SUPPORT_PCRE2_32
1731 if (re32 && !(current->start_offset & F_NO32) && pcre2_get_mark_32(mdata32_1) != pcre2_get_mark_32(mdata32_2)) {
1732 printf("32 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1733 total, current->pattern, current->input);
1734 is_successful = 0;
1735 }
1736 #endif
1737 }
1738
1739 #ifdef SUPPORT_PCRE2_8
1740 pcre2_code_free_8(re8);
1741 pcre2_match_data_free_8(mdata8_1);
1742 pcre2_match_data_free_8(mdata8_2);
1743 pcre2_match_context_free_8(mcontext8);
1744 #endif
1745 #ifdef SUPPORT_PCRE2_16
1746 pcre2_code_free_16(re16);
1747 pcre2_match_data_free_16(mdata16_1);
1748 pcre2_match_data_free_16(mdata16_2);
1749 pcre2_match_context_free_16(mcontext16);
1750 #endif
1751 #ifdef SUPPORT_PCRE2_32
1752 pcre2_code_free_32(re32);
1753 pcre2_match_data_free_32(mdata32_1);
1754 pcre2_match_data_free_32(mdata32_2);
1755 pcre2_match_context_free_32(mcontext32);
1756 #endif
1757
1758 if (is_successful) {
1759 successful++;
1760 successful_row++;
1761 printf(".");
1762 if (successful_row >= 60) {
1763 successful_row = 0;
1764 printf("\n");
1765 }
1766 } else
1767 successful_row = 0;
1768
1769 fflush(stdout);
1770 current++;
1771 }
1772 #ifdef SUPPORT_PCRE2_8
1773 setstack8(NULL);
1774 #endif
1775 #ifdef SUPPORT_PCRE2_16
1776 setstack16(NULL);
1777 #endif
1778 #ifdef SUPPORT_PCRE2_32
1779 setstack32(NULL);
1780 #endif
1781
1782 if (total == successful) {
1783 printf("\nAll JIT regression tests are successfully passed.\n");
1784 return 0;
1785 } else {
1786 printf("\nSuccessful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
1787 return 1;
1788 }
1789 }
1790
1791 #if defined SUPPORT_UNICODE
1792
check_invalid_utf_result(int pattern_index,const char * type,int result,int match_start,int match_end,PCRE2_SIZE * ovector)1793 static int check_invalid_utf_result(int pattern_index, const char *type, int result,
1794 int match_start, int match_end, PCRE2_SIZE *ovector)
1795 {
1796 if (match_start < 0) {
1797 if (result != -1) {
1798 printf("Pattern[%d] %s result is not -1.\n", pattern_index, type);
1799 return 1;
1800 }
1801 return 0;
1802 }
1803
1804 if (result <= 0) {
1805 printf("Pattern[%d] %s result (%d) is not greater than 0.\n", pattern_index, type, result);
1806 return 1;
1807 }
1808
1809 if (ovector[0] != (PCRE2_SIZE)match_start) {
1810 printf("Pattern[%d] %s ovector[0] is unexpected (%d instead of %d)\n",
1811 pattern_index, type, (int)ovector[0], match_start);
1812 return 1;
1813 }
1814
1815 if (ovector[1] != (PCRE2_SIZE)match_end) {
1816 printf("Pattern[%d] %s ovector[1] is unexpected (%d instead of %d)\n",
1817 pattern_index, type, (int)ovector[1], match_end);
1818 return 1;
1819 }
1820
1821 return 0;
1822 }
1823
1824 #endif /* SUPPORT_UNICODE */
1825
1826 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8
1827
1828 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
1829 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
1830 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
1831
1832 struct invalid_utf8_regression_test_case {
1833 int compile_options;
1834 int jit_compile_options;
1835 int start_offset;
1836 int skip_left;
1837 int skip_right;
1838 int match_start;
1839 int match_end;
1840 const char *pattern[2];
1841 const char *input;
1842 };
1843
1844 static const char invalid_utf8_newline_cr;
1845
1846 static const struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = {
1847 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1848 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf0\x90\x80\x80" },
1849 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
1850 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1851 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
1852 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
1853 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
1854 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf#" },
1855 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf" },
1856 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80#" },
1857 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80" },
1858 { UDA, CI, 0, 0, 2, -1, -1, { ".", NULL }, "\xef\xbf\xbf#" },
1859 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xef\xbf\xbf" },
1860 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\x7f#" },
1861 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\xc0" },
1862 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
1863 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf" },
1864 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xed\x9f\xbf#" },
1865 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xa0\x80#" },
1866 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xee\x80\x80#" },
1867 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xbf\xbf#" },
1868 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf##" },
1869 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf#" },
1870 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf" },
1871 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80##" },
1872 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80#" },
1873 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80" },
1874 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80##" },
1875 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0##" },
1876 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80" },
1877 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0" },
1878 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf##" },
1879 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf" },
1880 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80###" },
1881 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80" },
1882 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8###" },
1883 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8" },
1884 { UDA, CI, 0, 0, 0, 0, 1, { ".", NULL }, "\x7f" },
1885
1886 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
1887 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80\xf4\xa0\x80\x80" },
1888 { UDA, CPI, 4, 1, 1, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf" },
1889 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xef\xbf\xbf#" },
1890 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xe0\xa0\x80#" },
1891 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
1892 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
1893 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf\xf0\x8f\xbf\xbf" },
1894 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80\xf5\x80\x80\x80" },
1895 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80\xf4\x90\x80\x80" },
1896 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff\xf4\x8f\xbf\xff" },
1897 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf\xf4\x8f\xff\xbf" },
1898 { UDA, CPI, 4, 0, 1, -1, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80\xef\x80\x80" },
1899 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80\x80\x80\x80\x80" },
1900 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf\xe0\x9f\xbf#" },
1901 { UDA, CPI, 4, 2, 2, -1, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80\xe0\xa0\x80#" },
1902 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xf0\x80\x80\xf0\x80\x80#" },
1903 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xed\xa0\x80\xed\xa0\x80#" },
1904 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xdf\xbf#" },
1905 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xdf\xbf#" },
1906 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xc2\x80#" },
1907 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xc2\x80#" },
1908 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xc1\xbf\xc1\xbf##" },
1909 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xdf\xc0\xdf\xc0##" },
1910 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80\xe0\x80##" },
1911
1912 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xef\xbf\xbf#" },
1913 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xe0\xa0\x80#" },
1914 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf\xe0\x9f\xbf" },
1915 { UDA, CPI, 3, 1, 1, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xbf\xef\xbf\xbf" },
1916 { UDA, CPI, 3, 0, 1, -1, -1, { "\\B", "\\b" }, "\xdf\x80\x80\xdf\x80" },
1917 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xff\xef\xbf\xff" },
1918 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xff\xbf\xef\xff\xbf" },
1919 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xed\xbf\xbf\xed\xbf\xbf" },
1920
1921 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xdf\xbf#" },
1922 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xc2\x80#" },
1923 { UDA, CPI, 2, 1, 1, -1, -1, { "\\B", "\\b" }, "\xdf\xbf\xdf\xbf" },
1924 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xc1\xbf\xc1\xbf" },
1925 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x80\xe0\x80" },
1926 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xff\xdf\xff" },
1927 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xff\xbf\xff\xbf" },
1928
1929 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x7f#" },
1930 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x01#" },
1931 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80" },
1932 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\xb0\xb0" },
1933
1934 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
1935 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "a\xff" },
1936 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1937 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1938 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
1939 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 6, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1940 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1941 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 8, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1942 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1943
1944 { UDA, CPI, 0, 0, 0, 0, 1, { "\\X", NULL }, "A" },
1945 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xff" },
1946 { UDA, CPI, 0, 0, 0, 0, 2, { "\\X", NULL }, "\xc3\xa1" },
1947 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xc3\xa1" },
1948 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xc3\x7f" },
1949 { UDA, CPI, 0, 0, 0, 0, 3, { "\\X", NULL }, "\xe1\xbd\xb8" },
1950 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
1951 { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1952 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1953
1954 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "#" },
1955 { UDA, CPI, 0, 0, 0, 0, 4, { "[^#]", NULL }, "\xf4\x8f\xbf\xbf" },
1956 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xf4\x90\x80\x80" },
1957 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xc1\x80" },
1958
1959 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"},
1960 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"},
1961 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"},
1962 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xc3\x0a#"},
1963 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf1\x0a#"},
1964 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xf2\xbf\x0a#"},
1965 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \xf2\xbf\xbf\x0a#"},
1966 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xef\x0a#"},
1967 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xef\xbf\x0a#"},
1968 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \x85#\xc2\x85#"},
1969 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 7, 8, { "^\\W", NULL }, " \xe2\x80\xf8\xe2\x80\xa8#"},
1970
1971 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xe2\x80\xf8\xe2\x80\xa8#"},
1972 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 3, 4, { "#", NULL }, "\xe2\x80\xf8#\xe2\x80\xa8#"},
1973 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "abcd\xc2\x85#"},
1974 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 1, 2, { "#", NULL }, "\x85#\xc2\x85#"},
1975 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 5, 6, { "#", NULL }, "\xef,\x80,\xf8#\x0a"},
1976 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xef,\x80,\xf8\x0a#"},
1977
1978 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1979 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1980 { PCRE2_UTF, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1981 { PCRE2_UTF, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1982
1983 { PCRE2_UTF | PCRE2_UCP, CI, 0, 0, 0, -1, -1, { "[\\s]", NULL }, "\xed\xa0\x80" },
1984
1985 /* These two are not invalid UTF tests, but this infrastructure fits better for them. */
1986 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\X{2}", NULL }, "\r\n\n" },
1987 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\R{2}", NULL }, "\r\n\n" },
1988
1989 { PCRE2_UTF | PCRE2_MULTILINE, CI, 0, 0, 0, -1, -1, { "^.a", &invalid_utf8_newline_cr }, "\xc3\xa7#a" },
1990
1991 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
1992 };
1993
1994 #undef UDA
1995 #undef CI
1996 #undef CPI
1997
run_invalid_utf8_test(const struct invalid_utf8_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_8 * ccontext,pcre2_match_data_8 * mdata)1998 static int run_invalid_utf8_test(const struct invalid_utf8_regression_test_case *current,
1999 int pattern_index, int i, pcre2_compile_context_8 *ccontext, pcre2_match_data_8 *mdata)
2000 {
2001 pcre2_code_8 *code;
2002 int result, errorcode;
2003 PCRE2_SIZE length, erroroffset;
2004 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_8(mdata);
2005
2006 if (current->pattern[i] == NULL)
2007 return 1;
2008
2009 code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED,
2010 current->compile_options, &errorcode, &erroroffset, ccontext);
2011
2012 if (!code) {
2013 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2014 return 0;
2015 }
2016
2017 if (pcre2_jit_compile_8(code, current->jit_compile_options) != 0) {
2018 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2019 pcre2_code_free_8(code);
2020 return 0;
2021 }
2022
2023 length = (PCRE2_SIZE)(strlen(current->input) - current->skip_left - current->skip_right);
2024
2025 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2026 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2027 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2028
2029 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2030 pcre2_code_free_8(code);
2031 return 0;
2032 }
2033 }
2034
2035 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2036 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2037 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2038
2039 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2040 pcre2_code_free_8(code);
2041 return 0;
2042 }
2043 }
2044
2045 pcre2_code_free_8(code);
2046 return 1;
2047 }
2048
invalid_utf8_regression_tests(void)2049 static int invalid_utf8_regression_tests(void)
2050 {
2051 const struct invalid_utf8_regression_test_case *current;
2052 pcre2_compile_context_8 *ccontext;
2053 pcre2_match_data_8 *mdata;
2054 int total = 0, successful = 0;
2055 int result;
2056
2057 printf("\nRunning invalid-utf8 JIT regression tests\n");
2058
2059 ccontext = pcre2_compile_context_create_8(NULL);
2060 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
2061 mdata = pcre2_match_data_create_8(4, NULL);
2062
2063 for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) {
2064 /* printf("\nPattern: %s :\n", current->pattern); */
2065 total++;
2066
2067 result = 1;
2068 if (current->pattern[1] != &invalid_utf8_newline_cr)
2069 {
2070 if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
2071 result = 0;
2072 if (!run_invalid_utf8_test(current, total - 1, 1, ccontext, mdata))
2073 result = 0;
2074 } else {
2075 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_CR);
2076 if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
2077 result = 0;
2078 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
2079 }
2080
2081 if (result) {
2082 successful++;
2083 }
2084
2085 printf(".");
2086 if ((total % 60) == 0)
2087 printf("\n");
2088 }
2089
2090 if ((total % 60) != 0)
2091 printf("\n");
2092
2093 pcre2_match_data_free_8(mdata);
2094 pcre2_compile_context_free_8(ccontext);
2095
2096 if (total == successful) {
2097 printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n");
2098 return 0;
2099 } else {
2100 printf("\nInvalid UTF8 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2101 return 1;
2102 }
2103 }
2104
2105 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_8 */
2106
invalid_utf8_regression_tests(void)2107 static int invalid_utf8_regression_tests(void)
2108 {
2109 return 0;
2110 }
2111
2112 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_8 */
2113
2114 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_16
2115
2116 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2117 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2118 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2119
2120 struct invalid_utf16_regression_test_case {
2121 int compile_options;
2122 int jit_compile_options;
2123 int start_offset;
2124 int skip_left;
2125 int skip_right;
2126 int match_start;
2127 int match_end;
2128 const PCRE2_UCHAR16 *pattern[2];
2129 const PCRE2_UCHAR16 *input;
2130 };
2131
2132 static PCRE2_UCHAR16 allany16[] = { '.', 0 };
2133 static PCRE2_UCHAR16 non_word_boundary16[] = { '\\', 'B', 0 };
2134 static PCRE2_UCHAR16 word_boundary16[] = { '\\', 'b', 0 };
2135 static PCRE2_UCHAR16 backreference16[] = { '(', '.', ')', '\\', '1', 0 };
2136 static PCRE2_UCHAR16 grapheme16[] = { '\\', 'X', 0 };
2137 static PCRE2_UCHAR16 nothashmark16[] = { '[', '^', '#', ']', 0 };
2138 static PCRE2_UCHAR16 afternl16[] = { '^', '\\', 'W', 0 };
2139 static PCRE2_UCHAR16 generic16[] = { '#', 0xd800, 0xdc00, '#', 0 };
2140 static PCRE2_UCHAR16 test16_1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 };
2141 static PCRE2_UCHAR16 test16_2[] = { 0xd800, 0xdc00, 0xd800, 0xdc00, 0 };
2142 static PCRE2_UCHAR16 test16_3[] = { 0xdbff, 0xdfff, 0xdbff, 0xdfff, 0 };
2143 static PCRE2_UCHAR16 test16_4[] = { 0xd800, 0xdbff, 0xd800, 0xdbff, 0 };
2144 static PCRE2_UCHAR16 test16_5[] = { '#', 0xd800, 0xdc00, '#', 0 };
2145 static PCRE2_UCHAR16 test16_6[] = { 'a', 'A', 0xdc28, 0 };
2146 static PCRE2_UCHAR16 test16_7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
2147 static PCRE2_UCHAR16 test16_8[] = { '#', 0xd800, 0xdc00, 0 };
2148 static PCRE2_UCHAR16 test16_9[] = { ' ', 0x2028, '#', 0 };
2149 static PCRE2_UCHAR16 test16_10[] = { ' ', 0xdc00, 0xd800, 0x2028, '#', 0 };
2150 static PCRE2_UCHAR16 test16_11[] = { 0xdc00, 0xdc00, 0xd800, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2151 static PCRE2_UCHAR16 test16_12[] = { '#', 0xd800, 0xdc00, 0xd800, '#', 0xd800, 0xdc00, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2152
2153 static const struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
2154 { UDA, CI, 0, 0, 0, 0, 1, { allany16, NULL }, test16_1 },
2155 { UDA, CI, 1, 0, 0, 1, 2, { allany16, NULL }, test16_1 },
2156 { UDA, CI, 2, 0, 0, 2, 3, { allany16, NULL }, test16_1 },
2157 { UDA, CI, 3, 0, 0, 3, 4, { allany16, NULL }, test16_1 },
2158 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_2 },
2159 { UDA, CI, 0, 0, 3, -1, -1, { allany16, NULL }, test16_2 },
2160 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_2 },
2161 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_3 },
2162 { UDA, CI, 0, 0, 3, -1, -1, { allany16, NULL }, test16_3 },
2163 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_3 },
2164
2165 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary16, NULL }, test16_1 },
2166 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_1 },
2167 { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary16, NULL }, test16_1 },
2168 { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary16, NULL }, test16_1 },
2169 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_2 },
2170 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_3 },
2171 { UDA, CPI, 2, 1, 1, -1, -1, { non_word_boundary16, word_boundary16 }, test16_2 },
2172 { UDA, CPI, 2, 1, 1, -1, -1, { non_word_boundary16, word_boundary16 }, test16_3 },
2173 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_4 },
2174 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_5 },
2175
2176 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference16, NULL }, test16_6 },
2177 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference16, NULL }, test16_6 },
2178 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference16, NULL }, test16_7 },
2179 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference16, NULL }, test16_7 },
2180
2181 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme16, NULL }, test16_6 },
2182 { UDA, CPI, 1, 0, 0, 1, 2, { grapheme16, NULL }, test16_6 },
2183 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme16, NULL }, test16_6 },
2184 { UDA, CPI, 0, 0, 0, 0, 2, { grapheme16, NULL }, test16_7 },
2185 { UDA, CPI, 2, 0, 0, 2, 4, { grapheme16, NULL }, test16_7 },
2186 { UDA, CPI, 1, 0, 0, -1, -1, { grapheme16, NULL }, test16_7 },
2187
2188 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2189 { UDA, CPI, 1, 0, 0, 1, 3, { nothashmark16, NULL }, test16_8 },
2190 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2191
2192 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl16, NULL }, test16_9 },
2193 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { afternl16, NULL }, test16_10 },
2194
2195 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2196 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2197 { PCRE2_UTF, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2198 { PCRE2_UTF, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2199
2200 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2201 };
2202
2203 #undef UDA
2204 #undef CI
2205 #undef CPI
2206
run_invalid_utf16_test(const struct invalid_utf16_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_16 * ccontext,pcre2_match_data_16 * mdata)2207 static int run_invalid_utf16_test(const struct invalid_utf16_regression_test_case *current,
2208 int pattern_index, int i, pcre2_compile_context_16 *ccontext, pcre2_match_data_16 *mdata)
2209 {
2210 pcre2_code_16 *code;
2211 int result, errorcode;
2212 PCRE2_SIZE length, erroroffset;
2213 const PCRE2_UCHAR16 *input;
2214 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_16(mdata);
2215
2216 if (current->pattern[i] == NULL)
2217 return 1;
2218
2219 code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED,
2220 current->compile_options, &errorcode, &erroroffset, ccontext);
2221
2222 if (!code) {
2223 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2224 return 0;
2225 }
2226
2227 if (pcre2_jit_compile_16(code, current->jit_compile_options) != 0) {
2228 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2229 pcre2_code_free_16(code);
2230 return 0;
2231 }
2232
2233 input = current->input;
2234 length = 0;
2235
2236 while (*input++ != 0)
2237 length++;
2238
2239 length -= current->skip_left + current->skip_right;
2240
2241 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2242 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2243 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2244
2245 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2246 pcre2_code_free_16(code);
2247 return 0;
2248 }
2249 }
2250
2251 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2252 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2253 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2254
2255 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2256 pcre2_code_free_16(code);
2257 return 0;
2258 }
2259 }
2260
2261 pcre2_code_free_16(code);
2262 return 1;
2263 }
2264
invalid_utf16_regression_tests(void)2265 static int invalid_utf16_regression_tests(void)
2266 {
2267 const struct invalid_utf16_regression_test_case *current;
2268 pcre2_compile_context_16 *ccontext;
2269 pcre2_match_data_16 *mdata;
2270 int total = 0, successful = 0;
2271 int result;
2272
2273 printf("\nRunning invalid-utf16 JIT regression tests\n");
2274
2275 ccontext = pcre2_compile_context_create_16(NULL);
2276 pcre2_set_newline_16(ccontext, PCRE2_NEWLINE_ANY);
2277 mdata = pcre2_match_data_create_16(4, NULL);
2278
2279 for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) {
2280 /* printf("\nPattern: %s :\n", current->pattern); */
2281 total++;
2282
2283 result = 1;
2284 if (!run_invalid_utf16_test(current, total - 1, 0, ccontext, mdata))
2285 result = 0;
2286 if (!run_invalid_utf16_test(current, total - 1, 1, ccontext, mdata))
2287 result = 0;
2288
2289 if (result) {
2290 successful++;
2291 }
2292
2293 printf(".");
2294 if ((total % 60) == 0)
2295 printf("\n");
2296 }
2297
2298 if ((total % 60) != 0)
2299 printf("\n");
2300
2301 pcre2_match_data_free_16(mdata);
2302 pcre2_compile_context_free_16(ccontext);
2303
2304 if (total == successful) {
2305 printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n");
2306 return 0;
2307 } else {
2308 printf("\nInvalid UTF16 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2309 return 1;
2310 }
2311 }
2312
2313 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_16 */
2314
invalid_utf16_regression_tests(void)2315 static int invalid_utf16_regression_tests(void)
2316 {
2317 return 0;
2318 }
2319
2320 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */
2321
2322 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_32
2323
2324 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2325 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2326 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2327
2328 struct invalid_utf32_regression_test_case {
2329 int compile_options;
2330 int jit_compile_options;
2331 int start_offset;
2332 int skip_left;
2333 int skip_right;
2334 int match_start;
2335 int match_end;
2336 const PCRE2_UCHAR32 *pattern[2];
2337 const PCRE2_UCHAR32 *input;
2338 };
2339
2340 static PCRE2_UCHAR32 allany32[] = { '.', 0 };
2341 static PCRE2_UCHAR32 non_word_boundary32[] = { '\\', 'B', 0 };
2342 static PCRE2_UCHAR32 word_boundary32[] = { '\\', 'b', 0 };
2343 static PCRE2_UCHAR32 backreference32[] = { '(', '.', ')', '\\', '1', 0 };
2344 static PCRE2_UCHAR32 grapheme32[] = { '\\', 'X', 0 };
2345 static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 };
2346 static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 };
2347 static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x110000, 0x10ffff, 0 };
2348 static PCRE2_UCHAR32 test32_2[] = { 0xd7ff, 0xe000, 0xd800, 0xdfff, 0xe000, 0xdfff, 0xd800, 0 };
2349 static PCRE2_UCHAR32 test32_3[] = { 'a', 'A', 0x110000, 0 };
2350 static PCRE2_UCHAR32 test32_4[] = { '#', 0x10ffff, 0x110000, 0 };
2351 static PCRE2_UCHAR32 test32_5[] = { ' ', 0x2028, '#', 0 };
2352 static PCRE2_UCHAR32 test32_6[] = { ' ', 0x110000, 0x2028, '#', 0 };
2353
2354 static const struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = {
2355 { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 },
2356 { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 },
2357 { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_2 },
2358 { UDA, CI, 1, 0, 0, 1, 2, { allany32, NULL }, test32_2 },
2359 { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
2360 { UDA, CI, 3, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
2361
2362 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 },
2363 { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
2364 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_2 },
2365 { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
2366 { UDA, CPI, 6, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
2367
2368 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_3 },
2369 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_3 },
2370
2371 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 },
2372 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 },
2373 { UDA, CPI, 1, 0, 0, 1, 2, { grapheme32, NULL }, test32_2 },
2374 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
2375 { UDA, CPI, 3, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
2376 { UDA, CPI, 4, 0, 0, 4, 5, { grapheme32, NULL }, test32_2 },
2377
2378 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
2379 { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_4 },
2380 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
2381 { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_2 },
2382 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_2 },
2383
2384 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_5 },
2385 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_6 },
2386
2387 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2388 };
2389
2390 #undef UDA
2391 #undef CI
2392 #undef CPI
2393
run_invalid_utf32_test(const struct invalid_utf32_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_32 * ccontext,pcre2_match_data_32 * mdata)2394 static int run_invalid_utf32_test(const struct invalid_utf32_regression_test_case *current,
2395 int pattern_index, int i, pcre2_compile_context_32 *ccontext, pcre2_match_data_32 *mdata)
2396 {
2397 pcre2_code_32 *code;
2398 int result, errorcode;
2399 PCRE2_SIZE length, erroroffset;
2400 const PCRE2_UCHAR32 *input;
2401 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(mdata);
2402
2403 if (current->pattern[i] == NULL)
2404 return 1;
2405
2406 code = pcre2_compile_32(current->pattern[i], PCRE2_ZERO_TERMINATED,
2407 current->compile_options, &errorcode, &erroroffset, ccontext);
2408
2409 if (!code) {
2410 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2411 return 0;
2412 }
2413
2414 if (pcre2_jit_compile_32(code, current->jit_compile_options) != 0) {
2415 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2416 pcre2_code_free_32(code);
2417 return 0;
2418 }
2419
2420 input = current->input;
2421 length = 0;
2422
2423 while (*input++ != 0)
2424 length++;
2425
2426 length -= current->skip_left + current->skip_right;
2427
2428 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2429 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2430 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2431
2432 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2433 pcre2_code_free_32(code);
2434 return 0;
2435 }
2436 }
2437
2438 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2439 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2440 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2441
2442 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2443 pcre2_code_free_32(code);
2444 return 0;
2445 }
2446 }
2447
2448 pcre2_code_free_32(code);
2449 return 1;
2450 }
2451
invalid_utf32_regression_tests(void)2452 static int invalid_utf32_regression_tests(void)
2453 {
2454 const struct invalid_utf32_regression_test_case *current;
2455 pcre2_compile_context_32 *ccontext;
2456 pcre2_match_data_32 *mdata;
2457 int total = 0, successful = 0;
2458 int result;
2459
2460 printf("\nRunning invalid-utf32 JIT regression tests\n");
2461
2462 ccontext = pcre2_compile_context_create_32(NULL);
2463 pcre2_set_newline_32(ccontext, PCRE2_NEWLINE_ANY);
2464 mdata = pcre2_match_data_create_32(4, NULL);
2465
2466 for (current = invalid_utf32_regression_test_cases; current->pattern[0]; current++) {
2467 /* printf("\nPattern: %s :\n", current->pattern); */
2468 total++;
2469
2470 result = 1;
2471 if (!run_invalid_utf32_test(current, total - 1, 0, ccontext, mdata))
2472 result = 0;
2473 if (!run_invalid_utf32_test(current, total - 1, 1, ccontext, mdata))
2474 result = 0;
2475
2476 if (result) {
2477 successful++;
2478 }
2479
2480 printf(".");
2481 if ((total % 60) == 0)
2482 printf("\n");
2483 }
2484
2485 if ((total % 60) != 0)
2486 printf("\n");
2487
2488 pcre2_match_data_free_32(mdata);
2489 pcre2_compile_context_free_32(ccontext);
2490
2491 if (total == successful) {
2492 printf("\nAll invalid UTF32 JIT regression tests are successfully passed.\n");
2493 return 0;
2494 } else {
2495 printf("\nInvalid UTF32 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2496 return 1;
2497 }
2498 }
2499
2500 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_32 */
2501
invalid_utf32_regression_tests(void)2502 static int invalid_utf32_regression_tests(void)
2503 {
2504 return 0;
2505 }
2506
2507 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_32 */
2508
2509 /* End of pcre2_jit_test.c */
2510