1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 #ifdef HAVE_CONFIG_H
42 #include "config.h"
43 #endif
44
45 #include <stdio.h>
46 #include <string.h>
47
48 #define PCRE2_CODE_UNIT_WIDTH 0
49 #include "pcre2.h"
50
51 /*
52 Letter characters:
53 \xe6\x92\xad = 0x64ad = 25773 (kanji)
54 Non-letter characters:
55 \xc2\xa1 = 0xa1 = (Inverted Exclamation Mark)
56 \xf3\xa9\xb7\x80 = 0xe9dc0 = 957888
57 \xed\xa0\x80 = 55296 = 0xd800 (Invalid UTF character)
58 \xed\xb0\x80 = 56320 = 0xdc00 (Invalid UTF character)
59 Newlines:
60 \xc2\x85 = 0x85 = 133 (NExt Line = NEL)
61 \xe2\x80\xa8 = 0x2028 = 8232 (Line Separator)
62 Othercase pairs:
63 \xc3\xa9 = 0xe9 = 233 (e')
64 \xc3\x89 = 0xc9 = 201 (E')
65 \xc3\xa1 = 0xe1 = 225 (a')
66 \xc3\x81 = 0xc1 = 193 (A')
67 \x53 = 0x53 = S
68 \x73 = 0x73 = s
69 \xc5\xbf = 0x17f = 383 (long S)
70 \xc8\xba = 0x23a = 570
71 \xe2\xb1\xa5 = 0x2c65 = 11365
72 \xe1\xbd\xb8 = 0x1f78 = 8056
73 \xe1\xbf\xb8 = 0x1ff8 = 8184
74 \xf0\x90\x90\x80 = 0x10400 = 66560
75 \xf0\x90\x90\xa8 = 0x10428 = 66600
76 \xc7\x84 = 0x1c4 = 452
77 \xc7\x85 = 0x1c5 = 453
78 \xc7\x86 = 0x1c6 = 454
79 Caseless sets:
80 ucp_Armenian - \x{531}-\x{556} -> \x{561}-\x{586}
81 ucp_Coptic - \x{2c80}-\x{2ce3} -> caseless: XOR 0x1
82 ucp_Latin - \x{ff21}-\x{ff3a} -> \x{ff41]-\x{ff5a}
83
84 Mark property:
85 \xcc\x8d = 0x30d = 781
86 Special:
87 \xc2\x80 = 0x80 = 128 (lowest 2 byte character)
88 \xdf\xbf = 0x7ff = 2047 (highest 2 byte character)
89 \xe0\xa0\x80 = 0x800 = 2048 (lowest 2 byte character)
90 \xef\xbf\xbf = 0xffff = 65535 (highest 3 byte character)
91 \xf0\x90\x80\x80 = 0x10000 = 65536 (lowest 4 byte character)
92 \xf4\x8f\xbf\xbf = 0x10ffff = 1114111 (highest allowed utf character)
93 */
94
95 static int regression_tests(void);
96 static int invalid_utf8_regression_tests(void);
97 static int invalid_utf16_regression_tests(void);
98 static int invalid_utf32_regression_tests(void);
99
main(void)100 int main(void)
101 {
102 int jit = 0;
103 #if defined SUPPORT_PCRE2_8
104 pcre2_config_8(PCRE2_CONFIG_JIT, &jit);
105 #elif defined SUPPORT_PCRE2_16
106 pcre2_config_16(PCRE2_CONFIG_JIT, &jit);
107 #elif defined SUPPORT_PCRE2_32
108 pcre2_config_32(PCRE2_CONFIG_JIT, &jit);
109 #endif
110 if (!jit) {
111 printf("JIT must be enabled to run pcre_jit_test\n");
112 return 1;
113 }
114 return regression_tests()
115 | invalid_utf8_regression_tests()
116 | invalid_utf16_regression_tests()
117 | invalid_utf32_regression_tests();
118 }
119
120 /* --------------------------------------------------------------------------------------- */
121
122 #if !(defined SUPPORT_PCRE2_8) && !(defined SUPPORT_PCRE2_16) && !(defined SUPPORT_PCRE2_32)
123 #error SUPPORT_PCRE2_8 or SUPPORT_PCRE2_16 or SUPPORT_PCRE2_32 must be defined
124 #endif
125
126 #define MU (PCRE2_MULTILINE | PCRE2_UTF)
127 #define MUP (PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
128 #define CMU (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF)
129 #define CMUP (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
130 #define M (PCRE2_MULTILINE)
131 #define MP (PCRE2_MULTILINE | PCRE2_UCP)
132 #define U (PCRE2_UTF)
133 #define CM (PCRE2_CASELESS | PCRE2_MULTILINE)
134
135 #define BSR(x) ((x) << 16)
136 #define A PCRE2_NEWLINE_ANYCRLF
137
138 #define GET_NEWLINE(x) ((x) & 0xffff)
139 #define GET_BSR(x) ((x) >> 16)
140
141 #define OFFSET_MASK 0x00ffff
142 #define F_NO8 0x010000
143 #define F_NO16 0x020000
144 #define F_NO32 0x020000
145 #define F_NOMATCH 0x040000
146 #define F_DIFF 0x080000
147 #define F_FORCECONV 0x100000
148 #define F_PROPERTY 0x200000
149
150 struct regression_test_case {
151 int compile_options;
152 int newline;
153 int match_options;
154 int start_offset;
155 const char *pattern;
156 const char *input;
157 };
158
159 static struct regression_test_case regression_test_cases[] = {
160 /* Constant strings. */
161 { MU, A, 0, 0, "AbC", "AbAbC" },
162 { MU, A, 0, 0, "ACCEPT", "AACACCACCEACCEPACCEPTACCEPTT" },
163 { CMU, A, 0, 0, "aA#\xc3\xa9\xc3\x81", "aA#Aa#\xc3\x89\xc3\xa1" },
164 { M, A, 0, 0, "[^a]", "aAbB" },
165 { CM, A, 0, 0, "[^m]", "mMnN" },
166 { M, A, 0, 0, "a[^b][^#]", "abacd" },
167 { CM, A, 0, 0, "A[^B][^E]", "abacd" },
168 { CMU, A, 0, 0, "[^x][^#]", "XxBll" },
169 { MU, A, 0, 0, "[^a]", "aaa\xc3\xa1#Ab" },
170 { CMU, A, 0, 0, "[^A]", "aA\xe6\x92\xad" },
171 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\n+bc" },
172 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\r+bc" },
173 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\r+bc" },
174 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\n+bc" },
175 { MU, A, 0, 0, "[axd]", "sAXd" },
176 { CMU, A, 0, 0, "[axd]", "sAXd" },
177 { CMU, A, 0, 0 | F_NOMATCH, "[^axd]", "DxA" },
178 { MU, A, 0, 0, "[a-dA-C]", "\xe6\x92\xad\xc3\xa9.B" },
179 { MU, A, 0, 0, "[^a-dA-C]", "\xe6\x92\xad\xc3\xa9" },
180 { CMU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
181 { MU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
182 { MU, A, 0, 0, "[^a]", "\xc2\x80[]" },
183 { CMU, A, 0, 0, "\xf0\x90\x90\xa7", "\xf0\x90\x91\x8f" },
184 { CM, A, 0, 0, "1a2b3c4", "1a2B3c51A2B3C4" },
185 { PCRE2_CASELESS, 0, 0, 0, "\xff#a", "\xff#\xff\xfe##\xff#A" },
186 { PCRE2_CASELESS, 0, 0, 0, "\xfe", "\xff\xfc#\xfe\xfe" },
187 { PCRE2_CASELESS, 0, 0, 0, "a1", "Aa1" },
188 #ifndef NEVER_BACKSLASH_C
189 { M, A, 0, 0, "\\Ca", "cda" },
190 { CM, A, 0, 0, "\\Ca", "CDA" },
191 { M, A, 0, 0 | F_NOMATCH, "\\Cx", "cda" },
192 { CM, A, 0, 0 | F_NOMATCH, "\\Cx", "CDA" },
193 #endif /* !NEVER_BACKSLASH_C */
194 { CMUP, A, 0, 0, "\xf0\x90\x90\x80\xf0\x90\x90\xa8", "\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
195 { CMUP, A, 0, 0, "\xf0\x90\x90\x80{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
196 { CMUP, A, 0, 0, "\xf0\x90\x90\xa8{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
197 { CMUP, A, 0, 0, "\xe1\xbd\xb8\xe1\xbf\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
198 { M, A, 0, 0, "[3-57-9]", "5" },
199
200 /* Assertions. */
201 { MU, A, 0, 0, "\\b[^A]", "A_B#" },
202 { M, A, 0, 0 | F_NOMATCH, "\\b\\W", "\n*" },
203 { MU, A, 0, 0, "\\B[^,]\\b[^s]\\b", "#X" },
204 { MP, A, 0, 0, "\\B", "_\xa1" },
205 { MP, A, 0, 0 | F_PROPERTY, "\\b_\\b[,A]\\B", "_," },
206 { MUP, A, 0, 0, "\\b", "\xe6\x92\xad!" },
207 { MUP, A, 0, 0, "\\B", "_\xc2\xa1\xc3\xa1\xc2\x85" },
208 { MUP, A, 0, 0, "\\b[^A]\\B[^c]\\b[^_]\\B", "_\xc3\xa1\xe2\x80\xa8" },
209 { MUP, A, 0, 0, "\\b\\w+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
210 { MU, A, 0, 0 | F_NOMATCH, "\\b.", "\xcd\xbe" },
211 { CMUP, A, 0, 0, "\\By", "\xf0\x90\x90\xa8y" },
212 { M, A, 0, 0 | F_NOMATCH, "\\R^", "\n" },
213 { M, A, 0, 1 | F_NOMATCH, "^", "\n" },
214 { 0, 0, 0, 0, "^ab", "ab" },
215 { 0, 0, 0, 0 | F_NOMATCH, "^ab", "aab" },
216 { M, PCRE2_NEWLINE_CRLF, 0, 0, "^a", "\r\raa\n\naa\r\naa" },
217 { MU, A, 0, 0, "^-", "\xe2\x80\xa8--\xc2\x85-\r\n-" },
218 { M, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--b--\x85--" },
219 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xe2\x80\xa8--" },
220 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xc2\x85--" },
221 { 0, 0, 0, 0, "ab$", "ab" },
222 { 0, 0, 0, 0 | F_NOMATCH, "ab$", "abab\n\n" },
223 { PCRE2_DOLLAR_ENDONLY, 0, 0, 0 | F_NOMATCH, "ab$", "abab\r\n" },
224 { M, PCRE2_NEWLINE_CRLF, 0, 0, "a$", "\r\raa\n\naa\r\naa" },
225 { M, PCRE2_NEWLINE_ANY, 0, 0, "a$", "aaa" },
226 { MU, PCRE2_NEWLINE_ANYCRLF, 0, 0, "#$", "#\xc2\x85###\r#" },
227 { MU, PCRE2_NEWLINE_ANY, 0, 0, "#$", "#\xe2\x80\xa9" },
228 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0 | F_NOMATCH, "^a", "aa\naa" },
229 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0, "^a", "aa\naa" },
230 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\naa" },
231 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\r\n" },
232 { U | PCRE2_DOLLAR_ENDONLY, PCRE2_NEWLINE_ANY, 0, 0 | F_PROPERTY, "\\p{Any}{2,}$", "aa\r\n" },
233 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0, "a$", "aa\naa" },
234 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa" },
235 { U, PCRE2_NEWLINE_CR, 0, 0, "a\\Z", "aaa\r" },
236 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa\n" },
237 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r" },
238 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\n" },
239 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r\n" },
240 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
241 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
242 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
243 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
244 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
245 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
246 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
247 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
248 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
249 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xc2\x85" },
250 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
251 { M, A, 0, 0, "\\Aa", "aaa" },
252 { M, A, 0, 1 | F_NOMATCH, "\\Aa", "aaa" },
253 { M, A, 0, 1, "\\Ga", "aaa" },
254 { M, A, 0, 1 | F_NOMATCH, "\\Ga", "aba" },
255 { M, A, 0, 0, "a\\z", "aaa" },
256 { M, A, 0, 0 | F_NOMATCH, "a\\z", "aab" },
257
258 /* Brackets and alternatives. */
259 { MU, A, 0, 0, "(ab|bb|cd)", "bacde" },
260 { MU, A, 0, 0, "(?:ab|a)(bc|c)", "ababc" },
261 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|efg))", "abac" },
262 { CMU, A, 0, 0, "((aB|(Cc))|(bB)|(?:cd|EFg))", "AcCe" },
263 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|ebg))", "acebebg" },
264 { MU, A, 0, 0, "(?:(a)|(?:b))(cc|(?:d|e))(a|b)k", "accabdbbccbk" },
265 { MU, A, 0, 0, "\xc7\x82|\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
266 { MU, A, 0, 0, "=\xc7\x82|#\xc6\x82", "\xf1\x83\x82\x82=\xc7\x82\xc7\x83" },
267 { MU, A, 0, 0, "\xc7\x82\xc7\x83|\xc6\x82\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
268 { MU, A, 0, 0, "\xc6\x82\xc6\x82|\xc7\x83\xc7\x83|\xc8\x84\xc8\x84", "\xf1\x83\x82\x82\xc8\x84\xc8\x84" },
269 { U, A, 0, 0, "\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80", "\xdf\xbf\xc2\x80\xe4\x84\x80" },
270 { U, A, 0, 0, "(?:\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80)#", "\xdf\xbf\xc2\x80#\xe4\x84\x80#" },
271 { CM, A, 0, 0, "ab|cd", "CD" },
272 { CM, A, 0, 0, "a1277|a1377|bX487", "bx487" },
273 { CM, A, 0, 0, "a1277|a1377|bx487", "bX487" },
274
275 /* Greedy and non-greedy ? operators. */
276 { MU, A, 0, 0, "(?:a)?a", "laab" },
277 { CMU, A, 0, 0, "(A)?A", "llaab" },
278 { MU, A, 0, 0, "(a)?\?a", "aab" }, /* ?? is the prefix of trygraphs in GCC. */
279 { MU, A, 0, 0, "(a)?a", "manm" },
280 { CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" },
281 { MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" },
282 { MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" },
283
284 /* Greedy and non-greedy + operators */
285 { MU, A, 0, 0, "(aa)+aa", "aaaaaaa" },
286 { MU, A, 0, 0, "(aa)+?aa", "aaaaaaa" },
287 { MU, A, 0, 0, "(?:aba|ab|a)+l", "ababamababal" },
288 { MU, A, 0, 0, "(?:aba|ab|a)+?l", "ababamababal" },
289 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+?|ss)+e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
290 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+|ss)+?e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
291 { MU, A, 0, 0, "(?:(b(c)+?)+)?\?(?:(bc)+|(cb)+)+(?:m)+", "bccbcccbcbccbcbPbccbcccbcbccbcbmmn" },
292
293 /* Greedy and non-greedy * operators */
294 { CMU, A, 0, 0, "(?:AA)*AB", "aaaaaaamaaaaaaab" },
295 { MU, A, 0, 0, "(?:aa)*?ab", "aaaaaaamaaaaaaab" },
296 { MU, A, 0, 0, "(aa|ab)*ab", "aaabaaab" },
297 { CMU, A, 0, 0, "(aa|Ab)*?aB", "aaabaaab" },
298 { MU, A, 0, 0, "(a|b)*(?:a)*(?:b)*m", "abbbaaababanabbbaaababamm" },
299 { MU, A, 0, 0, "(a|b)*?(?:a)*?(?:b)*?m", "abbbaaababanabbbaaababamm" },
300 { M, A, 0, 0, "a(a(\\1*)a|(b)b+){0}a", "aa" },
301 { M, A, 0, 0, "((?:a|)*){0}a", "a" },
302
303 /* Combining ? + * operators */
304 { MU, A, 0, 0, "((bm)+)?\?(?:a)*(bm)+n|((am)+?)?(?:a)+(am)*n", "bmbmabmamaaamambmaman" },
305 { MU, A, 0, 0, "(((ab)?cd)*ef)+g", "abcdcdefcdefefmabcdcdefcdefefgg" },
306 { MU, A, 0, 0, "(((ab)?\?cd)*?ef)+?g", "abcdcdefcdefefmabcdcdefcdefefgg" },
307 { MU, A, 0, 0, "(?:(ab)?c|(?:ab)+?d)*g", "ababcdccababddg" },
308 { MU, A, 0, 0, "(?:(?:ab)?\?c|(ab)+d)*?g", "ababcdccababddg" },
309
310 /* Single character iterators. */
311 { MU, A, 0, 0, "(a+aab)+aaaab", "aaaabcaaaabaabcaabcaaabaaaab" },
312 { MU, A, 0, 0, "(a*a*aab)+x", "aaaaabaabaaabmaabx" },
313 { MU, A, 0, 0, "(a*?(b|ab)a*?)+x", "aaaabcxbbaabaacbaaabaabax" },
314 { MU, A, 0, 0, "(a+(ab|ad)a+)+x", "aaabaaaadaabaaabaaaadaaax" },
315 { MU, A, 0, 0, "(a?(a)a?)+(aaa)", "abaaabaaaaaaaa" },
316 { MU, A, 0, 0, "(a?\?(a)a?\?)+(b)", "aaaacaaacaacacbaaab" },
317 { MU, A, 0, 0, "(a{0,4}(b))+d", "aaaaaabaabcaaaaabaaaaabd" },
318 { MU, A, 0, 0, "(a{0,4}?[^b])+d+(a{0,4}[^b])d+", "aaaaadaaaacaadddaaddd" },
319 { MU, A, 0, 0, "(ba{2})+c", "baabaaabacbaabaac" },
320 { MU, A, 0, 0, "(a*+bc++)+", "aaabbcaaabcccab" },
321 { MU, A, 0, 0, "(a?+[^b])+", "babaacacb" },
322 { MU, A, 0, 0, "(a{0,3}+b)(a{0,3}+b)(a{0,3}+)[^c]", "abaabaaacbaabaaaac" },
323 { CMU, A, 0, 0, "([a-c]+[d-f]+?)+?g", "aBdacdehAbDaFgA" },
324 { CMU, A, 0, 0, "[c-f]+k", "DemmFke" },
325 { MU, A, 0, 0, "([DGH]{0,4}M)+", "GGDGHDGMMHMDHHGHM" },
326 { MU, A, 0, 0, "([a-c]{4,}s)+", "abasabbasbbaabsbba" },
327 { CMU, A, 0, 0, "[ace]{3,7}", "AcbDAcEEcEd" },
328 { CMU, A, 0, 0, "[ace]{3,7}?", "AcbDAcEEcEd" },
329 { CMU, A, 0, 0, "[ace]{3,}", "AcbDAcEEcEd" },
330 { CMU, A, 0, 0, "[ace]{3,}?", "AcbDAcEEcEd" },
331 { MU, A, 0, 0, "[ckl]{2,}?g", "cdkkmlglglkcg" },
332 { CMU, A, 0, 0, "[ace]{5}?", "AcCebDAcEEcEd" },
333 { MU, A, 0, 0, "([AbC]{3,5}?d)+", "BACaAbbAEAACCbdCCbdCCAAbb" },
334 { MU, A, 0, 0, "([^ab]{0,}s){2}", "abaabcdsABamsDDs" },
335 { MU, A, 0, 0, "\\b\\w+\\B", "x,a_cd" },
336 { MUP, A, 0, 0, "\\b[^\xc2\xa1]+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
337 { CMU, A, 0, 0, "[^b]+(a*)([^c]?d{3})", "aaaaddd" },
338 { CMUP, A, 0, 0, "\xe1\xbd\xb8{2}", "\xe1\xbf\xb8#\xe1\xbf\xb8\xe1\xbd\xb8" },
339 { CMU, A, 0, 0, "[^\xf0\x90\x90\x80]{2,4}@", "\xf0\x90\x90\xa8\xf0\x90\x90\x80###\xf0\x90\x90\x80@@@" },
340 { CMU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
341 { MU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
342 { MU, A, 0, 0, "[^\xe1\xbd\xb8]{3,}?", "##\xe1\xbd\xb8#\xe1\xbd\xb8#\xc3\x89#\xe1\xbd\xb8" },
343 { MU, A, 0, 0, "\\d+123", "987654321,01234" },
344 { MU, A, 0, 0, "abcd*|\\w+xy", "aaaaa,abxyz" },
345 { MU, A, 0, 0, "(?:abc|((?:amc|\\b\\w*xy)))", "aaaaa,abxyz" },
346 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.abcd#."},
347 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.mbcd#."},
348 { MU, A, 0, 0, ".[ab]*.", "xx" },
349 { MU, A, 0, 0, ".[ab]*a", "xxa" },
350 { MU, A, 0, 0, ".[ab]?.", "xx" },
351
352 /* Bracket repeats with limit. */
353 { MU, A, 0, 0, "(?:(ab){2}){5}M", "abababababababababababM" },
354 { MU, A, 0, 0, "(?:ab|abab){1,5}M", "abababababababababababM" },
355 { MU, A, 0, 0, "(?>ab|abab){1,5}M", "abababababababababababM" },
356 { MU, A, 0, 0, "(?:ab|abab){1,5}?M", "abababababababababababM" },
357 { MU, A, 0, 0, "(?>ab|abab){1,5}?M", "abababababababababababM" },
358 { MU, A, 0, 0, "(?:(ab){1,4}?){1,3}?M", "abababababababababababababM" },
359 { MU, A, 0, 0, "(?:(ab){1,4}){1,3}abababababababababababM", "ababababababababababababM" },
360 { MU, A, 0, 0 | F_NOMATCH, "(?:(ab){1,4}){1,3}abababababababababababM", "abababababababababababM" },
361 { MU, A, 0, 0, "(ab){4,6}?M", "abababababababM" },
362
363 /* Basic character sets. */
364 { MU, A, 0, 0, "(?:\\s)+(?:\\S)+", "ab \t\xc3\xa9\xe6\x92\xad " },
365 { MU, A, 0, 0, "(\\w)*(k)(\\W)?\?", "abcdef abck11" },
366 { MU, A, 0, 0, "\\((\\d)+\\)\\D", "a() (83 (8)2 (9)ab" },
367 { MU, A, 0, 0, "\\w(\\s|(?:\\d)*,)+\\w\\wb", "a 5, 4,, bb 5, 4,, aab" },
368 { MU, A, 0, 0, "(\\v+)(\\V+)", "\x0e\xc2\x85\xe2\x80\xa8\x0b\x09\xe2\x80\xa9" },
369 { MU, A, 0, 0, "(\\h+)(\\H+)", "\xe2\x80\xa8\xe2\x80\x80\x20\xe2\x80\x8a\xe2\x81\x9f\xe3\x80\x80\x09\x20\xc2\xa0\x0a" },
370 { MU, A, 0, 0, "x[bcef]+", "xaxdxecbfg" },
371 { MU, A, 0, 0, "x[bcdghij]+", "xaxexfxdgbjk" },
372 { MU, A, 0, 0, "x[^befg]+", "xbxexacdhg" },
373 { MU, A, 0, 0, "x[^bcdl]+", "xlxbxaekmd" },
374 { MU, A, 0, 0, "x[^bcdghi]+", "xbxdxgxaefji" },
375 { MU, A, 0, 0, "x[B-Fb-f]+", "xaxAxgxbfBFG" },
376 { CMU, A, 0, 0, "\\x{e9}+", "#\xf0\x90\x90\xa8\xc3\xa8\xc3\xa9\xc3\x89\xc3\x88" },
377 { CMU, A, 0, 0, "[^\\x{e9}]+", "\xc3\xa9#\xf0\x90\x90\xa8\xc3\xa8\xc3\x88\xc3\x89" },
378 { MU, A, 0, 0, "[\\x02\\x7e]+", "\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x02\x7e\x7f" },
379 { MU, A, 0, 0, "[^\\x02\\x7e]+", "\x02\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x7f\x7e" },
380 { MU, A, 0, 0, "[\\x{81}-\\x{7fe}]+", "#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xc2\x81\xdf\xbe\xdf\xbf" },
381 { MU, A, 0, 0, "[^\\x{81}-\\x{7fe}]+", "\xc2\x81#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xdf\xbf\xdf\xbe" },
382 { MU, A, 0, 0, "[\\x{801}-\\x{fffe}]+", "#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xe0\xa0\x81\xef\xbf\xbe\xef\xbf\xbf" },
383 { MU, A, 0, 0, "[^\\x{801}-\\x{fffe}]+", "\xe0\xa0\x81#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xef\xbf\xbf\xef\xbf\xbe" },
384 { MU, A, 0, 0, "[\\x{10001}-\\x{10fffe}]+", "#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf0\x90\x80\x81\xf4\x8f\xbf\xbe\xf4\x8f\xbf\xbf" },
385 { MU, A, 0, 0, "[^\\x{10001}-\\x{10fffe}]+", "\xf0\x90\x80\x81#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbe" },
386 { CMU, A, 0, 0 | F_NOMATCH, "^[\\x{0100}-\\x{017f}]", " " },
387
388 /* Unicode properties. */
389 { MUP, A, 0, 0, "[1-5\xc3\xa9\\w]", "\xc3\xa1_" },
390 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\x81\\p{Ll}]", "A_\xc3\x89\xc3\xa1" },
391 { MUP, A, 0, 0, "[\\Wd-h_x-z]+", "a\xc2\xa1#_yhzdxi" },
392 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}]", "abc" },
393 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}]", "abc" },
394 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}\xc3\xa1-\xc3\xa8]", "abc" },
395 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}\xc3\xa1-\xc3\xa8]", "abc" },
396 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
397 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
398 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
399 { MUP, A, 0, 0 | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
400 { MUP, A, 0, 0, "[b-\xc3\xa9\\s]", "a\xc\xe6\x92\xad" },
401 { CMUP, A, 0, 0, "[\xc2\x85-\xc2\x89\xc3\x89]", "\xc2\x84\xc3\xa9" },
402 { MUP, A, 0, 0, "[^b-d^&\\s]{3,}", "db^ !a\xe2\x80\xa8_ae" },
403 { MUP, A, 0, 0 | F_PROPERTY, "[^\\S\\P{Any}][\\sN]{1,3}[\\P{N}]{4}", "\xe2\x80\xaa\xa N\x9\xc3\xa9_0" },
404 { MU, A, 0, 0 | F_PROPERTY, "[^\\P{L}\x9!D-F\xa]{2,3}", "\x9,.DF\xa.CG\xc3\x81" },
405 { CMUP, A, 0, 0, "[\xc3\xa1-\xc3\xa9_\xe2\x80\xa0-\xe2\x80\xaf]{1,5}[^\xe2\x80\xa0-\xe2\x80\xaf]", "\xc2\xa1\xc3\x89\xc3\x89\xe2\x80\xaf_\xe2\x80\xa0" },
406 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa2-\xc3\xa6\xc3\x81-\xc3\x84\xe2\x80\xa8-\xe2\x80\xa9\xe6\x92\xad\\p{Zs}]{2,}", "\xe2\x80\xa7\xe2\x80\xa9\xe6\x92\xad \xe6\x92\xae" },
407 { MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
408 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" },
409
410 /* Possible empty brackets. */
411 { MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },
412 { MU, A, 0, 0, "(|ab||bc|a)+d", "abcxabcabd" },
413 { MU, A, 0, 0, "(?:|ab||bc|a)*d", "abcxabcabd" },
414 { MU, A, 0, 0, "(|ab||bc|a)*d", "abcxabcabd" },
415 { MU, A, 0, 0, "(?:|ab||bc|a)+?d", "abcxabcabd" },
416 { MU, A, 0, 0, "(|ab||bc|a)+?d", "abcxabcabd" },
417 { MU, A, 0, 0, "(?:|ab||bc|a)*?d", "abcxabcabd" },
418 { MU, A, 0, 0, "(|ab||bc|a)*?d", "abcxabcabd" },
419 { MU, A, 0, 0, "(((a)*?|(?:ba)+)+?|(?:|c|ca)*)*m", "abaacaccabacabalabaacaccabacabamm" },
420 { MU, A, 0, 0, "(?:((?:a)*|(ba)+?)+|(|c|ca)*?)*?m", "abaacaccabacabalabaacaccabacabamm" },
421
422 /* Start offset. */
423 { MU, A, 0, 3, "(\\d|(?:\\w)*\\w)+", "0ac01Hb" },
424 { MU, A, 0, 4 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
425 { MU, A, 0, 2 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
426 { MU, A, 0, 1, "(\\w\\W\\w)+", "ab#d" },
427
428 /* Newline. */
429 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
430 { M, PCRE2_NEWLINE_CR, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
431 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{1,3}[^#]", "\r\n##...." },
432 { MU, A, PCRE2_NO_UTF_CHECK, 1, "^.a", "\n\x80\nxa" },
433 { MU, A, 0, 1, "^", "\r\n" },
434 { M, PCRE2_NEWLINE_CRLF, 0, 1 | F_NOMATCH, "^", "\r\n" },
435 { M, PCRE2_NEWLINE_CRLF, 0, 1, "^", "\r\na" },
436
437 /* Any character except newline or any newline. */
438 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
439 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".(.).", "a\xc3\xa1\r\n\n\r\r" },
440 { 0, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
441 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
442 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.).", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa9$de" },
443 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0 | F_NOMATCH, ".(.).", "\xe2\x80\xa8\nb\r" },
444 { 0, PCRE2_NEWLINE_ANY, 0, 0, "(.)(.)", "#\x85#\r#\n#\r\n#\x84" },
445 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.+)#", "#\rMn\xc2\x85#\n###" },
446 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\r" },
447 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\x85#\r\n#" },
448 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\xe2\x80\xa8#c" },
449 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\r\nc" },
450 { U, PCRE2_NEWLINE_CRLF | BSR(PCRE2_BSR_UNICODE), 0, 0, "(\\R.)+", "\xc2\x85\r\n#\xe2\x80\xa8\n\r\n\r" },
451 { MU, A, 0, 0 | F_NOMATCH, "\\R+", "ab" },
452 { MU, A, 0, 0, "\\R+", "ab\r\n\r" },
453 { MU, A, 0, 0, "\\R*", "ab\r\n\r" },
454 { MU, A, 0, 0, "\\R*", "\r\n\r" },
455 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\r\r" },
456 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\n\n\n\r\r\r" },
457 { MU, A, 0, 0, "\\R{2,}", "\r\nab\n\n\n\r\r\r" },
458 { MU, A, 0, 0, "\\R{0,3}", "\r\n\r\n\r\n\r\n\r\n" },
459 { MU, A, 0, 0 | F_NOMATCH, "\\R+\\R\\R", "\r\n\r\n" },
460 { MU, A, 0, 0, "\\R+\\R\\R", "\r\r\r" },
461 { MU, A, 0, 0, "\\R*\\R\\R", "\n\r" },
462 { MU, A, 0, 0 | F_NOMATCH, "\\R{2,4}\\R\\R", "\r\r\r" },
463 { MU, A, 0, 0, "\\R{2,4}\\R\\R", "\r\r\r\r" },
464
465 /* Atomic groups (no fallback from "next" direction). */
466 { MU, A, 0, 0 | F_NOMATCH, "(?>ab)ab", "bab" },
467 { MU, A, 0, 0 | F_NOMATCH, "(?>(ab))ab", "bab" },
468 { MU, A, 0, 0, "(?>ab)+abc(?>de)*def(?>gh)?ghe(?>ij)+?k(?>lm)*?n(?>op)?\?op",
469 "bababcdedefgheijijklmlmnop" },
470 { MU, A, 0, 0, "(?>a(b)+a|(ab)?\?(b))an", "abban" },
471 { MU, A, 0, 0, "(?>ab+a|(?:ab)?\?b)an", "abban" },
472 { MU, A, 0, 0, "((?>ab|ad|)*?)(?>|c)*abad", "abababcababad" },
473 { MU, A, 0, 0, "(?>(aa|b|)*+(?>(##)|###)*d|(aa)(?>(baa)?)m)", "aabaa#####da" },
474 { MU, A, 0, 0, "((?>a|)+?)b", "aaacaaab" },
475 { MU, A, 0, 0, "(?>x|)*$", "aaa" },
476 { MU, A, 0, 0, "(?>(x)|)*$", "aaa" },
477 { MU, A, 0, 0, "(?>x|())*$", "aaa" },
478 { MU, A, 0, 0, "((?>[cxy]a|[a-d])*?)b", "aaa+ aaab" },
479 { MU, A, 0, 0, "((?>[cxy](a)|[a-d])*?)b", "aaa+ aaab" },
480 { MU, A, 0, 0, "(?>((?>(a+))))bab|(?>((?>(a+))))bb", "aaaabaaabaabab" },
481 { MU, A, 0, 0, "(?>(?>a+))bab|(?>(?>a+))bb", "aaaabaaabaabab" },
482 { MU, A, 0, 0, "(?>(a)c|(?>(c)|(a))a)b*?bab", "aaaabaaabaabab" },
483 { MU, A, 0, 0, "(?>ac|(?>c|a)a)b*?bab", "aaaabaaabaabab" },
484 { MU, A, 0, 0, "(?>(b)b|(a))*b(?>(c)|d)?x", "ababcaaabdbx" },
485 { MU, A, 0, 0, "(?>bb|a)*b(?>c|d)?x", "ababcaaabdbx" },
486 { MU, A, 0, 0, "(?>(bb)|a)*b(?>c|(d))?x", "ababcaaabdbx" },
487 { MU, A, 0, 0, "(?>(a))*?(?>(a))+?(?>(a))??x", "aaaaaacccaaaaabax" },
488 { MU, A, 0, 0, "(?>a)*?(?>a)+?(?>a)??x", "aaaaaacccaaaaabax" },
489 { MU, A, 0, 0, "(?>(a)|)*?(?>(a)|)+?(?>(a)|)??x", "aaaaaacccaaaaabax" },
490 { MU, A, 0, 0, "(?>a|)*?(?>a|)+?(?>a|)??x", "aaaaaacccaaaaabax" },
491 { MU, A, 0, 0, "(?>a(?>(a{0,2}))*?b|aac)+b", "aaaaaaacaaaabaaaaacaaaabaacaaabb" },
492 { CM, A, 0, 0, "(?>((?>a{32}|b+|(a*))?(?>c+|d*)?\?)+e)+?f", "aaccebbdde bbdaaaccebbdee bbdaaaccebbdeef" },
493 { MU, A, 0, 0, "(?>(?:(?>aa|a||x)+?b|(?>aa|a||(x))+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
494 { MU, A, 0, 0, "(?>(?:(?>aa|a||(x))+?b|(?>aa|a||x)+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
495 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d" },
496 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d#\xcc\x8d\xcc\x8d" },
497 { MU, A, 0, 0 | F_PROPERTY, "\\X+..", "\xcc\x8d#\xcc\x8d#\xcc\x8d\xcc\x8d" },
498 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}", "abcdef" },
499 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}?", "abcdef" },
500 { MU, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d##" },
501 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d#\xcc\x8d##" },
502 { MU, A, 0, 0, "(c(ab)?+ab)+", "cabcababcab" },
503 { MU, A, 0, 0, "(?>(a+)b)+aabab", "aaaabaaabaabab" },
504
505 /* Possessive quantifiers. */
506 { MU, A, 0, 0, "(?:a|b)++m", "mababbaaxababbaam" },
507 { MU, A, 0, 0, "(?:a|b)*+m", "mababbaaxababbaam" },
508 { MU, A, 0, 0, "(?:a|b)*+m", "ababbaaxababbaam" },
509 { MU, A, 0, 0, "(a|b)++m", "mababbaaxababbaam" },
510 { MU, A, 0, 0, "(a|b)*+m", "mababbaaxababbaam" },
511 { MU, A, 0, 0, "(a|b)*+m", "ababbaaxababbaam" },
512 { MU, A, 0, 0, "(a|b(*ACCEPT))++m", "maaxab" },
513 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxm" },
514 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxbbm" },
515 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxm" },
516 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxbbm" },
517 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxm" },
518 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxbbm" },
519 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxm" },
520 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxbbm" },
521 { MU, A, 0, 0, "(?:a|(b))++m", "mababbaaxababbaam" },
522 { MU, A, 0, 0, "(?:(a)|b)*+m", "mababbaaxababbaam" },
523 { MU, A, 0, 0, "(?:(a)|(b))*+m", "ababbaaxababbaam" },
524 { MU, A, 0, 0, "(a|(b))++m", "mababbaaxababbaam" },
525 { MU, A, 0, 0, "((a)|b)*+m", "mababbaaxababbaam" },
526 { MU, A, 0, 0, "((a)|(b))*+m", "ababbaaxababbaam" },
527 { MU, A, 0, 0, "(a|(b)(*ACCEPT))++m", "maaxab" },
528 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxm" },
529 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxbbm" },
530 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxm" },
531 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxbbm" },
532 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxm" },
533 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxbbm" },
534 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxm" },
535 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxbbm" },
536 { MU, A, 0, 0 | F_NOMATCH, "(?>(b{2,4}))(?:(?:(aa|c))++m|(?:(aa|c))+n)", "bbaacaaccaaaacxbbbmbn" },
537 { MU, A, 0, 0, "((?:b)++a)+(cd)*+m", "bbababbacdcdnbbababbacdcdm" },
538 { MU, A, 0, 0, "((?:(b))++a)+((c)d)*+m", "bbababbacdcdnbbababbacdcdm" },
539 { MU, A, 0, 0, "(?:(?:(?:ab)*+k)++(?:n(?:cd)++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
540 { MU, A, 0, 0, "(?:((ab)*+(k))++(n(?:c(d))++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
541
542 /* Back references. */
543 { MU, A, 0, 0, "(aa|bb)(\\1*)(ll|)(\\3*)bbbbbbc", "aaaaaabbbbbbbbc" },
544 { CMU, A, 0, 0, "(aa|bb)(\\1+)(ll|)(\\3+)bbbbbbc", "bBbbBbCbBbbbBbbcbbBbbbBBbbC" },
545 { CM, A, 0, 0, "(a{2,4})\\1", "AaAaaAaA" },
546 { MU, A, 0, 0, "(aa|bb)(\\1?)aa(\\1?)(ll|)(\\4+)bbc", "aaaaaaaabbaabbbbaabbbbc" },
547 { MU, A, 0, 0, "(aa|bb)(\\1{0,5})(ll|)(\\3{0,5})cc", "bbxxbbbbxxaaaaaaaaaaaaaaaacc" },
548 { MU, A, 0, 0, "(aa|bb)(\\1{3,5})(ll|)(\\3{3,5})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
549 { MU, A, 0, 0, "(aa|bb)(\\1{3,})(ll|)(\\3{3,})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
550 { MU, A, 0, 0, "(\\w+)b(\\1+)c", "GabGaGaDbGaDGaDc" },
551 { MU, A, 0, 0, "(?:(aa)|b)\\1?b", "bb" },
552 { CMU, A, 0, 0, "(aa|bb)(\\1*?)aa(\\1+?)", "bBBbaaAAaaAAaa" },
553 { MU, A, 0, 0, "(aa|bb)(\\1*?)(dd|)cc(\\3+?)", "aaaaaccdd" },
554 { CMU, A, 0, 0, "(?:(aa|bb)(\\1?\?)cc){2}(\\1?\?)", "aAaABBbbAAaAcCaAcCaA" },
555 { MU, A, 0, 0, "(?:(aa|bb)(\\1{3,5}?)){2}(dd|)(\\3{3,5}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
556 { CM, A, 0, 0, "(?:(aa|bb)(\\1{3,}?)){2}(dd|)(\\3{3,}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
557 { MU, A, 0, 0, "(?:(aa|bb)(\\1{0,3}?)){2}(dd|)(\\3{0,3}?)b(\\1{0,3}?)(\\1{0,3})", "aaaaaaaaaaaaaaabaaaaa" },
558 { MU, A, 0, 0, "(a(?:\\1|)a){3}b", "aaaaaaaaaaab" },
559 { M, A, 0, 0, "(a?)b(\\1\\1*\\1+\\1?\\1*?\\1+?\\1??\\1*+\\1++\\1?+\\1{4}\\1{3,5}\\1{4,}\\1{0,5}\\1{3,5}?\\1{4,}?\\1{0,5}?\\1{3,5}+\\1{4,}+\\1{0,5}+#){2}d", "bb#b##d" },
560 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
561 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{0,2}", "wwwww." },
562 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwww" },
563 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwwww" },
564 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
565 { CMUP, A, 0, 0, "(\xf0\x90\x90\x80)\\1", "\xf0\x90\x90\xa8\xf0\x90\x90\xa8" },
566 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
567 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
568 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>*(?<A>aa)(?<A>bb)", "aabb" },
569 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{0,3}aaaaaa", "aabbaaaaaa" },
570 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{2,5}bb", "aabbaaaabb" },
571 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}m", "aaaaaaaabbbbaabbbbm" },
572 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
573 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
574 { MU | PCRE2_DUPNAMES, A, 0, 0, "\\k<A>*?(?<A>aa)(?<A>bb)", "aabb" },
575 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
576 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>*?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
577 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
578 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}M", "aaaaaaaabbbbaabbbbm" },
579 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{1,3}M", "aaaaaaaabbbbaabbbbm" },
580 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}?M", "aaaaaabbbbbbaabbbbbbbbbbm" },
581 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
582
583 /* Assertions. */
584 { MU, A, 0, 0, "(?=xx|yy|zz)\\w{4}", "abczzdefg" },
585 { MU, A, 0, 0, "(?=((\\w+)b){3}|ab)", "dbbbb ab" },
586 { MU, A, 0, 0, "(?!ab|bc|cd)[a-z]{2}", "Xabcdef" },
587 { MU, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
588 { MU, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
589 { M, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
590 { M, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
591 { MU, A, 0, 0, "(\\d{2})(?!\\w+c|(((\\w?)m){2}n)+|\\1)", "x5656" },
592 { MU, A, 0, 0, "((?=((\\d{2,6}\\w){2,}))\\w{5,20}K){2,}", "567v09708K12l00M00 567v09708K12l00M00K45K" },
593 { MU, A, 0, 0, "(?=(?:(?=\\S+a)\\w*(b)){3})\\w+\\d", "bba bbab nbbkba nbbkba0kl" },
594 { MU, A, 0, 0, "(?>a(?>(b+))a(?=(..)))*?k", "acabbcabbaabacabaabbakk" },
595 { MU, A, 0, 0, "((?(?=(a))a)+k)", "bbak" },
596 { MU, A, 0, 0, "((?(?=a)a)+k)", "bbak" },
597 { MU, A, 0, 0 | F_NOMATCH, "(?=(?>(a))m)amk", "a k" },
598 { MU, A, 0, 0 | F_NOMATCH, "(?!(?>(a))m)amk", "a k" },
599 { MU, A, 0, 0 | F_NOMATCH, "(?>(?=(a))am)amk", "a k" },
600 { MU, A, 0, 0, "(?=(?>a|(?=(?>(b+))a|c)[a-c]+)*?m)[a-cm]+k", "aaam bbam baaambaam abbabba baaambaamk" },
601 { MU, A, 0, 0, "(?> ?\?\\b(?(?=\\w{1,4}(a))m)\\w{0,8}bc){2,}?", "bca ssbc mabd ssbc mabc" },
602 { MU, A, 0, 0, "(?:(?=ab)?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
603 { MU, A, 0, 0, "(?:(?=a(b))?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
604 { MU, A, 0, 0, "(?:(?=.(.))??\\1.)+m", "aabbbcbacccanaabbbcbacccam" },
605 { MU, A, 0, 0, "(?:(?=.)??[a-c])+m", "abacdcbacacdcaccam" },
606 { MU, A, 0, 0, "((?!a)?(?!([^a]))?)+$", "acbab" },
607 { MU, A, 0, 0, "((?!a)?\?(?!([^a]))?\?)+$", "acbab" },
608 { MU, A, 0, 0, "a(?=(?C)\\B(?C`x`))b", "ab" },
609 { MU, A, 0, 0, "a(?!(?C)\\B(?C`x`))bb|ab", "abb" },
610 { MU, A, 0, 0, "a(?=\\b|(?C)\\B(?C`x`))b", "ab" },
611 { MU, A, 0, 0, "a(?!\\b|(?C)\\B(?C`x`))bb|ab", "abb" },
612 { MU, A, 0, 0, "c(?(?=(?C)\\B(?C`x`))ab|a)", "cab" },
613 { MU, A, 0, 0, "c(?(?!(?C)\\B(?C`x`))ab|a)", "cab" },
614 { MU, A, 0, 0, "c(?(?=\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
615 { MU, A, 0, 0, "c(?(?!\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
616 { MU, A, 0, 0, "a(?=)b", "ab" },
617 { MU, A, 0, 0 | F_NOMATCH, "a(?!)b", "ab" },
618
619 /* Not empty, ACCEPT, FAIL */
620 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*", "bcx" },
621 { MU, A, PCRE2_NOTEMPTY, 0, "a*", "bcaad" },
622 { MU, A, PCRE2_NOTEMPTY, 0, "a*?", "bcaad" },
623 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*", "bcaad" },
624 { MU, A, 0, 0, "a(*ACCEPT)b", "ab" },
625 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*(*ACCEPT)b", "bcx" },
626 { MU, A, PCRE2_NOTEMPTY, 0, "a*(*ACCEPT)b", "bcaad" },
627 { MU, A, PCRE2_NOTEMPTY, 0, "a*?(*ACCEPT)b", "bcaad" },
628 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "(?:z|a*(*ACCEPT)b)", "bcx" },
629 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*(*ACCEPT)b)", "bcaad" },
630 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*?(*ACCEPT)b)", "bcaad" },
631 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*(*ACCEPT)b", "bcx" },
632 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0 | F_NOMATCH, "a*(*ACCEPT)b", "" },
633 { MU, A, 0, 0, "((a(*ACCEPT)b))", "ab" },
634 { MU, A, 0, 0, "(a(*FAIL)a|a)", "aaa" },
635 { MU, A, 0, 0, "(?=ab(*ACCEPT)b)a", "ab" },
636 { MU, A, 0, 0, "(?=(?:x|ab(*ACCEPT)b))", "ab" },
637 { MU, A, 0, 0, "(?=(a(b(*ACCEPT)b)))a", "ab" },
638 { MU, A, PCRE2_NOTEMPTY, 0, "(?=a*(*ACCEPT))c", "c" },
639
640 /* Conditional blocks. */
641 { MU, A, 0, 0, "(?(?=(a))a|b)+k", "ababbalbbadabak" },
642 { MU, A, 0, 0, "(?(?!(b))a|b)+k", "ababbalbbadabak" },
643 { MU, A, 0, 0, "(?(?=a)a|b)+k", "ababbalbbadabak" },
644 { MU, A, 0, 0, "(?(?!b)a|b)+k", "ababbalbbadabak" },
645 { MU, A, 0, 0, "(?(?=(a))a*|b*)+k", "ababbalbbadabak" },
646 { MU, A, 0, 0, "(?(?!(b))a*|b*)+k", "ababbalbbadabak" },
647 { MU, A, 0, 0, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
648 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
649 { MU, A, 0, 0 | F_DIFF, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
650 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
651 { MU, A, 0, 0, "(?(?=a)a*|b*)+k", "ababbalbbadabak" },
652 { MU, A, 0, 0, "(?(?!b)a*|b*)+k", "ababbalbbadabak" },
653 { MU, A, 0, 0, "(?(?=a)ab)", "a" },
654 { MU, A, 0, 0, "(?(?<!b)c)", "b" },
655 { MU, A, 0, 0, "(?(DEFINE)a(b))", "a" },
656 { MU, A, 0, 0, "a(?(DEFINE)(?:b|(?:c?)+)*)", "a" },
657 { MU, A, 0, 0, "(?(?=.[a-c])[k-l]|[A-D])", "kdB" },
658 { MU, A, 0, 0, "(?(?!.{0,4}[cd])(aa|bb)|(cc|dd))+", "aabbccddaa" },
659 { MU, A, 0, 0, "(?(?=[^#@]*@)(aaab|aa|aba)|(aba|aab)){3,}", "aaabaaaba#aaabaaaba#aaabaaaba@" },
660 { MU, A, 0, 0, "((?=\\w{5})\\w(?(?=\\w*k)\\d|[a-f_])*\\w\\s)+", "mol m10kk m088k _f_a_ mbkkl" },
661 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cdcaa" },
662 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cbb" },
663 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))(aaaa|a?))+aak", "aaaaab aaaaak" },
664 { MU, A, 0, 0, "(?(?=a)(aaaa|a?))+aak", "aaaaab aaaaak" },
665 { MU, A, 0, 0, "(?(?!(b))(aaaa|a?))+aak", "aaaaab aaaaak" },
666 { MU, A, 0, 0, "(?(?!b)(aaaa|a?))+aak", "aaaaab aaaaak" },
667 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))a*)+aak", "aaaaab aaaaak" },
668 { MU, A, 0, 0, "(?(?=a)a*)+aak", "aaaaab aaaaak" },
669 { MU, A, 0, 0, "(?(?!(b))a*)+aak", "aaaaab aaaaak" },
670 { MU, A, 0, 0, "(?(?!b)a*)+aak", "aaaaab aaaaak" },
671 { MU, A, 0, 0, "(?(?=(?=(?!(x))a)aa)aaa|(?(?=(?!y)bb)bbb))*k", "abaabbaaabbbaaabbb abaabbaaabbbaaabbbk" },
672 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)*l", "bc ddd abccabccl" },
673 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+?dd", "bcabcacdb bdddd" },
674 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+l", "ababccddabdbccd abcccl" },
675 { MU, A, 0, 0, "((?:a|aa)(?(1)aaa))x", "aax" },
676 { MU, A, 0, 0, "(?(?!)a|b)", "ab" },
677 { MU, A, 0, 0, "(?(?!)a)", "ab" },
678 { MU, A, 0, 0 | F_NOMATCH, "(?(?!)a|b)", "ac" },
679
680 /* Set start of match. */
681 { MU, A, 0, 0, "(?:\\Ka)*aaaab", "aaaaaaaa aaaaaaabb" },
682 { MU, A, 0, 0, "(?>\\Ka\\Ka)*aaaab", "aaaaaaaa aaaaaaaaaabb" },
683 { MU, A, 0, 0, "a+\\K(?<=\\Gaa)a", "aaaaaa" },
684 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a\\K(*ACCEPT)b", "aa" },
685 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a\\K(*ACCEPT)b", "aa" },
686
687 /* First line. */
688 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_PROPERTY, "\\p{Any}a", "bb\naaa" },
689 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}a", "bb\r\naaa" },
690 { MU | PCRE2_FIRSTLINE, A, 0, 0, "(?<=a)", "a" },
691 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[^a][^b]", "ab" },
692 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "a", "\na" },
693 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[abc]", "\na" },
694 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^a", "\na" },
695 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^(?<=\n)", "\na" },
696 { MU | PCRE2_FIRSTLINE, A, 0, 0, "\xf0\x90\x90\x80", "\xf0\x90\x90\x80" },
697 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\xc2\x85#" },
698 { M | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\x85#" },
699 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "^#", "\xe2\x80\xa8#" },
700 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_PROPERTY, "\\p{Any}", "\r\na" },
701 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
702 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, "a", "\ra" },
703 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH, "ba", "bbb\r\nba" },
704 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}{4}|a", "\r\na" },
705 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 1, ".", "\r\n" },
706 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_LF, 0, 0 | F_NOMATCH, "ab.", "ab" },
707 { MU | PCRE2_FIRSTLINE, A, 0, 1 | F_NOMATCH, "^[a-d0-9]", "\nxx\nd" },
708 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_ANY, 0, 0, "....a", "012\n0a" },
709 { MU | PCRE2_FIRSTLINE, A, 0, 0, "[aC]", "a" },
710
711 /* Recurse. */
712 { MU, A, 0, 0, "(a)(?1)", "aa" },
713 { MU, A, 0, 0, "((a))(?1)", "aa" },
714 { MU, A, 0, 0, "(b|a)(?1)", "aa" },
715 { MU, A, 0, 0, "(b|(a))(?1)", "aa" },
716 { MU, A, 0, 0 | F_NOMATCH, "((a)(b)(?:a*))(?1)", "aba" },
717 { MU, A, 0, 0, "((a)(b)(?:a*))(?1)", "abab" },
718 { MU, A, 0, 0, "((a+)c(?2))b(?1)", "aacaabaca" },
719 { MU, A, 0, 0, "((?2)b|(a)){2}(?1)", "aabab" },
720 { MU, A, 0, 0, "(?1)(a)*+(?2)(b(?1))", "aababa" },
721 { MU, A, 0, 0, "(?1)(((a(*ACCEPT)))b)", "axaa" },
722 { MU, A, 0, 0, "(?1)(?(DEFINE) (((ac(*ACCEPT)))b) )", "akaac" },
723 { MU, A, 0, 0, "(a+)b(?1)b\\1", "abaaabaaaaa" },
724 { MU, A, 0, 0, "(?(DEFINE)(aa|a))(?1)ab", "aab" },
725 { MU, A, 0, 0, "(?(DEFINE)(a\\Kb))(?1)+ababc", "abababxabababc" },
726 { MU, A, 0, 0, "(a\\Kb)(?1)+ababc", "abababxababababc" },
727 { MU, A, 0, 0 | F_NOMATCH, "(a\\Kb)(?1)+ababc", "abababxababababxc" },
728 { MU, A, 0, 0, "b|<(?R)*>", "<<b>" },
729 { MU, A, 0, 0, "(a\\K){0}(?:(?1)b|ac)", "ac" },
730 { MU, A, 0, 0, "(?(DEFINE)(a(?2)|b)(b(?1)|(a)))(?:(?1)|(?2))m", "ababababnababababaam" },
731 { MU, A, 0, 0, "(a)((?(R)a|b))(?2)", "aabbabaa" },
732 { MU, A, 0, 0, "(a)((?(R2)a|b))(?2)", "aabbabaa" },
733 { MU, A, 0, 0, "(a)((?(R1)a|b))(?2)", "ababba" },
734 { MU, A, 0, 0, "(?(R0)aa|bb(?R))", "abba aabb bbaa" },
735 { MU, A, 0, 0, "((?(R)(?:aaaa|a)|(?:(aaaa)|(a)))+)(?1)$", "aaaaaaaaaa aaaa" },
736 { MU, A, 0, 0, "(?P<Name>a(?(R&Name)a|b))(?1)", "aab abb abaa" },
737 { MU, A, 0, 0, "((?(R)a|(?1)){3})", "XaaaaaaaaaX" },
738 { MU, A, 0, 0, "((?:(?(R)a|(?1))){3})", "XaaaaaaaaaX" },
739 { MU, A, 0, 0, "((?(R)a|(?1)){1,3})aaaaaa", "aaaaaaaaXaaaaaaaaa" },
740 { MU, A, 0, 0, "((?(R)a|(?1)){1,3}?)M", "aaaM" },
741 { MU, A, 0, 0, "((.)(?:.|\\2(?1))){0}#(?1)#", "#aabbccdde# #aabbccddee#" },
742 { MU, A, 0, 0, "((.)(?:\\2|\\2{4}b)){0}#(?:(?1))+#", "#aaaab# #aaaaab#" },
743
744 /* 16 bit specific tests. */
745 { CM, A, 0, 0 | F_FORCECONV, "\xc3\xa1", "\xc3\x81\xc3\xa1" },
746 { CM, A, 0, 0 | F_FORCECONV, "\xe1\xbd\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
747 { CM, A, 0, 0 | F_FORCECONV, "[\xc3\xa1]", "\xc3\x81\xc3\xa1" },
748 { CM, A, 0, 0 | F_FORCECONV, "[\xe1\xbd\xb8]", "\xe1\xbf\xb8\xe1\xbd\xb8" },
749 { CM, A, 0, 0 | F_FORCECONV, "[a-\xed\xb0\x80]", "A" },
750 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[a-\\x{dc00}]", "B" },
751 { CM, A, 0, 0 | F_NO8 | F_NOMATCH | F_FORCECONV, "[b-\\x{dc00}]", "a" },
752 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "\xed\xa0\x80\\x{d800}\xed\xb0\x80\\x{dc00}", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80" },
753 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\xed\xa0\x80\\x{d800}]{1,2}?[\xed\xb0\x80\\x{dc00}]{1,2}?#", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80#" },
754 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80\xed\xb0\x80#]{0,3}(?<=\xed\xb0\x80.)", "\xed\xa0\x80#\xed\xa0\x80##\xed\xb0\x80\xed\xa0\x80" },
755 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\x9f\xbf\xed\xa0\x83" },
756 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\xb4\x80\xed\xb3\xb0" },
757 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\x9f\xbf\xed\xa0\x83" },
758 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\xb4\x80\xed\xb3\xb0" },
759 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xef\xbf\xbf]+[\x1-\xed\xb0\x80]+#", "\xed\xa0\x85\xc3\x81\xed\xa0\x85\xef\xbf\xb0\xc2\x85\xed\xa9\x89#" },
760 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80][\xed\xb0\x80]{2,}", "\xed\xa0\x80\xed\xb0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80\xed\xb0\x80" },
761 { M, A, 0, 0 | F_FORCECONV, "[^\xed\xb0\x80]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
762 { M, A, 0, 0 | F_NO8 | F_FORCECONV, "[^\\x{dc00}]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
763 { CM, A, 0, 0 | F_FORCECONV, ".\\B.", "\xed\xa0\x80\xed\xb0\x80" },
764 { CM, A, 0, 0 | F_FORCECONV, "\\D+(?:\\d+|.)\\S+(?:\\s+|.)\\W+(?:\\w+|.)\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80" },
765 { CM, A, 0, 0 | F_FORCECONV, "\\d*\\s*\\w*\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80" },
766 { CM, A, 0, 0 | F_FORCECONV | F_NOMATCH, "\\d*?\\D*?\\s*?\\S*?\\w*?\\W*?##", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80#" },
767 { CM | PCRE2_EXTENDED, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80 \xed\xb0\x80 !", "\xed\xa0\x80\xed\xb0\x80!" },
768 { CM, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80+#[^#]+\xed\xa0\x80", "\xed\xa0\x80#a\xed\xa0\x80" },
769 { CM, A, 0, 0 | F_FORCECONV, "(\xed\xa0\x80+)#\\1", "\xed\xa0\x80\xed\xa0\x80#\xed\xa0\x80\xed\xa0\x80" },
770 { M, PCRE2_NEWLINE_ANY, 0, 0 | F_NO8 | F_FORCECONV, "^-", "a--\xe2\x80\xa8--" },
771 { 0, BSR(PCRE2_BSR_UNICODE), 0, 0 | F_NO8 | F_FORCECONV, "\\R", "ab\xe2\x80\xa8" },
772 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v", "ab\xe2\x80\xa9" },
773 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h", "ab\xe1\xa0\x8e" },
774 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v+?\\V+?#", "\xe2\x80\xa9\xe2\x80\xa9\xef\xbf\xbf\xef\xbf\xbf#" },
775 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h+?\\H+?#", "\xe1\xa0\x8e\xe1\xa0\x8e\xef\xbf\xbf\xef\xbf\xbf#" },
776
777 /* Partial matching. */
778 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab", "a" },
779 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab|a", "a" },
780 { MU, A, PCRE2_PARTIAL_HARD, 0, "ab|a", "a" },
781 { MU, A, PCRE2_PARTIAL_SOFT, 0, "\\b#", "a" },
782 { MU, A, PCRE2_PARTIAL_SOFT, 0, "(?<=a)b", "a" },
783 { MU, A, PCRE2_PARTIAL_SOFT, 0, "abc|(?<=xxa)bc", "xxab" },
784 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a\\B", "a" },
785 { MU, A, PCRE2_PARTIAL_HARD, 0, "a\\b", "a" },
786
787 /* (*MARK) verb. */
788 { MU, A, 0, 0, "a(*MARK:aa)a", "ababaa" },
789 { MU, A, 0, 0 | F_NOMATCH, "a(*:aa)a", "abab" },
790 { MU, A, 0, 0, "a(*:aa)(b(*:bb)b|bc)", "abc" },
791 { MU, A, 0, 0 | F_NOMATCH, "a(*:1)x|b(*:2)y", "abc" },
792 { MU, A, 0, 0, "(?>a(*:aa))b|ac", "ac" },
793 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))(?1)", "a" },
794 { MU, A, 0, 0 | F_NOMATCH, "(?(DEFINE)((a)(*:aa)))(?1)b", "aa" },
795 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))a(?1)b|aac", "aac" },
796 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
797 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b)+", "babba" },
798 { MU, A, 0, 0 | F_NOMATCH, "(a(*:aa)){0}(?:b(?1)b)+", "ba" },
799 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
800 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b)+", "babba" },
801 { MU, A, 0, 0 | F_NOMATCH, "(a\\K(*:aa)){0}(?:b(?1)b)+", "ba" },
802 { MU, A, 0, 0 | F_NOMATCH, "(*:mark)m", "a" },
803
804 /* (*COMMIT) verb. */
805 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)b", "ac" },
806 { MU, A, 0, 0, "aa(*COMMIT)b", "xaxaab" },
807 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)(*:msg)b|ac", "ac" },
808 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b)++", "abac" },
809 { MU, A, 0, 0 | F_NOMATCH, "((a)(*COMMIT)b)++", "abac" },
810 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*COMMIT)b)ab|ad", "ad" },
811
812 /* (*PRUNE) verb. */
813 { MU, A, 0, 0, "aa\\K(*PRUNE)b", "aaab" },
814 { MU, A, 0, 0, "aa(*PRUNE:bb)b|a", "aa" },
815 { MU, A, 0, 0, "(a)(a)(*PRUNE)b|(a)", "aa" },
816 { MU, A, 0, 0, "(a)(a)(a)(a)(a)(a)(a)(a)(*PRUNE)b|(a)", "aaaaaaaa" },
817 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|", "a" },
818 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|m", "a" },
819 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*PRUNE)b)ab|ad", "ad" },
820 { MU, A, 0, 0, "a(*COMMIT)(*PRUNE)d|bc", "abc" },
821 { MU, A, 0, 0, "(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
822 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
823 { MU, A, 0, 0, "(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
824 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
825 { MU, A, 0, 0, "(a(*COMMIT)b){0}a(?1)(*PRUNE)c|bc", "abc" },
826 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b){0}a(*COMMIT)(?1)(*PRUNE)c|bc", "abc" },
827 { MU, A, 0, 0, "(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
828 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
829 { MU, A, 0, 0, "((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
830 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
831 { MU, A, 0, 0, "(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
832 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
833 { MU, A, 0, 0, "(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
834 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
835 { MU, A, 0, 0, "(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
836 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
837 { MU, A, 0, 0, "(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
838 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
839 { MU, A, 0, 0, "(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
840 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
841 { MU, A, 0, 0, "(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
842 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
843
844 /* (*SKIP) verb. */
845 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*SKIP)b)ab|ad", "ad" },
846 { MU, A, 0, 0, "(\\w+(*SKIP)#)", "abcd,xyz#," },
847 { MU, A, 0, 0, "\\w+(*SKIP)#|mm", "abcd,xyz#," },
848 { MU, A, 0, 0 | F_NOMATCH, "b+(?<=(*SKIP)#c)|b+", "#bbb" },
849
850 /* (*THEN) verb. */
851 { MU, A, 0, 0, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcaabcaabcaabcnacm" },
852 { MU, A, 0, 0 | F_NOMATCH, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcm" },
853 { MU, A, 0, 0, "((?:a(*THEN)|aab)c|a+)+m", "aabcaabcnmaabcaabcm" },
854 { MU, A, 0, 0, "((?:a|aab)(*THEN)c|a+)+m", "aam" },
855 { MU, A, 0, 0, "((?:a(*COMMIT)|aab)(*THEN)c|a+)+m", "aam" },
856 { MU, A, 0, 0, "(?(?=a(*THEN)b)ab|ad)", "ad" },
857 { MU, A, 0, 0, "(?(?!a(*THEN)b)ad|add)", "add" },
858 { MU, A, 0, 0 | F_NOMATCH, "(?(?=a)a(*THEN)b|ad)", "ad" },
859 { MU, A, 0, 0, "(?!(?(?=a)ab|b(*THEN)d))bn|bnn", "bnn" },
860
861 /* Recurse and control verbs. */
862 { MU, A, 0, 0, "(a(*ACCEPT)b){0}a(?1)b", "aacaabb" },
863 { MU, A, 0, 0, "((a)\\2(*ACCEPT)b){0}a(?1)b", "aaacaaabb" },
864 { MU, A, 0, 0, "((ab|a(*ACCEPT)x)+|ababababax){0}_(?1)_", "_ababababax_ _ababababa_" },
865 { MU, A, 0, 0, "((.)(?:A(*ACCEPT)|(?1)\\2)){0}_(?1)_", "_bcdaAdcb_bcdaAdcb_" },
866 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_", "_ab_" },
867 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_|(_aa_)", "_aa_" },
868 { MU, A, 0, 0, "(a(*COMMIT)(?:b|bb)|c(*ACCEPT)d|dd){0}_(?1)+_", "_ax_ _cd_ _abbb_ _abcd_ _abbcdd_" },
869 { MU, A, 0, 0, "((.)(?:.|(*COMMIT)\\2{3}(*ACCEPT).*|.*)){0}_(?1){0,4}_", "_aaaabbbbccccddd_ _aaaabbbbccccdddd_" },
870
871 #ifdef SUPPORT_UNICODE
872 /* Script runs and iterations. */
873 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
874 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
875 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
876 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
877 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
878 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)++#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
879 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)?#", "!ab!abc!ab!ab#" },
880 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)??#", "!ab!abc!ab!ab#" },
881 #endif
882
883 /* Deep recursion. */
884 { MU, A, 0, 0, "((((?:(?:(?:\\w)+)?)*|(?>\\w)+?)+|(?>\\w)?\?)*)?\\s", "aaaaa+ " },
885 { MU, A, 0, 0, "(?:((?:(?:(?:\\w*?)+)??|(?>\\w)?|\\w*+)*)+)+?\\s", "aa+ " },
886 { MU, A, 0, 0, "((a?)+)+b", "aaaaaaaaaaaa b" },
887
888 /* Deep recursion: Stack limit reached. */
889 { M, A, 0, 0 | F_NOMATCH, "a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaa" },
890 { M, A, 0, 0 | F_NOMATCH, "(?:a+)+b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
891 { M, A, 0, 0 | F_NOMATCH, "(?:a+?)+?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
892 { M, A, 0, 0 | F_NOMATCH, "(?:a*)*b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
893 { M, A, 0, 0 | F_NOMATCH, "(?:a*?)*?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
894
895 { 0, 0, 0, 0, NULL, NULL }
896 };
897
898 #ifdef SUPPORT_PCRE2_8
callback8(void * arg)899 static pcre2_jit_stack_8* callback8(void *arg)
900 {
901 return (pcre2_jit_stack_8 *)arg;
902 }
903 #endif
904
905 #ifdef SUPPORT_PCRE2_16
callback16(void * arg)906 static pcre2_jit_stack_16* callback16(void *arg)
907 {
908 return (pcre2_jit_stack_16 *)arg;
909 }
910 #endif
911
912 #ifdef SUPPORT_PCRE2_32
callback32(void * arg)913 static pcre2_jit_stack_32* callback32(void *arg)
914 {
915 return (pcre2_jit_stack_32 *)arg;
916 }
917 #endif
918
919 #ifdef SUPPORT_PCRE2_8
920 static pcre2_jit_stack_8 *stack8;
921
getstack8(void)922 static pcre2_jit_stack_8 *getstack8(void)
923 {
924 if (!stack8)
925 stack8 = pcre2_jit_stack_create_8(1, 1024 * 1024, NULL);
926 return stack8;
927 }
928
setstack8(pcre2_match_context_8 * mcontext)929 static void setstack8(pcre2_match_context_8 *mcontext)
930 {
931 if (!mcontext) {
932 if (stack8)
933 pcre2_jit_stack_free_8(stack8);
934 stack8 = NULL;
935 return;
936 }
937
938 pcre2_jit_stack_assign_8(mcontext, callback8, getstack8());
939 }
940 #endif /* SUPPORT_PCRE2_8 */
941
942 #ifdef SUPPORT_PCRE2_16
943 static pcre2_jit_stack_16 *stack16;
944
getstack16(void)945 static pcre2_jit_stack_16 *getstack16(void)
946 {
947 if (!stack16)
948 stack16 = pcre2_jit_stack_create_16(1, 1024 * 1024, NULL);
949 return stack16;
950 }
951
setstack16(pcre2_match_context_16 * mcontext)952 static void setstack16(pcre2_match_context_16 *mcontext)
953 {
954 if (!mcontext) {
955 if (stack16)
956 pcre2_jit_stack_free_16(stack16);
957 stack16 = NULL;
958 return;
959 }
960
961 pcre2_jit_stack_assign_16(mcontext, callback16, getstack16());
962 }
963 #endif /* SUPPORT_PCRE2_16 */
964
965 #ifdef SUPPORT_PCRE2_32
966 static pcre2_jit_stack_32 *stack32;
967
getstack32(void)968 static pcre2_jit_stack_32 *getstack32(void)
969 {
970 if (!stack32)
971 stack32 = pcre2_jit_stack_create_32(1, 1024 * 1024, NULL);
972 return stack32;
973 }
974
setstack32(pcre2_match_context_32 * mcontext)975 static void setstack32(pcre2_match_context_32 *mcontext)
976 {
977 if (!mcontext) {
978 if (stack32)
979 pcre2_jit_stack_free_32(stack32);
980 stack32 = NULL;
981 return;
982 }
983
984 pcre2_jit_stack_assign_32(mcontext, callback32, getstack32());
985 }
986 #endif /* SUPPORT_PCRE2_32 */
987
988 #ifdef SUPPORT_PCRE2_16
989
convert_utf8_to_utf16(PCRE2_SPTR8 input,PCRE2_UCHAR16 * output,int * offsetmap,int max_length)990 static int convert_utf8_to_utf16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int *offsetmap, int max_length)
991 {
992 PCRE2_SPTR8 iptr = input;
993 PCRE2_UCHAR16 *optr = output;
994 unsigned int c;
995
996 if (max_length == 0)
997 return 0;
998
999 while (*iptr && max_length > 1) {
1000 c = 0;
1001 if (offsetmap)
1002 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1003
1004 if (*iptr < 0xc0)
1005 c = *iptr++;
1006 else if (!(*iptr & 0x20)) {
1007 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1008 iptr += 2;
1009 } else if (!(*iptr & 0x10)) {
1010 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1011 iptr += 3;
1012 } else if (!(*iptr & 0x08)) {
1013 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1014 iptr += 4;
1015 }
1016
1017 if (c < 65536) {
1018 *optr++ = c;
1019 max_length--;
1020 } else if (max_length <= 2) {
1021 *optr = '\0';
1022 return (int)(optr - output);
1023 } else {
1024 c -= 0x10000;
1025 *optr++ = 0xd800 | ((c >> 10) & 0x3ff);
1026 *optr++ = 0xdc00 | (c & 0x3ff);
1027 max_length -= 2;
1028 if (offsetmap)
1029 offsetmap++;
1030 }
1031 }
1032 if (offsetmap)
1033 *offsetmap = (int)(iptr - (unsigned char*)input);
1034 *optr = '\0';
1035 return (int)(optr - output);
1036 }
1037
copy_char8_to_char16(PCRE2_SPTR8 input,PCRE2_UCHAR16 * output,int max_length)1038 static int copy_char8_to_char16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int max_length)
1039 {
1040 PCRE2_SPTR8 iptr = input;
1041 PCRE2_UCHAR16 *optr = output;
1042
1043 if (max_length == 0)
1044 return 0;
1045
1046 while (*iptr && max_length > 1) {
1047 *optr++ = *iptr++;
1048 max_length--;
1049 }
1050 *optr = '\0';
1051 return (int)(optr - output);
1052 }
1053
1054 #define REGTEST_MAX_LENGTH16 4096
1055 static PCRE2_UCHAR16 regtest_buf16[REGTEST_MAX_LENGTH16];
1056 static int regtest_offsetmap16[REGTEST_MAX_LENGTH16];
1057
1058 #endif /* SUPPORT_PCRE2_16 */
1059
1060 #ifdef SUPPORT_PCRE2_32
1061
convert_utf8_to_utf32(PCRE2_SPTR8 input,PCRE2_UCHAR32 * output,int * offsetmap,int max_length)1062 static int convert_utf8_to_utf32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int *offsetmap, int max_length)
1063 {
1064 PCRE2_SPTR8 iptr = input;
1065 PCRE2_UCHAR32 *optr = output;
1066 unsigned int c;
1067
1068 if (max_length == 0)
1069 return 0;
1070
1071 while (*iptr && max_length > 1) {
1072 c = 0;
1073 if (offsetmap)
1074 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1075
1076 if (*iptr < 0xc0)
1077 c = *iptr++;
1078 else if (!(*iptr & 0x20)) {
1079 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1080 iptr += 2;
1081 } else if (!(*iptr & 0x10)) {
1082 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1083 iptr += 3;
1084 } else if (!(*iptr & 0x08)) {
1085 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1086 iptr += 4;
1087 }
1088
1089 *optr++ = c;
1090 max_length--;
1091 }
1092 if (offsetmap)
1093 *offsetmap = (int)(iptr - (unsigned char*)input);
1094 *optr = 0;
1095 return (int)(optr - output);
1096 }
1097
copy_char8_to_char32(PCRE2_SPTR8 input,PCRE2_UCHAR32 * output,int max_length)1098 static int copy_char8_to_char32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int max_length)
1099 {
1100 PCRE2_SPTR8 iptr = input;
1101 PCRE2_UCHAR32 *optr = output;
1102
1103 if (max_length == 0)
1104 return 0;
1105
1106 while (*iptr && max_length > 1) {
1107 *optr++ = *iptr++;
1108 max_length--;
1109 }
1110 *optr = '\0';
1111 return (int)(optr - output);
1112 }
1113
1114 #define REGTEST_MAX_LENGTH32 4096
1115 static PCRE2_UCHAR32 regtest_buf32[REGTEST_MAX_LENGTH32];
1116 static int regtest_offsetmap32[REGTEST_MAX_LENGTH32];
1117
1118 #endif /* SUPPORT_PCRE2_32 */
1119
check_ascii(const char * input)1120 static int check_ascii(const char *input)
1121 {
1122 const unsigned char *ptr = (unsigned char *)input;
1123 while (*ptr) {
1124 if (*ptr > 127)
1125 return 0;
1126 ptr++;
1127 }
1128 return 1;
1129 }
1130
1131 #define OVECTOR_SIZE 15
1132
regression_tests(void)1133 static int regression_tests(void)
1134 {
1135 struct regression_test_case *current = regression_test_cases;
1136 int error;
1137 PCRE2_SIZE err_offs;
1138 int is_successful;
1139 int is_ascii;
1140 int total = 0;
1141 int successful = 0;
1142 int successful_row = 0;
1143 int counter = 0;
1144 int jit_compile_mode;
1145 int utf = 0;
1146 int disabled_options = 0;
1147 int i;
1148 #ifdef SUPPORT_PCRE2_8
1149 pcre2_code_8 *re8;
1150 pcre2_compile_context_8 *ccontext8;
1151 pcre2_match_data_8 *mdata8_1;
1152 pcre2_match_data_8 *mdata8_2;
1153 pcre2_match_context_8 *mcontext8;
1154 PCRE2_SIZE *ovector8_1 = NULL;
1155 PCRE2_SIZE *ovector8_2 = NULL;
1156 int return_value8[2];
1157 #endif
1158 #ifdef SUPPORT_PCRE2_16
1159 pcre2_code_16 *re16;
1160 pcre2_compile_context_16 *ccontext16;
1161 pcre2_match_data_16 *mdata16_1;
1162 pcre2_match_data_16 *mdata16_2;
1163 pcre2_match_context_16 *mcontext16;
1164 PCRE2_SIZE *ovector16_1 = NULL;
1165 PCRE2_SIZE *ovector16_2 = NULL;
1166 int return_value16[2];
1167 int length16;
1168 #endif
1169 #ifdef SUPPORT_PCRE2_32
1170 pcre2_code_32 *re32;
1171 pcre2_compile_context_32 *ccontext32;
1172 pcre2_match_data_32 *mdata32_1;
1173 pcre2_match_data_32 *mdata32_2;
1174 pcre2_match_context_32 *mcontext32;
1175 PCRE2_SIZE *ovector32_1 = NULL;
1176 PCRE2_SIZE *ovector32_2 = NULL;
1177 int return_value32[2];
1178 int length32;
1179 #endif
1180
1181 #if defined SUPPORT_PCRE2_8
1182 PCRE2_UCHAR8 cpu_info[128];
1183 #elif defined SUPPORT_PCRE2_16
1184 PCRE2_UCHAR16 cpu_info[128];
1185 #elif defined SUPPORT_PCRE2_32
1186 PCRE2_UCHAR32 cpu_info[128];
1187 #endif
1188 #if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1189 int return_value;
1190 #endif
1191
1192 /* This test compares the behaviour of interpreter and JIT. Although disabling
1193 utf or ucp may make tests fail, if the pcre_exec result is the SAME, it is
1194 still considered successful from pcre_jit_test point of view. */
1195
1196 #if defined SUPPORT_PCRE2_8
1197 pcre2_config_8(PCRE2_CONFIG_JITTARGET, &cpu_info);
1198 #elif defined SUPPORT_PCRE2_16
1199 pcre2_config_16(PCRE2_CONFIG_JITTARGET, &cpu_info);
1200 #elif defined SUPPORT_PCRE2_32
1201 pcre2_config_32(PCRE2_CONFIG_JITTARGET, &cpu_info);
1202 #endif
1203
1204 printf("Running JIT regression tests\n");
1205 printf(" target CPU of SLJIT compiler: ");
1206 for (i = 0; cpu_info[i]; i++)
1207 printf("%c", (char)(cpu_info[i]));
1208 printf("\n");
1209
1210 #if defined SUPPORT_PCRE2_8
1211 pcre2_config_8(PCRE2_CONFIG_UNICODE, &utf);
1212 #elif defined SUPPORT_PCRE2_16
1213 pcre2_config_16(PCRE2_CONFIG_UNICODE, &utf);
1214 #elif defined SUPPORT_PCRE2_32
1215 pcre2_config_32(PCRE2_CONFIG_UNICODE, &utf);
1216 #endif
1217
1218 if (!utf)
1219 disabled_options |= PCRE2_UTF;
1220 #ifdef SUPPORT_PCRE2_8
1221 printf(" in 8 bit mode with UTF-8 %s:\n", utf ? "enabled" : "disabled");
1222 #endif
1223 #ifdef SUPPORT_PCRE2_16
1224 printf(" in 16 bit mode with UTF-16 %s:\n", utf ? "enabled" : "disabled");
1225 #endif
1226 #ifdef SUPPORT_PCRE2_32
1227 printf(" in 32 bit mode with UTF-32 %s:\n", utf ? "enabled" : "disabled");
1228 #endif
1229
1230 while (current->pattern) {
1231 /* printf("\nPattern: %s :\n", current->pattern); */
1232 total++;
1233 is_ascii = 0;
1234 if (!(current->start_offset & F_PROPERTY))
1235 is_ascii = check_ascii(current->pattern) && check_ascii(current->input);
1236
1237 if (current->match_options & PCRE2_PARTIAL_SOFT)
1238 jit_compile_mode = PCRE2_JIT_PARTIAL_SOFT;
1239 else if (current->match_options & PCRE2_PARTIAL_HARD)
1240 jit_compile_mode = PCRE2_JIT_PARTIAL_HARD;
1241 else
1242 jit_compile_mode = PCRE2_JIT_COMPLETE;
1243 error = 0;
1244 #ifdef SUPPORT_PCRE2_8
1245 re8 = NULL;
1246 ccontext8 = pcre2_compile_context_create_8(NULL);
1247 if (ccontext8) {
1248 if (GET_NEWLINE(current->newline))
1249 pcre2_set_newline_8(ccontext8, GET_NEWLINE(current->newline));
1250 if (GET_BSR(current->newline))
1251 pcre2_set_bsr_8(ccontext8, GET_BSR(current->newline));
1252
1253 if (!(current->start_offset & F_NO8)) {
1254 re8 = pcre2_compile_8((PCRE2_SPTR8)current->pattern, PCRE2_ZERO_TERMINATED,
1255 current->compile_options & ~disabled_options,
1256 &error, &err_offs, ccontext8);
1257
1258 if (!re8 && (utf || is_ascii))
1259 printf("\n8 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1260 }
1261 pcre2_compile_context_free_8(ccontext8);
1262 }
1263 else
1264 printf("\n8 bit: Cannot allocate compile context\n");
1265 #endif
1266 #ifdef SUPPORT_PCRE2_16
1267 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1268 convert_utf8_to_utf16((PCRE2_SPTR8)current->pattern, regtest_buf16, NULL, REGTEST_MAX_LENGTH16);
1269 else
1270 copy_char8_to_char16((PCRE2_SPTR8)current->pattern, regtest_buf16, REGTEST_MAX_LENGTH16);
1271
1272 re16 = NULL;
1273 ccontext16 = pcre2_compile_context_create_16(NULL);
1274 if (ccontext16) {
1275 if (GET_NEWLINE(current->newline))
1276 pcre2_set_newline_16(ccontext16, GET_NEWLINE(current->newline));
1277 if (GET_BSR(current->newline))
1278 pcre2_set_bsr_16(ccontext16, GET_BSR(current->newline));
1279
1280 if (!(current->start_offset & F_NO16)) {
1281 re16 = pcre2_compile_16(regtest_buf16, PCRE2_ZERO_TERMINATED,
1282 current->compile_options & ~disabled_options,
1283 &error, &err_offs, ccontext16);
1284
1285 if (!re16 && (utf || is_ascii))
1286 printf("\n16 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1287 }
1288 pcre2_compile_context_free_16(ccontext16);
1289 }
1290 else
1291 printf("\n16 bit: Cannot allocate compile context\n");
1292 #endif
1293 #ifdef SUPPORT_PCRE2_32
1294 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1295 convert_utf8_to_utf32((PCRE2_SPTR8)current->pattern, regtest_buf32, NULL, REGTEST_MAX_LENGTH32);
1296 else
1297 copy_char8_to_char32((PCRE2_SPTR8)current->pattern, regtest_buf32, REGTEST_MAX_LENGTH32);
1298
1299 re32 = NULL;
1300 ccontext32 = pcre2_compile_context_create_32(NULL);
1301 if (ccontext32) {
1302 if (GET_NEWLINE(current->newline))
1303 pcre2_set_newline_32(ccontext32, GET_NEWLINE(current->newline));
1304 if (GET_BSR(current->newline))
1305 pcre2_set_bsr_32(ccontext32, GET_BSR(current->newline));
1306
1307 if (!(current->start_offset & F_NO32)) {
1308 re32 = pcre2_compile_32(regtest_buf32, PCRE2_ZERO_TERMINATED,
1309 current->compile_options & ~disabled_options,
1310 &error, &err_offs, ccontext32);
1311
1312 if (!re32 && (utf || is_ascii))
1313 printf("\n32 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1314 }
1315 pcre2_compile_context_free_32(ccontext32);
1316 }
1317 else
1318 printf("\n32 bit: Cannot allocate compile context\n");
1319 #endif
1320
1321 counter++;
1322 if ((counter & 0x3) != 0) {
1323 #ifdef SUPPORT_PCRE2_8
1324 setstack8(NULL);
1325 #endif
1326 #ifdef SUPPORT_PCRE2_16
1327 setstack16(NULL);
1328 #endif
1329 #ifdef SUPPORT_PCRE2_32
1330 setstack32(NULL);
1331 #endif
1332 }
1333
1334 #ifdef SUPPORT_PCRE2_8
1335 return_value8[0] = -1000;
1336 return_value8[1] = -1000;
1337 mdata8_1 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1338 mdata8_2 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1339 mcontext8 = pcre2_match_context_create_8(NULL);
1340 if (!mdata8_1 || !mdata8_2 || !mcontext8) {
1341 printf("\n8 bit: Cannot allocate match data\n");
1342 pcre2_match_data_free_8(mdata8_1);
1343 pcre2_match_data_free_8(mdata8_2);
1344 pcre2_match_context_free_8(mcontext8);
1345 pcre2_code_free_8(re8);
1346 re8 = NULL;
1347 } else {
1348 ovector8_1 = pcre2_get_ovector_pointer_8(mdata8_1);
1349 ovector8_2 = pcre2_get_ovector_pointer_8(mdata8_2);
1350 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1351 ovector8_1[i] = -2;
1352 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1353 ovector8_2[i] = -2;
1354 }
1355 if (re8) {
1356 return_value8[1] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1357 current->start_offset & OFFSET_MASK, current->match_options, mdata8_2, NULL);
1358
1359 if (pcre2_jit_compile_8(re8, jit_compile_mode)) {
1360 printf("\n8 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1361 } else if ((counter & 0x1) != 0) {
1362 setstack8(mcontext8);
1363 return_value8[0] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1364 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1365 } else {
1366 pcre2_jit_stack_assign_8(mcontext8, NULL, getstack8());
1367 return_value8[0] = pcre2_jit_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1368 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1369 }
1370 }
1371 #endif
1372
1373 #ifdef SUPPORT_PCRE2_16
1374 return_value16[0] = -1000;
1375 return_value16[1] = -1000;
1376 mdata16_1 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1377 mdata16_2 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1378 mcontext16 = pcre2_match_context_create_16(NULL);
1379 if (!mdata16_1 || !mdata16_2 || !mcontext16) {
1380 printf("\n16 bit: Cannot allocate match data\n");
1381 pcre2_match_data_free_16(mdata16_1);
1382 pcre2_match_data_free_16(mdata16_2);
1383 pcre2_match_context_free_16(mcontext16);
1384 pcre2_code_free_16(re16);
1385 re16 = NULL;
1386 } else {
1387 ovector16_1 = pcre2_get_ovector_pointer_16(mdata16_1);
1388 ovector16_2 = pcre2_get_ovector_pointer_16(mdata16_2);
1389 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1390 ovector16_1[i] = -2;
1391 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1392 ovector16_2[i] = -2;
1393 }
1394 if (re16) {
1395 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1396 length16 = convert_utf8_to_utf16((PCRE2_SPTR8)current->input, regtest_buf16, regtest_offsetmap16, REGTEST_MAX_LENGTH16);
1397 else
1398 length16 = copy_char8_to_char16((PCRE2_SPTR8)current->input, regtest_buf16, REGTEST_MAX_LENGTH16);
1399
1400 return_value16[1] = pcre2_match_16(re16, regtest_buf16, length16,
1401 current->start_offset & OFFSET_MASK, current->match_options, mdata16_2, NULL);
1402
1403 if (pcre2_jit_compile_16(re16, jit_compile_mode)) {
1404 printf("\n16 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1405 } else if ((counter & 0x1) != 0) {
1406 setstack16(mcontext16);
1407 return_value16[0] = pcre2_match_16(re16, regtest_buf16, length16,
1408 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1409 } else {
1410 pcre2_jit_stack_assign_16(mcontext16, NULL, getstack16());
1411 return_value16[0] = pcre2_jit_match_16(re16, regtest_buf16, length16,
1412 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1413 }
1414 }
1415 #endif
1416
1417 #ifdef SUPPORT_PCRE2_32
1418 return_value32[0] = -1000;
1419 return_value32[1] = -1000;
1420 mdata32_1 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1421 mdata32_2 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1422 mcontext32 = pcre2_match_context_create_32(NULL);
1423 if (!mdata32_1 || !mdata32_2 || !mcontext32) {
1424 printf("\n32 bit: Cannot allocate match data\n");
1425 pcre2_match_data_free_32(mdata32_1);
1426 pcre2_match_data_free_32(mdata32_2);
1427 pcre2_match_context_free_32(mcontext32);
1428 pcre2_code_free_32(re32);
1429 re32 = NULL;
1430 } else {
1431 ovector32_1 = pcre2_get_ovector_pointer_32(mdata32_1);
1432 ovector32_2 = pcre2_get_ovector_pointer_32(mdata32_2);
1433 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1434 ovector32_1[i] = -2;
1435 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1436 ovector32_2[i] = -2;
1437 }
1438 if (re32) {
1439 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1440 length32 = convert_utf8_to_utf32((PCRE2_SPTR8)current->input, regtest_buf32, regtest_offsetmap32, REGTEST_MAX_LENGTH32);
1441 else
1442 length32 = copy_char8_to_char32((PCRE2_SPTR8)current->input, regtest_buf32, REGTEST_MAX_LENGTH32);
1443
1444 return_value32[1] = pcre2_match_32(re32, regtest_buf32, length32,
1445 current->start_offset & OFFSET_MASK, current->match_options, mdata32_2, NULL);
1446
1447 if (pcre2_jit_compile_32(re32, jit_compile_mode)) {
1448 printf("\n32 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1449 } else if ((counter & 0x1) != 0) {
1450 setstack32(mcontext32);
1451 return_value32[0] = pcre2_match_32(re32, regtest_buf32, length32,
1452 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1453 } else {
1454 pcre2_jit_stack_assign_32(mcontext32, NULL, getstack32());
1455 return_value32[0] = pcre2_jit_match_32(re32, regtest_buf32, length32,
1456 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1457 }
1458 }
1459 #endif
1460
1461 /* printf("[%d-%d-%d|%d-%d|%d-%d|%d-%d]%s",
1462 return_value8[0], return_value16[0], return_value32[0],
1463 (int)ovector8_1[0], (int)ovector8_1[1],
1464 (int)ovector16_1[0], (int)ovector16_1[1],
1465 (int)ovector32_1[0], (int)ovector32_1[1],
1466 (current->compile_options & PCRE2_CASELESS) ? "C" : ""); */
1467
1468 /* If F_DIFF is set, just run the test, but do not compare the results.
1469 Segfaults can still be captured. */
1470
1471 is_successful = 1;
1472 if (!(current->start_offset & F_DIFF)) {
1473 #if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1474 if (!(current->start_offset & F_FORCECONV)) {
1475
1476 /* All results must be the same. */
1477 #ifdef SUPPORT_PCRE2_8
1478 if ((return_value = return_value8[0]) != return_value8[1]) {
1479 printf("\n8 bit: Return value differs(J8:%d,I8:%d): [%d] '%s' @ '%s'\n",
1480 return_value8[0], return_value8[1], total, current->pattern, current->input);
1481 is_successful = 0;
1482 } else
1483 #endif
1484 #ifdef SUPPORT_PCRE2_16
1485 if ((return_value = return_value16[0]) != return_value16[1]) {
1486 printf("\n16 bit: Return value differs(J16:%d,I16:%d): [%d] '%s' @ '%s'\n",
1487 return_value16[0], return_value16[1], total, current->pattern, current->input);
1488 is_successful = 0;
1489 } else
1490 #endif
1491 #ifdef SUPPORT_PCRE2_32
1492 if ((return_value = return_value32[0]) != return_value32[1]) {
1493 printf("\n32 bit: Return value differs(J32:%d,I32:%d): [%d] '%s' @ '%s'\n",
1494 return_value32[0], return_value32[1], total, current->pattern, current->input);
1495 is_successful = 0;
1496 } else
1497 #endif
1498 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1499 if (return_value8[0] != return_value16[0]) {
1500 printf("\n8 and 16 bit: Return value differs(J8:%d,J16:%d): [%d] '%s' @ '%s'\n",
1501 return_value8[0], return_value16[0],
1502 total, current->pattern, current->input);
1503 is_successful = 0;
1504 } else
1505 #endif
1506 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1507 if (return_value8[0] != return_value32[0]) {
1508 printf("\n8 and 32 bit: Return value differs(J8:%d,J32:%d): [%d] '%s' @ '%s'\n",
1509 return_value8[0], return_value32[0],
1510 total, current->pattern, current->input);
1511 is_successful = 0;
1512 } else
1513 #endif
1514 #if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1515 if (return_value16[0] != return_value32[0]) {
1516 printf("\n16 and 32 bit: Return value differs(J16:%d,J32:%d): [%d] '%s' @ '%s'\n",
1517 return_value16[0], return_value32[0],
1518 total, current->pattern, current->input);
1519 is_successful = 0;
1520 } else
1521 #endif
1522 if (return_value >= 0 || return_value == PCRE2_ERROR_PARTIAL) {
1523 if (return_value == PCRE2_ERROR_PARTIAL) {
1524 return_value = 2;
1525 } else {
1526 return_value *= 2;
1527 }
1528 #ifdef SUPPORT_PCRE2_8
1529 return_value8[0] = return_value;
1530 #endif
1531 #ifdef SUPPORT_PCRE2_16
1532 return_value16[0] = return_value;
1533 #endif
1534 #ifdef SUPPORT_PCRE2_32
1535 return_value32[0] = return_value;
1536 #endif
1537 /* Transform back the results. */
1538 if (current->compile_options & PCRE2_UTF) {
1539 #ifdef SUPPORT_PCRE2_16
1540 for (i = 0; i < return_value; ++i) {
1541 if (ovector16_1[i] != PCRE2_UNSET)
1542 ovector16_1[i] = regtest_offsetmap16[ovector16_1[i]];
1543 if (ovector16_2[i] != PCRE2_UNSET)
1544 ovector16_2[i] = regtest_offsetmap16[ovector16_2[i]];
1545 }
1546 #endif
1547 #ifdef SUPPORT_PCRE2_32
1548 for (i = 0; i < return_value; ++i) {
1549 if (ovector32_1[i] != PCRE2_UNSET)
1550 ovector32_1[i] = regtest_offsetmap32[ovector32_1[i]];
1551 if (ovector32_2[i] != PCRE2_UNSET)
1552 ovector32_2[i] = regtest_offsetmap32[ovector32_2[i]];
1553 }
1554 #endif
1555 }
1556
1557 for (i = 0; i < return_value; ++i) {
1558 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1559 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector16_1[i] || ovector8_1[i] != ovector16_2[i]) {
1560 printf("\n8 and 16 bit: Ovector[%d] value differs(J8:%d,I8:%d,J16:%d,I16:%d): [%d] '%s' @ '%s' \n",
1561 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector16_1[i], (int)ovector16_2[i],
1562 total, current->pattern, current->input);
1563 is_successful = 0;
1564 }
1565 #endif
1566 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1567 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector32_1[i] || ovector8_1[i] != ovector32_2[i]) {
1568 printf("\n8 and 32 bit: Ovector[%d] value differs(J8:%d,I8:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1569 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1570 total, current->pattern, current->input);
1571 is_successful = 0;
1572 }
1573 #endif
1574 #if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1575 if (ovector16_1[i] != ovector16_2[i] || ovector16_1[i] != ovector32_1[i] || ovector16_1[i] != ovector32_2[i]) {
1576 printf("\n16 and 32 bit: Ovector[%d] value differs(J16:%d,I16:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1577 i, (int)ovector16_1[i], (int)ovector16_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1578 total, current->pattern, current->input);
1579 is_successful = 0;
1580 }
1581 #endif
1582 }
1583 }
1584 } else
1585 #endif /* more than one of SUPPORT_PCRE2_8, SUPPORT_PCRE2_16 and SUPPORT_PCRE2_32 */
1586 {
1587 #ifdef SUPPORT_PCRE2_8
1588 if (return_value8[0] != return_value8[1]) {
1589 printf("\n8 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1590 return_value8[0], return_value8[1], total, current->pattern, current->input);
1591 is_successful = 0;
1592 } else if (return_value8[0] >= 0 || return_value8[0] == PCRE2_ERROR_PARTIAL) {
1593 if (return_value8[0] == PCRE2_ERROR_PARTIAL)
1594 return_value8[0] = 2;
1595 else
1596 return_value8[0] *= 2;
1597
1598 for (i = 0; i < return_value8[0]; ++i)
1599 if (ovector8_1[i] != ovector8_2[i]) {
1600 printf("\n8 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1601 i, (int)ovector8_1[i], (int)ovector8_2[i], total, current->pattern, current->input);
1602 is_successful = 0;
1603 }
1604 }
1605 #endif
1606
1607 #ifdef SUPPORT_PCRE2_16
1608 if (return_value16[0] != return_value16[1]) {
1609 printf("\n16 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1610 return_value16[0], return_value16[1], total, current->pattern, current->input);
1611 is_successful = 0;
1612 } else if (return_value16[0] >= 0 || return_value16[0] == PCRE2_ERROR_PARTIAL) {
1613 if (return_value16[0] == PCRE2_ERROR_PARTIAL)
1614 return_value16[0] = 2;
1615 else
1616 return_value16[0] *= 2;
1617
1618 for (i = 0; i < return_value16[0]; ++i)
1619 if (ovector16_1[i] != ovector16_2[i]) {
1620 printf("\n16 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1621 i, (int)ovector16_1[i], (int)ovector16_2[i], total, current->pattern, current->input);
1622 is_successful = 0;
1623 }
1624 }
1625 #endif
1626
1627 #ifdef SUPPORT_PCRE2_32
1628 if (return_value32[0] != return_value32[1]) {
1629 printf("\n32 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1630 return_value32[0], return_value32[1], total, current->pattern, current->input);
1631 is_successful = 0;
1632 } else if (return_value32[0] >= 0 || return_value32[0] == PCRE2_ERROR_PARTIAL) {
1633 if (return_value32[0] == PCRE2_ERROR_PARTIAL)
1634 return_value32[0] = 2;
1635 else
1636 return_value32[0] *= 2;
1637
1638 for (i = 0; i < return_value32[0]; ++i)
1639 if (ovector32_1[i] != ovector32_2[i]) {
1640 printf("\n32 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1641 i, (int)ovector32_1[i], (int)ovector32_2[i], total, current->pattern, current->input);
1642 is_successful = 0;
1643 }
1644 }
1645 #endif
1646 }
1647 }
1648
1649 if (is_successful) {
1650 #ifdef SUPPORT_PCRE2_8
1651 if (!(current->start_offset & F_NO8) && (utf || is_ascii)) {
1652 if (return_value8[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1653 printf("8 bit: Test should match: [%d] '%s' @ '%s'\n",
1654 total, current->pattern, current->input);
1655 is_successful = 0;
1656 }
1657
1658 if (return_value8[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1659 printf("8 bit: Test should not match: [%d] '%s' @ '%s'\n",
1660 total, current->pattern, current->input);
1661 is_successful = 0;
1662 }
1663 }
1664 #endif
1665 #ifdef SUPPORT_PCRE2_16
1666 if (!(current->start_offset & F_NO16) && (utf || is_ascii)) {
1667 if (return_value16[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1668 printf("16 bit: Test should match: [%d] '%s' @ '%s'\n",
1669 total, current->pattern, current->input);
1670 is_successful = 0;
1671 }
1672
1673 if (return_value16[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1674 printf("16 bit: Test should not match: [%d] '%s' @ '%s'\n",
1675 total, current->pattern, current->input);
1676 is_successful = 0;
1677 }
1678 }
1679 #endif
1680 #ifdef SUPPORT_PCRE2_32
1681 if (!(current->start_offset & F_NO32) && (utf || is_ascii)) {
1682 if (return_value32[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1683 printf("32 bit: Test should match: [%d] '%s' @ '%s'\n",
1684 total, current->pattern, current->input);
1685 is_successful = 0;
1686 }
1687
1688 if (return_value32[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1689 printf("32 bit: Test should not match: [%d] '%s' @ '%s'\n",
1690 total, current->pattern, current->input);
1691 is_successful = 0;
1692 }
1693 }
1694 #endif
1695 }
1696
1697 if (is_successful) {
1698 #ifdef SUPPORT_PCRE2_8
1699 if (re8 && !(current->start_offset & F_NO8) && pcre2_get_mark_8(mdata8_1) != pcre2_get_mark_8(mdata8_2)) {
1700 printf("8 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1701 total, current->pattern, current->input);
1702 is_successful = 0;
1703 }
1704 #endif
1705 #ifdef SUPPORT_PCRE2_16
1706 if (re16 && !(current->start_offset & F_NO16) && pcre2_get_mark_16(mdata16_1) != pcre2_get_mark_16(mdata16_2)) {
1707 printf("16 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1708 total, current->pattern, current->input);
1709 is_successful = 0;
1710 }
1711 #endif
1712 #ifdef SUPPORT_PCRE2_32
1713 if (re32 && !(current->start_offset & F_NO32) && pcre2_get_mark_32(mdata32_1) != pcre2_get_mark_32(mdata32_2)) {
1714 printf("32 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1715 total, current->pattern, current->input);
1716 is_successful = 0;
1717 }
1718 #endif
1719 }
1720
1721 #ifdef SUPPORT_PCRE2_8
1722 pcre2_code_free_8(re8);
1723 pcre2_match_data_free_8(mdata8_1);
1724 pcre2_match_data_free_8(mdata8_2);
1725 pcre2_match_context_free_8(mcontext8);
1726 #endif
1727 #ifdef SUPPORT_PCRE2_16
1728 pcre2_code_free_16(re16);
1729 pcre2_match_data_free_16(mdata16_1);
1730 pcre2_match_data_free_16(mdata16_2);
1731 pcre2_match_context_free_16(mcontext16);
1732 #endif
1733 #ifdef SUPPORT_PCRE2_32
1734 pcre2_code_free_32(re32);
1735 pcre2_match_data_free_32(mdata32_1);
1736 pcre2_match_data_free_32(mdata32_2);
1737 pcre2_match_context_free_32(mcontext32);
1738 #endif
1739
1740 if (is_successful) {
1741 successful++;
1742 successful_row++;
1743 printf(".");
1744 if (successful_row >= 60) {
1745 successful_row = 0;
1746 printf("\n");
1747 }
1748 } else
1749 successful_row = 0;
1750
1751 fflush(stdout);
1752 current++;
1753 }
1754 #ifdef SUPPORT_PCRE2_8
1755 setstack8(NULL);
1756 #endif
1757 #ifdef SUPPORT_PCRE2_16
1758 setstack16(NULL);
1759 #endif
1760 #ifdef SUPPORT_PCRE2_32
1761 setstack32(NULL);
1762 #endif
1763
1764 if (total == successful) {
1765 printf("\nAll JIT regression tests are successfully passed.\n");
1766 return 0;
1767 } else {
1768 printf("\nSuccessful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
1769 return 1;
1770 }
1771 }
1772
1773 #if defined SUPPORT_UNICODE && (defined SUPPORT_PCRE2_8 || defined SUPPORT_PCRE2_16 || defined SUPPORT_PCRE2_32)
1774
check_invalid_utf_result(int pattern_index,const char * type,int result,int match_start,int match_end,PCRE2_SIZE * ovector)1775 static int check_invalid_utf_result(int pattern_index, const char *type, int result,
1776 int match_start, int match_end, PCRE2_SIZE *ovector)
1777 {
1778 if (match_start < 0) {
1779 if (result != -1) {
1780 printf("Pattern[%d] %s result is not -1.\n", pattern_index, type);
1781 return 1;
1782 }
1783 return 0;
1784 }
1785
1786 if (result <= 0) {
1787 printf("Pattern[%d] %s result (%d) is not greater than 0.\n", pattern_index, type, result);
1788 return 1;
1789 }
1790
1791 if (ovector[0] != (PCRE2_SIZE)match_start) {
1792 printf("Pattern[%d] %s ovector[0] is unexpected (%d instead of %d)\n",
1793 pattern_index, type, (int)ovector[0], match_start);
1794 return 1;
1795 }
1796
1797 if (ovector[1] != (PCRE2_SIZE)match_end) {
1798 printf("Pattern[%d] %s ovector[1] is unexpected (%d instead of %d)\n",
1799 pattern_index, type, (int)ovector[1], match_end);
1800 return 1;
1801 }
1802
1803 return 0;
1804 }
1805
1806 #endif /* SUPPORT_UNICODE && (SUPPORT_PCRE2_8 || SUPPORT_PCRE2_16 || SUPPORT_PCRE2_32) */
1807
1808 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8
1809
1810 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
1811 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
1812 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
1813
1814 struct invalid_utf8_regression_test_case {
1815 int compile_options;
1816 int jit_compile_options;
1817 int start_offset;
1818 int skip_left;
1819 int skip_right;
1820 int match_start;
1821 int match_end;
1822 const char *pattern[2];
1823 const char *input;
1824 };
1825
1826 static struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = {
1827 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1828 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf0\x90\x80\x80" },
1829 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
1830 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1831 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
1832 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
1833 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
1834 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf#" },
1835 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf" },
1836 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80#" },
1837 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80" },
1838 { UDA, CI, 0, 0, 2, -1, -1, { ".", NULL }, "\xef\xbf\xbf#" },
1839 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xef\xbf\xbf" },
1840 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\x7f#" },
1841 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\xc0" },
1842 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
1843 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf" },
1844 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xed\x9f\xbf#" },
1845 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xa0\x80#" },
1846 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xee\x80\x80#" },
1847 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xbf\xbf#" },
1848 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf##" },
1849 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf#" },
1850 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf" },
1851 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80##" },
1852 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80#" },
1853 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80" },
1854 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80##" },
1855 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0##" },
1856 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80" },
1857 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0" },
1858 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf##" },
1859 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf" },
1860 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80###" },
1861 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80" },
1862 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8###" },
1863 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8" },
1864 { UDA, CI, 0, 0, 0, 0, 1, { ".", NULL }, "\x7f" },
1865
1866 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
1867 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80#" },
1868 { UDA, CPI, 4, 1, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf#" },
1869 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xef\xbf\xbf#" },
1870 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xe0\xa0\x80#" },
1871 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
1872 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
1873 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf#" },
1874 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80#" },
1875 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80#" },
1876 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff#" },
1877 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf#" },
1878 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80#" },
1879 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80#" },
1880 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf#" },
1881 { UDA, CPI, 4, 2, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80#" },
1882 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xf0\x80\x80#" },
1883 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xed\xa0\x80#" },
1884 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xdf\xbf#" },
1885 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xdf\xbf#" },
1886 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xc2\x80#" },
1887 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xc2\x80#" },
1888 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xc1\xbf#" },
1889 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xdf\xc0#" },
1890 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
1891 { UDA, CPI, 4, 2, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80#" },
1892
1893 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xef\xbf\xbf#" },
1894 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xe0\xa0\x80#" },
1895 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf#" },
1896 { UDA, CPI, 3, 1, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xbf#" },
1897 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\x80\x80#" },
1898 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xff#" },
1899 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xff\xbf#" },
1900 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xed\xbf\xbf#" },
1901
1902 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xdf\xbf#" },
1903 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xc2\x80#" },
1904 { UDA, CPI, 2, 1, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xbf#" },
1905 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xc1\xbf#" },
1906 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x80#" },
1907 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xff#" },
1908 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xff\xbf#" },
1909
1910 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x7f#" },
1911 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x01#" },
1912 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80#" },
1913 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80#" },
1914
1915 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
1916 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "a\xff" },
1917 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1918 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1919 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
1920 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 6, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1921 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1922 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 8, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1923 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1924
1925 { UDA, CPI, 0, 0, 0, 0, 1, { "\\X", NULL }, "A" },
1926 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xff" },
1927 { UDA, CPI, 0, 0, 0, 0, 2, { "\\X", NULL }, "\xc3\xa1" },
1928 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xc3\xa1" },
1929 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xc3\x7f" },
1930 { UDA, CPI, 0, 0, 0, 0, 3, { "\\X", NULL }, "\xe1\xbd\xb8" },
1931 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
1932 { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1933 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1934
1935 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "#" },
1936 { UDA, CPI, 0, 0, 0, 0, 4, { "[^#]", NULL }, "\xf4\x8f\xbf\xbf" },
1937 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xf4\x90\x80\x80" },
1938 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xc1\x80" },
1939
1940 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"},
1941 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"},
1942 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"},
1943 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xc3\x0a#"},
1944 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf1\x0a#"},
1945 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xf2\xbf\x0a#"},
1946 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \xf2\xbf\xbf\x0a#"},
1947 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xef\x0a#"},
1948 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xef\xbf\x0a#"},
1949 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \x85#\xc2\x85#"},
1950 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 7, 8, { "^\\W", NULL }, " \xe2\x80\xf8\xe2\x80\xa8#"},
1951
1952 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xe2\x80\xf8\xe2\x80\xa8#"},
1953 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 3, 4, { "#", NULL }, "\xe2\x80\xf8#\xe2\x80\xa8#"},
1954 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "abcd\xc2\x85#"},
1955 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 1, 2, { "#", NULL }, "\x85#\xc2\x85#"},
1956 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 5, 6, { "#", NULL }, "\xef,\x80,\xf8#\x0a"},
1957 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xef,\x80,\xf8\x0a#"},
1958
1959 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1960 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1961 { PCRE2_UTF, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1962 { PCRE2_UTF, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1963
1964 /* These two are not invalid UTF tests, but this infrastructure fits better for them. */
1965 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\X{2}", NULL }, "\r\n\n" },
1966 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\R{2}", NULL }, "\r\n\n" },
1967
1968 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
1969 };
1970
1971 #undef UDA
1972 #undef CI
1973 #undef CPI
1974
run_invalid_utf8_test(struct invalid_utf8_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_8 * ccontext,pcre2_match_data_8 * mdata)1975 static int run_invalid_utf8_test(struct invalid_utf8_regression_test_case *current,
1976 int pattern_index, int i, pcre2_compile_context_8 *ccontext, pcre2_match_data_8 *mdata)
1977 {
1978 pcre2_code_8 *code;
1979 int result, errorcode;
1980 PCRE2_SIZE length, erroroffset;
1981 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_8(mdata);
1982
1983 if (current->pattern[i] == NULL)
1984 return 1;
1985
1986 code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED,
1987 current->compile_options, &errorcode, &erroroffset, ccontext);
1988
1989 if (!code) {
1990 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
1991 return 0;
1992 }
1993
1994 if (pcre2_jit_compile_8(code, current->jit_compile_options) != 0) {
1995 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
1996 pcre2_code_free_8(code);
1997 return 0;
1998 }
1999
2000 length = (PCRE2_SIZE)(strlen(current->input) - current->skip_left - current->skip_right);
2001
2002 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2003 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2004 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2005
2006 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2007 pcre2_code_free_8(code);
2008 return 0;
2009 }
2010 }
2011
2012 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2013 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2014 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2015
2016 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2017 pcre2_code_free_8(code);
2018 return 0;
2019 }
2020 }
2021
2022 pcre2_code_free_8(code);
2023 return 1;
2024 }
2025
invalid_utf8_regression_tests(void)2026 static int invalid_utf8_regression_tests(void)
2027 {
2028 struct invalid_utf8_regression_test_case *current;
2029 pcre2_compile_context_8 *ccontext;
2030 pcre2_match_data_8 *mdata;
2031 int total = 0, successful = 0;
2032 int result;
2033
2034 printf("\nRunning invalid-utf8 JIT regression tests\n");
2035
2036 ccontext = pcre2_compile_context_create_8(NULL);
2037 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
2038 mdata = pcre2_match_data_create_8(4, NULL);
2039
2040 for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) {
2041 /* printf("\nPattern: %s :\n", current->pattern); */
2042 total++;
2043
2044 result = 1;
2045 if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
2046 result = 0;
2047 if (!run_invalid_utf8_test(current, total - 1, 1, ccontext, mdata))
2048 result = 0;
2049
2050 if (result) {
2051 successful++;
2052 }
2053
2054 printf(".");
2055 if ((total % 60) == 0)
2056 printf("\n");
2057 }
2058
2059 if ((total % 60) != 0)
2060 printf("\n");
2061
2062 pcre2_match_data_free_8(mdata);
2063 pcre2_compile_context_free_8(ccontext);
2064
2065 if (total == successful) {
2066 printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n");
2067 return 0;
2068 } else {
2069 printf("\nInvalid UTF8 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2070 return 1;
2071 }
2072 }
2073
2074 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_8 */
2075
invalid_utf8_regression_tests(void)2076 static int invalid_utf8_regression_tests(void)
2077 {
2078 return 0;
2079 }
2080
2081 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_8 */
2082
2083 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_16
2084
2085 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2086 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2087 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2088
2089 struct invalid_utf16_regression_test_case {
2090 int compile_options;
2091 int jit_compile_options;
2092 int start_offset;
2093 int skip_left;
2094 int skip_right;
2095 int match_start;
2096 int match_end;
2097 const PCRE2_UCHAR16 *pattern[2];
2098 const PCRE2_UCHAR16 *input;
2099 };
2100
2101 static PCRE2_UCHAR16 allany16[] = { '.', 0 };
2102 static PCRE2_UCHAR16 non_word_boundary16[] = { '\\', 'B', 0 };
2103 static PCRE2_UCHAR16 word_boundary16[] = { '\\', 'b', 0 };
2104 static PCRE2_UCHAR16 backreference16[] = { '(', '.', ')', '\\', '1', 0 };
2105 static PCRE2_UCHAR16 grapheme16[] = { '\\', 'X', 0 };
2106 static PCRE2_UCHAR16 nothashmark16[] = { '[', '^', '#', ']', 0 };
2107 static PCRE2_UCHAR16 afternl16[] = { '^', '\\', 'W', 0 };
2108 static PCRE2_UCHAR16 generic16[] = { '#', 0xd800, 0xdc00, '#', 0 };
2109 static PCRE2_UCHAR16 test16_1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 };
2110 static PCRE2_UCHAR16 test16_2[] = { 0xd800, 0xdc00, '#', 0 };
2111 static PCRE2_UCHAR16 test16_3[] = { 0xdbff, 0xdfff, '#', 0 };
2112 static PCRE2_UCHAR16 test16_4[] = { 0xd800, 0xdbff, '#', 0 };
2113 static PCRE2_UCHAR16 test16_5[] = { '#', 0xd800, '#', 0 };
2114 static PCRE2_UCHAR16 test16_6[] = { 'a', 'A', 0xdc28, 0 };
2115 static PCRE2_UCHAR16 test16_7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
2116 static PCRE2_UCHAR16 test16_8[] = { '#', 0xd800, 0xdc00, 0 };
2117 static PCRE2_UCHAR16 test16_9[] = { ' ', 0x2028, '#', 0 };
2118 static PCRE2_UCHAR16 test16_10[] = { ' ', 0xdc00, 0xd800, 0x2028, '#', 0 };
2119 static PCRE2_UCHAR16 test16_11[] = { 0xdc00, 0xdc00, 0xd800, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2120 static PCRE2_UCHAR16 test16_12[] = { '#', 0xd800, 0xdc00, 0xd800, '#', 0xd800, 0xdc00, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2121
2122 static struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
2123 { UDA, CI, 0, 0, 0, 0, 1, { allany16, NULL }, test16_1 },
2124 { UDA, CI, 1, 0, 0, 1, 2, { allany16, NULL }, test16_1 },
2125 { UDA, CI, 2, 0, 0, 2, 3, { allany16, NULL }, test16_1 },
2126 { UDA, CI, 3, 0, 0, 3, 4, { allany16, NULL }, test16_1 },
2127 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_2 },
2128 { UDA, CI, 0, 0, 2, -1, -1, { allany16, NULL }, test16_2 },
2129 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_2 },
2130 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_3 },
2131 { UDA, CI, 0, 0, 2, -1, -1, { allany16, NULL }, test16_3 },
2132 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_3 },
2133
2134 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary16, NULL }, test16_1 },
2135 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_1 },
2136 { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary16, NULL }, test16_1 },
2137 { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary16, NULL }, test16_1 },
2138 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_2 },
2139 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_3 },
2140 { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_2 },
2141 { UDA, CPI, 2, 1, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_3 },
2142 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_4 },
2143 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_5 },
2144
2145 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference16, NULL }, test16_6 },
2146 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference16, NULL }, test16_6 },
2147 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference16, NULL }, test16_7 },
2148 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference16, NULL }, test16_7 },
2149
2150 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme16, NULL }, test16_6 },
2151 { UDA, CPI, 1, 0, 0, 1, 2, { grapheme16, NULL }, test16_6 },
2152 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme16, NULL }, test16_6 },
2153 { UDA, CPI, 0, 0, 0, 0, 2, { grapheme16, NULL }, test16_7 },
2154 { UDA, CPI, 2, 0, 0, 2, 4, { grapheme16, NULL }, test16_7 },
2155 { UDA, CPI, 1, 0, 0, -1, -1, { grapheme16, NULL }, test16_7 },
2156
2157 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2158 { UDA, CPI, 1, 0, 0, 1, 3, { nothashmark16, NULL }, test16_8 },
2159 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2160
2161 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl16, NULL }, test16_9 },
2162 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { afternl16, NULL }, test16_10 },
2163
2164 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2165 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2166 { PCRE2_UTF, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2167 { PCRE2_UTF, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2168
2169 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2170 };
2171
2172 #undef UDA
2173 #undef CI
2174 #undef CPI
2175
run_invalid_utf16_test(struct invalid_utf16_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_16 * ccontext,pcre2_match_data_16 * mdata)2176 static int run_invalid_utf16_test(struct invalid_utf16_regression_test_case *current,
2177 int pattern_index, int i, pcre2_compile_context_16 *ccontext, pcre2_match_data_16 *mdata)
2178 {
2179 pcre2_code_16 *code;
2180 int result, errorcode;
2181 PCRE2_SIZE length, erroroffset;
2182 const PCRE2_UCHAR16 *input;
2183 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_16(mdata);
2184
2185 if (current->pattern[i] == NULL)
2186 return 1;
2187
2188 code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED,
2189 current->compile_options, &errorcode, &erroroffset, ccontext);
2190
2191 if (!code) {
2192 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2193 return 0;
2194 }
2195
2196 if (pcre2_jit_compile_16(code, current->jit_compile_options) != 0) {
2197 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2198 pcre2_code_free_16(code);
2199 return 0;
2200 }
2201
2202 input = current->input;
2203 length = 0;
2204
2205 while (*input++ != 0)
2206 length++;
2207
2208 length -= current->skip_left + current->skip_right;
2209
2210 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2211 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2212 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2213
2214 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2215 pcre2_code_free_16(code);
2216 return 0;
2217 }
2218 }
2219
2220 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2221 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2222 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2223
2224 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2225 pcre2_code_free_16(code);
2226 return 0;
2227 }
2228 }
2229
2230 pcre2_code_free_16(code);
2231 return 1;
2232 }
2233
invalid_utf16_regression_tests(void)2234 static int invalid_utf16_regression_tests(void)
2235 {
2236 struct invalid_utf16_regression_test_case *current;
2237 pcre2_compile_context_16 *ccontext;
2238 pcre2_match_data_16 *mdata;
2239 int total = 0, successful = 0;
2240 int result;
2241
2242 printf("\nRunning invalid-utf16 JIT regression tests\n");
2243
2244 ccontext = pcre2_compile_context_create_16(NULL);
2245 pcre2_set_newline_16(ccontext, PCRE2_NEWLINE_ANY);
2246 mdata = pcre2_match_data_create_16(4, NULL);
2247
2248 for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) {
2249 /* printf("\nPattern: %s :\n", current->pattern); */
2250 total++;
2251
2252 result = 1;
2253 if (!run_invalid_utf16_test(current, total - 1, 0, ccontext, mdata))
2254 result = 0;
2255 if (!run_invalid_utf16_test(current, total - 1, 1, ccontext, mdata))
2256 result = 0;
2257
2258 if (result) {
2259 successful++;
2260 }
2261
2262 printf(".");
2263 if ((total % 60) == 0)
2264 printf("\n");
2265 }
2266
2267 if ((total % 60) != 0)
2268 printf("\n");
2269
2270 pcre2_match_data_free_16(mdata);
2271 pcre2_compile_context_free_16(ccontext);
2272
2273 if (total == successful) {
2274 printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n");
2275 return 0;
2276 } else {
2277 printf("\nInvalid UTF16 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2278 return 1;
2279 }
2280 }
2281
2282 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_16 */
2283
invalid_utf16_regression_tests(void)2284 static int invalid_utf16_regression_tests(void)
2285 {
2286 return 0;
2287 }
2288
2289 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */
2290
2291 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_32
2292
2293 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2294 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2295 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2296
2297 struct invalid_utf32_regression_test_case {
2298 int compile_options;
2299 int jit_compile_options;
2300 int start_offset;
2301 int skip_left;
2302 int skip_right;
2303 int match_start;
2304 int match_end;
2305 const PCRE2_UCHAR32 *pattern[2];
2306 const PCRE2_UCHAR32 *input;
2307 };
2308
2309 static PCRE2_UCHAR32 allany32[] = { '.', 0 };
2310 static PCRE2_UCHAR32 non_word_boundary32[] = { '\\', 'B', 0 };
2311 static PCRE2_UCHAR32 word_boundary32[] = { '\\', 'b', 0 };
2312 static PCRE2_UCHAR32 backreference32[] = { '(', '.', ')', '\\', '1', 0 };
2313 static PCRE2_UCHAR32 grapheme32[] = { '\\', 'X', 0 };
2314 static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 };
2315 static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 };
2316 static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x10ffff, 0 };
2317 static PCRE2_UCHAR32 test32_2[] = { 'a', 'A', 0x110000, 0 };
2318 static PCRE2_UCHAR32 test32_3[] = { '#', 0x10ffff, 0x110000, 0 };
2319 static PCRE2_UCHAR32 test32_4[] = { ' ', 0x2028, '#', 0 };
2320 static PCRE2_UCHAR32 test32_5[] = { ' ', 0x110000, 0x2028, '#', 0 };
2321
2322 static struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = {
2323 { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 },
2324 { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 },
2325
2326 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 },
2327 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
2328 { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
2329
2330 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_2 },
2331 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_2 },
2332
2333 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 },
2334 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 },
2335
2336 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 },
2337 { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_3 },
2338 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_3 },
2339
2340 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_4 },
2341 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_5 },
2342
2343 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2344 };
2345
2346 #undef UDA
2347 #undef CI
2348 #undef CPI
2349
run_invalid_utf32_test(struct invalid_utf32_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_32 * ccontext,pcre2_match_data_32 * mdata)2350 static int run_invalid_utf32_test(struct invalid_utf32_regression_test_case *current,
2351 int pattern_index, int i, pcre2_compile_context_32 *ccontext, pcre2_match_data_32 *mdata)
2352 {
2353 pcre2_code_32 *code;
2354 int result, errorcode;
2355 PCRE2_SIZE length, erroroffset;
2356 const PCRE2_UCHAR32 *input;
2357 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(mdata);
2358
2359 if (current->pattern[i] == NULL)
2360 return 1;
2361
2362 code = pcre2_compile_32(current->pattern[i], PCRE2_ZERO_TERMINATED,
2363 current->compile_options, &errorcode, &erroroffset, ccontext);
2364
2365 if (!code) {
2366 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2367 return 0;
2368 }
2369
2370 if (pcre2_jit_compile_32(code, current->jit_compile_options) != 0) {
2371 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2372 pcre2_code_free_32(code);
2373 return 0;
2374 }
2375
2376 input = current->input;
2377 length = 0;
2378
2379 while (*input++ != 0)
2380 length++;
2381
2382 length -= current->skip_left + current->skip_right;
2383
2384 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2385 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2386 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2387
2388 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2389 pcre2_code_free_32(code);
2390 return 0;
2391 }
2392 }
2393
2394 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2395 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2396 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2397
2398 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2399 pcre2_code_free_32(code);
2400 return 0;
2401 }
2402 }
2403
2404 pcre2_code_free_32(code);
2405 return 1;
2406 }
2407
invalid_utf32_regression_tests(void)2408 static int invalid_utf32_regression_tests(void)
2409 {
2410 struct invalid_utf32_regression_test_case *current;
2411 pcre2_compile_context_32 *ccontext;
2412 pcre2_match_data_32 *mdata;
2413 int total = 0, successful = 0;
2414 int result;
2415
2416 printf("\nRunning invalid-utf32 JIT regression tests\n");
2417
2418 ccontext = pcre2_compile_context_create_32(NULL);
2419 pcre2_set_newline_32(ccontext, PCRE2_NEWLINE_ANY);
2420 mdata = pcre2_match_data_create_32(4, NULL);
2421
2422 for (current = invalid_utf32_regression_test_cases; current->pattern[0]; current++) {
2423 /* printf("\nPattern: %s :\n", current->pattern); */
2424 total++;
2425
2426 result = 1;
2427 if (!run_invalid_utf32_test(current, total - 1, 0, ccontext, mdata))
2428 result = 0;
2429 if (!run_invalid_utf32_test(current, total - 1, 1, ccontext, mdata))
2430 result = 0;
2431
2432 if (result) {
2433 successful++;
2434 }
2435
2436 printf(".");
2437 if ((total % 60) == 0)
2438 printf("\n");
2439 }
2440
2441 if ((total % 60) != 0)
2442 printf("\n");
2443
2444 pcre2_match_data_free_32(mdata);
2445 pcre2_compile_context_free_32(ccontext);
2446
2447 if (total == successful) {
2448 printf("\nAll invalid UTF32 JIT regression tests are successfully passed.\n");
2449 return 0;
2450 } else {
2451 printf("\nInvalid UTF32 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2452 return 1;
2453 }
2454 }
2455
2456 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_32 */
2457
invalid_utf32_regression_tests(void)2458 static int invalid_utf32_regression_tests(void)
2459 {
2460 return 0;
2461 }
2462
2463 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_32 */
2464
2465 /* End of pcre2_jit_test.c */
2466