1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016-2018 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 42 /* This module contains mode-dependent macro and structure definitions. The 43 file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined. 44 These mode-dependent items are kept in a separate file so that they can also be 45 #included multiple times for different code unit widths by pcre2test in order 46 to have access to the hidden structures at all supported widths. 47 48 Some of the mode-dependent macros are required at different widths for 49 different parts of the pcre2test code (in particular, the included 50 pcre_printint.c file). We undefine them here so that they can be re-defined for 51 multiple inclusions. Not all of these are used in pcre2test, but it's easier 52 just to undefine them all. */ 53 54 #undef ACROSSCHAR 55 #undef BACKCHAR 56 #undef BYTES2CU 57 #undef CHMAX_255 58 #undef CU2BYTES 59 #undef FORWARDCHAR 60 #undef FORWARDCHARTEST 61 #undef GET 62 #undef GET2 63 #undef GETCHAR 64 #undef GETCHARINC 65 #undef GETCHARINCTEST 66 #undef GETCHARLEN 67 #undef GETCHARLENTEST 68 #undef GETCHARTEST 69 #undef GET_EXTRALEN 70 #undef HAS_EXTRALEN 71 #undef IMM2_SIZE 72 #undef MAX_255 73 #undef MAX_MARK 74 #undef MAX_PATTERN_SIZE 75 #undef MAX_UTF_SINGLE_CU 76 #undef NOT_FIRSTCU 77 #undef PUT 78 #undef PUT2 79 #undef PUT2INC 80 #undef PUTCHAR 81 #undef PUTINC 82 #undef TABLE_GET 83 84 85 86 /* -------------------------- MACROS ----------------------------- */ 87 88 /* PCRE keeps offsets in its compiled code as at least 16-bit quantities 89 (always stored in big-endian order in 8-bit mode) by default. These are used, 90 for example, to link from the start of a subpattern to its alternatives and its 91 end. The use of 16 bits per offset limits the size of an 8-bit compiled regex 92 to around 64K, which is big enough for almost everybody. However, I received a 93 request for an even bigger limit. For this reason, and also to make the code 94 easier to maintain, the storing and loading of offsets from the compiled code 95 unit string is now handled by the macros that are defined here. 96 97 The macros are controlled by the value of LINK_SIZE. This defaults to 2, but 98 values of 3 or 4 are also supported. */ 99 100 /* ------------------- 8-bit support ------------------ */ 101 102 #if PCRE2_CODE_UNIT_WIDTH == 8 103 104 #if LINK_SIZE == 2 105 #define PUT(a,n,d) \ 106 (a[n] = (PCRE2_UCHAR)((d) >> 8)), \ 107 (a[(n)+1] = (PCRE2_UCHAR)((d) & 255)) 108 #define GET(a,n) \ 109 (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) 110 #define MAX_PATTERN_SIZE (1 << 16) 111 112 #elif LINK_SIZE == 3 113 #define PUT(a,n,d) \ 114 (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ 115 (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \ 116 (a[(n)+2] = (PCRE2_UCHAR)((d) & 255)) 117 #define GET(a,n) \ 118 (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) 119 #define MAX_PATTERN_SIZE (1 << 24) 120 121 #elif LINK_SIZE == 4 122 #define PUT(a,n,d) \ 123 (a[n] = (PCRE2_UCHAR)((d) >> 24)), \ 124 (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \ 125 (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)), \ 126 (a[(n)+3] = (PCRE2_UCHAR)((d) & 255)) 127 #define GET(a,n) \ 128 (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) 129 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ 130 131 #else 132 #error LINK_SIZE must be 2, 3, or 4 133 #endif 134 135 136 /* ------------------- 16-bit support ------------------ */ 137 138 #elif PCRE2_CODE_UNIT_WIDTH == 16 139 140 #if LINK_SIZE == 2 141 #undef LINK_SIZE 142 #define LINK_SIZE 1 143 #define PUT(a,n,d) \ 144 (a[n] = (PCRE2_UCHAR)(d)) 145 #define GET(a,n) \ 146 (a[n]) 147 #define MAX_PATTERN_SIZE (1 << 16) 148 149 #elif LINK_SIZE == 3 || LINK_SIZE == 4 150 #undef LINK_SIZE 151 #define LINK_SIZE 2 152 #define PUT(a,n,d) \ 153 (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ 154 (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535)) 155 #define GET(a,n) \ 156 (unsigned int)(((a)[n] << 16) | (a)[(n)+1]) 157 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ 158 159 #else 160 #error LINK_SIZE must be 2, 3, or 4 161 #endif 162 163 164 /* ------------------- 32-bit support ------------------ */ 165 166 #elif PCRE2_CODE_UNIT_WIDTH == 32 167 #undef LINK_SIZE 168 #define LINK_SIZE 1 169 #define PUT(a,n,d) \ 170 (a[n] = (d)) 171 #define GET(a,n) \ 172 (a[n]) 173 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ 174 175 #else 176 #error Unsupported compiling mode 177 #endif 178 179 180 /* --------------- Other mode-specific macros ----------------- */ 181 182 /* PCRE uses some other (at least) 16-bit quantities that do not change when 183 the size of offsets changes. There are used for repeat counts and for other 184 things such as capturing parenthesis numbers in back references. 185 186 Define the number of code units required to hold a 16-bit count/offset, and 187 macros to load and store such a value. For reasons that I do not understand, 188 the expression in the 8-bit GET2 macro is treated by gcc as a signed 189 expression, even when a is declared as unsigned. It seems that any kind of 190 arithmetic results in a signed value. Hence the cast. */ 191 192 #if PCRE2_CODE_UNIT_WIDTH == 8 193 #define IMM2_SIZE 2 194 #define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) 195 #define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255 196 197 #else /* Code units are 16 or 32 bits */ 198 #define IMM2_SIZE 1 199 #define GET2(a,n) a[n] 200 #define PUT2(a,n,d) a[n] = d 201 #endif 202 203 /* Other macros that are different for 8-bit mode. The MAX_255 macro checks 204 whether its argument, which is assumed to be one code unit, is less than 256. 205 The CHMAX_255 macro does not assume one code unit. The maximum length of a MARK 206 name must fit in one code unit; currently it is set to 255 or 65535. The 207 TABLE_GET macro is used to access elements of tables containing exactly 256 208 items. When code points can be greater than 255, a check is needed before 209 accessing these tables. */ 210 211 #if PCRE2_CODE_UNIT_WIDTH == 8 212 #define MAX_255(c) TRUE 213 #define MAX_MARK ((1u << 8) - 1) 214 #ifdef SUPPORT_UNICODE 215 #define SUPPORT_WIDE_CHARS 216 #define CHMAX_255(c) ((c) <= 255u) 217 #else 218 #define CHMAX_255(c) TRUE 219 #endif /* SUPPORT_UNICODE */ 220 #define TABLE_GET(c, table, default) ((table)[c]) 221 222 #else /* Code units are 16 or 32 bits */ 223 #define CHMAX_255(c) ((c) <= 255u) 224 #define MAX_255(c) ((c) <= 255u) 225 #define MAX_MARK ((1u << 16) - 1) 226 #define SUPPORT_WIDE_CHARS 227 #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) 228 #endif 229 230 231 232 /* ----------------- Character-handling macros ----------------- */ 233 234 /* There is a proposed future special "UTF-21" mode, in which only the lowest 235 21 bits of a 32-bit character are interpreted as UTF, with the remaining 11 236 high-order bits available to the application for other uses. In preparation for 237 the future implementation of this mode, there are macros that load a data item 238 and, if in this special mode, mask it to 21 bits. These macros all have names 239 starting with UCHAR21. In all other modes, including the normal 32-bit 240 library, the macros all have the same simple definitions. When the new mode is 241 implemented, it is expected that these definitions will be varied appropriately 242 using #ifdef when compiling the library that supports the special mode. */ 243 244 #define UCHAR21(eptr) (*(eptr)) 245 #define UCHAR21TEST(eptr) (*(eptr)) 246 #define UCHAR21INC(eptr) (*(eptr)++) 247 #define UCHAR21INCTEST(eptr) (*(eptr)++) 248 249 /* When UTF encoding is being used, a character is no longer just a single 250 byte in 8-bit mode or a single short in 16-bit mode. The macros for character 251 handling generate simple sequences when used in the basic mode, and more 252 complicated ones for UTF characters. GETCHARLENTEST and other macros are not 253 used when UTF is not supported. To make sure they can never even appear when 254 UTF support is omitted, we don't even define them. */ 255 256 #ifndef SUPPORT_UNICODE 257 258 /* #define MAX_UTF_SINGLE_CU */ 259 /* #define HAS_EXTRALEN(c) */ 260 /* #define GET_EXTRALEN(c) */ 261 /* #define NOT_FIRSTCU(c) */ 262 #define GETCHAR(c, eptr) c = *eptr; 263 #define GETCHARTEST(c, eptr) c = *eptr; 264 #define GETCHARINC(c, eptr) c = *eptr++; 265 #define GETCHARINCTEST(c, eptr) c = *eptr++; 266 #define GETCHARLEN(c, eptr, len) c = *eptr; 267 #define PUTCHAR(c, p) (*p = c, 1) 268 /* #define GETCHARLENTEST(c, eptr, len) */ 269 /* #define BACKCHAR(eptr) */ 270 /* #define FORWARDCHAR(eptr) */ 271 /* #define FORWARCCHARTEST(eptr,end) */ 272 /* #define ACROSSCHAR(condition, eptr, action) */ 273 274 #else /* SUPPORT_UNICODE */ 275 276 /* ------------------- 8-bit support ------------------ */ 277 278 #if PCRE2_CODE_UNIT_WIDTH == 8 279 #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ 280 281 /* The largest UTF code point that can be encoded as a single code unit. */ 282 283 #define MAX_UTF_SINGLE_CU 127 284 285 /* Tests whether the code point needs extra characters to decode. */ 286 287 #define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c) 288 289 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. 290 Otherwise it has an undefined behaviour. */ 291 292 #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu]) 293 294 /* Returns TRUE, if the given value is not the first code unit of a UTF 295 sequence. */ 296 297 #define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u) 298 299 /* Get the next UTF-8 character, not advancing the pointer. This is called when 300 we know we are in UTF-8 mode. */ 301 302 #define GETCHAR(c, eptr) \ 303 c = *eptr; \ 304 if (c >= 0xc0u) GETUTF8(c, eptr); 305 306 /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the 307 pointer. */ 308 309 #define GETCHARTEST(c, eptr) \ 310 c = *eptr; \ 311 if (utf && c >= 0xc0u) GETUTF8(c, eptr); 312 313 /* Get the next UTF-8 character, advancing the pointer. This is called when we 314 know we are in UTF-8 mode. */ 315 316 #define GETCHARINC(c, eptr) \ 317 c = *eptr++; \ 318 if (c >= 0xc0u) GETUTF8INC(c, eptr); 319 320 /* Get the next character, testing for UTF-8 mode, and advancing the pointer. 321 This is called when we don't know if we are in UTF-8 mode. */ 322 323 #define GETCHARINCTEST(c, eptr) \ 324 c = *eptr++; \ 325 if (utf && c >= 0xc0u) GETUTF8INC(c, eptr); 326 327 /* Get the next UTF-8 character, not advancing the pointer, incrementing length 328 if there are extra bytes. This is called when we know we are in UTF-8 mode. */ 329 330 #define GETCHARLEN(c, eptr, len) \ 331 c = *eptr; \ 332 if (c >= 0xc0u) GETUTF8LEN(c, eptr, len); 333 334 /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the 335 pointer, incrementing length if there are extra bytes. This is called when we 336 do not know if we are in UTF-8 mode. */ 337 338 #define GETCHARLENTEST(c, eptr, len) \ 339 c = *eptr; \ 340 if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len); 341 342 /* If the pointer is not at the start of a character, move it back until 343 it is. This is called only in UTF-8 mode - we don't put a test within the macro 344 because almost all calls are already within a block of UTF-8 only code. */ 345 346 #define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr-- 347 348 /* Same as above, just in the other direction. */ 349 #define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++ 350 #define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++ 351 352 /* Same as above, but it allows a fully customizable form. */ 353 #define ACROSSCHAR(condition, eptr, action) \ 354 while((condition) && ((*eptr) & 0xc0u) == 0x80u) action 355 356 /* Deposit a character into memory, returning the number of code units. */ 357 358 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ 359 PRIV(ord2utf)(c,p) : (*p = c, 1)) 360 361 362 /* ------------------- 16-bit support ------------------ */ 363 364 #elif PCRE2_CODE_UNIT_WIDTH == 16 365 #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ 366 367 /* The largest UTF code point that can be encoded as a single code unit. */ 368 369 #define MAX_UTF_SINGLE_CU 65535 370 371 /* Tests whether the code point needs extra characters to decode. */ 372 373 #define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u) 374 375 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. 376 Otherwise it has an undefined behaviour. */ 377 378 #define GET_EXTRALEN(c) 1 379 380 /* Returns TRUE, if the given value is not the first code unit of a UTF 381 sequence. */ 382 383 #define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u) 384 385 /* Base macro to pick up the low surrogate of a UTF-16 character, not 386 advancing the pointer. */ 387 388 #define GETUTF16(c, eptr) \ 389 { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; } 390 391 /* Get the next UTF-16 character, not advancing the pointer. This is called when 392 we know we are in UTF-16 mode. */ 393 394 #define GETCHAR(c, eptr) \ 395 c = *eptr; \ 396 if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); 397 398 /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the 399 pointer. */ 400 401 #define GETCHARTEST(c, eptr) \ 402 c = *eptr; \ 403 if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); 404 405 /* Base macro to pick up the low surrogate of a UTF-16 character, advancing 406 the pointer. */ 407 408 #define GETUTF16INC(c, eptr) \ 409 { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; } 410 411 /* Get the next UTF-16 character, advancing the pointer. This is called when we 412 know we are in UTF-16 mode. */ 413 414 #define GETCHARINC(c, eptr) \ 415 c = *eptr++; \ 416 if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); 417 418 /* Get the next character, testing for UTF-16 mode, and advancing the pointer. 419 This is called when we don't know if we are in UTF-16 mode. */ 420 421 #define GETCHARINCTEST(c, eptr) \ 422 c = *eptr++; \ 423 if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); 424 425 /* Base macro to pick up the low surrogate of a UTF-16 character, not 426 advancing the pointer, incrementing the length. */ 427 428 #define GETUTF16LEN(c, eptr, len) \ 429 { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; } 430 431 /* Get the next UTF-16 character, not advancing the pointer, incrementing 432 length if there is a low surrogate. This is called when we know we are in 433 UTF-16 mode. */ 434 435 #define GETCHARLEN(c, eptr, len) \ 436 c = *eptr; \ 437 if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); 438 439 /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the 440 pointer, incrementing length if there is a low surrogate. This is called when 441 we do not know if we are in UTF-16 mode. */ 442 443 #define GETCHARLENTEST(c, eptr, len) \ 444 c = *eptr; \ 445 if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); 446 447 /* If the pointer is not at the start of a character, move it back until 448 it is. This is called only in UTF-16 mode - we don't put a test within the 449 macro because almost all calls are already within a block of UTF-16 only 450 code. */ 451 452 #define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr-- 453 454 /* Same as above, just in the other direction. */ 455 #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++ 456 #define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++ 457 458 /* Same as above, but it allows a fully customizable form. */ 459 #define ACROSSCHAR(condition, eptr, action) \ 460 if ((condition) && ((*eptr) & 0xfc00u) == 0xdc00u) action 461 462 /* Deposit a character into memory, returning the number of code units. */ 463 464 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ 465 PRIV(ord2utf)(c,p) : (*p = c, 1)) 466 467 468 /* ------------------- 32-bit support ------------------ */ 469 470 #else 471 472 /* These are trivial for the 32-bit library, since all UTF-32 characters fit 473 into one PCRE2_UCHAR unit. */ 474 475 #define MAX_UTF_SINGLE_CU (0x10ffffu) 476 #define HAS_EXTRALEN(c) (0) 477 #define GET_EXTRALEN(c) (0) 478 #define NOT_FIRSTCU(c) (0) 479 480 /* Get the next UTF-32 character, not advancing the pointer. This is called when 481 we know we are in UTF-32 mode. */ 482 483 #define GETCHAR(c, eptr) \ 484 c = *(eptr); 485 486 /* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the 487 pointer. */ 488 489 #define GETCHARTEST(c, eptr) \ 490 c = *(eptr); 491 492 /* Get the next UTF-32 character, advancing the pointer. This is called when we 493 know we are in UTF-32 mode. */ 494 495 #define GETCHARINC(c, eptr) \ 496 c = *((eptr)++); 497 498 /* Get the next character, testing for UTF-32 mode, and advancing the pointer. 499 This is called when we don't know if we are in UTF-32 mode. */ 500 501 #define GETCHARINCTEST(c, eptr) \ 502 c = *((eptr)++); 503 504 /* Get the next UTF-32 character, not advancing the pointer, not incrementing 505 length (since all UTF-32 is of length 1). This is called when we know we are in 506 UTF-32 mode. */ 507 508 #define GETCHARLEN(c, eptr, len) \ 509 GETCHAR(c, eptr) 510 511 /* Get the next UTF-32character, testing for UTF-32 mode, not advancing the 512 pointer, not incrementing the length (since all UTF-32 is of length 1). 513 This is called when we do not know if we are in UTF-32 mode. */ 514 515 #define GETCHARLENTEST(c, eptr, len) \ 516 GETCHARTEST(c, eptr) 517 518 /* If the pointer is not at the start of a character, move it back until 519 it is. This is called only in UTF-32 mode - we don't put a test within the 520 macro because almost all calls are already within a block of UTF-32 only 521 code. 522 523 These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */ 524 525 #define BACKCHAR(eptr) do { } while (0) 526 527 /* Same as above, just in the other direction. */ 528 529 #define FORWARDCHAR(eptr) do { } while (0) 530 #define FORWARDCHARTEST(eptr,end) do { } while (0) 531 532 /* Same as above, but it allows a fully customizable form. */ 533 534 #define ACROSSCHAR(condition, eptr, action) do { } while (0) 535 536 /* Deposit a character into memory, returning the number of code units. */ 537 538 #define PUTCHAR(c, p) (*p = c, 1) 539 540 #endif /* UTF-32 character handling */ 541 #endif /* SUPPORT_UNICODE */ 542 543 544 /* Mode-dependent macros that have the same definition in all modes. */ 545 546 #define CU2BYTES(x) ((x)*((PCRE2_CODE_UNIT_WIDTH/8))) 547 #define BYTES2CU(x) ((x)/((PCRE2_CODE_UNIT_WIDTH/8))) 548 #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE 549 #define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE 550 551 552 /* ----------------------- HIDDEN STRUCTURES ----------------------------- */ 553 554 /* NOTE: All these structures *must* start with a pcre2_memctl structure. The 555 code that uses them is simpler because it assumes this. */ 556 557 /* The real general context structure. At present it holds only data for custom 558 memory control. */ 559 560 typedef struct pcre2_real_general_context { 561 pcre2_memctl memctl; 562 } pcre2_real_general_context; 563 564 /* The real compile context structure */ 565 566 typedef struct pcre2_real_compile_context { 567 pcre2_memctl memctl; 568 int (*stack_guard)(uint32_t, void *); 569 void *stack_guard_data; 570 const uint8_t *tables; 571 PCRE2_SIZE max_pattern_length; 572 uint16_t bsr_convention; 573 uint16_t newline_convention; 574 uint32_t parens_nest_limit; 575 uint32_t extra_options; 576 } pcre2_real_compile_context; 577 578 /* The real match context structure. */ 579 580 typedef struct pcre2_real_match_context { 581 pcre2_memctl memctl; 582 #ifdef SUPPORT_JIT 583 pcre2_jit_callback jit_callback; 584 void *jit_callback_data; 585 #endif 586 int (*callout)(pcre2_callout_block *, void *); 587 void *callout_data; 588 int (*substitute_callout)(pcre2_substitute_callout_block *, void *); 589 void *substitute_callout_data; 590 PCRE2_SIZE offset_limit; 591 uint32_t heap_limit; 592 uint32_t match_limit; 593 uint32_t depth_limit; 594 } pcre2_real_match_context; 595 596 /* The real convert context structure. */ 597 598 typedef struct pcre2_real_convert_context { 599 pcre2_memctl memctl; 600 uint32_t glob_separator; 601 uint32_t glob_escape; 602 } pcre2_real_convert_context; 603 604 /* The real compiled code structure. The type for the blocksize field is 605 defined specially because it is required in pcre2_serialize_decode() when 606 copying the size from possibly unaligned memory into a variable of the same 607 type. Use a macro rather than a typedef to avoid compiler warnings when this 608 file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the 609 largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit 610 argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field 611 here.) */ 612 613 #undef CODE_BLOCKSIZE_TYPE 614 #define CODE_BLOCKSIZE_TYPE size_t 615 616 #undef LOOKBEHIND_MAX 617 #define LOOKBEHIND_MAX UINT16_MAX 618 619 typedef struct pcre2_real_code { 620 pcre2_memctl memctl; /* Memory control fields */ 621 const uint8_t *tables; /* The character tables */ 622 void *executable_jit; /* Pointer to JIT code */ 623 uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */ 624 CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */ 625 uint32_t magic_number; /* Paranoid and endianness check */ 626 uint32_t compile_options; /* Options passed to pcre2_compile() */ 627 uint32_t overall_options; /* Options after processing the pattern */ 628 uint32_t extra_options; /* Taken from compile_context */ 629 uint32_t flags; /* Various state flags */ 630 uint32_t limit_heap; /* Limit set in the pattern */ 631 uint32_t limit_match; /* Limit set in the pattern */ 632 uint32_t limit_depth; /* Limit set in the pattern */ 633 uint32_t first_codeunit; /* Starting code unit */ 634 uint32_t last_codeunit; /* This codeunit must be seen */ 635 uint16_t bsr_convention; /* What \R matches */ 636 uint16_t newline_convention; /* What is a newline? */ 637 uint16_t max_lookbehind; /* Longest lookbehind (characters) */ 638 uint16_t minlength; /* Minimum length of match */ 639 uint16_t top_bracket; /* Highest numbered group */ 640 uint16_t top_backref; /* Highest numbered back reference */ 641 uint16_t name_entry_size; /* Size (code units) of table entries */ 642 uint16_t name_count; /* Number of name entries in the table */ 643 } pcre2_real_code; 644 645 /* The real match data structure. Define ovector as large as it can ever 646 actually be so that array bound checkers don't grumble. Memory for this 647 structure is obtained by calling pcre2_match_data_create(), which sets the size 648 as the offset of ovector plus a pair of elements for each capturable string, so 649 the size varies from call to call. As the maximum number of capturing 650 subpatterns is 65535 we must allow for 65536 strings to include the overall 651 match. (See also the heapframe structure below.) */ 652 653 typedef struct pcre2_real_match_data { 654 pcre2_memctl memctl; 655 const pcre2_real_code *code; /* The pattern used for the match */ 656 PCRE2_SPTR subject; /* The subject that was matched */ 657 PCRE2_SPTR mark; /* Pointer to last mark */ 658 PCRE2_SIZE leftchar; /* Offset to leftmost code unit */ 659 PCRE2_SIZE rightchar; /* Offset to rightmost code unit */ 660 PCRE2_SIZE startchar; /* Offset to starting code unit */ 661 uint8_t matchedby; /* Type of match (normal, JIT, DFA) */ 662 uint8_t flags; /* Various flags */ 663 uint16_t oveccount; /* Number of pairs */ 664 int rc; /* The return code from the match */ 665 PCRE2_SIZE ovector[131072]; /* Must be last in the structure */ 666 } pcre2_real_match_data; 667 668 669 /* ----------------------- PRIVATE STRUCTURES ----------------------------- */ 670 671 /* These structures are not needed for pcre2test. */ 672 673 #ifndef PCRE2_PCRE2TEST 674 675 /* Structures for checking for mutual recursion when scanning compiled or 676 parsed code. */ 677 678 typedef struct recurse_check { 679 struct recurse_check *prev; 680 PCRE2_SPTR group; 681 } recurse_check; 682 683 typedef struct parsed_recurse_check { 684 struct parsed_recurse_check *prev; 685 uint32_t *groupptr; 686 } parsed_recurse_check; 687 688 /* Structure for building a cache when filling in recursion offsets. */ 689 690 typedef struct recurse_cache { 691 PCRE2_SPTR group; 692 int groupnumber; 693 } recurse_cache; 694 695 /* Structure for maintaining a chain of pointers to the currently incomplete 696 branches, for testing for left recursion while compiling. */ 697 698 typedef struct branch_chain { 699 struct branch_chain *outer; 700 PCRE2_UCHAR *current_branch; 701 } branch_chain; 702 703 /* Structure for building a list of named groups during the first pass of 704 compiling. */ 705 706 typedef struct named_group { 707 PCRE2_SPTR name; /* Points to the name in the pattern */ 708 uint32_t number; /* Group number */ 709 uint16_t length; /* Length of the name */ 710 uint16_t isdup; /* TRUE if a duplicate */ 711 } named_group; 712 713 /* Structure for passing "static" information around between the functions 714 doing the compiling, so that they are thread-safe. */ 715 716 typedef struct compile_block { 717 pcre2_real_compile_context *cx; /* Points to the compile context */ 718 const uint8_t *lcc; /* Points to lower casing table */ 719 const uint8_t *fcc; /* Points to case-flipping table */ 720 const uint8_t *cbits; /* Points to character type table */ 721 const uint8_t *ctypes; /* Points to table of type maps */ 722 PCRE2_SPTR start_workspace; /* The start of working space */ 723 PCRE2_SPTR start_code; /* The start of the compiled code */ 724 PCRE2_SPTR start_pattern; /* The start of the pattern */ 725 PCRE2_SPTR end_pattern; /* The end of the pattern */ 726 PCRE2_UCHAR *name_table; /* The name/number table */ 727 PCRE2_SIZE workspace_size; /* Size of workspace */ 728 PCRE2_SIZE small_ref_offset[10]; /* Offsets for \1 to \9 */ 729 PCRE2_SIZE erroroffset; /* Offset of error in pattern */ 730 uint16_t names_found; /* Number of entries so far */ 731 uint16_t name_entry_size; /* Size of each entry */ 732 uint16_t parens_depth; /* Depth of nested parentheses */ 733 uint16_t assert_depth; /* Depth of nested assertions */ 734 open_capitem *open_caps; /* Chain of open capture items */ 735 named_group *named_groups; /* Points to vector in pre-compile */ 736 uint32_t named_group_list_size; /* Number of entries in the list */ 737 uint32_t external_options; /* External (initial) options */ 738 uint32_t external_flags; /* External flag bits to be set */ 739 uint32_t bracount; /* Count of capturing parentheses */ 740 uint32_t lastcapture; /* Last capture encountered */ 741 uint32_t *parsed_pattern; /* Parsed pattern buffer */ 742 uint32_t *parsed_pattern_end; /* Parsed pattern should not get here */ 743 uint32_t *groupinfo; /* Group info vector */ 744 uint32_t top_backref; /* Maximum back reference */ 745 uint32_t backref_map; /* Bitmap of low back refs */ 746 uint32_t nltype; /* Newline type */ 747 uint32_t nllen; /* Newline string length */ 748 uint32_t class_range_start; /* Overall class range start */ 749 uint32_t class_range_end; /* Overall class range end */ 750 PCRE2_UCHAR nl[4]; /* Newline string when fixed length */ 751 int max_lookbehind; /* Maximum lookbehind (characters) */ 752 int req_varyopt; /* "After variable item" flag for reqbyte */ 753 BOOL had_accept; /* (*ACCEPT) encountered */ 754 BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ 755 BOOL had_recurse; /* Had a recursion or subroutine call */ 756 BOOL dupnames; /* Duplicate names exist */ 757 } compile_block; 758 759 /* Structure for keeping the properties of the in-memory stack used 760 by the JIT matcher. */ 761 762 typedef struct pcre2_real_jit_stack { 763 pcre2_memctl memctl; 764 void* stack; 765 } pcre2_real_jit_stack; 766 767 /* Structure for items in a linked list that represents an explicit recursive 768 call within the pattern when running pcre_dfa_match(). */ 769 770 typedef struct dfa_recursion_info { 771 struct dfa_recursion_info *prevrec; 772 PCRE2_SPTR subject_position; 773 uint32_t group_num; 774 } dfa_recursion_info; 775 776 /* Structure for "stack" frames that are used for remembering backtracking 777 positions during matching. As these are used in a vector, with the ovector item 778 being extended, the size of the structure must be a multiple of PCRE2_SIZE. The 779 only way to check this at compile time is to force an error by generating an 780 array with a negative size. By putting this in a typedef (which is never used), 781 we don't generate any code when all is well. */ 782 783 typedef struct heapframe { 784 785 /* The first set of fields are variables that have to be preserved over calls 786 to RRMATCH(), but which do not need to be copied to new frames. */ 787 788 PCRE2_SPTR ecode; /* The current position in the pattern */ 789 PCRE2_SPTR temp_sptr[2]; /* Used for short-term PCRE_SPTR values */ 790 PCRE2_SIZE length; /* Used for character, string, or code lengths */ 791 PCRE2_SIZE back_frame; /* Amount to subtract on RRETURN */ 792 PCRE2_SIZE temp_size; /* Used for short-term PCRE2_SIZE values */ 793 uint32_t rdepth; /* "Recursion" depth */ 794 uint32_t group_frame_type; /* Type information for group frames */ 795 uint32_t temp_32[4]; /* Used for short-term 32-bit or BOOL values */ 796 uint8_t return_id; /* Where to go on in internal "return" */ 797 uint8_t op; /* Processing opcode */ 798 799 /* At this point, the structure is 16-bit aligned. On most architectures 800 the alignment requirement for a pointer will ensure that the eptr field below 801 is 32-bit or 64-bit aligned. However, on m68k it is fine to have a pointer 802 that is 16-bit aligned. We must therefore ensure that what comes between here 803 and eptr is an odd multiple of 16 bits so as to get back into 32-bit 804 alignment. This happens naturally when PCRE2_UCHAR is 8 bits wide, but needs 805 fudges in the other cases. In the 32-bit case the padding comes first so that 806 the occu field itself is 32-bit aligned. Without the padding, this structure 807 is no longer a multiple of PCRE2_SIZE on m68k, and the check below fails. */ 808 809 #if PCRE2_CODE_UNIT_WIDTH == 8 810 PCRE2_UCHAR occu[6]; /* Used for other case code units */ 811 #elif PCRE2_CODE_UNIT_WIDTH == 16 812 PCRE2_UCHAR occu[2]; /* Used for other case code units */ 813 uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */ 814 #else 815 uint8_t unused[2]; /* Ensure 32-bit alignment (see above) */ 816 PCRE2_UCHAR occu[1]; /* Used for other case code units */ 817 #endif 818 819 /* The rest have to be copied from the previous frame whenever a new frame 820 becomes current. The final field is specified as a large vector so that 821 runtime array bound checks don't catch references to it. However, for any 822 specific call to pcre2_match() the memory allocated for each frame structure 823 allows for exactly the right size ovector for the number of capturing 824 parentheses. (See also the comment for pcre2_real_match_data above.) */ 825 826 PCRE2_SPTR eptr; /* MUST BE FIRST */ 827 PCRE2_SPTR start_match; /* Can be adjusted by \K */ 828 PCRE2_SPTR mark; /* Most recent mark on the success path */ 829 uint32_t current_recurse; /* Current (deepest) recursion number */ 830 uint32_t capture_last; /* Most recent capture */ 831 PCRE2_SIZE last_group_offset; /* Saved offset to most recent group frame */ 832 PCRE2_SIZE offset_top; /* Offset after highest capture */ 833 PCRE2_SIZE ovector[131072]; /* Must be last in the structure */ 834 } heapframe; 835 836 /* This typedef is a check that the size of the heapframe structure is a 837 multiple of PCRE2_SIZE. See various comments above. */ 838 839 typedef char check_heapframe_size[ 840 ((sizeof(heapframe) % sizeof(PCRE2_SIZE)) == 0)? (+1):(-1)]; 841 842 /* Structure for passing "static" information around between the functions 843 doing traditional NFA matching (pcre2_match() and friends). */ 844 845 typedef struct match_block { 846 pcre2_memctl memctl; /* For general use */ 847 PCRE2_SIZE frame_vector_size; /* Size of a backtracking frame */ 848 heapframe *match_frames; /* Points to vector of frames */ 849 heapframe *match_frames_top; /* Points after the end of the vector */ 850 heapframe *stack_frames; /* The original vector on the stack */ 851 PCRE2_SIZE heap_limit; /* As it says */ 852 uint32_t match_limit; /* As it says */ 853 uint32_t match_limit_depth; /* As it says */ 854 uint32_t match_call_count; /* Number of times a new frame is created */ 855 BOOL hitend; /* Hit the end of the subject at some point */ 856 BOOL hasthen; /* Pattern contains (*THEN) */ 857 const uint8_t *lcc; /* Points to lower casing table */ 858 const uint8_t *fcc; /* Points to case-flipping table */ 859 const uint8_t *ctypes; /* Points to table of type maps */ 860 PCRE2_SIZE start_offset; /* The start offset value */ 861 PCRE2_SIZE end_offset_top; /* Highwater mark at end of match */ 862 uint16_t partial; /* PARTIAL options */ 863 uint16_t bsr_convention; /* \R interpretation */ 864 uint16_t name_count; /* Number of names in name table */ 865 uint16_t name_entry_size; /* Size of entry in names table */ 866 PCRE2_SPTR name_table; /* Table of group names */ 867 PCRE2_SPTR start_code; /* For use when recursing */ 868 PCRE2_SPTR start_subject; /* Start of the subject string */ 869 PCRE2_SPTR end_subject; /* End of the subject string */ 870 PCRE2_SPTR end_match_ptr; /* Subject position at end match */ 871 PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ 872 PCRE2_SPTR last_used_ptr; /* Latest consulted character */ 873 PCRE2_SPTR mark; /* Mark pointer to pass back on success */ 874 PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */ 875 PCRE2_SPTR verb_ecode_ptr; /* For passing back info */ 876 PCRE2_SPTR verb_skip_ptr; /* For passing back a (*SKIP) name */ 877 uint32_t verb_current_recurse; /* Current recurse when (*VERB) happens */ 878 uint32_t moptions; /* Match options */ 879 uint32_t poptions; /* Pattern options */ 880 uint32_t skip_arg_count; /* For counting SKIP_ARGs */ 881 uint32_t ignore_skip_arg; /* For re-run when SKIP arg name not found */ 882 uint32_t nltype; /* Newline type */ 883 uint32_t nllen; /* Newline string length */ 884 PCRE2_UCHAR nl[4]; /* Newline string when fixed */ 885 pcre2_callout_block *cb; /* Points to a callout block */ 886 void *callout_data; /* To pass back to callouts */ 887 int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ 888 } match_block; 889 890 /* A similar structure is used for the same purpose by the DFA matching 891 functions. */ 892 893 typedef struct dfa_match_block { 894 pcre2_memctl memctl; /* For general use */ 895 PCRE2_SPTR start_code; /* Start of the compiled pattern */ 896 PCRE2_SPTR start_subject ; /* Start of the subject string */ 897 PCRE2_SPTR end_subject; /* End of subject string */ 898 PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ 899 PCRE2_SPTR last_used_ptr; /* Latest consulted character */ 900 const uint8_t *tables; /* Character tables */ 901 PCRE2_SIZE start_offset; /* The start offset value */ 902 PCRE2_SIZE heap_limit; /* As it says */ 903 PCRE2_SIZE heap_used; /* As it says */ 904 uint32_t match_limit; /* As it says */ 905 uint32_t match_limit_depth; /* As it says */ 906 uint32_t match_call_count; /* Number of calls of internal function */ 907 uint32_t moptions; /* Match options */ 908 uint32_t poptions; /* Pattern options */ 909 uint32_t nltype; /* Newline type */ 910 uint32_t nllen; /* Newline string length */ 911 PCRE2_UCHAR nl[4]; /* Newline string when fixed */ 912 uint16_t bsr_convention; /* \R interpretation */ 913 pcre2_callout_block *cb; /* Points to a callout block */ 914 void *callout_data; /* To pass back to callouts */ 915 int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ 916 dfa_recursion_info *recursive; /* Linked list of recursion data */ 917 } dfa_match_block; 918 919 #endif /* PCRE2_PCRE2TEST */ 920 921 /* End of pcre2_intmodedep.h */ 922