1 /************************************************* 2 * Perl-Compatible Regular Expressions * 3 *************************************************/ 4 5 /* PCRE is a library of functions to support regular expressions whose syntax 6 and semantics are as close as possible to those of the Perl 5 language. 7 8 Written by Philip Hazel 9 Original API code Copyright (c) 1997-2012 University of Cambridge 10 New API code Copyright (c) 2016 University of Cambridge 11 12 ----------------------------------------------------------------------------- 13 Redistribution and use in source and binary forms, with or without 14 modification, are permitted provided that the following conditions are met: 15 16 * Redistributions of source code must retain the above copyright notice, 17 this list of conditions and the following disclaimer. 18 19 * Redistributions in binary form must reproduce the above copyright 20 notice, this list of conditions and the following disclaimer in the 21 documentation and/or other materials provided with the distribution. 22 23 * Neither the name of the University of Cambridge nor the names of its 24 contributors may be used to endorse or promote products derived from 25 this software without specific prior written permission. 26 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 POSSIBILITY OF SUCH DAMAGE. 38 ----------------------------------------------------------------------------- 39 */ 40 41 42 /* This module contains mode-dependent macro and structure definitions. The 43 file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined. 44 These mode-dependent items are kept in a separate file so that they can also be 45 #included multiple times for different code unit widths by pcre2test in order 46 to have access to the hidden structures at all supported widths. 47 48 Some of the mode-dependent macros are required at different widths for 49 different parts of the pcre2test code (in particular, the included 50 pcre_printint.c file). We undefine them here so that they can be re-defined for 51 multiple inclusions. Not all of these are used in pcre2test, but it's easier 52 just to undefine them all. */ 53 54 #undef ACROSSCHAR 55 #undef BACKCHAR 56 #undef BYTES2CU 57 #undef CU2BYTES 58 #undef FORWARDCHAR 59 #undef FORWARDCHARTEST 60 #undef GET 61 #undef GET2 62 #undef GETCHAR 63 #undef GETCHARINC 64 #undef GETCHARINCTEST 65 #undef GETCHARLEN 66 #undef GETCHARLENTEST 67 #undef GETCHARTEST 68 #undef GET_EXTRALEN 69 #undef HAS_EXTRALEN 70 #undef IMM2_SIZE 71 #undef MAX_255 72 #undef MAX_MARK 73 #undef MAX_PATTERN_SIZE 74 #undef MAX_UTF_SINGLE_CU 75 #undef NOT_FIRSTCU 76 #undef PUT 77 #undef PUT2 78 #undef PUT2INC 79 #undef PUTCHAR 80 #undef PUTINC 81 #undef TABLE_GET 82 83 84 85 /* -------------------------- MACROS ----------------------------- */ 86 87 /* PCRE keeps offsets in its compiled code as at least 16-bit quantities 88 (always stored in big-endian order in 8-bit mode) by default. These are used, 89 for example, to link from the start of a subpattern to its alternatives and its 90 end. The use of 16 bits per offset limits the size of an 8-bit compiled regex 91 to around 64K, which is big enough for almost everybody. However, I received a 92 request for an even bigger limit. For this reason, and also to make the code 93 easier to maintain, the storing and loading of offsets from the compiled code 94 unit string is now handled by the macros that are defined here. 95 96 The macros are controlled by the value of LINK_SIZE. This defaults to 2, but 97 values of 3 or 4 are also supported. */ 98 99 /* ------------------- 8-bit support ------------------ */ 100 101 #if PCRE2_CODE_UNIT_WIDTH == 8 102 103 #if LINK_SIZE == 2 104 #define PUT(a,n,d) \ 105 (a[n] = (PCRE2_UCHAR)((d) >> 8)), \ 106 (a[(n)+1] = (PCRE2_UCHAR)((d) & 255)) 107 #define GET(a,n) \ 108 (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) 109 #define MAX_PATTERN_SIZE (1 << 16) 110 111 #elif LINK_SIZE == 3 112 #define PUT(a,n,d) \ 113 (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ 114 (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \ 115 (a[(n)+2] = (PCRE2_UCHAR)((d) & 255)) 116 #define GET(a,n) \ 117 (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2]) 118 #define MAX_PATTERN_SIZE (1 << 24) 119 120 #elif LINK_SIZE == 4 121 #define PUT(a,n,d) \ 122 (a[n] = (PCRE2_UCHAR)((d) >> 24)), \ 123 (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \ 124 (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)), \ 125 (a[(n)+3] = (PCRE2_UCHAR)((d) & 255)) 126 #define GET(a,n) \ 127 (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3]) 128 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ 129 130 #else 131 #error LINK_SIZE must be 2, 3, or 4 132 #endif 133 134 135 /* ------------------- 16-bit support ------------------ */ 136 137 #elif PCRE2_CODE_UNIT_WIDTH == 16 138 139 #if LINK_SIZE == 2 140 #undef LINK_SIZE 141 #define LINK_SIZE 1 142 #define PUT(a,n,d) \ 143 (a[n] = (d)) 144 #define GET(a,n) \ 145 (a[n]) 146 #define MAX_PATTERN_SIZE (1 << 16) 147 148 #elif LINK_SIZE == 3 || LINK_SIZE == 4 149 #undef LINK_SIZE 150 #define LINK_SIZE 2 151 #define PUT(a,n,d) \ 152 (a[n] = (PCRE2_UCHAR)((d) >> 16)), \ 153 (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535)) 154 #define GET(a,n) \ 155 (unsigned int)(((a)[n] << 16) | (a)[(n)+1]) 156 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ 157 158 #else 159 #error LINK_SIZE must be 2, 3, or 4 160 #endif 161 162 163 /* ------------------- 32-bit support ------------------ */ 164 165 #elif PCRE2_CODE_UNIT_WIDTH == 32 166 #undef LINK_SIZE 167 #define LINK_SIZE 1 168 #define PUT(a,n,d) \ 169 (a[n] = (d)) 170 #define GET(a,n) \ 171 (a[n]) 172 #define MAX_PATTERN_SIZE (1 << 30) /* Keep it positive */ 173 174 #else 175 #error Unsupported compiling mode 176 #endif 177 178 179 /* --------------- Other mode-specific macros ----------------- */ 180 181 /* PCRE uses some other (at least) 16-bit quantities that do not change when 182 the size of offsets changes. There are used for repeat counts and for other 183 things such as capturing parenthesis numbers in back references. 184 185 Define the number of code units required to hold a 16-bit count/offset, and 186 macros to load and store such a value. For reasons that I do not understand, 187 the expression in the 8-bit GET2 macro is treated by gcc as a signed 188 expression, even when a is declared as unsigned. It seems that any kind of 189 arithmetic results in a signed value. Hence the cast. */ 190 191 #if PCRE2_CODE_UNIT_WIDTH == 8 192 #define IMM2_SIZE 2 193 #define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1]) 194 #define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255 195 196 #else /* Code units are 16 or 32 bits */ 197 #define IMM2_SIZE 1 198 #define GET2(a,n) a[n] 199 #define PUT2(a,n,d) a[n] = d 200 #endif 201 202 /* Other macros that are different for 8-bit mode. The MAX_255 macro checks 203 whether its argument is less than 256. The maximum length of a MARK name must 204 fit in one code unit; currently it is set to 255 or 65535. The TABLE_GET macro 205 is used to access elements of tables containing exactly 256 items. When code 206 points can be greater than 255, a check is needed before accessing these 207 tables. */ 208 209 #if PCRE2_CODE_UNIT_WIDTH == 8 210 #define MAX_255(c) TRUE 211 #define MAX_MARK ((1u << 8) - 1) 212 #ifdef SUPPORT_UNICODE 213 #define SUPPORT_WIDE_CHARS 214 #endif /* SUPPORT_UNICODE */ 215 #define TABLE_GET(c, table, default) ((table)[c]) 216 217 #else /* Code units are 16 or 32 bits */ 218 #define MAX_255(c) ((c) <= 255u) 219 #define MAX_MARK ((1u << 16) - 1) 220 #define SUPPORT_WIDE_CHARS 221 #define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default)) 222 #endif 223 224 225 226 /* ----------------- Character-handling macros ----------------- */ 227 228 /* There is a proposed future special "UTF-21" mode, in which only the lowest 229 21 bits of a 32-bit character are interpreted as UTF, with the remaining 11 230 high-order bits available to the application for other uses. In preparation for 231 the future implementation of this mode, there are macros that load a data item 232 and, if in this special mode, mask it to 21 bits. These macros all have names 233 starting with UCHAR21. In all other modes, including the normal 32-bit 234 library, the macros all have the same simple definitions. When the new mode is 235 implemented, it is expected that these definitions will be varied appropriately 236 using #ifdef when compiling the library that supports the special mode. */ 237 238 #define UCHAR21(eptr) (*(eptr)) 239 #define UCHAR21TEST(eptr) (*(eptr)) 240 #define UCHAR21INC(eptr) (*(eptr)++) 241 #define UCHAR21INCTEST(eptr) (*(eptr)++) 242 243 /* When UTF encoding is being used, a character is no longer just a single 244 byte in 8-bit mode or a single short in 16-bit mode. The macros for character 245 handling generate simple sequences when used in the basic mode, and more 246 complicated ones for UTF characters. GETCHARLENTEST and other macros are not 247 used when UTF is not supported. To make sure they can never even appear when 248 UTF support is omitted, we don't even define them. */ 249 250 #ifndef SUPPORT_UNICODE 251 252 /* #define MAX_UTF_SINGLE_CU */ 253 /* #define HAS_EXTRALEN(c) */ 254 /* #define GET_EXTRALEN(c) */ 255 /* #define NOT_FIRSTCU(c) */ 256 #define GETCHAR(c, eptr) c = *eptr; 257 #define GETCHARTEST(c, eptr) c = *eptr; 258 #define GETCHARINC(c, eptr) c = *eptr++; 259 #define GETCHARINCTEST(c, eptr) c = *eptr++; 260 #define GETCHARLEN(c, eptr, len) c = *eptr; 261 #define PUTCHAR(c, p) (*p = c, 1) 262 /* #define GETCHARLENTEST(c, eptr, len) */ 263 /* #define BACKCHAR(eptr) */ 264 /* #define FORWARDCHAR(eptr) */ 265 /* #define FORWARCCHARTEST(eptr,end) */ 266 /* #define ACROSSCHAR(condition, eptr, action) */ 267 268 #else /* SUPPORT_UNICODE */ 269 270 /* ------------------- 8-bit support ------------------ */ 271 272 #if PCRE2_CODE_UNIT_WIDTH == 8 273 #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ 274 275 /* The largest UTF code point that can be encoded as a single code unit. */ 276 277 #define MAX_UTF_SINGLE_CU 127 278 279 /* Tests whether the code point needs extra characters to decode. */ 280 281 #define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c) 282 283 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. 284 Otherwise it has an undefined behaviour. */ 285 286 #define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu]) 287 288 /* Returns TRUE, if the given value is not the first code unit of a UTF 289 sequence. */ 290 291 #define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u) 292 293 /* Get the next UTF-8 character, not advancing the pointer. This is called when 294 we know we are in UTF-8 mode. */ 295 296 #define GETCHAR(c, eptr) \ 297 c = *eptr; \ 298 if (c >= 0xc0u) GETUTF8(c, eptr); 299 300 /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the 301 pointer. */ 302 303 #define GETCHARTEST(c, eptr) \ 304 c = *eptr; \ 305 if (utf && c >= 0xc0u) GETUTF8(c, eptr); 306 307 /* Get the next UTF-8 character, advancing the pointer. This is called when we 308 know we are in UTF-8 mode. */ 309 310 #define GETCHARINC(c, eptr) \ 311 c = *eptr++; \ 312 if (c >= 0xc0u) GETUTF8INC(c, eptr); 313 314 /* Get the next character, testing for UTF-8 mode, and advancing the pointer. 315 This is called when we don't know if we are in UTF-8 mode. */ 316 317 #define GETCHARINCTEST(c, eptr) \ 318 c = *eptr++; \ 319 if (utf && c >= 0xc0u) GETUTF8INC(c, eptr); 320 321 /* Get the next UTF-8 character, not advancing the pointer, incrementing length 322 if there are extra bytes. This is called when we know we are in UTF-8 mode. */ 323 324 #define GETCHARLEN(c, eptr, len) \ 325 c = *eptr; \ 326 if (c >= 0xc0u) GETUTF8LEN(c, eptr, len); 327 328 /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the 329 pointer, incrementing length if there are extra bytes. This is called when we 330 do not know if we are in UTF-8 mode. */ 331 332 #define GETCHARLENTEST(c, eptr, len) \ 333 c = *eptr; \ 334 if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len); 335 336 /* If the pointer is not at the start of a character, move it back until 337 it is. This is called only in UTF-8 mode - we don't put a test within the macro 338 because almost all calls are already within a block of UTF-8 only code. */ 339 340 #define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr-- 341 342 /* Same as above, just in the other direction. */ 343 #define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++ 344 #define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++ 345 346 /* Same as above, but it allows a fully customizable form. */ 347 #define ACROSSCHAR(condition, eptr, action) \ 348 while((condition) && ((eptr) & 0xc0u) == 0x80u) action 349 350 /* Deposit a character into memory, returning the number of code units. */ 351 352 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ 353 PRIV(ord2utf)(c,p) : (*p = c, 1)) 354 355 356 /* ------------------- 16-bit support ------------------ */ 357 358 #elif PCRE2_CODE_UNIT_WIDTH == 16 359 #define MAYBE_UTF_MULTI /* UTF chars may use multiple code units */ 360 361 /* The largest UTF code point that can be encoded as a single code unit. */ 362 363 #define MAX_UTF_SINGLE_CU 65535 364 365 /* Tests whether the code point needs extra characters to decode. */ 366 367 #define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u) 368 369 /* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE. 370 Otherwise it has an undefined behaviour. */ 371 372 #define GET_EXTRALEN(c) 1 373 374 /* Returns TRUE, if the given value is not the first code unit of a UTF 375 sequence. */ 376 377 #define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u) 378 379 /* Base macro to pick up the low surrogate of a UTF-16 character, not 380 advancing the pointer. */ 381 382 #define GETUTF16(c, eptr) \ 383 { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; } 384 385 /* Get the next UTF-16 character, not advancing the pointer. This is called when 386 we know we are in UTF-16 mode. */ 387 388 #define GETCHAR(c, eptr) \ 389 c = *eptr; \ 390 if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); 391 392 /* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the 393 pointer. */ 394 395 #define GETCHARTEST(c, eptr) \ 396 c = *eptr; \ 397 if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr); 398 399 /* Base macro to pick up the low surrogate of a UTF-16 character, advancing 400 the pointer. */ 401 402 #define GETUTF16INC(c, eptr) \ 403 { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; } 404 405 /* Get the next UTF-16 character, advancing the pointer. This is called when we 406 know we are in UTF-16 mode. */ 407 408 #define GETCHARINC(c, eptr) \ 409 c = *eptr++; \ 410 if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); 411 412 /* Get the next character, testing for UTF-16 mode, and advancing the pointer. 413 This is called when we don't know if we are in UTF-16 mode. */ 414 415 #define GETCHARINCTEST(c, eptr) \ 416 c = *eptr++; \ 417 if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr); 418 419 /* Base macro to pick up the low surrogate of a UTF-16 character, not 420 advancing the pointer, incrementing the length. */ 421 422 #define GETUTF16LEN(c, eptr, len) \ 423 { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; } 424 425 /* Get the next UTF-16 character, not advancing the pointer, incrementing 426 length if there is a low surrogate. This is called when we know we are in 427 UTF-16 mode. */ 428 429 #define GETCHARLEN(c, eptr, len) \ 430 c = *eptr; \ 431 if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); 432 433 /* Get the next UTF-816character, testing for UTF-16 mode, not advancing the 434 pointer, incrementing length if there is a low surrogate. This is called when 435 we do not know if we are in UTF-16 mode. */ 436 437 #define GETCHARLENTEST(c, eptr, len) \ 438 c = *eptr; \ 439 if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len); 440 441 /* If the pointer is not at the start of a character, move it back until 442 it is. This is called only in UTF-16 mode - we don't put a test within the 443 macro because almost all calls are already within a block of UTF-16 only 444 code. */ 445 446 #define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr-- 447 448 /* Same as above, just in the other direction. */ 449 #define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++ 450 #define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++ 451 452 /* Same as above, but it allows a fully customizable form. */ 453 #define ACROSSCHAR(condition, eptr, action) \ 454 if ((condition) && ((eptr) & 0xfc00u) == 0xdc00u) action 455 456 /* Deposit a character into memory, returning the number of code units. */ 457 458 #define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \ 459 PRIV(ord2utf)(c,p) : (*p = c, 1)) 460 461 462 /* ------------------- 32-bit support ------------------ */ 463 464 #else 465 466 /* These are trivial for the 32-bit library, since all UTF-32 characters fit 467 into one PCRE2_UCHAR unit. */ 468 469 #define MAX_UTF_SINGLE_CU (0x10ffffu) 470 #define HAS_EXTRALEN(c) (0) 471 #define GET_EXTRALEN(c) (0) 472 #define NOT_FIRSTCU(c) (0) 473 474 /* Get the next UTF-32 character, not advancing the pointer. This is called when 475 we know we are in UTF-32 mode. */ 476 477 #define GETCHAR(c, eptr) \ 478 c = *(eptr); 479 480 /* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the 481 pointer. */ 482 483 #define GETCHARTEST(c, eptr) \ 484 c = *(eptr); 485 486 /* Get the next UTF-32 character, advancing the pointer. This is called when we 487 know we are in UTF-32 mode. */ 488 489 #define GETCHARINC(c, eptr) \ 490 c = *((eptr)++); 491 492 /* Get the next character, testing for UTF-32 mode, and advancing the pointer. 493 This is called when we don't know if we are in UTF-32 mode. */ 494 495 #define GETCHARINCTEST(c, eptr) \ 496 c = *((eptr)++); 497 498 /* Get the next UTF-32 character, not advancing the pointer, not incrementing 499 length (since all UTF-32 is of length 1). This is called when we know we are in 500 UTF-32 mode. */ 501 502 #define GETCHARLEN(c, eptr, len) \ 503 GETCHAR(c, eptr) 504 505 /* Get the next UTF-32character, testing for UTF-32 mode, not advancing the 506 pointer, not incrementing the length (since all UTF-32 is of length 1). 507 This is called when we do not know if we are in UTF-32 mode. */ 508 509 #define GETCHARLENTEST(c, eptr, len) \ 510 GETCHARTEST(c, eptr) 511 512 /* If the pointer is not at the start of a character, move it back until 513 it is. This is called only in UTF-32 mode - we don't put a test within the 514 macro because almost all calls are already within a block of UTF-32 only 515 code. 516 517 These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */ 518 519 #define BACKCHAR(eptr) do { } while (0) 520 521 /* Same as above, just in the other direction. */ 522 523 #define FORWARDCHAR(eptr) do { } while (0) 524 #define FORWARDCHARTEST(eptr,end) do { } while (0) 525 526 /* Same as above, but it allows a fully customizable form. */ 527 528 #define ACROSSCHAR(condition, eptr, action) do { } while (0) 529 530 /* Deposit a character into memory, returning the number of code units. */ 531 532 #define PUTCHAR(c, p) (*p = c, 1) 533 534 #endif /* UTF-32 character handling */ 535 #endif /* SUPPORT_UNICODE */ 536 537 538 /* Mode-dependent macros that have the same definition in all modes. */ 539 540 #define CU2BYTES(x) ((x)*((PCRE2_CODE_UNIT_WIDTH/8))) 541 #define BYTES2CU(x) ((x)/((PCRE2_CODE_UNIT_WIDTH/8))) 542 #define PUTINC(a,n,d) PUT(a,n,d), a += LINK_SIZE 543 #define PUT2INC(a,n,d) PUT2(a,n,d), a += IMM2_SIZE 544 545 546 /* ----------------------- HIDDEN STRUCTURES ----------------------------- */ 547 548 /* NOTE: All these structures *must* start with a pcre2_memctl structure. The 549 code that uses them is simpler because it assumes this. */ 550 551 /* The real general context structure. At present it holds only data for custom 552 memory control. */ 553 554 typedef struct pcre2_real_general_context { 555 pcre2_memctl memctl; 556 } pcre2_real_general_context; 557 558 /* The real compile context structure */ 559 560 typedef struct pcre2_real_compile_context { 561 pcre2_memctl memctl; 562 int (*stack_guard)(uint32_t, void *); 563 void *stack_guard_data; 564 const uint8_t *tables; 565 PCRE2_SIZE max_pattern_length; 566 uint16_t bsr_convention; 567 uint16_t newline_convention; 568 uint32_t parens_nest_limit; 569 } pcre2_real_compile_context; 570 571 /* The real match context structure. */ 572 573 typedef struct pcre2_real_match_context { 574 pcre2_memctl memctl; 575 #ifdef HEAP_MATCH_RECURSE 576 pcre2_memctl stack_memctl; 577 #endif 578 #ifdef SUPPORT_JIT 579 pcre2_jit_callback jit_callback; 580 void *jit_callback_data; 581 #endif 582 int (*callout)(pcre2_callout_block *, void *); 583 void *callout_data; 584 PCRE2_SIZE offset_limit; 585 uint32_t match_limit; 586 uint32_t recursion_limit; 587 } pcre2_real_match_context; 588 589 /* The real compiled code structure. The type for the blocksize field is 590 defined specially because it is required in pcre2_serialize_decode() when 591 copying the size from possibly unaligned memory into a variable of the same 592 type. Use a macro rather than a typedef to avoid compiler warnings when this 593 file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the 594 largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit 595 argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field 596 here.) */ 597 598 #undef CODE_BLOCKSIZE_TYPE 599 #define CODE_BLOCKSIZE_TYPE size_t 600 601 #undef LOOKBEHIND_MAX 602 #define LOOKBEHIND_MAX UINT16_MAX 603 604 typedef struct pcre2_real_code { 605 pcre2_memctl memctl; /* Memory control fields */ 606 const uint8_t *tables; /* The character tables */ 607 void *executable_jit; /* Pointer to JIT code */ 608 uint8_t start_bitmap[32]; /* Bitmap for starting code unit < 256 */ 609 CODE_BLOCKSIZE_TYPE blocksize; /* Total (bytes) that was malloc-ed */ 610 uint32_t magic_number; /* Paranoid and endianness check */ 611 uint32_t compile_options; /* Options passed to pcre2_compile() */ 612 uint32_t overall_options; /* Options after processing the pattern */ 613 uint32_t flags; /* Various state flags */ 614 uint32_t limit_match; /* Limit set in the pattern */ 615 uint32_t limit_recursion; /* Limit set in the pattern */ 616 uint32_t first_codeunit; /* Starting code unit */ 617 uint32_t last_codeunit; /* This codeunit must be seen */ 618 uint16_t bsr_convention; /* What \R matches */ 619 uint16_t newline_convention; /* What is a newline? */ 620 uint16_t max_lookbehind; /* Longest lookbehind (characters) */ 621 uint16_t minlength; /* Minimum length of match */ 622 uint16_t top_bracket; /* Highest numbered group */ 623 uint16_t top_backref; /* Highest numbered back reference */ 624 uint16_t name_entry_size; /* Size (code units) of table entries */ 625 uint16_t name_count; /* Number of name entries in the table */ 626 } pcre2_real_code; 627 628 /* The real match data structure. */ 629 630 typedef struct pcre2_real_match_data { 631 pcre2_memctl memctl; 632 const pcre2_real_code *code; /* The pattern used for the match */ 633 PCRE2_SPTR subject; /* The subject that was matched */ 634 PCRE2_SPTR mark; /* Pointer to last mark */ 635 PCRE2_SIZE leftchar; /* Offset to leftmost code unit */ 636 PCRE2_SIZE rightchar; /* Offset to rightmost code unit */ 637 PCRE2_SIZE startchar; /* Offset to starting code unit */ 638 uint16_t matchedby; /* Type of match (normal, JIT, DFA) */ 639 uint16_t oveccount; /* Number of pairs */ 640 int rc; /* The return code from the match */ 641 PCRE2_SIZE ovector[1]; /* The first field */ 642 } pcre2_real_match_data; 643 644 645 /* ----------------------- PRIVATE STRUCTURES ----------------------------- */ 646 647 /* These structures are not needed for pcre2test. */ 648 649 #ifndef PCRE2_PCRE2TEST 650 651 /* Structure for checking for mutual recursion when scanning compiled code. */ 652 653 typedef struct recurse_check { 654 struct recurse_check *prev; 655 PCRE2_SPTR group; 656 } recurse_check; 657 658 /* Structure for building a cache when filling in recursion offsets. */ 659 660 typedef struct recurse_cache { 661 PCRE2_SPTR group; 662 int recno; 663 } recurse_cache; 664 665 /* Structure for maintaining a chain of pointers to the currently incomplete 666 branches, for testing for left recursion while compiling. */ 667 668 typedef struct branch_chain { 669 struct branch_chain *outer; 670 PCRE2_UCHAR *current_branch; 671 } branch_chain; 672 673 /* Structure for building a list of named groups during the first pass of 674 compiling. */ 675 676 typedef struct named_group { 677 PCRE2_SPTR name; /* Points to the name in the pattern */ 678 uint32_t number; /* Group number */ 679 uint16_t length; /* Length of the name */ 680 uint16_t isdup; /* TRUE if a duplicate */ 681 } named_group; 682 683 /* Structure for passing "static" information around between the functions 684 doing the compiling, so that they are thread-safe. */ 685 686 typedef struct compile_block { 687 pcre2_real_compile_context *cx; /* Points to the compile context */ 688 const uint8_t *lcc; /* Points to lower casing table */ 689 const uint8_t *fcc; /* Points to case-flipping table */ 690 const uint8_t *cbits; /* Points to character type table */ 691 const uint8_t *ctypes; /* Points to table of type maps */ 692 PCRE2_SPTR start_workspace; /* The start of working space */ 693 PCRE2_SPTR start_code; /* The start of the compiled code */ 694 PCRE2_SPTR start_pattern; /* The start of the pattern */ 695 PCRE2_SPTR end_pattern; /* The end of the pattern */ 696 PCRE2_SPTR nestptr[2]; /* Pointer(s) saved for string substitution */ 697 PCRE2_UCHAR *name_table; /* The name/number table */ 698 size_t workspace_size; /* Size of workspace */ 699 uint16_t names_found; /* Number of entries so far */ 700 uint16_t name_entry_size; /* Size of each entry */ 701 open_capitem *open_caps; /* Chain of open capture items */ 702 named_group *named_groups; /* Points to vector in pre-compile */ 703 uint32_t named_group_list_size; /* Number of entries in the list */ 704 uint32_t external_options; /* External (initial) options */ 705 uint32_t external_flags; /* External flag bits to be set */ 706 uint32_t bracount; /* Count of capturing parens as we compile */ 707 uint32_t final_bracount; /* Saved value after first pass */ 708 uint32_t *groupinfo; /* Group info vector */ 709 uint32_t top_backref; /* Maximum back reference */ 710 uint32_t backref_map; /* Bitmap of low back refs */ 711 uint32_t nltype; /* Newline type */ 712 uint32_t nllen; /* Newline string length */ 713 PCRE2_UCHAR nl[4]; /* Newline string when fixed length */ 714 int max_lookbehind; /* Maximum lookbehind (characters) */ 715 int parens_depth; /* Depth of nested parentheses */ 716 int assert_depth; /* Depth of nested assertions */ 717 int req_varyopt; /* "After variable item" flag for reqbyte */ 718 BOOL had_accept; /* (*ACCEPT) encountered */ 719 BOOL had_pruneorskip; /* (*PRUNE) or (*SKIP) encountered */ 720 BOOL had_recurse; /* Had a recursion or subroutine call */ 721 BOOL check_lookbehind; /* Lookbehinds need later checking */ 722 BOOL dupnames; /* Duplicate names exist */ 723 BOOL iscondassert; /* Next assert is a condition */ 724 } compile_block; 725 726 /* Structure for keeping the properties of the in-memory stack used 727 by the JIT matcher. */ 728 729 typedef struct pcre2_real_jit_stack { 730 pcre2_memctl memctl; 731 void* stack; 732 } pcre2_real_jit_stack; 733 734 /* Structure for keeping a chain of heap blocks used for saving ovectors 735 during pattern recursion when the ovector is larger than can be saved on 736 the system stack. */ 737 738 typedef struct ovecsave_frame { 739 struct ovecsave_frame *next; /* Next frame on free chain */ 740 PCRE2_SIZE saved_ovec[1]; /* First vector element */ 741 } ovecsave_frame; 742 743 /* Structure for items in a linked list that represents an explicit recursive 744 call within the pattern; used by pcre_match(). */ 745 746 typedef struct recursion_info { 747 struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ 748 unsigned int group_num; /* Number of group that was called */ 749 PCRE2_SIZE *ovec_save; /* Pointer to saved ovector frame */ 750 uint32_t saved_capture_last; /* Last capture number */ 751 PCRE2_SPTR subject_position; /* Position at start of recursion */ 752 } recursion_info; 753 754 /* A similar structure for pcre_dfa_match(). */ 755 756 typedef struct dfa_recursion_info { 757 struct dfa_recursion_info *prevrec; 758 PCRE2_SPTR subject_position; 759 uint32_t group_num; 760 } dfa_recursion_info; 761 762 /* Structure for building a chain of data for holding the values of the subject 763 pointer at the start of each subpattern, so as to detect when an empty string 764 has been matched by a subpattern - to break infinite loops; used by 765 pcre2_match(). */ 766 767 typedef struct eptrblock { 768 struct eptrblock *epb_prev; 769 PCRE2_SPTR epb_saved_eptr; 770 } eptrblock; 771 772 /* Structure for passing "static" information around between the functions 773 doing traditional NFA matching (pcre2_match() and friends). */ 774 775 typedef struct match_block { 776 pcre2_memctl memctl; /* For general use */ 777 #ifdef HEAP_MATCH_RECURSE 778 pcre2_memctl stack_memctl; /* For "stack" frames */ 779 #endif 780 uint32_t match_call_count; /* As it says */ 781 uint32_t match_limit; /* As it says */ 782 uint32_t match_limit_recursion; /* As it says */ 783 BOOL hitend; /* Hit the end of the subject at some point */ 784 BOOL hasthen; /* Pattern contains (*THEN) */ 785 const uint8_t *lcc; /* Points to lower casing table */ 786 const uint8_t *fcc; /* Points to case-flipping table */ 787 const uint8_t *ctypes; /* Points to table of type maps */ 788 PCRE2_SIZE *ovector; /* Pointer to the offset vector */ 789 PCRE2_SIZE offset_end; /* One past the end */ 790 PCRE2_SIZE offset_max; /* The maximum usable for return data */ 791 PCRE2_SIZE start_offset; /* The start offset value */ 792 PCRE2_SIZE end_offset_top; /* Highwater mark at end of match */ 793 uint16_t partial; /* PARTIAL options */ 794 uint16_t bsr_convention; /* \R interpretation */ 795 uint16_t name_count; /* Number of names in name table */ 796 uint16_t name_entry_size; /* Size of entry in names table */ 797 PCRE2_SPTR name_table; /* Table of group names */ 798 PCRE2_SPTR start_code; /* For use when recursing */ 799 PCRE2_SPTR start_subject; /* Start of the subject string */ 800 PCRE2_SPTR end_subject; /* End of the subject string */ 801 PCRE2_SPTR start_match_ptr; /* Start of matched string */ 802 PCRE2_SPTR end_match_ptr; /* Subject position at end match */ 803 PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ 804 PCRE2_SPTR last_used_ptr; /* Latest consulted character */ 805 PCRE2_SPTR mark; /* Mark pointer to pass back on success */ 806 PCRE2_SPTR nomatch_mark; /* Mark pointer to pass back on failure */ 807 PCRE2_SPTR once_target; /* Where to back up to for atomic groups */ 808 uint32_t moptions; /* Match options */ 809 uint32_t poptions; /* Pattern options */ 810 uint32_t capture_last; /* Most recent capture number + overflow flag */ 811 uint32_t skip_arg_count; /* For counting SKIP_ARGs */ 812 uint32_t ignore_skip_arg; /* For re-run when SKIP arg name not found */ 813 uint32_t match_function_type; /* Set for certain special calls of match() */ 814 uint32_t nltype; /* Newline type */ 815 uint32_t nllen; /* Newline string length */ 816 PCRE2_UCHAR nl[4]; /* Newline string when fixed */ 817 eptrblock *eptrchain; /* Chain of eptrblocks for tail recursions */ 818 recursion_info *recursive; /* Linked list of recursion data */ 819 ovecsave_frame *ovecsave_chain; /* Linked list of free ovecsave blocks */ 820 void *callout_data; /* To pass back to callouts */ 821 int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ 822 #ifdef HEAP_MATCH_RECURSE 823 void *match_frames_base; /* For remembering malloc'd frames */ 824 #endif 825 } match_block; 826 827 /* A similar structure is used for the same purpose by the DFA matching 828 functions. */ 829 830 typedef struct dfa_match_block { 831 pcre2_memctl memctl; /* For general use */ 832 PCRE2_SPTR start_code; /* Start of the compiled pattern */ 833 PCRE2_SPTR start_subject ; /* Start of the subject string */ 834 PCRE2_SPTR end_subject; /* End of subject string */ 835 PCRE2_SPTR start_used_ptr; /* Earliest consulted character */ 836 PCRE2_SPTR last_used_ptr; /* Latest consulted character */ 837 const uint8_t *tables; /* Character tables */ 838 PCRE2_SIZE start_offset; /* The start offset value */ 839 uint32_t moptions; /* Match options */ 840 uint32_t poptions; /* Pattern options */ 841 uint32_t nltype; /* Newline type */ 842 uint32_t nllen; /* Newline string length */ 843 PCRE2_UCHAR nl[4]; /* Newline string when fixed */ 844 uint16_t bsr_convention; /* \R interpretation */ 845 void *callout_data; /* To pass back to callouts */ 846 int (*callout)(pcre2_callout_block *,void *); /* Callout function or NULL */ 847 dfa_recursion_info *recursive; /* Linked list of recursion data */ 848 } dfa_match_block; 849 850 #endif /* PCRE2_PCRE2TEST */ 851 852 /* End of pcre2_intmodedep.h */ 853