1# Copyright (C) 2016 and later: Unicode, Inc. and others. 2# License & terms of use: http://www.unicode.org/copyright.html 3#***************************************************************************** 4# 5# Copyright (C) 2002-2015, International Business Machines Corporation and others. 6# All Rights Reserved. 7# 8#***************************************************************************** 9# 10# file: regexcst.txt 11# ICU Regular Expression Parser State Table 12# 13# This state table is used when reading and parsing a regular expression pattern 14# The pattern parser uses a state machine; the data in this file define the 15# state transitions that occur for each input character. 16# 17# *** This file defines the regex pattern grammar. This is it. 18# *** The determination of what is accepted is here. 19# 20# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays 21# that are then built with the rule parser. 22# 23 24# 25# Here is the syntax of the state definitions in this file: 26# 27# 28#StateName: 29# input-char n next-state ^push-state action 30# input-char n next-state ^push-state action 31# | | | | | 32# | | | | |--- action to be performed by state machine 33# | | | | See function RBBIRuleScanner::doParseActions() 34# | | | | 35# | | | |--- Push this named state onto the state stack. 36# | | | Later, when next state is specified as "pop", 37# | | | the pushed state will become the current state. 38# | | | 39# | | |--- Transition to this state if the current input character matches the input 40# | | character or char class in the left hand column. "pop" causes the next 41# | | state to be popped from the state stack. 42# | | 43# | |--- When making the state transition specified on this line, advance to the next 44# | character from the input only if 'n' appears here. 45# | 46# |--- Character or named character classes to test for. If the current character being scanned 47# matches, perform the actions and go to the state specified on this line. 48# The input character is tested sequentally, in the order written. The characters and 49# character classes tested for do not need to be mutually exclusive. The first match wins. 50# 51 52 53 54 55# 56# start state, scan position is at the beginning of the pattern. 57# 58start: 59 default term doPatStart 60 61 62 63 64# 65# term. At a position where we can accept the start most items in a pattern. 66# 67term: 68 quoted n expr-quant doLiteralChar 69 rule_char n expr-quant doLiteralChar 70 '[' n set-open ^set-finish doSetBegin 71 '(' n open-paren 72 '.' n expr-quant doDotAny 73 '^' n expr-quant doCaret 74 '$' n expr-quant doDollar 75 '\' n backslash 76 '|' n term doOrOperator 77 ')' n pop doCloseParen 78 eof term doPatFinish 79 default errorDeath doRuleError 80 81 82 83# 84# expr-quant We've just finished scanning a term, now look for the optional 85# trailing quantifier - *, +, ?, *?, etc. 86# 87expr-quant: 88 '*' n quant-star 89 '+' n quant-plus 90 '?' n quant-opt 91 '{' n interval-open doIntervalInit 92 '(' n open-paren-quant 93 default expr-cont 94 95 96# 97# expr-cont Expression, continuation. At a point where additional terms are 98# allowed, but not required. No Quantifiers 99# 100expr-cont: 101 '|' n term doOrOperator 102 ')' n pop doCloseParen 103 default term 104 105 106# 107# open-paren-quant Special case handling for comments appearing before a quantifier, 108# e.g. x(?#comment )* 109# Open parens from expr-quant come here; anything but a (?# comment 110# branches into the normal parenthesis sequence as quickly as possible. 111# 112open-paren-quant: 113 '?' n open-paren-quant2 doSuppressComments 114 default open-paren 115 116open-paren-quant2: 117 '#' n paren-comment ^expr-quant 118 default open-paren-extended 119 120 121# 122# open-paren We've got an open paren. We need to scan further to 123# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. 124# 125open-paren: 126 '?' n open-paren-extended doSuppressComments 127 default term ^expr-quant doOpenCaptureParen 128 129open-paren-extended: 130 ':' n term ^expr-quant doOpenNonCaptureParen # (?: 131 '>' n term ^expr-quant doOpenAtomicParen # (?> 132 '=' n term ^expr-cont doOpenLookAhead # (?= 133 '!' n term ^expr-cont doOpenLookAheadNeg # (?! 134 '<' n open-paren-lookbehind 135 '#' n paren-comment ^term 136 'i' paren-flag doBeginMatchMode 137 'd' paren-flag doBeginMatchMode 138 'm' paren-flag doBeginMatchMode 139 's' paren-flag doBeginMatchMode 140 'u' paren-flag doBeginMatchMode 141 'w' paren-flag doBeginMatchMode 142 'x' paren-flag doBeginMatchMode 143 '-' paren-flag doBeginMatchMode 144 '(' n errorDeath doConditionalExpr 145 '{' n errorDeath doPerlInline 146 default errorDeath doBadOpenParenType 147 148open-paren-lookbehind: 149 '=' n term ^expr-cont doOpenLookBehind # (?<= 150 '!' n term ^expr-cont doOpenLookBehindNeg # (?<! 151 ascii_letter named-capture doBeginNamedCapture # (?<name 152 default errorDeath doBadOpenParenType 153 154 155# 156# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' 157# 158paren-comment: 159 ')' n pop 160 eof errorDeath doMismatchedParenErr 161 default n paren-comment 162 163# 164# paren-flag Scanned a (?ismx-ismx flag setting 165# 166paren-flag: 167 'i' n paren-flag doMatchMode 168 'd' n paren-flag doMatchMode 169 'm' n paren-flag doMatchMode 170 's' n paren-flag doMatchMode 171 'u' n paren-flag doMatchMode 172 'w' n paren-flag doMatchMode 173 'x' n paren-flag doMatchMode 174 '-' n paren-flag doMatchMode 175 ')' n term doSetMatchMode 176 ':' n term ^expr-quant doMatchModeParen 177 default errorDeath doBadModeFlag 178 179# 180# named-capture (?<name> ... ), position currently on the name. 181# 182named-capture: 183 ascii_letter n named-capture doContinueNamedCapture 184 digit_char n named-capture doContinueNamedCapture 185 '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture. 186 default errorDeath doBadNamedCapture 187 188# 189# quant-star Scanning a '*' quantifier. Need to look ahead to decide 190# between plain '*', '*?', '*+' 191# 192quant-star: 193 '?' n expr-cont doNGStar # *? 194 '+' n expr-cont doPossessiveStar # *+ 195 default expr-cont doStar 196 197 198# 199# quant-plus Scanning a '+' quantifier. Need to look ahead to decide 200# between plain '+', '+?', '++' 201# 202quant-plus: 203 '?' n expr-cont doNGPlus # *? 204 '+' n expr-cont doPossessivePlus # *+ 205 default expr-cont doPlus 206 207 208# 209# quant-opt Scanning a '?' quantifier. Need to look ahead to decide 210# between plain '?', '??', '?+' 211# 212quant-opt: 213 '?' n expr-cont doNGOpt # ?? 214 '+' n expr-cont doPossessiveOpt # ?+ 215 default expr-cont doOpt # ? 216 217 218# 219# Interval scanning a '{', the opening delimiter for an interval specification 220# {number} or {min, max} or {min,} 221# 222interval-open: 223 digit_char interval-lower 224 default errorDeath doIntervalError 225 226interval-lower: 227 digit_char n interval-lower doIntevalLowerDigit 228 ',' n interval-upper 229 '}' n interval-type doIntervalSame # {n} 230 default errorDeath doIntervalError 231 232interval-upper: 233 digit_char n interval-upper doIntervalUpperDigit 234 '}' n interval-type 235 default errorDeath doIntervalError 236 237interval-type: 238 '?' n expr-cont doNGInterval # {n,m}? 239 '+' n expr-cont doPossessiveInterval # {n,m}+ 240 default expr-cont doInterval # {m,n} 241 242 243# 244# backslash # Backslash. Figure out which of the \thingies we have encountered. 245# The low level next-char function will have preprocessed 246# some of them already; those won't come here. 247backslash: 248 'A' n term doBackslashA 249 'B' n term doBackslashB 250 'b' n term doBackslashb 251 'd' n expr-quant doBackslashd 252 'D' n expr-quant doBackslashD 253 'G' n term doBackslashG 254 'h' n expr-quant doBackslashh 255 'H' n expr-quant doBackslashH 256 'k' n named-backref 257 'N' expr-quant doNamedChar # \N{NAME} named char 258 'p' expr-quant doProperty # \p{Lu} style property 259 'P' expr-quant doProperty 260 'R' n expr-quant doBackslashR 261 'Q' n term doEnterQuoteMode 262 'S' n expr-quant doBackslashS 263 's' n expr-quant doBackslashs 264 'v' n expr-quant doBackslashv 265 'V' n expr-quant doBackslashV 266 'W' n expr-quant doBackslashW 267 'w' n expr-quant doBackslashw 268 'X' n expr-quant doBackslashX 269 'Z' n term doBackslashZ 270 'z' n term doBackslashz 271 digit_char n expr-quant doBackRef # Will scan multiple digits 272 eof errorDeath doEscapeError 273 default n expr-quant doEscapedLiteralChar 274 275 276# named-backref Scanned \k 277# Leading to \k<captureName> 278# Failure to get the full sequence is an error. 279# 280named-backref: 281 '<' n named-backref-2 doBeginNamedBackRef 282 default errorDeath doBadNamedCapture 283 284named-backref-2: 285 ascii_letter n named-backref-3 doContinueNamedBackRef 286 default errorDeath doBadNamedCapture 287 288named-backref-3: 289 ascii_letter n named-backref-3 doContinueNamedBackRef 290 digit_char n named-backref-3 doContinueNamedBackRef 291 '>' n expr-quant doCompleteNamedBackRef 292 default errorDeath doBadNamedCapture 293 294 295# 296# [set expression] parsing, 297# All states involved in parsing set expressions have names beginning with "set-" 298# 299 300set-open: 301 '^' n set-open2 doSetNegate 302 ':' set-posix doSetPosixProp 303 default set-open2 304 305set-open2: 306 ']' n set-after-lit doSetLiteral 307 default set-start 308 309# set-posix: 310# scanned a '[:' If it really is a [:property:], doSetPosixProp will have 311# moved the scan to the closing ']'. If it wasn't a property 312# expression, the scan will still be at the opening ':', which should 313# be interpreted as a normal set expression. 314set-posix: 315 ']' n pop doSetEnd 316 ':' set-start 317 default errorDeath doRuleError # should not be possible. 318 319# 320# set-start after the [ and special case leading characters (^ and/or ]) but before 321# everything else. A '-' is literal at this point. 322# 323set-start: 324 ']' n pop doSetEnd 325 '[' n set-open ^set-after-set doSetBeginUnion 326 '\' n set-escape 327 '-' n set-start-dash 328 '&' n set-start-amp 329 default n set-after-lit doSetLiteral 330 331# set-start-dash Turn "[--" into a syntax error. 332# "[-x" is good, - and x are literals. 333# 334set-start-dash: 335 '-' errorDeath doRuleError 336 default set-after-lit doSetAddDash 337 338# set-start-amp Turn "[&&" into a syntax error. 339# "[&x" is good, & and x are literals. 340# 341set-start-amp: 342 '&' errorDeath doRuleError 343 default set-after-lit doSetAddAmp 344 345# 346# set-after-lit The last thing scanned was a literal character within a set. 347# Can be followed by anything. Single '-' or '&' are 348# literals in this context, not operators. 349set-after-lit: 350 ']' n pop doSetEnd 351 '[' n set-open ^set-after-set doSetBeginUnion 352 '-' n set-lit-dash 353 '&' n set-lit-amp 354 '\' n set-escape 355 eof errorDeath doSetNoCloseError 356 default n set-after-lit doSetLiteral 357 358set-after-set: 359 ']' n pop doSetEnd 360 '[' n set-open ^set-after-set doSetBeginUnion 361 '-' n set-set-dash 362 '&' n set-set-amp 363 '\' n set-escape 364 eof errorDeath doSetNoCloseError 365 default n set-after-lit doSetLiteral 366 367set-after-range: 368 ']' n pop doSetEnd 369 '[' n set-open ^set-after-set doSetBeginUnion 370 '-' n set-range-dash 371 '&' n set-range-amp 372 '\' n set-escape 373 eof errorDeath doSetNoCloseError 374 default n set-after-lit doSetLiteral 375 376 377# set-after-op 378# After a -- or && 379# It is an error to close a set at this point. 380# 381set-after-op: 382 '[' n set-open ^set-after-set doSetBeginUnion 383 ']' errorDeath doSetOpError 384 '\' n set-escape 385 default n set-after-lit doSetLiteral 386 387# 388# set-set-amp 389# Have scanned [[set]& 390# Could be a '&' intersection operator, if a set follows. 391# Could be the start of a '&&' operator. 392# Otherwise is a literal. 393set-set-amp: 394 '[' n set-open ^set-after-set doSetBeginIntersection1 395 '&' n set-after-op doSetIntersection2 396 default set-after-lit doSetAddAmp 397 398 399# set-lit-amp Have scanned "[literals&" 400# Could be a start of "&&" operator or a literal 401# In [abc&[def]], the '&' is a literal 402# 403set-lit-amp: 404 '&' n set-after-op doSetIntersection2 405 default set-after-lit doSetAddAmp 406 407 408# 409# set-set-dash 410# Have scanned [set]- 411# Could be a '-' difference operator, if a [set] follows. 412# Could be the start of a '--' operator. 413# Otherwise is a literal. 414set-set-dash: 415 '[' n set-open ^set-after-set doSetBeginDifference1 416 '-' n set-after-op doSetDifference2 417 default set-after-lit doSetAddDash 418 419 420# 421# set-range-dash 422# scanned a-b- or \w- 423# any set or range like item where the trailing single '-' should 424# be literal, not a set difference operation. 425# A trailing "--" is still a difference operator. 426set-range-dash: 427 '-' n set-after-op doSetDifference2 428 default set-after-lit doSetAddDash 429 430 431set-range-amp: 432 '&' n set-after-op doSetIntersection2 433 default set-after-lit doSetAddAmp 434 435 436# set-lit-dash 437# Have scanned "[literals-" Could be a range or a -- operator or a literal 438# In [abc-[def]], the '-' is a literal (confirmed with a Java test) 439# [abc-\p{xx} the '-' is an error 440# [abc-] the '-' is a literal 441# [ab-xy] the '-' is a range 442# 443set-lit-dash: 444 '-' n set-after-op doSetDifference2 445 '[' set-after-lit doSetAddDash 446 ']' set-after-lit doSetAddDash 447 '\' n set-lit-dash-escape 448 default n set-after-range doSetRange 449 450# set-lit-dash-escape 451# 452# scanned "[literal-\" 453# Could be a range, if the \ introduces an escaped literal char or a named char. 454# Otherwise it is an error. 455# 456set-lit-dash-escape: 457 's' errorDeath doSetOpError 458 'S' errorDeath doSetOpError 459 'w' errorDeath doSetOpError 460 'W' errorDeath doSetOpError 461 'd' errorDeath doSetOpError 462 'D' errorDeath doSetOpError 463 'N' set-after-range doSetNamedRange 464 default n set-after-range doSetRange 465 466 467# 468# set-escape 469# Common back-slash escape processing within set expressions 470# 471set-escape: 472 'p' set-after-set doSetProp 473 'P' set-after-set doSetProp 474 'N' set-after-lit doSetNamedChar 475 's' n set-after-range doSetBackslash_s 476 'S' n set-after-range doSetBackslash_S 477 'w' n set-after-range doSetBackslash_w 478 'W' n set-after-range doSetBackslash_W 479 'd' n set-after-range doSetBackslash_d 480 'D' n set-after-range doSetBackslash_D 481 'h' n set-after-range doSetBackslash_h 482 'H' n set-after-range doSetBackslash_H 483 'v' n set-after-range doSetBackslash_v 484 'V' n set-after-range doSetBackslash_V 485 default n set-after-lit doSetLiteralEscaped 486 487# 488# set-finish 489# Have just encountered the final ']' that completes a [set], and 490# arrived here via a pop. From here, we exit the set parsing world, and go 491# back to generic regular expression parsing. 492# 493set-finish: 494 default expr-quant doSetFinish 495 496 497# 498# errorDeath. This state is specified as the next state whenever a syntax error 499# in the source rules is detected. Barring bugs, the state machine will never 500# actually get here, but will stop because of the action associated with the error. 501# But, just in case, this state asks the state machine to exit. 502errorDeath: 503 default n errorDeath doExit 504 505 506