1 2#***************************************************************************** 3# 4# Copyright (C) 2002-2007, International Business Machines Corporation and others. 5# All Rights Reserved. 6# 7#***************************************************************************** 8# 9# file: regexcst.txt 10# ICU Regular Expression Parser State Table 11# 12# This state table is used when reading and parsing a regular expression pattern 13# The pattern parser uses a state machine; the data in this file define the 14# state transitions that occur for each input character. 15# 16# *** This file defines the regex pattern grammar. This is it. 17# *** The determination of what is accepted is here. 18# 19# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays 20# that are then built with the rule parser. 21# 22 23# 24# Here is the syntax of the state definitions in this file: 25# 26# 27#StateName: 28# input-char n next-state ^push-state action 29# input-char n next-state ^push-state action 30# | | | | | 31# | | | | |--- action to be performed by state machine 32# | | | | See function RBBIRuleScanner::doParseActions() 33# | | | | 34# | | | |--- Push this named state onto the state stack. 35# | | | Later, when next state is specified as "pop", 36# | | | the pushed state will become the current state. 37# | | | 38# | | |--- Transition to this state if the current input character matches the input 39# | | character or char class in the left hand column. "pop" causes the next 40# | | state to be popped from the state stack. 41# | | 42# | |--- When making the state transition specified on this line, advance to the next 43# | character from the input only if 'n' appears here. 44# | 45# |--- Character or named character classes to test for. If the current character being scanned 46# matches, peform the actions and go to the state specified on this line. 47# The input character is tested sequentally, in the order written. The characters and 48# character classes tested for do not need to be mutually exclusive. The first match wins. 49# 50 51 52 53 54# 55# start state, scan position is at the beginning of the pattern. 56# 57start: 58 default term doPatStart 59 60 61 62 63# 64# term. At a position where we can accept the start most items in a pattern. 65# 66term: 67 quoted n expr-quant doLiteralChar 68 rule_char n expr-quant doLiteralChar 69 '[' n set-open ^set-finish doSetBegin 70 '(' n open-paren 71 '.' n expr-quant doDotAny 72 '^' n expr-quant doCaret 73 '$' n expr-quant doDollar 74 '\' n backslash 75 '|' n term doOrOperator 76 ')' n pop doCloseParen 77 eof term doPatFinish 78 default errorDeath doRuleError 79 80 81 82# 83# expr-quant We've just finished scanning a term, now look for the optional 84# trailing quantifier - *, +, ?, *?, etc. 85# 86expr-quant: 87 '*' n quant-star 88 '+' n quant-plus 89 '?' n quant-opt 90 '{' n interval-open doIntervalInit 91 '(' n open-paren-quant 92 default expr-cont 93 94 95# 96# expr-cont Expression, continuation. At a point where additional terms are 97# allowed, but not required. No Quantifiers 98# 99expr-cont: 100 '|' n term doOrOperator 101 ')' n pop doCloseParen 102 default term 103 104 105# 106# open-paren-quant Special case handling for comments appearing before a quantifier, 107# e.g. x(?#comment )* 108# Open parens from expr-quant come here; anything but a (?# comment 109# branches into the normal parenthesis sequence as quickly as possible. 110# 111open-paren-quant: 112 '?' n open-paren-quant2 doSuppressComments 113 default open-paren 114 115open-paren-quant2: 116 '#' n paren-comment ^expr-quant 117 default open-paren-extended 118 119 120# 121# open-paren We've got an open paren. We need to scan further to 122# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. 123# 124open-paren: 125 '?' n open-paren-extended doSuppressComments 126 default term ^expr-quant doOpenCaptureParen 127 128open-paren-extended: 129 ':' n term ^expr-quant doOpenNonCaptureParen # (?: 130 '>' n term ^expr-quant doOpenAtomicParen # (?> 131 '=' n term ^expr-cont doOpenLookAhead # (?= 132 '!' n term ^expr-cont doOpenLookAheadNeg # (?! 133 '<' n open-paren-lookbehind 134 '#' n paren-comment ^term 135 'i' paren-flag doBeginMatchMode 136 'd' paren-flag doBeginMatchMode 137 'm' paren-flag doBeginMatchMode 138 's' paren-flag doBeginMatchMode 139 'u' paren-flag doBeginMatchMode 140 'w' paren-flag doBeginMatchMode 141 'x' paren-flag doBeginMatchMode 142 '-' paren-flag doBeginMatchMode 143 '(' n errorDeath doConditionalExpr 144 '{' n errorDeath doPerlInline 145 default errorDeath doBadOpenParenType 146 147open-paren-lookbehind: 148 '=' n term ^expr-cont doOpenLookBehind # (?<= 149 '!' n term ^expr-cont doOpenLookBehindNeg # (?<! 150 default errorDeath doBadOpenParenType 151 152 153# 154# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' 155# 156paren-comment: 157 ')' n pop 158 eof errorDeath doMismatchedParenErr 159 default n paren-comment 160 161# 162# paren-flag Scanned a (?ismx-ismx flag setting 163# 164paren-flag: 165 'i' n paren-flag doMatchMode 166 'd' n paren-flag doMatchMode 167 'm' n paren-flag doMatchMode 168 's' n paren-flag doMatchMode 169 'u' n paren-flag doMatchMode 170 'w' n paren-flag doMatchMode 171 'x' n paren-flag doMatchMode 172 '-' n paren-flag doMatchMode 173 ')' n term doSetMatchMode 174 ':' n term ^expr-quant doMatchModeParen 175 default errorDeath doBadModeFlag 176 177 178# 179# quant-star Scanning a '*' quantifier. Need to look ahead to decide 180# between plain '*', '*?', '*+' 181# 182quant-star: 183 '?' n expr-cont doNGStar # *? 184 '+' n expr-cont doPossessiveStar # *+ 185 default expr-cont doStar 186 187 188# 189# quant-plus Scanning a '+' quantifier. Need to look ahead to decide 190# between plain '+', '+?', '++' 191# 192quant-plus: 193 '?' n expr-cont doNGPlus # *? 194 '+' n expr-cont doPossessivePlus # *+ 195 default expr-cont doPlus 196 197 198# 199# quant-opt Scanning a '?' quantifier. Need to look ahead to decide 200# between plain '?', '??', '?+' 201# 202quant-opt: 203 '?' n expr-cont doNGOpt # ?? 204 '+' n expr-cont doPossessiveOpt # ?+ 205 default expr-cont doOpt # ? 206 207 208# 209# Interval scanning a '{', the opening delimiter for an interval specification 210# {number} or {min, max} or {min,} 211# 212interval-open: 213 digit_char interval-lower 214 default errorDeath doIntervalError 215 216interval-lower: 217 digit_char n interval-lower doIntevalLowerDigit 218 ',' n interval-upper 219 '}' n interval-type doIntervalSame # {n} 220 default errorDeath doIntervalError 221 222interval-upper: 223 digit_char n interval-upper doIntervalUpperDigit 224 '}' n interval-type 225 default errorDeath doIntervalError 226 227interval-type: 228 '?' n expr-cont doNGInterval # {n,m}? 229 '+' n expr-cont doPossessiveInterval # {n,m}+ 230 default expr-cont doInterval # {m,n} 231 232 233# 234# backslash # Backslash. Figure out which of the \thingies we have encountered. 235# The low level next-char function will have preprocessed 236# some of them already; those won't come here. 237backslash: 238 'A' n term doBackslashA 239 'B' n term doBackslashB 240 'b' n term doBackslashb 241 'd' n expr-quant doBackslashd 242 'D' n expr-quant doBackslashD 243 'G' n term doBackslashG 244 'N' expr-quant doNamedChar # \N{NAME} named char 245 'p' expr-quant doProperty # \p{Lu} style property 246 'P' expr-quant doProperty 247 'Q' n term doEnterQuoteMode 248 'S' n expr-quant doBackslashS 249 's' n expr-quant doBackslashs 250 'W' n expr-quant doBackslashW 251 'w' n expr-quant doBackslashw 252 'X' n expr-quant doBackslashX 253 'Z' n term doBackslashZ 254 'z' n term doBackslashz 255 digit_char n expr-quant doBackRef # Will scan multiple digits 256 eof errorDeath doEscapeError 257 default n expr-quant doEscapedLiteralChar 258 259 260 261# 262# [set expression] parsing, 263# All states involved in parsing set expressions have names beginning with "set-" 264# 265 266set-open: 267 '^' n set-open2 doSetNegate 268 ':' set-posix doSetPosixProp 269 default set-open2 270 271set-open2: 272 ']' n set-after-lit doSetLiteral 273 default set-start 274 275# set-posix: 276# scanned a '[:' If it really is a [:property:], doSetPosixProp will have 277# moved the scan to the closing ']'. If it wasn't a property 278# expression, the scan will still be at the opening ':', which should 279# be interpreted as a normal set expression. 280set-posix: 281 ']' n pop doSetEnd 282 ':' set-start 283 default errorDeath doRuleError # should not be possible. 284 285# 286# set-start after the [ and special case leading characters (^ and/or ]) but before 287# everything else. A '-' is literal at this point. 288# 289set-start: 290 ']' n pop doSetEnd 291 '[' n set-open ^set-after-set doSetBeginUnion 292 '\' n set-escape 293 '-' n set-start-dash 294 '&' n set-start-amp 295 default n set-after-lit doSetLiteral 296 297# set-start-dash Turn "[--" into a syntax error. 298# "[-x" is good, - and x are literals. 299# 300set-start-dash: 301 '-' errorDeath doRuleError 302 default set-after-lit doSetAddDash 303 304# set-start-amp Turn "[&&" into a syntax error. 305# "[&x" is good, & and x are literals. 306# 307set-start-amp: 308 '&' errorDeath doRuleError 309 default set-after-lit doSetAddAmp 310 311# 312# set-after-lit The last thing scanned was a literal character within a set. 313# Can be followed by anything. Single '-' or '&' are 314# literals in this context, not operators. 315set-after-lit: 316 ']' n pop doSetEnd 317 '[' n set-open ^set-after-set doSetBeginUnion 318 '-' n set-lit-dash 319 '&' n set-lit-amp 320 '\' n set-escape 321 eof errorDeath doSetNoCloseError 322 default n set-after-lit doSetLiteral 323 324set-after-set: 325 ']' n pop doSetEnd 326 '[' n set-open ^set-after-set doSetBeginUnion 327 '-' n set-set-dash 328 '&' n set-set-amp 329 '\' n set-escape 330 eof errorDeath doSetNoCloseError 331 default n set-after-lit doSetLiteral 332 333set-after-range: 334 ']' n pop doSetEnd 335 '[' n set-open ^set-after-set doSetBeginUnion 336 '-' n set-range-dash 337 '&' n set-range-amp 338 '\' n set-escape 339 eof errorDeath doSetNoCloseError 340 default n set-after-lit doSetLiteral 341 342 343# set-after-op 344# After a -- or && 345# It is an error to close a set at this point. 346# 347set-after-op: 348 '[' n set-open ^set-after-set doSetBeginUnion 349 ']' errorDeath doSetOpError 350 '\' n set-escape 351 default n set-after-lit doSetLiteral 352 353# 354# set-set-amp 355# Have scanned [[set]& 356# Could be a '&' intersection operator, if a set follows. 357# Could be the start of a '&&' operator. 358# Otherewise is a literal. 359set-set-amp: 360 '[' n set-open ^set-after-set doSetBeginIntersection1 361 '&' n set-after-op doSetIntersection2 362 default set-after-lit doSetAddAmp 363 364 365# set-lit-amp Have scanned "[literals&" 366# Could be a start of "&&" operator or a literal 367# In [abc&[def]], the '&' is a literal 368# 369set-lit-amp: 370 '&' n set-after-op doSetIntersection2 371 default set-after-lit doSetAddAmp 372 373 374# 375# set-set-dash 376# Have scanned [set]- 377# Could be a '-' difference operator, if a [set] follows. 378# Could be the start of a '--' operator. 379# Otherewise is a literal. 380set-set-dash: 381 '[' n set-open ^set-after-set doSetBeginDifference1 382 '-' n set-after-op doSetDifference2 383 default set-after-lit doSetAddDash 384 385 386# 387# set-range-dash 388# scanned a-b- or \w- 389# any set or range like item where the trailing single '-' should 390# be literal, not a set difference operation. 391# A trailing "--" is still a difference operator. 392set-range-dash: 393 '-' n set-after-op doSetDifference2 394 default set-after-lit doSetAddDash 395 396 397set-range-amp: 398 '&' n set-after-op doSetIntersection2 399 default set-after-lit doSetAddAmp 400 401 402# set-lit-dash 403# Have scanned "[literals-" Could be a range or a -- operator or a literal 404# In [abc-[def]], the '-' is a literal (confirmed with a Java test) 405# [abc-\p{xx} the '-' is an error 406# [abc-] the '-' is a literal 407# [ab-xy] the '-' is a range 408# 409set-lit-dash: 410 '-' n set-after-op doSetDifference2 411 '[' set-after-lit doSetAddDash 412 ']' set-after-lit doSetAddDash 413 '\' n set-lit-dash-escape 414 default n set-after-range doSetRange 415 416# set-lit-dash-escape 417# 418# scanned "[literal-\" 419# Could be a range, if the \ introduces an escaped literal char or a named char. 420# Otherwise it is an error. 421# 422set-lit-dash-escape: 423 's' errorDeath doSetOpError 424 'S' errorDeath doSetOpError 425 'w' errorDeath doSetOpError 426 'W' errorDeath doSetOpError 427 'd' errorDeath doSetOpError 428 'D' errorDeath doSetOpError 429 'N' set-after-range doSetNamedRange 430 default n set-after-range doSetRange 431 432 433# 434# set-escape 435# Common back-slash escape processing within set expressions 436# 437set-escape: 438 'p' set-after-set doSetProp 439 'P' set-after-set doSetProp 440 'N' set-after-lit doSetNamedChar 441 's' n set-after-range doSetBackslash_s 442 'S' n set-after-range doSetBackslash_S 443 'w' n set-after-range doSetBackslash_w 444 'W' n set-after-range doSetBackslash_W 445 'd' n set-after-range doSetBackslash_d 446 'D' n set-after-range doSetBackslash_D 447 default n set-after-lit doSetLiteralEscaped 448 449# 450# set-finish 451# Have just encountered the final ']' that completes a [set], and 452# arrived here via a pop. From here, we exit the set parsing world, and go 453# back to generic regular expression parsing. 454# 455set-finish: 456 default expr-quant doSetFinish 457 458 459# 460# errorDeath. This state is specified as the next state whenever a syntax error 461# in the source rules is detected. Barring bugs, the state machine will never 462# actually get here, but will stop because of the action associated with the error. 463# But, just in case, this state asks the state machine to exit. 464errorDeath: 465 default n errorDeath doExit 466 467 468