1 2# Scanner produces tokens of the following types: 3# STREAM-START 4# STREAM-END 5# DIRECTIVE(name, value) 6# DOCUMENT-START 7# DOCUMENT-END 8# BLOCK-SEQUENCE-START 9# BLOCK-MAPPING-START 10# BLOCK-END 11# FLOW-SEQUENCE-START 12# FLOW-MAPPING-START 13# FLOW-SEQUENCE-END 14# FLOW-MAPPING-END 15# BLOCK-ENTRY 16# FLOW-ENTRY 17# KEY 18# VALUE 19# ALIAS(value) 20# ANCHOR(value) 21# TAG(value) 22# SCALAR(value, plain, style) 23# 24# Read comments in the Scanner code for more details. 25# 26 27__all__ = ['Scanner', 'ScannerError'] 28 29from .error import MarkedYAMLError 30from .tokens import * 31 32class ScannerError(MarkedYAMLError): 33 pass 34 35class SimpleKey: 36 # See below simple keys treatment. 37 38 def __init__(self, token_number, required, index, line, column, mark): 39 self.token_number = token_number 40 self.required = required 41 self.index = index 42 self.line = line 43 self.column = column 44 self.mark = mark 45 46class Scanner: 47 48 def __init__(self): 49 """Initialize the scanner.""" 50 # It is assumed that Scanner and Reader will have a common descendant. 51 # Reader do the dirty work of checking for BOM and converting the 52 # input data to Unicode. It also adds NUL to the end. 53 # 54 # Reader supports the following methods 55 # self.peek(i=0) # peek the next i-th character 56 # self.prefix(l=1) # peek the next l characters 57 # self.forward(l=1) # read the next l characters and move the pointer. 58 59 # Had we reached the end of the stream? 60 self.done = False 61 62 # The number of unclosed '{' and '['. `flow_level == 0` means block 63 # context. 64 self.flow_level = 0 65 66 # List of processed tokens that are not yet emitted. 67 self.tokens = [] 68 69 # Add the STREAM-START token. 70 self.fetch_stream_start() 71 72 # Number of tokens that were emitted through the `get_token` method. 73 self.tokens_taken = 0 74 75 # The current indentation level. 76 self.indent = -1 77 78 # Past indentation levels. 79 self.indents = [] 80 81 # Variables related to simple keys treatment. 82 83 # A simple key is a key that is not denoted by the '?' indicator. 84 # Example of simple keys: 85 # --- 86 # block simple key: value 87 # ? not a simple key: 88 # : { flow simple key: value } 89 # We emit the KEY token before all keys, so when we find a potential 90 # simple key, we try to locate the corresponding ':' indicator. 91 # Simple keys should be limited to a single line and 1024 characters. 92 93 # Can a simple key start at the current position? A simple key may 94 # start: 95 # - at the beginning of the line, not counting indentation spaces 96 # (in block context), 97 # - after '{', '[', ',' (in the flow context), 98 # - after '?', ':', '-' (in the block context). 99 # In the block context, this flag also signifies if a block collection 100 # may start at the current position. 101 self.allow_simple_key = True 102 103 # Keep track of possible simple keys. This is a dictionary. The key 104 # is `flow_level`; there can be no more that one possible simple key 105 # for each level. The value is a SimpleKey record: 106 # (token_number, required, index, line, column, mark) 107 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow), 108 # '[', or '{' tokens. 109 self.possible_simple_keys = {} 110 111 # Public methods. 112 113 def check_token(self, *choices): 114 # Check if the next token is one of the given types. 115 while self.need_more_tokens(): 116 self.fetch_more_tokens() 117 if self.tokens: 118 if not choices: 119 return True 120 for choice in choices: 121 if isinstance(self.tokens[0], choice): 122 return True 123 return False 124 125 def peek_token(self): 126 # Return the next token, but do not delete if from the queue. 127 # Return None if no more tokens. 128 while self.need_more_tokens(): 129 self.fetch_more_tokens() 130 if self.tokens: 131 return self.tokens[0] 132 else: 133 return None 134 135 def get_token(self): 136 # Return the next token. 137 while self.need_more_tokens(): 138 self.fetch_more_tokens() 139 if self.tokens: 140 self.tokens_taken += 1 141 return self.tokens.pop(0) 142 143 # Private methods. 144 145 def need_more_tokens(self): 146 if self.done: 147 return False 148 if not self.tokens: 149 return True 150 # The current token may be a potential simple key, so we 151 # need to look further. 152 self.stale_possible_simple_keys() 153 if self.next_possible_simple_key() == self.tokens_taken: 154 return True 155 156 def fetch_more_tokens(self): 157 158 # Eat whitespaces and comments until we reach the next token. 159 self.scan_to_next_token() 160 161 # Remove obsolete possible simple keys. 162 self.stale_possible_simple_keys() 163 164 # Compare the current indentation and column. It may add some tokens 165 # and decrease the current indentation level. 166 self.unwind_indent(self.column) 167 168 # Peek the next character. 169 ch = self.peek() 170 171 # Is it the end of stream? 172 if ch == '\0': 173 return self.fetch_stream_end() 174 175 # Is it a directive? 176 if ch == '%' and self.check_directive(): 177 return self.fetch_directive() 178 179 # Is it the document start? 180 if ch == '-' and self.check_document_start(): 181 return self.fetch_document_start() 182 183 # Is it the document end? 184 if ch == '.' and self.check_document_end(): 185 return self.fetch_document_end() 186 187 # TODO: support for BOM within a stream. 188 #if ch == '\uFEFF': 189 # return self.fetch_bom() <-- issue BOMToken 190 191 # Note: the order of the following checks is NOT significant. 192 193 # Is it the flow sequence start indicator? 194 if ch == '[': 195 return self.fetch_flow_sequence_start() 196 197 # Is it the flow mapping start indicator? 198 if ch == '{': 199 return self.fetch_flow_mapping_start() 200 201 # Is it the flow sequence end indicator? 202 if ch == ']': 203 return self.fetch_flow_sequence_end() 204 205 # Is it the flow mapping end indicator? 206 if ch == '}': 207 return self.fetch_flow_mapping_end() 208 209 # Is it the flow entry indicator? 210 if ch == ',': 211 return self.fetch_flow_entry() 212 213 # Is it the block entry indicator? 214 if ch == '-' and self.check_block_entry(): 215 return self.fetch_block_entry() 216 217 # Is it the key indicator? 218 if ch == '?' and self.check_key(): 219 return self.fetch_key() 220 221 # Is it the value indicator? 222 if ch == ':' and self.check_value(): 223 return self.fetch_value() 224 225 # Is it an alias? 226 if ch == '*': 227 return self.fetch_alias() 228 229 # Is it an anchor? 230 if ch == '&': 231 return self.fetch_anchor() 232 233 # Is it a tag? 234 if ch == '!': 235 return self.fetch_tag() 236 237 # Is it a literal scalar? 238 if ch == '|' and not self.flow_level: 239 return self.fetch_literal() 240 241 # Is it a folded scalar? 242 if ch == '>' and not self.flow_level: 243 return self.fetch_folded() 244 245 # Is it a single quoted scalar? 246 if ch == '\'': 247 return self.fetch_single() 248 249 # Is it a double quoted scalar? 250 if ch == '\"': 251 return self.fetch_double() 252 253 # It must be a plain scalar then. 254 if self.check_plain(): 255 return self.fetch_plain() 256 257 # No? It's an error. Let's produce a nice error message. 258 raise ScannerError("while scanning for the next token", None, 259 "found character %r that cannot start any token" % ch, 260 self.get_mark()) 261 262 # Simple keys treatment. 263 264 def next_possible_simple_key(self): 265 # Return the number of the nearest possible simple key. Actually we 266 # don't need to loop through the whole dictionary. We may replace it 267 # with the following code: 268 # if not self.possible_simple_keys: 269 # return None 270 # return self.possible_simple_keys[ 271 # min(self.possible_simple_keys.keys())].token_number 272 min_token_number = None 273 for level in self.possible_simple_keys: 274 key = self.possible_simple_keys[level] 275 if min_token_number is None or key.token_number < min_token_number: 276 min_token_number = key.token_number 277 return min_token_number 278 279 def stale_possible_simple_keys(self): 280 # Remove entries that are no longer possible simple keys. According to 281 # the YAML specification, simple keys 282 # - should be limited to a single line, 283 # - should be no longer than 1024 characters. 284 # Disabling this procedure will allow simple keys of any length and 285 # height (may cause problems if indentation is broken though). 286 for level in list(self.possible_simple_keys): 287 key = self.possible_simple_keys[level] 288 if key.line != self.line \ 289 or self.index-key.index > 1024: 290 if key.required: 291 raise ScannerError("while scanning a simple key", key.mark, 292 "could not find expected ':'", self.get_mark()) 293 del self.possible_simple_keys[level] 294 295 def save_possible_simple_key(self): 296 # The next token may start a simple key. We check if it's possible 297 # and save its position. This function is called for 298 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. 299 300 # Check if a simple key is required at the current position. 301 required = not self.flow_level and self.indent == self.column 302 303 # The next token might be a simple key. Let's save it's number and 304 # position. 305 if self.allow_simple_key: 306 self.remove_possible_simple_key() 307 token_number = self.tokens_taken+len(self.tokens) 308 key = SimpleKey(token_number, required, 309 self.index, self.line, self.column, self.get_mark()) 310 self.possible_simple_keys[self.flow_level] = key 311 312 def remove_possible_simple_key(self): 313 # Remove the saved possible key position at the current flow level. 314 if self.flow_level in self.possible_simple_keys: 315 key = self.possible_simple_keys[self.flow_level] 316 317 if key.required: 318 raise ScannerError("while scanning a simple key", key.mark, 319 "could not find expected ':'", self.get_mark()) 320 321 del self.possible_simple_keys[self.flow_level] 322 323 # Indentation functions. 324 325 def unwind_indent(self, column): 326 327 ## In flow context, tokens should respect indentation. 328 ## Actually the condition should be `self.indent >= column` according to 329 ## the spec. But this condition will prohibit intuitively correct 330 ## constructions such as 331 ## key : { 332 ## } 333 #if self.flow_level and self.indent > column: 334 # raise ScannerError(None, None, 335 # "invalid indentation or unclosed '[' or '{'", 336 # self.get_mark()) 337 338 # In the flow context, indentation is ignored. We make the scanner less 339 # restrictive then specification requires. 340 if self.flow_level: 341 return 342 343 # In block context, we may need to issue the BLOCK-END tokens. 344 while self.indent > column: 345 mark = self.get_mark() 346 self.indent = self.indents.pop() 347 self.tokens.append(BlockEndToken(mark, mark)) 348 349 def add_indent(self, column): 350 # Check if we need to increase indentation. 351 if self.indent < column: 352 self.indents.append(self.indent) 353 self.indent = column 354 return True 355 return False 356 357 # Fetchers. 358 359 def fetch_stream_start(self): 360 # We always add STREAM-START as the first token and STREAM-END as the 361 # last token. 362 363 # Read the token. 364 mark = self.get_mark() 365 366 # Add STREAM-START. 367 self.tokens.append(StreamStartToken(mark, mark, 368 encoding=self.encoding)) 369 370 371 def fetch_stream_end(self): 372 373 # Set the current indentation to -1. 374 self.unwind_indent(-1) 375 376 # Reset simple keys. 377 self.remove_possible_simple_key() 378 self.allow_simple_key = False 379 self.possible_simple_keys = {} 380 381 # Read the token. 382 mark = self.get_mark() 383 384 # Add STREAM-END. 385 self.tokens.append(StreamEndToken(mark, mark)) 386 387 # The steam is finished. 388 self.done = True 389 390 def fetch_directive(self): 391 392 # Set the current indentation to -1. 393 self.unwind_indent(-1) 394 395 # Reset simple keys. 396 self.remove_possible_simple_key() 397 self.allow_simple_key = False 398 399 # Scan and add DIRECTIVE. 400 self.tokens.append(self.scan_directive()) 401 402 def fetch_document_start(self): 403 self.fetch_document_indicator(DocumentStartToken) 404 405 def fetch_document_end(self): 406 self.fetch_document_indicator(DocumentEndToken) 407 408 def fetch_document_indicator(self, TokenClass): 409 410 # Set the current indentation to -1. 411 self.unwind_indent(-1) 412 413 # Reset simple keys. Note that there could not be a block collection 414 # after '---'. 415 self.remove_possible_simple_key() 416 self.allow_simple_key = False 417 418 # Add DOCUMENT-START or DOCUMENT-END. 419 start_mark = self.get_mark() 420 self.forward(3) 421 end_mark = self.get_mark() 422 self.tokens.append(TokenClass(start_mark, end_mark)) 423 424 def fetch_flow_sequence_start(self): 425 self.fetch_flow_collection_start(FlowSequenceStartToken) 426 427 def fetch_flow_mapping_start(self): 428 self.fetch_flow_collection_start(FlowMappingStartToken) 429 430 def fetch_flow_collection_start(self, TokenClass): 431 432 # '[' and '{' may start a simple key. 433 self.save_possible_simple_key() 434 435 # Increase the flow level. 436 self.flow_level += 1 437 438 # Simple keys are allowed after '[' and '{'. 439 self.allow_simple_key = True 440 441 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. 442 start_mark = self.get_mark() 443 self.forward() 444 end_mark = self.get_mark() 445 self.tokens.append(TokenClass(start_mark, end_mark)) 446 447 def fetch_flow_sequence_end(self): 448 self.fetch_flow_collection_end(FlowSequenceEndToken) 449 450 def fetch_flow_mapping_end(self): 451 self.fetch_flow_collection_end(FlowMappingEndToken) 452 453 def fetch_flow_collection_end(self, TokenClass): 454 455 # Reset possible simple key on the current level. 456 self.remove_possible_simple_key() 457 458 # Decrease the flow level. 459 self.flow_level -= 1 460 461 # No simple keys after ']' or '}'. 462 self.allow_simple_key = False 463 464 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. 465 start_mark = self.get_mark() 466 self.forward() 467 end_mark = self.get_mark() 468 self.tokens.append(TokenClass(start_mark, end_mark)) 469 470 def fetch_flow_entry(self): 471 472 # Simple keys are allowed after ','. 473 self.allow_simple_key = True 474 475 # Reset possible simple key on the current level. 476 self.remove_possible_simple_key() 477 478 # Add FLOW-ENTRY. 479 start_mark = self.get_mark() 480 self.forward() 481 end_mark = self.get_mark() 482 self.tokens.append(FlowEntryToken(start_mark, end_mark)) 483 484 def fetch_block_entry(self): 485 486 # Block context needs additional checks. 487 if not self.flow_level: 488 489 # Are we allowed to start a new entry? 490 if not self.allow_simple_key: 491 raise ScannerError(None, None, 492 "sequence entries are not allowed here", 493 self.get_mark()) 494 495 # We may need to add BLOCK-SEQUENCE-START. 496 if self.add_indent(self.column): 497 mark = self.get_mark() 498 self.tokens.append(BlockSequenceStartToken(mark, mark)) 499 500 # It's an error for the block entry to occur in the flow context, 501 # but we let the parser detect this. 502 else: 503 pass 504 505 # Simple keys are allowed after '-'. 506 self.allow_simple_key = True 507 508 # Reset possible simple key on the current level. 509 self.remove_possible_simple_key() 510 511 # Add BLOCK-ENTRY. 512 start_mark = self.get_mark() 513 self.forward() 514 end_mark = self.get_mark() 515 self.tokens.append(BlockEntryToken(start_mark, end_mark)) 516 517 def fetch_key(self): 518 519 # Block context needs additional checks. 520 if not self.flow_level: 521 522 # Are we allowed to start a key (not necessary a simple)? 523 if not self.allow_simple_key: 524 raise ScannerError(None, None, 525 "mapping keys are not allowed here", 526 self.get_mark()) 527 528 # We may need to add BLOCK-MAPPING-START. 529 if self.add_indent(self.column): 530 mark = self.get_mark() 531 self.tokens.append(BlockMappingStartToken(mark, mark)) 532 533 # Simple keys are allowed after '?' in the block context. 534 self.allow_simple_key = not self.flow_level 535 536 # Reset possible simple key on the current level. 537 self.remove_possible_simple_key() 538 539 # Add KEY. 540 start_mark = self.get_mark() 541 self.forward() 542 end_mark = self.get_mark() 543 self.tokens.append(KeyToken(start_mark, end_mark)) 544 545 def fetch_value(self): 546 547 # Do we determine a simple key? 548 if self.flow_level in self.possible_simple_keys: 549 550 # Add KEY. 551 key = self.possible_simple_keys[self.flow_level] 552 del self.possible_simple_keys[self.flow_level] 553 self.tokens.insert(key.token_number-self.tokens_taken, 554 KeyToken(key.mark, key.mark)) 555 556 # If this key starts a new block mapping, we need to add 557 # BLOCK-MAPPING-START. 558 if not self.flow_level: 559 if self.add_indent(key.column): 560 self.tokens.insert(key.token_number-self.tokens_taken, 561 BlockMappingStartToken(key.mark, key.mark)) 562 563 # There cannot be two simple keys one after another. 564 self.allow_simple_key = False 565 566 # It must be a part of a complex key. 567 else: 568 569 # Block context needs additional checks. 570 # (Do we really need them? They will be caught by the parser 571 # anyway.) 572 if not self.flow_level: 573 574 # We are allowed to start a complex value if and only if 575 # we can start a simple key. 576 if not self.allow_simple_key: 577 raise ScannerError(None, None, 578 "mapping values are not allowed here", 579 self.get_mark()) 580 581 # If this value starts a new block mapping, we need to add 582 # BLOCK-MAPPING-START. It will be detected as an error later by 583 # the parser. 584 if not self.flow_level: 585 if self.add_indent(self.column): 586 mark = self.get_mark() 587 self.tokens.append(BlockMappingStartToken(mark, mark)) 588 589 # Simple keys are allowed after ':' in the block context. 590 self.allow_simple_key = not self.flow_level 591 592 # Reset possible simple key on the current level. 593 self.remove_possible_simple_key() 594 595 # Add VALUE. 596 start_mark = self.get_mark() 597 self.forward() 598 end_mark = self.get_mark() 599 self.tokens.append(ValueToken(start_mark, end_mark)) 600 601 def fetch_alias(self): 602 603 # ALIAS could be a simple key. 604 self.save_possible_simple_key() 605 606 # No simple keys after ALIAS. 607 self.allow_simple_key = False 608 609 # Scan and add ALIAS. 610 self.tokens.append(self.scan_anchor(AliasToken)) 611 612 def fetch_anchor(self): 613 614 # ANCHOR could start a simple key. 615 self.save_possible_simple_key() 616 617 # No simple keys after ANCHOR. 618 self.allow_simple_key = False 619 620 # Scan and add ANCHOR. 621 self.tokens.append(self.scan_anchor(AnchorToken)) 622 623 def fetch_tag(self): 624 625 # TAG could start a simple key. 626 self.save_possible_simple_key() 627 628 # No simple keys after TAG. 629 self.allow_simple_key = False 630 631 # Scan and add TAG. 632 self.tokens.append(self.scan_tag()) 633 634 def fetch_literal(self): 635 self.fetch_block_scalar(style='|') 636 637 def fetch_folded(self): 638 self.fetch_block_scalar(style='>') 639 640 def fetch_block_scalar(self, style): 641 642 # A simple key may follow a block scalar. 643 self.allow_simple_key = True 644 645 # Reset possible simple key on the current level. 646 self.remove_possible_simple_key() 647 648 # Scan and add SCALAR. 649 self.tokens.append(self.scan_block_scalar(style)) 650 651 def fetch_single(self): 652 self.fetch_flow_scalar(style='\'') 653 654 def fetch_double(self): 655 self.fetch_flow_scalar(style='"') 656 657 def fetch_flow_scalar(self, style): 658 659 # A flow scalar could be a simple key. 660 self.save_possible_simple_key() 661 662 # No simple keys after flow scalars. 663 self.allow_simple_key = False 664 665 # Scan and add SCALAR. 666 self.tokens.append(self.scan_flow_scalar(style)) 667 668 def fetch_plain(self): 669 670 # A plain scalar could be a simple key. 671 self.save_possible_simple_key() 672 673 # No simple keys after plain scalars. But note that `scan_plain` will 674 # change this flag if the scan is finished at the beginning of the 675 # line. 676 self.allow_simple_key = False 677 678 # Scan and add SCALAR. May change `allow_simple_key`. 679 self.tokens.append(self.scan_plain()) 680 681 # Checkers. 682 683 def check_directive(self): 684 685 # DIRECTIVE: ^ '%' ... 686 # The '%' indicator is already checked. 687 if self.column == 0: 688 return True 689 690 def check_document_start(self): 691 692 # DOCUMENT-START: ^ '---' (' '|'\n') 693 if self.column == 0: 694 if self.prefix(3) == '---' \ 695 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': 696 return True 697 698 def check_document_end(self): 699 700 # DOCUMENT-END: ^ '...' (' '|'\n') 701 if self.column == 0: 702 if self.prefix(3) == '...' \ 703 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': 704 return True 705 706 def check_block_entry(self): 707 708 # BLOCK-ENTRY: '-' (' '|'\n') 709 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' 710 711 def check_key(self): 712 713 # KEY(flow context): '?' 714 if self.flow_level: 715 return True 716 717 # KEY(block context): '?' (' '|'\n') 718 else: 719 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' 720 721 def check_value(self): 722 723 # VALUE(flow context): ':' 724 if self.flow_level: 725 return True 726 727 # VALUE(block context): ':' (' '|'\n') 728 else: 729 return self.peek(1) in '\0 \t\r\n\x85\u2028\u2029' 730 731 def check_plain(self): 732 733 # A plain scalar may start with any non-space character except: 734 # '-', '?', ':', ',', '[', ']', '{', '}', 735 # '#', '&', '*', '!', '|', '>', '\'', '\"', 736 # '%', '@', '`'. 737 # 738 # It may also start with 739 # '-', '?', ':' 740 # if it is followed by a non-space character. 741 # 742 # Note that we limit the last rule to the block context (except the 743 # '-' character) because we want the flow context to be space 744 # independent. 745 ch = self.peek() 746 return ch not in '\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \ 747 or (self.peek(1) not in '\0 \t\r\n\x85\u2028\u2029' 748 and (ch == '-' or (not self.flow_level and ch in '?:'))) 749 750 # Scanners. 751 752 def scan_to_next_token(self): 753 # We ignore spaces, line breaks and comments. 754 # If we find a line break in the block context, we set the flag 755 # `allow_simple_key` on. 756 # The byte order mark is stripped if it's the first character in the 757 # stream. We do not yet support BOM inside the stream as the 758 # specification requires. Any such mark will be considered as a part 759 # of the document. 760 # 761 # TODO: We need to make tab handling rules more sane. A good rule is 762 # Tabs cannot precede tokens 763 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, 764 # KEY(block), VALUE(block), BLOCK-ENTRY 765 # So the checking code is 766 # if <TAB>: 767 # self.allow_simple_keys = False 768 # We also need to add the check for `allow_simple_keys == True` to 769 # `unwind_indent` before issuing BLOCK-END. 770 # Scanners for block, flow, and plain scalars need to be modified. 771 772 if self.index == 0 and self.peek() == '\uFEFF': 773 self.forward() 774 found = False 775 while not found: 776 while self.peek() == ' ': 777 self.forward() 778 if self.peek() == '#': 779 while self.peek() not in '\0\r\n\x85\u2028\u2029': 780 self.forward() 781 if self.scan_line_break(): 782 if not self.flow_level: 783 self.allow_simple_key = True 784 else: 785 found = True 786 787 def scan_directive(self): 788 # See the specification for details. 789 start_mark = self.get_mark() 790 self.forward() 791 name = self.scan_directive_name(start_mark) 792 value = None 793 if name == 'YAML': 794 value = self.scan_yaml_directive_value(start_mark) 795 end_mark = self.get_mark() 796 elif name == 'TAG': 797 value = self.scan_tag_directive_value(start_mark) 798 end_mark = self.get_mark() 799 else: 800 end_mark = self.get_mark() 801 while self.peek() not in '\0\r\n\x85\u2028\u2029': 802 self.forward() 803 self.scan_directive_ignored_line(start_mark) 804 return DirectiveToken(name, value, start_mark, end_mark) 805 806 def scan_directive_name(self, start_mark): 807 # See the specification for details. 808 length = 0 809 ch = self.peek(length) 810 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ 811 or ch in '-_': 812 length += 1 813 ch = self.peek(length) 814 if not length: 815 raise ScannerError("while scanning a directive", start_mark, 816 "expected alphabetic or numeric character, but found %r" 817 % ch, self.get_mark()) 818 value = self.prefix(length) 819 self.forward(length) 820 ch = self.peek() 821 if ch not in '\0 \r\n\x85\u2028\u2029': 822 raise ScannerError("while scanning a directive", start_mark, 823 "expected alphabetic or numeric character, but found %r" 824 % ch, self.get_mark()) 825 return value 826 827 def scan_yaml_directive_value(self, start_mark): 828 # See the specification for details. 829 while self.peek() == ' ': 830 self.forward() 831 major = self.scan_yaml_directive_number(start_mark) 832 if self.peek() != '.': 833 raise ScannerError("while scanning a directive", start_mark, 834 "expected a digit or '.', but found %r" % self.peek(), 835 self.get_mark()) 836 self.forward() 837 minor = self.scan_yaml_directive_number(start_mark) 838 if self.peek() not in '\0 \r\n\x85\u2028\u2029': 839 raise ScannerError("while scanning a directive", start_mark, 840 "expected a digit or ' ', but found %r" % self.peek(), 841 self.get_mark()) 842 return (major, minor) 843 844 def scan_yaml_directive_number(self, start_mark): 845 # See the specification for details. 846 ch = self.peek() 847 if not ('0' <= ch <= '9'): 848 raise ScannerError("while scanning a directive", start_mark, 849 "expected a digit, but found %r" % ch, self.get_mark()) 850 length = 0 851 while '0' <= self.peek(length) <= '9': 852 length += 1 853 value = int(self.prefix(length)) 854 self.forward(length) 855 return value 856 857 def scan_tag_directive_value(self, start_mark): 858 # See the specification for details. 859 while self.peek() == ' ': 860 self.forward() 861 handle = self.scan_tag_directive_handle(start_mark) 862 while self.peek() == ' ': 863 self.forward() 864 prefix = self.scan_tag_directive_prefix(start_mark) 865 return (handle, prefix) 866 867 def scan_tag_directive_handle(self, start_mark): 868 # See the specification for details. 869 value = self.scan_tag_handle('directive', start_mark) 870 ch = self.peek() 871 if ch != ' ': 872 raise ScannerError("while scanning a directive", start_mark, 873 "expected ' ', but found %r" % ch, self.get_mark()) 874 return value 875 876 def scan_tag_directive_prefix(self, start_mark): 877 # See the specification for details. 878 value = self.scan_tag_uri('directive', start_mark) 879 ch = self.peek() 880 if ch not in '\0 \r\n\x85\u2028\u2029': 881 raise ScannerError("while scanning a directive", start_mark, 882 "expected ' ', but found %r" % ch, self.get_mark()) 883 return value 884 885 def scan_directive_ignored_line(self, start_mark): 886 # See the specification for details. 887 while self.peek() == ' ': 888 self.forward() 889 if self.peek() == '#': 890 while self.peek() not in '\0\r\n\x85\u2028\u2029': 891 self.forward() 892 ch = self.peek() 893 if ch not in '\0\r\n\x85\u2028\u2029': 894 raise ScannerError("while scanning a directive", start_mark, 895 "expected a comment or a line break, but found %r" 896 % ch, self.get_mark()) 897 self.scan_line_break() 898 899 def scan_anchor(self, TokenClass): 900 # The specification does not restrict characters for anchors and 901 # aliases. This may lead to problems, for instance, the document: 902 # [ *alias, value ] 903 # can be interpreted in two ways, as 904 # [ "value" ] 905 # and 906 # [ *alias , "value" ] 907 # Therefore we restrict aliases to numbers and ASCII letters. 908 start_mark = self.get_mark() 909 indicator = self.peek() 910 if indicator == '*': 911 name = 'alias' 912 else: 913 name = 'anchor' 914 self.forward() 915 length = 0 916 ch = self.peek(length) 917 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ 918 or ch in '-_': 919 length += 1 920 ch = self.peek(length) 921 if not length: 922 raise ScannerError("while scanning an %s" % name, start_mark, 923 "expected alphabetic or numeric character, but found %r" 924 % ch, self.get_mark()) 925 value = self.prefix(length) 926 self.forward(length) 927 ch = self.peek() 928 if ch not in '\0 \t\r\n\x85\u2028\u2029?:,]}%@`': 929 raise ScannerError("while scanning an %s" % name, start_mark, 930 "expected alphabetic or numeric character, but found %r" 931 % ch, self.get_mark()) 932 end_mark = self.get_mark() 933 return TokenClass(value, start_mark, end_mark) 934 935 def scan_tag(self): 936 # See the specification for details. 937 start_mark = self.get_mark() 938 ch = self.peek(1) 939 if ch == '<': 940 handle = None 941 self.forward(2) 942 suffix = self.scan_tag_uri('tag', start_mark) 943 if self.peek() != '>': 944 raise ScannerError("while parsing a tag", start_mark, 945 "expected '>', but found %r" % self.peek(), 946 self.get_mark()) 947 self.forward() 948 elif ch in '\0 \t\r\n\x85\u2028\u2029': 949 handle = None 950 suffix = '!' 951 self.forward() 952 else: 953 length = 1 954 use_handle = False 955 while ch not in '\0 \r\n\x85\u2028\u2029': 956 if ch == '!': 957 use_handle = True 958 break 959 length += 1 960 ch = self.peek(length) 961 handle = '!' 962 if use_handle: 963 handle = self.scan_tag_handle('tag', start_mark) 964 else: 965 handle = '!' 966 self.forward() 967 suffix = self.scan_tag_uri('tag', start_mark) 968 ch = self.peek() 969 if ch not in '\0 \r\n\x85\u2028\u2029': 970 raise ScannerError("while scanning a tag", start_mark, 971 "expected ' ', but found %r" % ch, self.get_mark()) 972 value = (handle, suffix) 973 end_mark = self.get_mark() 974 return TagToken(value, start_mark, end_mark) 975 976 def scan_block_scalar(self, style): 977 # See the specification for details. 978 979 if style == '>': 980 folded = True 981 else: 982 folded = False 983 984 chunks = [] 985 start_mark = self.get_mark() 986 987 # Scan the header. 988 self.forward() 989 chomping, increment = self.scan_block_scalar_indicators(start_mark) 990 self.scan_block_scalar_ignored_line(start_mark) 991 992 # Determine the indentation level and go to the first non-empty line. 993 min_indent = self.indent+1 994 if min_indent < 1: 995 min_indent = 1 996 if increment is None: 997 breaks, max_indent, end_mark = self.scan_block_scalar_indentation() 998 indent = max(min_indent, max_indent) 999 else: 1000 indent = min_indent+increment-1 1001 breaks, end_mark = self.scan_block_scalar_breaks(indent) 1002 line_break = '' 1003 1004 # Scan the inner part of the block scalar. 1005 while self.column == indent and self.peek() != '\0': 1006 chunks.extend(breaks) 1007 leading_non_space = self.peek() not in ' \t' 1008 length = 0 1009 while self.peek(length) not in '\0\r\n\x85\u2028\u2029': 1010 length += 1 1011 chunks.append(self.prefix(length)) 1012 self.forward(length) 1013 line_break = self.scan_line_break() 1014 breaks, end_mark = self.scan_block_scalar_breaks(indent) 1015 if self.column == indent and self.peek() != '\0': 1016 1017 # Unfortunately, folding rules are ambiguous. 1018 # 1019 # This is the folding according to the specification: 1020 1021 if folded and line_break == '\n' \ 1022 and leading_non_space and self.peek() not in ' \t': 1023 if not breaks: 1024 chunks.append(' ') 1025 else: 1026 chunks.append(line_break) 1027 1028 # This is Clark Evans's interpretation (also in the spec 1029 # examples): 1030 # 1031 #if folded and line_break == '\n': 1032 # if not breaks: 1033 # if self.peek() not in ' \t': 1034 # chunks.append(' ') 1035 # else: 1036 # chunks.append(line_break) 1037 #else: 1038 # chunks.append(line_break) 1039 else: 1040 break 1041 1042 # Chomp the tail. 1043 if chomping is not False: 1044 chunks.append(line_break) 1045 if chomping is True: 1046 chunks.extend(breaks) 1047 1048 # We are done. 1049 return ScalarToken(''.join(chunks), False, start_mark, end_mark, 1050 style) 1051 1052 def scan_block_scalar_indicators(self, start_mark): 1053 # See the specification for details. 1054 chomping = None 1055 increment = None 1056 ch = self.peek() 1057 if ch in '+-': 1058 if ch == '+': 1059 chomping = True 1060 else: 1061 chomping = False 1062 self.forward() 1063 ch = self.peek() 1064 if ch in '0123456789': 1065 increment = int(ch) 1066 if increment == 0: 1067 raise ScannerError("while scanning a block scalar", start_mark, 1068 "expected indentation indicator in the range 1-9, but found 0", 1069 self.get_mark()) 1070 self.forward() 1071 elif ch in '0123456789': 1072 increment = int(ch) 1073 if increment == 0: 1074 raise ScannerError("while scanning a block scalar", start_mark, 1075 "expected indentation indicator in the range 1-9, but found 0", 1076 self.get_mark()) 1077 self.forward() 1078 ch = self.peek() 1079 if ch in '+-': 1080 if ch == '+': 1081 chomping = True 1082 else: 1083 chomping = False 1084 self.forward() 1085 ch = self.peek() 1086 if ch not in '\0 \r\n\x85\u2028\u2029': 1087 raise ScannerError("while scanning a block scalar", start_mark, 1088 "expected chomping or indentation indicators, but found %r" 1089 % ch, self.get_mark()) 1090 return chomping, increment 1091 1092 def scan_block_scalar_ignored_line(self, start_mark): 1093 # See the specification for details. 1094 while self.peek() == ' ': 1095 self.forward() 1096 if self.peek() == '#': 1097 while self.peek() not in '\0\r\n\x85\u2028\u2029': 1098 self.forward() 1099 ch = self.peek() 1100 if ch not in '\0\r\n\x85\u2028\u2029': 1101 raise ScannerError("while scanning a block scalar", start_mark, 1102 "expected a comment or a line break, but found %r" % ch, 1103 self.get_mark()) 1104 self.scan_line_break() 1105 1106 def scan_block_scalar_indentation(self): 1107 # See the specification for details. 1108 chunks = [] 1109 max_indent = 0 1110 end_mark = self.get_mark() 1111 while self.peek() in ' \r\n\x85\u2028\u2029': 1112 if self.peek() != ' ': 1113 chunks.append(self.scan_line_break()) 1114 end_mark = self.get_mark() 1115 else: 1116 self.forward() 1117 if self.column > max_indent: 1118 max_indent = self.column 1119 return chunks, max_indent, end_mark 1120 1121 def scan_block_scalar_breaks(self, indent): 1122 # See the specification for details. 1123 chunks = [] 1124 end_mark = self.get_mark() 1125 while self.column < indent and self.peek() == ' ': 1126 self.forward() 1127 while self.peek() in '\r\n\x85\u2028\u2029': 1128 chunks.append(self.scan_line_break()) 1129 end_mark = self.get_mark() 1130 while self.column < indent and self.peek() == ' ': 1131 self.forward() 1132 return chunks, end_mark 1133 1134 def scan_flow_scalar(self, style): 1135 # See the specification for details. 1136 # Note that we loose indentation rules for quoted scalars. Quoted 1137 # scalars don't need to adhere indentation because " and ' clearly 1138 # mark the beginning and the end of them. Therefore we are less 1139 # restrictive then the specification requires. We only need to check 1140 # that document separators are not included in scalars. 1141 if style == '"': 1142 double = True 1143 else: 1144 double = False 1145 chunks = [] 1146 start_mark = self.get_mark() 1147 quote = self.peek() 1148 self.forward() 1149 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) 1150 while self.peek() != quote: 1151 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark)) 1152 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) 1153 self.forward() 1154 end_mark = self.get_mark() 1155 return ScalarToken(''.join(chunks), False, start_mark, end_mark, 1156 style) 1157 1158 ESCAPE_REPLACEMENTS = { 1159 '0': '\0', 1160 'a': '\x07', 1161 'b': '\x08', 1162 't': '\x09', 1163 '\t': '\x09', 1164 'n': '\x0A', 1165 'v': '\x0B', 1166 'f': '\x0C', 1167 'r': '\x0D', 1168 'e': '\x1B', 1169 ' ': '\x20', 1170 '\"': '\"', 1171 '\\': '\\', 1172 '/': '/', 1173 'N': '\x85', 1174 '_': '\xA0', 1175 'L': '\u2028', 1176 'P': '\u2029', 1177 } 1178 1179 ESCAPE_CODES = { 1180 'x': 2, 1181 'u': 4, 1182 'U': 8, 1183 } 1184 1185 def scan_flow_scalar_non_spaces(self, double, start_mark): 1186 # See the specification for details. 1187 chunks = [] 1188 while True: 1189 length = 0 1190 while self.peek(length) not in '\'\"\\\0 \t\r\n\x85\u2028\u2029': 1191 length += 1 1192 if length: 1193 chunks.append(self.prefix(length)) 1194 self.forward(length) 1195 ch = self.peek() 1196 if not double and ch == '\'' and self.peek(1) == '\'': 1197 chunks.append('\'') 1198 self.forward(2) 1199 elif (double and ch == '\'') or (not double and ch in '\"\\'): 1200 chunks.append(ch) 1201 self.forward() 1202 elif double and ch == '\\': 1203 self.forward() 1204 ch = self.peek() 1205 if ch in self.ESCAPE_REPLACEMENTS: 1206 chunks.append(self.ESCAPE_REPLACEMENTS[ch]) 1207 self.forward() 1208 elif ch in self.ESCAPE_CODES: 1209 length = self.ESCAPE_CODES[ch] 1210 self.forward() 1211 for k in range(length): 1212 if self.peek(k) not in '0123456789ABCDEFabcdef': 1213 raise ScannerError("while scanning a double-quoted scalar", start_mark, 1214 "expected escape sequence of %d hexadecimal numbers, but found %r" % 1215 (length, self.peek(k)), self.get_mark()) 1216 code = int(self.prefix(length), 16) 1217 chunks.append(chr(code)) 1218 self.forward(length) 1219 elif ch in '\r\n\x85\u2028\u2029': 1220 self.scan_line_break() 1221 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark)) 1222 else: 1223 raise ScannerError("while scanning a double-quoted scalar", start_mark, 1224 "found unknown escape character %r" % ch, self.get_mark()) 1225 else: 1226 return chunks 1227 1228 def scan_flow_scalar_spaces(self, double, start_mark): 1229 # See the specification for details. 1230 chunks = [] 1231 length = 0 1232 while self.peek(length) in ' \t': 1233 length += 1 1234 whitespaces = self.prefix(length) 1235 self.forward(length) 1236 ch = self.peek() 1237 if ch == '\0': 1238 raise ScannerError("while scanning a quoted scalar", start_mark, 1239 "found unexpected end of stream", self.get_mark()) 1240 elif ch in '\r\n\x85\u2028\u2029': 1241 line_break = self.scan_line_break() 1242 breaks = self.scan_flow_scalar_breaks(double, start_mark) 1243 if line_break != '\n': 1244 chunks.append(line_break) 1245 elif not breaks: 1246 chunks.append(' ') 1247 chunks.extend(breaks) 1248 else: 1249 chunks.append(whitespaces) 1250 return chunks 1251 1252 def scan_flow_scalar_breaks(self, double, start_mark): 1253 # See the specification for details. 1254 chunks = [] 1255 while True: 1256 # Instead of checking indentation, we check for document 1257 # separators. 1258 prefix = self.prefix(3) 1259 if (prefix == '---' or prefix == '...') \ 1260 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': 1261 raise ScannerError("while scanning a quoted scalar", start_mark, 1262 "found unexpected document separator", self.get_mark()) 1263 while self.peek() in ' \t': 1264 self.forward() 1265 if self.peek() in '\r\n\x85\u2028\u2029': 1266 chunks.append(self.scan_line_break()) 1267 else: 1268 return chunks 1269 1270 def scan_plain(self): 1271 # See the specification for details. 1272 # We add an additional restriction for the flow context: 1273 # plain scalars in the flow context cannot contain ',' or '?'. 1274 # We also keep track of the `allow_simple_key` flag here. 1275 # Indentation rules are loosed for the flow context. 1276 chunks = [] 1277 start_mark = self.get_mark() 1278 end_mark = start_mark 1279 indent = self.indent+1 1280 # We allow zero indentation for scalars, but then we need to check for 1281 # document separators at the beginning of the line. 1282 #if indent == 0: 1283 # indent = 1 1284 spaces = [] 1285 while True: 1286 length = 0 1287 if self.peek() == '#': 1288 break 1289 while True: 1290 ch = self.peek(length) 1291 if ch in '\0 \t\r\n\x85\u2028\u2029' \ 1292 or (ch == ':' and 1293 self.peek(length+1) in '\0 \t\r\n\x85\u2028\u2029' 1294 + (u',[]{}' if self.flow_level else u''))\ 1295 or (self.flow_level and ch in ',?[]{}'): 1296 break 1297 length += 1 1298 if length == 0: 1299 break 1300 self.allow_simple_key = False 1301 chunks.extend(spaces) 1302 chunks.append(self.prefix(length)) 1303 self.forward(length) 1304 end_mark = self.get_mark() 1305 spaces = self.scan_plain_spaces(indent, start_mark) 1306 if not spaces or self.peek() == '#' \ 1307 or (not self.flow_level and self.column < indent): 1308 break 1309 return ScalarToken(''.join(chunks), True, start_mark, end_mark) 1310 1311 def scan_plain_spaces(self, indent, start_mark): 1312 # See the specification for details. 1313 # The specification is really confusing about tabs in plain scalars. 1314 # We just forbid them completely. Do not use tabs in YAML! 1315 chunks = [] 1316 length = 0 1317 while self.peek(length) in ' ': 1318 length += 1 1319 whitespaces = self.prefix(length) 1320 self.forward(length) 1321 ch = self.peek() 1322 if ch in '\r\n\x85\u2028\u2029': 1323 line_break = self.scan_line_break() 1324 self.allow_simple_key = True 1325 prefix = self.prefix(3) 1326 if (prefix == '---' or prefix == '...') \ 1327 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': 1328 return 1329 breaks = [] 1330 while self.peek() in ' \r\n\x85\u2028\u2029': 1331 if self.peek() == ' ': 1332 self.forward() 1333 else: 1334 breaks.append(self.scan_line_break()) 1335 prefix = self.prefix(3) 1336 if (prefix == '---' or prefix == '...') \ 1337 and self.peek(3) in '\0 \t\r\n\x85\u2028\u2029': 1338 return 1339 if line_break != '\n': 1340 chunks.append(line_break) 1341 elif not breaks: 1342 chunks.append(' ') 1343 chunks.extend(breaks) 1344 elif whitespaces: 1345 chunks.append(whitespaces) 1346 return chunks 1347 1348 def scan_tag_handle(self, name, start_mark): 1349 # See the specification for details. 1350 # For some strange reasons, the specification does not allow '_' in 1351 # tag handles. I have allowed it anyway. 1352 ch = self.peek() 1353 if ch != '!': 1354 raise ScannerError("while scanning a %s" % name, start_mark, 1355 "expected '!', but found %r" % ch, self.get_mark()) 1356 length = 1 1357 ch = self.peek(length) 1358 if ch != ' ': 1359 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ 1360 or ch in '-_': 1361 length += 1 1362 ch = self.peek(length) 1363 if ch != '!': 1364 self.forward(length) 1365 raise ScannerError("while scanning a %s" % name, start_mark, 1366 "expected '!', but found %r" % ch, self.get_mark()) 1367 length += 1 1368 value = self.prefix(length) 1369 self.forward(length) 1370 return value 1371 1372 def scan_tag_uri(self, name, start_mark): 1373 # See the specification for details. 1374 # Note: we do not check if URI is well-formed. 1375 chunks = [] 1376 length = 0 1377 ch = self.peek(length) 1378 while '0' <= ch <= '9' or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z' \ 1379 or ch in '-;/?:@&=+$,_.!~*\'()[]%': 1380 if ch == '%': 1381 chunks.append(self.prefix(length)) 1382 self.forward(length) 1383 length = 0 1384 chunks.append(self.scan_uri_escapes(name, start_mark)) 1385 else: 1386 length += 1 1387 ch = self.peek(length) 1388 if length: 1389 chunks.append(self.prefix(length)) 1390 self.forward(length) 1391 length = 0 1392 if not chunks: 1393 raise ScannerError("while parsing a %s" % name, start_mark, 1394 "expected URI, but found %r" % ch, self.get_mark()) 1395 return ''.join(chunks) 1396 1397 def scan_uri_escapes(self, name, start_mark): 1398 # See the specification for details. 1399 codes = [] 1400 mark = self.get_mark() 1401 while self.peek() == '%': 1402 self.forward() 1403 for k in range(2): 1404 if self.peek(k) not in '0123456789ABCDEFabcdef': 1405 raise ScannerError("while scanning a %s" % name, start_mark, 1406 "expected URI escape sequence of 2 hexadecimal numbers, but found %r" 1407 % self.peek(k), self.get_mark()) 1408 codes.append(int(self.prefix(2), 16)) 1409 self.forward(2) 1410 try: 1411 value = bytes(codes).decode('utf-8') 1412 except UnicodeDecodeError as exc: 1413 raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark) 1414 return value 1415 1416 def scan_line_break(self): 1417 # Transforms: 1418 # '\r\n' : '\n' 1419 # '\r' : '\n' 1420 # '\n' : '\n' 1421 # '\x85' : '\n' 1422 # '\u2028' : '\u2028' 1423 # '\u2029 : '\u2029' 1424 # default : '' 1425 ch = self.peek() 1426 if ch in '\r\n\x85': 1427 if self.prefix(2) == '\r\n': 1428 self.forward(2) 1429 else: 1430 self.forward() 1431 return '\n' 1432 elif ch in '\u2028\u2029': 1433 self.forward() 1434 return ch 1435 return '' 1436