• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2008, http://www.snakeyaml.org
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package org.yaml.snakeyaml.scanner;
17 
18 import java.nio.ByteBuffer;
19 import java.nio.charset.CharacterCodingException;
20 import java.util.ArrayList;
21 import java.util.HashMap;
22 import java.util.Iterator;
23 import java.util.LinkedHashMap;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.regex.Pattern;
27 
28 import org.yaml.snakeyaml.error.Mark;
29 import org.yaml.snakeyaml.error.YAMLException;
30 import org.yaml.snakeyaml.reader.StreamReader;
31 import org.yaml.snakeyaml.tokens.AliasToken;
32 import org.yaml.snakeyaml.tokens.AnchorToken;
33 import org.yaml.snakeyaml.tokens.BlockEndToken;
34 import org.yaml.snakeyaml.tokens.BlockEntryToken;
35 import org.yaml.snakeyaml.tokens.BlockMappingStartToken;
36 import org.yaml.snakeyaml.tokens.BlockSequenceStartToken;
37 import org.yaml.snakeyaml.tokens.DirectiveToken;
38 import org.yaml.snakeyaml.tokens.DocumentEndToken;
39 import org.yaml.snakeyaml.tokens.DocumentStartToken;
40 import org.yaml.snakeyaml.tokens.FlowEntryToken;
41 import org.yaml.snakeyaml.tokens.FlowMappingEndToken;
42 import org.yaml.snakeyaml.tokens.FlowMappingStartToken;
43 import org.yaml.snakeyaml.tokens.FlowSequenceEndToken;
44 import org.yaml.snakeyaml.tokens.FlowSequenceStartToken;
45 import org.yaml.snakeyaml.tokens.KeyToken;
46 import org.yaml.snakeyaml.tokens.ScalarToken;
47 import org.yaml.snakeyaml.tokens.StreamEndToken;
48 import org.yaml.snakeyaml.tokens.StreamStartToken;
49 import org.yaml.snakeyaml.tokens.TagToken;
50 import org.yaml.snakeyaml.tokens.TagTuple;
51 import org.yaml.snakeyaml.tokens.Token;
52 import org.yaml.snakeyaml.tokens.ValueToken;
53 import org.yaml.snakeyaml.util.ArrayStack;
54 import org.yaml.snakeyaml.util.UriEncoder;
55 
56 /**
57  * <pre>
58  * Scanner produces tokens of the following types:
59  * STREAM-START
60  * STREAM-END
61  * DIRECTIVE(name, value)
62  * DOCUMENT-START
63  * DOCUMENT-END
64  * BLOCK-SEQUENCE-START
65  * BLOCK-MAPPING-START
66  * BLOCK-END
67  * FLOW-SEQUENCE-START
68  * FLOW-MAPPING-START
69  * FLOW-SEQUENCE-END
70  * FLOW-MAPPING-END
71  * BLOCK-ENTRY
72  * FLOW-ENTRY
73  * KEY
74  * VALUE
75  * ALIAS(value)
76  * ANCHOR(value)
77  * TAG(value)
78  * SCALAR(value, plain, style)
79  * Read comments in the Scanner code for more details.
80  * </pre>
81  */
82 public final class ScannerImpl implements Scanner {
83     /**
84      * A regular expression matching characters which are not in the hexadecimal
85      * set (0-9, A-F, a-f).
86      */
87     private final static Pattern NOT_HEXA = Pattern.compile("[^0-9A-Fa-f]");
88 
89     /**
90      * A mapping from an escaped character in the input stream to the character
91      * that they should be replaced with.
92      *
93      * YAML defines several common and a few uncommon escape sequences.
94      *
95      * @see <a href="http://www.yaml.org/spec/current.html#id2517668">4.1.6.
96      *      Escape Sequences</a>
97      */
98     public final static Map<Character, String> ESCAPE_REPLACEMENTS = new HashMap<Character, String>();
99 
100     /**
101      * A mapping from a character to a number of bytes to read-ahead for that
102      * escape sequence. These escape sequences are used to handle unicode
103      * escaping in the following formats, where H is a hexadecimal character:
104      *
105      * <pre>
106      * &#92;xHH         : escaped 8-bit Unicode character
107      * &#92;uHHHH       : escaped 16-bit Unicode character
108      * &#92;UHHHHHHHH   : escaped 32-bit Unicode character
109      * </pre>
110      *
111      * @see <a href="http://yaml.org/spec/1.1/current.html#id872840">5.6. Escape
112      *      Sequences</a>
113      */
114     public final static Map<Character, Integer> ESCAPE_CODES = new HashMap<Character, Integer>();
115 
116     static {
117         // ASCII null
118         ESCAPE_REPLACEMENTS.put(Character.valueOf('0'), "\0");
119         // ASCII bell
120         ESCAPE_REPLACEMENTS.put(Character.valueOf('a'), "\u0007");
121         // ASCII backspace
122         ESCAPE_REPLACEMENTS.put(Character.valueOf('b'), "\u0008");
123         // ASCII horizontal tab
124         ESCAPE_REPLACEMENTS.put(Character.valueOf('t'), "\u0009");
125         // ASCII newline (line feed; &#92;n maps to 0x0A)
126         ESCAPE_REPLACEMENTS.put(Character.valueOf('n'), "\n");
127         // ASCII vertical tab
128         ESCAPE_REPLACEMENTS.put(Character.valueOf('v'), "\u000B");
129         // ASCII form-feed
130         ESCAPE_REPLACEMENTS.put(Character.valueOf('f'), "\u000C");
131         // carriage-return (&#92;r maps to 0x0D)
132         ESCAPE_REPLACEMENTS.put(Character.valueOf('r'), "\r");
133         // ASCII escape character (Esc)
134         ESCAPE_REPLACEMENTS.put(Character.valueOf('e'), "\u001B");
135         // ASCII space
136         ESCAPE_REPLACEMENTS.put(Character.valueOf(' '), "\u0020");
137         // ASCII double-quote
138         ESCAPE_REPLACEMENTS.put(Character.valueOf('"'), "\"");
139         // ASCII backslash
140         ESCAPE_REPLACEMENTS.put(Character.valueOf('\\'), "\\");
141         // Unicode next line
142         ESCAPE_REPLACEMENTS.put(Character.valueOf('N'), "\u0085");
143         // Unicode non-breaking-space
144         ESCAPE_REPLACEMENTS.put(Character.valueOf('_'), "\u00A0");
145         // Unicode line-separator
146         ESCAPE_REPLACEMENTS.put(Character.valueOf('L'), "\u2028");
147         // Unicode paragraph separator
148         ESCAPE_REPLACEMENTS.put(Character.valueOf('P'), "\u2029");
149 
150         // 8-bit Unicode
151         ESCAPE_CODES.put(Character.valueOf('x'), 2);
152         // 16-bit Unicode
153         ESCAPE_CODES.put(Character.valueOf('u'), 4);
154         // 32-bit Unicode (Supplementary characters are supported)
155         ESCAPE_CODES.put(Character.valueOf('U'), 8);
156     }
157     private final StreamReader reader;
158     // Had we reached the end of the stream?
159     private boolean done = false;
160 
161     // The number of unclosed '{' and '['. `flow_level == 0` means block
162     // context.
163     private int flowLevel = 0;
164 
165     // List of processed tokens that are not yet emitted.
166     private List<Token> tokens;
167 
168     // Number of tokens that were emitted through the `get_token` method.
169     private int tokensTaken = 0;
170 
171     // The current indentation level.
172     private int indent = -1;
173 
174     // Past indentation levels.
175     private ArrayStack<Integer> indents;
176 
177     // Variables related to simple keys treatment. See PyYAML.
178 
179     /**
180      * <pre>
181      * A simple key is a key that is not denoted by the '?' indicator.
182      * Example of simple keys:
183      *   ---
184      *   block simple key: value
185      *   ? not a simple key:
186      *   : { flow simple key: value }
187      * We emit the KEY token before all keys, so when we find a potential
188      * simple key, we try to locate the corresponding ':' indicator.
189      * Simple keys should be limited to a single line and 1024 characters.
190      *
191      * Can a simple key start at the current position? A simple key may
192      * start:
193      * - at the beginning of the line, not counting indentation spaces
194      *       (in block context),
195      * - after '{', '[', ',' (in the flow context),
196      * - after '?', ':', '-' (in the block context).
197      * In the block context, this flag also signifies if a block collection
198      * may start at the current position.
199      * </pre>
200      */
201     private boolean allowSimpleKey = true;
202 
203     /*
204      * Keep track of possible simple keys. This is a dictionary. The key is
205      * `flow_level`; there can be no more that one possible simple key for each
206      * level. The value is a SimpleKey record: (token_number, required, index,
207      * line, column, mark) A simple key may start with ALIAS, ANCHOR, TAG,
208      * SCALAR(flow), '[', or '{' tokens.
209      */
210     private Map<Integer, SimpleKey> possibleSimpleKeys;
211 
ScannerImpl(StreamReader reader)212     public ScannerImpl(StreamReader reader) {
213         this.reader = reader;
214         this.tokens = new ArrayList<Token>(100);
215         this.indents = new ArrayStack<Integer>(10);
216         // The order in possibleSimpleKeys is kept for nextPossibleSimpleKey()
217         this.possibleSimpleKeys = new LinkedHashMap<Integer, SimpleKey>();
218         fetchStreamStart();// Add the STREAM-START token.
219     }
220 
221     /**
222      * Check whether the next token is one of the given types.
223      */
checkToken(Token.ID... choices)224     public boolean checkToken(Token.ID... choices) {
225         while (needMoreTokens()) {
226             fetchMoreTokens();
227         }
228         if (!this.tokens.isEmpty()) {
229             if (choices.length == 0) {
230                 return true;
231             }
232             // since profiler puts this method on top (it is used a lot), we
233             // should not use 'foreach' here because of the performance reasons
234             Token.ID first = this.tokens.get(0).getTokenId();
235             for (int i = 0; i < choices.length; i++) {
236                 if (first == choices[i]) {
237                     return true;
238                 }
239             }
240         }
241         return false;
242     }
243 
244     /**
245      * Return the next token, but do not delete it from the queue.
246      */
peekToken()247     public Token peekToken() {
248         while (needMoreTokens()) {
249             fetchMoreTokens();
250         }
251         return this.tokens.get(0);
252     }
253 
254     /**
255      * Return the next token, removing it from the queue.
256      */
getToken()257     public Token getToken() {
258         if (!this.tokens.isEmpty()) {
259             this.tokensTaken++;
260             return this.tokens.remove(0);
261         }
262         return null;
263     }
264 
265     // Private methods.
266     /**
267      * Returns true if more tokens should be scanned.
268      */
needMoreTokens()269     private boolean needMoreTokens() {
270         // If we are done, we do not require more tokens.
271         if (this.done) {
272             return false;
273         }
274         // If we aren't done, but we have no tokens, we need to scan more.
275         if (this.tokens.isEmpty()) {
276             return true;
277         }
278         // The current token may be a potential simple key, so we
279         // need to look further.
280         stalePossibleSimpleKeys();
281         return nextPossibleSimpleKey() == this.tokensTaken;
282     }
283 
284     /**
285      * Fetch one or more tokens from the StreamReader.
286      */
fetchMoreTokens()287     private void fetchMoreTokens() {
288         // Eat whitespaces and comments until we reach the next token.
289         scanToNextToken();
290         // Remove obsolete possible simple keys.
291         stalePossibleSimpleKeys();
292         // Compare the current indentation and column. It may add some tokens
293         // and decrease the current indentation level.
294         unwindIndent(reader.getColumn());
295         // Peek the next character, to decide what the next group of tokens
296         // will look like.
297         char ch = reader.peek();
298         switch (ch) {
299         case '\0':
300             // Is it the end of stream?
301             fetchStreamEnd();
302             return;
303         case '%':
304             // Is it a directive?
305             if (checkDirective()) {
306                 fetchDirective();
307                 return;
308             }
309             break;
310         case '-':
311             // Is it the document start?
312             if (checkDocumentStart()) {
313                 fetchDocumentStart();
314                 return;
315                 // Is it the block entry indicator?
316             } else if (checkBlockEntry()) {
317                 fetchBlockEntry();
318                 return;
319             }
320             break;
321         case '.':
322             // Is it the document end?
323             if (checkDocumentEnd()) {
324                 fetchDocumentEnd();
325                 return;
326             }
327             break;
328         // TODO support for BOM within a stream. (not implemented in PyYAML)
329         case '[':
330             // Is it the flow sequence start indicator?
331             fetchFlowSequenceStart();
332             return;
333         case '{':
334             // Is it the flow mapping start indicator?
335             fetchFlowMappingStart();
336             return;
337         case ']':
338             // Is it the flow sequence end indicator?
339             fetchFlowSequenceEnd();
340             return;
341         case '}':
342             // Is it the flow mapping end indicator?
343             fetchFlowMappingEnd();
344             return;
345         case ',':
346             // Is it the flow entry indicator?
347             fetchFlowEntry();
348             return;
349             // see block entry indicator above
350         case '?':
351             // Is it the key indicator?
352             if (checkKey()) {
353                 fetchKey();
354                 return;
355             }
356             break;
357         case ':':
358             // Is it the value indicator?
359             if (checkValue()) {
360                 fetchValue();
361                 return;
362             }
363             break;
364         case '*':
365             // Is it an alias?
366             fetchAlias();
367             return;
368         case '&':
369             // Is it an anchor?
370             fetchAnchor();
371             return;
372         case '!':
373             // Is it a tag?
374             fetchTag();
375             return;
376         case '|':
377             // Is it a literal scalar?
378             if (this.flowLevel == 0) {
379                 fetchLiteral();
380                 return;
381             }
382             break;
383         case '>':
384             // Is it a folded scalar?
385             if (this.flowLevel == 0) {
386                 fetchFolded();
387                 return;
388             }
389             break;
390         case '\'':
391             // Is it a single quoted scalar?
392             fetchSingle();
393             return;
394         case '"':
395             // Is it a double quoted scalar?
396             fetchDouble();
397             return;
398         }
399         // It must be a plain scalar then.
400         if (checkPlain()) {
401             fetchPlain();
402             return;
403         }
404         // No? It's an error. Let's produce a nice error message.We do this by
405         // converting escaped characters into their escape sequences. This is a
406         // backwards use of the ESCAPE_REPLACEMENTS map.
407         String chRepresentation = String.valueOf(ch);
408         for (Character s : ESCAPE_REPLACEMENTS.keySet()) {
409             String v = ESCAPE_REPLACEMENTS.get(s);
410             if (v.equals(chRepresentation)) {
411                 chRepresentation = "\\" + s;// ' ' -> '\t'
412                 break;
413             }
414         }
415         if (ch == '\t')
416             chRepresentation += "(TAB)";
417         String text = String
418                 .format("found character '%s' that cannot start any token. (Do not use %s for indentation)",
419                         chRepresentation, chRepresentation);
420         throw new ScannerException("while scanning for the next token", null, text,
421                 reader.getMark());
422     }
423 
424     // Simple keys treatment.
425 
426     /**
427      * Return the number of the nearest possible simple key. Actually we don't
428      * need to loop through the whole dictionary.
429      */
nextPossibleSimpleKey()430     private int nextPossibleSimpleKey() {
431         /*
432          * the implementation is not as in PyYAML. Because
433          * this.possibleSimpleKeys is ordered we can simply take the first key
434          */
435         if (!this.possibleSimpleKeys.isEmpty()) {
436             return this.possibleSimpleKeys.values().iterator().next().getTokenNumber();
437         }
438         return -1;
439     }
440 
441     /**
442      * <pre>
443      * Remove entries that are no longer possible simple keys. According to
444      * the YAML specification, simple keys
445      * - should be limited to a single line,
446      * - should be no longer than 1024 characters.
447      * Disabling this procedure will allow simple keys of any length and
448      * height (may cause problems if indentation is broken though).
449      * </pre>
450      */
stalePossibleSimpleKeys()451     private void stalePossibleSimpleKeys() {
452         if (!this.possibleSimpleKeys.isEmpty()) {
453             for (Iterator<SimpleKey> iterator = this.possibleSimpleKeys.values().iterator(); iterator
454                     .hasNext();) {
455                 SimpleKey key = iterator.next();
456                 if ((key.getLine() != reader.getLine())
457                         || (reader.getIndex() - key.getIndex() > 1024)) {
458                     // If the key is not on the same line as the current
459                     // position OR the difference in column between the token
460                     // start and the current position is more than the maximum
461                     // simple key length, then this cannot be a simple key.
462                     if (key.isRequired()) {
463                         // If the key was required, this implies an error
464                         // condition.
465                         throw new ScannerException("while scanning a simple key", key.getMark(),
466                                 "could not find expected ':'", reader.getMark());
467                     }
468                     iterator.remove();
469                 }
470             }
471         }
472     }
473 
474     /**
475      * The next token may start a simple key. We check if it's possible and save
476      * its position. This function is called for ALIAS, ANCHOR, TAG,
477      * SCALAR(flow), '[', and '{'.
478      */
savePossibleSimpleKey()479     private void savePossibleSimpleKey() {
480         // The next token may start a simple key. We check if it's possible
481         // and save its position. This function is called for
482         // ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
483 
484         // Check if a simple key is required at the current position.
485         // A simple key is required if this position is the root flowLevel, AND
486         // the current indentation level is the same as the last indent-level.
487         boolean required = (this.flowLevel == 0) && (this.indent == this.reader.getColumn());
488 
489         if (allowSimpleKey || !required) {
490             // A simple key is required only if it is the first token in the
491             // current line. Therefore it is always allowed.
492         } else {
493             throw new YAMLException(
494                     "A simple key is required only if it is the first token in the current line");
495         }
496 
497         // The next token might be a simple key. Let's save it's number and
498         // position.
499         if (this.allowSimpleKey) {
500             removePossibleSimpleKey();
501             int tokenNumber = this.tokensTaken + this.tokens.size();
502             SimpleKey key = new SimpleKey(tokenNumber, required, reader.getIndex(),
503                     reader.getLine(), this.reader.getColumn(), this.reader.getMark());
504             this.possibleSimpleKeys.put(this.flowLevel, key);
505         }
506     }
507 
508     /**
509      * Remove the saved possible key position at the current flow level.
510      */
removePossibleSimpleKey()511     private void removePossibleSimpleKey() {
512         SimpleKey key = possibleSimpleKeys.remove(flowLevel);
513         if (key != null && key.isRequired()) {
514             throw new ScannerException("while scanning a simple key", key.getMark(),
515                     "could not find expected ':'", reader.getMark());
516         }
517     }
518 
519     // Indentation functions.
520 
521     /**
522      * * Handle implicitly ending multiple levels of block nodes by decreased
523      * indentation. This function becomes important on lines 4 and 7 of this
524      * example:
525      *
526      * <pre>
527      * 1) book one:
528      * 2)   part one:
529      * 3)     chapter one
530      * 4)   part two:
531      * 5)     chapter one
532      * 6)     chapter two
533      * 7) book two:
534      * </pre>
535      *
536      * In flow context, tokens should respect indentation. Actually the
537      * condition should be `self.indent &gt;= column` according to the spec. But
538      * this condition will prohibit intuitively correct constructions such as
539      * key : { } </pre>
540      */
unwindIndent(int col)541     private void unwindIndent(int col) {
542         // In the flow context, indentation is ignored. We make the scanner less
543         // restrictive then specification requires.
544         if (this.flowLevel != 0) {
545             return;
546         }
547 
548         // In block context, we may need to issue the BLOCK-END tokens.
549         while (this.indent > col) {
550             Mark mark = reader.getMark();
551             this.indent = this.indents.pop();
552             this.tokens.add(new BlockEndToken(mark, mark));
553         }
554     }
555 
556     /**
557      * Check if we need to increase indentation.
558      */
addIndent(int column)559     private boolean addIndent(int column) {
560         if (this.indent < column) {
561             this.indents.push(this.indent);
562             this.indent = column;
563             return true;
564         }
565         return false;
566     }
567 
568     // Fetchers.
569 
570     /**
571      * We always add STREAM-START as the first token and STREAM-END as the last
572      * token.
573      */
fetchStreamStart()574     private void fetchStreamStart() {
575         // Read the token.
576         Mark mark = reader.getMark();
577 
578         // Add STREAM-START.
579         Token token = new StreamStartToken(mark, mark);
580         this.tokens.add(token);
581     }
582 
fetchStreamEnd()583     private void fetchStreamEnd() {
584         // Set the current intendation to -1.
585         unwindIndent(-1);
586 
587         // Reset simple keys.
588         removePossibleSimpleKey();
589         this.allowSimpleKey = false;
590         this.possibleSimpleKeys.clear();
591 
592         // Read the token.
593         Mark mark = reader.getMark();
594 
595         // Add STREAM-END.
596         Token token = new StreamEndToken(mark, mark);
597         this.tokens.add(token);
598 
599         // The stream is finished.
600         this.done = true;
601     }
602 
603     /**
604      * Fetch a YAML directive. Directives are presentation details that are
605      * interpreted as instructions to the processor. YAML defines two kinds of
606      * directives, YAML and TAG; all other types are reserved for future use.
607      *
608      * @see <a href="http://www.yaml.org/spec/1.1/#id864824"></a>
609      */
fetchDirective()610     private void fetchDirective() {
611         // Set the current intendation to -1.
612         unwindIndent(-1);
613 
614         // Reset simple keys.
615         removePossibleSimpleKey();
616         this.allowSimpleKey = false;
617 
618         // Scan and add DIRECTIVE.
619         Token tok = scanDirective();
620         this.tokens.add(tok);
621     }
622 
623     /**
624      * Fetch a document-start token ("---").
625      */
fetchDocumentStart()626     private void fetchDocumentStart() {
627         fetchDocumentIndicator(true);
628     }
629 
630     /**
631      * Fetch a document-end token ("...").
632      */
fetchDocumentEnd()633     private void fetchDocumentEnd() {
634         fetchDocumentIndicator(false);
635     }
636 
637     /**
638      * Fetch a document indicator, either "---" for "document-start", or else
639      * "..." for "document-end. The type is chosen by the given boolean.
640      */
fetchDocumentIndicator(boolean isDocumentStart)641     private void fetchDocumentIndicator(boolean isDocumentStart) {
642         // Set the current intendation to -1.
643         unwindIndent(-1);
644 
645         // Reset simple keys. Note that there could not be a block collection
646         // after '---'.
647         removePossibleSimpleKey();
648         this.allowSimpleKey = false;
649 
650         // Add DOCUMENT-START or DOCUMENT-END.
651         Mark startMark = reader.getMark();
652         reader.forward(3);
653         Mark endMark = reader.getMark();
654         Token token;
655         if (isDocumentStart) {
656             token = new DocumentStartToken(startMark, endMark);
657         } else {
658             token = new DocumentEndToken(startMark, endMark);
659         }
660         this.tokens.add(token);
661     }
662 
fetchFlowSequenceStart()663     private void fetchFlowSequenceStart() {
664         fetchFlowCollectionStart(false);
665     }
666 
fetchFlowMappingStart()667     private void fetchFlowMappingStart() {
668         fetchFlowCollectionStart(true);
669     }
670 
671     /**
672      * Fetch a flow-style collection start, which is either a sequence or a
673      * mapping. The type is determined by the given boolean.
674      *
675      * A flow-style collection is in a format similar to JSON. Sequences are
676      * started by '[' and ended by ']'; mappings are started by '{' and ended by
677      * '}'.
678      *
679      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
680      *
681      * @param isMappingStart
682      */
fetchFlowCollectionStart(boolean isMappingStart)683     private void fetchFlowCollectionStart(boolean isMappingStart) {
684         // '[' and '{' may start a simple key.
685         savePossibleSimpleKey();
686 
687         // Increase the flow level.
688         this.flowLevel++;
689 
690         // Simple keys are allowed after '[' and '{'.
691         this.allowSimpleKey = true;
692 
693         // Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
694         Mark startMark = reader.getMark();
695         reader.forward(1);
696         Mark endMark = reader.getMark();
697         Token token;
698         if (isMappingStart) {
699             token = new FlowMappingStartToken(startMark, endMark);
700         } else {
701             token = new FlowSequenceStartToken(startMark, endMark);
702         }
703         this.tokens.add(token);
704     }
705 
fetchFlowSequenceEnd()706     private void fetchFlowSequenceEnd() {
707         fetchFlowCollectionEnd(false);
708     }
709 
fetchFlowMappingEnd()710     private void fetchFlowMappingEnd() {
711         fetchFlowCollectionEnd(true);
712     }
713 
714     /**
715      * Fetch a flow-style collection end, which is either a sequence or a
716      * mapping. The type is determined by the given boolean.
717      *
718      * A flow-style collection is in a format similar to JSON. Sequences are
719      * started by '[' and ended by ']'; mappings are started by '{' and ended by
720      * '}'.
721      *
722      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
723      */
fetchFlowCollectionEnd(boolean isMappingEnd)724     private void fetchFlowCollectionEnd(boolean isMappingEnd) {
725         // Reset possible simple key on the current level.
726         removePossibleSimpleKey();
727 
728         // Decrease the flow level.
729         this.flowLevel--;
730 
731         // No simple keys after ']' or '}'.
732         this.allowSimpleKey = false;
733 
734         // Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
735         Mark startMark = reader.getMark();
736         reader.forward();
737         Mark endMark = reader.getMark();
738         Token token;
739         if (isMappingEnd) {
740             token = new FlowMappingEndToken(startMark, endMark);
741         } else {
742             token = new FlowSequenceEndToken(startMark, endMark);
743         }
744         this.tokens.add(token);
745     }
746 
747     /**
748      * Fetch an entry in the flow style. Flow-style entries occur either
749      * immediately after the start of a collection, or else after a comma.
750      *
751      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
752      */
fetchFlowEntry()753     private void fetchFlowEntry() {
754         // Simple keys are allowed after ','.
755         this.allowSimpleKey = true;
756 
757         // Reset possible simple key on the current level.
758         removePossibleSimpleKey();
759 
760         // Add FLOW-ENTRY.
761         Mark startMark = reader.getMark();
762         reader.forward();
763         Mark endMark = reader.getMark();
764         Token token = new FlowEntryToken(startMark, endMark);
765         this.tokens.add(token);
766     }
767 
768     /**
769      * Fetch an entry in the block style.
770      *
771      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
772      */
fetchBlockEntry()773     private void fetchBlockEntry() {
774         // Block context needs additional checks.
775         if (this.flowLevel == 0) {
776             // Are we allowed to start a new entry?
777             if (!this.allowSimpleKey) {
778                 throw new ScannerException(null, null, "sequence entries are not allowed here",
779                         reader.getMark());
780             }
781 
782             // We may need to add BLOCK-SEQUENCE-START.
783             if (addIndent(this.reader.getColumn())) {
784                 Mark mark = reader.getMark();
785                 this.tokens.add(new BlockSequenceStartToken(mark, mark));
786             }
787         } else {
788             // It's an error for the block entry to occur in the flow
789             // context,but we let the parser detect this.
790         }
791         // Simple keys are allowed after '-'.
792         this.allowSimpleKey = true;
793 
794         // Reset possible simple key on the current level.
795         removePossibleSimpleKey();
796 
797         // Add BLOCK-ENTRY.
798         Mark startMark = reader.getMark();
799         reader.forward();
800         Mark endMark = reader.getMark();
801         Token token = new BlockEntryToken(startMark, endMark);
802         this.tokens.add(token);
803     }
804 
805     /**
806      * Fetch a key in a block-style mapping.
807      *
808      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
809      */
fetchKey()810     private void fetchKey() {
811         // Block context needs additional checks.
812         if (this.flowLevel == 0) {
813             // Are we allowed to start a key (not necessary a simple)?
814             if (!this.allowSimpleKey) {
815                 throw new ScannerException(null, null, "mapping keys are not allowed here",
816                         reader.getMark());
817             }
818             // We may need to add BLOCK-MAPPING-START.
819             if (addIndent(this.reader.getColumn())) {
820                 Mark mark = reader.getMark();
821                 this.tokens.add(new BlockMappingStartToken(mark, mark));
822             }
823         }
824         // Simple keys are allowed after '?' in the block context.
825         this.allowSimpleKey = this.flowLevel == 0;
826 
827         // Reset possible simple key on the current level.
828         removePossibleSimpleKey();
829 
830         // Add KEY.
831         Mark startMark = reader.getMark();
832         reader.forward();
833         Mark endMark = reader.getMark();
834         Token token = new KeyToken(startMark, endMark);
835         this.tokens.add(token);
836     }
837 
838     /**
839      * Fetch a value in a block-style mapping.
840      *
841      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
842      */
fetchValue()843     private void fetchValue() {
844         // Do we determine a simple key?
845         SimpleKey key = this.possibleSimpleKeys.remove(this.flowLevel);
846         if (key != null) {
847             // Add KEY.
848             this.tokens.add(key.getTokenNumber() - this.tokensTaken, new KeyToken(key.getMark(),
849                     key.getMark()));
850 
851             // If this key starts a new block mapping, we need to add
852             // BLOCK-MAPPING-START.
853             if (this.flowLevel == 0) {
854                 if (addIndent(key.getColumn())) {
855                     this.tokens.add(key.getTokenNumber() - this.tokensTaken,
856                             new BlockMappingStartToken(key.getMark(), key.getMark()));
857                 }
858             }
859             // There cannot be two simple keys one after another.
860             this.allowSimpleKey = false;
861 
862         } else {
863             // It must be a part of a complex key.
864             // Block context needs additional checks. Do we really need them?
865             // They will be caught by the parser anyway.
866             if (this.flowLevel == 0) {
867 
868                 // We are allowed to start a complex value if and only if we can
869                 // start a simple key.
870                 if (!this.allowSimpleKey) {
871                     throw new ScannerException(null, null, "mapping values are not allowed here",
872                             reader.getMark());
873                 }
874             }
875 
876             // If this value starts a new block mapping, we need to add
877             // BLOCK-MAPPING-START. It will be detected as an error later by
878             // the parser.
879             if (flowLevel == 0) {
880                 if (addIndent(reader.getColumn())) {
881                     Mark mark = reader.getMark();
882                     this.tokens.add(new BlockMappingStartToken(mark, mark));
883                 }
884             }
885 
886             // Simple keys are allowed after ':' in the block context.
887             allowSimpleKey = flowLevel == 0;
888 
889             // Reset possible simple key on the current level.
890             removePossibleSimpleKey();
891         }
892         // Add VALUE.
893         Mark startMark = reader.getMark();
894         reader.forward();
895         Mark endMark = reader.getMark();
896         Token token = new ValueToken(startMark, endMark);
897         this.tokens.add(token);
898     }
899 
900     /**
901      * Fetch an alias, which is a reference to an anchor. Aliases take the
902      * format:
903      *
904      * <pre>
905      * *(anchor name)
906      * </pre>
907      *
908      * @see <a href="http://www.yaml.org/spec/1.1/#id863390"></a>
909      */
fetchAlias()910     private void fetchAlias() {
911         // ALIAS could be a simple key.
912         savePossibleSimpleKey();
913 
914         // No simple keys after ALIAS.
915         this.allowSimpleKey = false;
916 
917         // Scan and add ALIAS.
918         Token tok = scanAnchor(false);
919         this.tokens.add(tok);
920     }
921 
922     /**
923      * Fetch an anchor. Anchors take the form:
924      *
925      * <pre>
926      * &(anchor name)
927      * </pre>
928      *
929      * @see <a href="http://www.yaml.org/spec/1.1/#id863390"></a>
930      */
fetchAnchor()931     private void fetchAnchor() {
932         // ANCHOR could start a simple key.
933         savePossibleSimpleKey();
934 
935         // No simple keys after ANCHOR.
936         this.allowSimpleKey = false;
937 
938         // Scan and add ANCHOR.
939         Token tok = scanAnchor(true);
940         this.tokens.add(tok);
941     }
942 
943     /**
944      * Fetch a tag. Tags take a complex form.
945      *
946      * @see <a href="http://www.yaml.org/spec/1.1/#id861700"></a>
947      */
fetchTag()948     private void fetchTag() {
949         // TAG could start a simple key.
950         savePossibleSimpleKey();
951 
952         // No simple keys after TAG.
953         this.allowSimpleKey = false;
954 
955         // Scan and add TAG.
956         Token tok = scanTag();
957         this.tokens.add(tok);
958     }
959 
960     /**
961      * Fetch a literal scalar, denoted with a vertical-bar. This is the type
962      * best used for source code and other content, such as binary data, which
963      * must be included verbatim.
964      *
965      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
966      */
fetchLiteral()967     private void fetchLiteral() {
968         fetchBlockScalar('|');
969     }
970 
971     /**
972      * Fetch a folded scalar, denoted with a greater-than sign. This is the type
973      * best used for long content, such as the text of a chapter or description.
974      *
975      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
976      */
fetchFolded()977     private void fetchFolded() {
978         fetchBlockScalar('>');
979     }
980 
981     /**
982      * Fetch a block scalar (literal or folded).
983      *
984      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
985      *
986      * @param style
987      */
fetchBlockScalar(char style)988     private void fetchBlockScalar(char style) {
989         // A simple key may follow a block scalar.
990         this.allowSimpleKey = true;
991 
992         // Reset possible simple key on the current level.
993         removePossibleSimpleKey();
994 
995         // Scan and add SCALAR.
996         Token tok = scanBlockScalar(style);
997         this.tokens.add(tok);
998     }
999 
1000     /**
1001      * Fetch a single-quoted (') scalar.
1002      */
fetchSingle()1003     private void fetchSingle() {
1004         fetchFlowScalar('\'');
1005     }
1006 
1007     /**
1008      * Fetch a double-quoted (") scalar.
1009      */
fetchDouble()1010     private void fetchDouble() {
1011         fetchFlowScalar('"');
1012     }
1013 
1014     /**
1015      * Fetch a flow scalar (single- or double-quoted).
1016      *
1017      * @see <a href="http://www.yaml.org/spec/1.1/#id863975"></a>
1018      *
1019      * @param style
1020      */
fetchFlowScalar(char style)1021     private void fetchFlowScalar(char style) {
1022         // A flow scalar could be a simple key.
1023         savePossibleSimpleKey();
1024 
1025         // No simple keys after flow scalars.
1026         this.allowSimpleKey = false;
1027 
1028         // Scan and add SCALAR.
1029         Token tok = scanFlowScalar(style);
1030         this.tokens.add(tok);
1031     }
1032 
1033     /**
1034      * Fetch a plain scalar.
1035      */
fetchPlain()1036     private void fetchPlain() {
1037         // A plain scalar could be a simple key.
1038         savePossibleSimpleKey();
1039 
1040         // No simple keys after plain scalars. But note that `scan_plain` will
1041         // change this flag if the scan is finished at the beginning of the
1042         // line.
1043         this.allowSimpleKey = false;
1044 
1045         // Scan and add SCALAR. May change `allow_simple_key`.
1046         Token tok = scanPlain();
1047         this.tokens.add(tok);
1048     }
1049 
1050     // Checkers.
1051     /**
1052      * Returns true if the next thing on the reader is a directive, given that
1053      * the leading '%' has already been checked.
1054      *
1055      * @see <a href="http://www.yaml.org/spec/1.1/#id864824"></a>
1056      */
checkDirective()1057     private boolean checkDirective() {
1058         // DIRECTIVE: ^ '%' ...
1059         // The '%' indicator is already checked.
1060         return reader.getColumn() == 0;
1061     }
1062 
1063     /**
1064      * Returns true if the next thing on the reader is a document-start ("---").
1065      * A document-start is always followed immediately by a new line.
1066      */
checkDocumentStart()1067     private boolean checkDocumentStart() {
1068         // DOCUMENT-START: ^ '---' (' '|'\n')
1069         if (reader.getColumn() == 0) {
1070             if ("---".equals(reader.prefix(3)) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
1071                 return true;
1072             }
1073         }
1074         return false;
1075     }
1076 
1077     /**
1078      * Returns true if the next thing on the reader is a document-end ("..."). A
1079      * document-end is always followed immediately by a new line.
1080      */
checkDocumentEnd()1081     private boolean checkDocumentEnd() {
1082         // DOCUMENT-END: ^ '...' (' '|'\n')
1083         if (reader.getColumn() == 0) {
1084             if ("...".equals(reader.prefix(3)) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
1085                 return true;
1086             }
1087         }
1088         return false;
1089     }
1090 
1091     /**
1092      * Returns true if the next thing on the reader is a block token.
1093      */
checkBlockEntry()1094     private boolean checkBlockEntry() {
1095         // BLOCK-ENTRY: '-' (' '|'\n')
1096         return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
1097     }
1098 
1099     /**
1100      * Returns true if the next thing on the reader is a key token.
1101      */
checkKey()1102     private boolean checkKey() {
1103         // KEY(flow context): '?'
1104         if (this.flowLevel != 0) {
1105             return true;
1106         } else {
1107             // KEY(block context): '?' (' '|'\n')
1108             return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
1109         }
1110     }
1111 
1112     /**
1113      * Returns true if the next thing on the reader is a value token.
1114      */
checkValue()1115     private boolean checkValue() {
1116         // VALUE(flow context): ':'
1117         if (flowLevel != 0) {
1118             return true;
1119         } else {
1120             // VALUE(block context): ':' (' '|'\n')
1121             return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
1122         }
1123     }
1124 
1125     /**
1126      * Returns true if the next thing on the reader is a plain token.
1127      */
checkPlain()1128     private boolean checkPlain() {
1129         /**
1130          * <pre>
1131          * A plain scalar may start with any non-space character except:
1132          *   '-', '?', ':', ',', '[', ']', '{', '}',
1133          *   '#', '&amp;', '*', '!', '|', '&gt;', '\'', '\&quot;',
1134          *   '%', '@', '`'.
1135          *
1136          * It may also start with
1137          *   '-', '?', ':'
1138          * if it is followed by a non-space character.
1139          *
1140          * Note that we limit the last rule to the block context (except the
1141          * '-' character) because we want the flow context to be space
1142          * independent.
1143          * </pre>
1144          */
1145         char ch = reader.peek();
1146         // If the next char is NOT one of the forbidden chars above or
1147         // whitespace, then this is the start of a plain scalar.
1148         return Constant.NULL_BL_T_LINEBR.hasNo(ch, "-?:,[]{}#&*!|>\'\"%@`")
1149                 || (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(1)) && (ch == '-' || (this.flowLevel == 0 && "?:"
1150                         .indexOf(ch) != -1)));
1151     }
1152 
1153     // Scanners.
1154 
1155     /**
1156      * <pre>
1157      * We ignore spaces, line breaks and comments.
1158      * If we find a line break in the block context, we set the flag
1159      * `allow_simple_key` on.
1160      * The byte order mark is stripped if it's the first character in the
1161      * stream. We do not yet support BOM inside the stream as the
1162      * specification requires. Any such mark will be considered as a part
1163      * of the document.
1164      * TODO: We need to make tab handling rules more sane. A good rule is
1165      *   Tabs cannot precede tokens
1166      *   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
1167      *   KEY(block), VALUE(block), BLOCK-ENTRY
1168      * So the checking code is
1169      *   if &lt;TAB&gt;:
1170      *       self.allow_simple_keys = False
1171      * We also need to add the check for `allow_simple_keys == True` to
1172      * `unwind_indent` before issuing BLOCK-END.
1173      * Scanners for block, flow, and plain scalars need to be modified.
1174      * </pre>
1175      */
scanToNextToken()1176     private void scanToNextToken() {
1177         // If there is a byte order mark (BOM) at the beginning of the stream,
1178         // forward past it.
1179         if (reader.getIndex() == 0 && reader.peek() == '\uFEFF') {
1180             reader.forward();
1181         }
1182         boolean found = false;
1183         while (!found) {
1184             int ff = 0;
1185             // Peek ahead until we find the first non-space character, then
1186             // move forward directly to that character.
1187             while (reader.peek(ff) == ' ') {
1188                 ff++;
1189             }
1190             if (ff > 0) {
1191                 reader.forward(ff);
1192             }
1193             // If the character we have skipped forward to is a comment (#),
1194             // then peek ahead until we find the next end of line. YAML
1195             // comments are from a # to the next new-line. We then forward
1196             // past the comment.
1197             if (reader.peek() == '#') {
1198                 ff = 0;
1199                 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1200                     ff++;
1201                 }
1202                 if (ff > 0) {
1203                     reader.forward(ff);
1204                 }
1205             }
1206             // If we scanned a line break, then (depending on flow level),
1207             // simple keys may be allowed.
1208             if (scanLineBreak().length() != 0) {// found a line-break
1209                 if (this.flowLevel == 0) {
1210                     // Simple keys are allowed at flow-level 0 after a line
1211                     // break
1212                     this.allowSimpleKey = true;
1213                 }
1214             } else {
1215                 found = true;
1216             }
1217         }
1218     }
1219 
1220     @SuppressWarnings({ "unchecked", "rawtypes" })
scanDirective()1221     private Token scanDirective() {
1222         // See the specification for details.
1223         Mark startMark = reader.getMark();
1224         Mark endMark;
1225         reader.forward();
1226         String name = scanDirectiveName(startMark);
1227         List<?> value = null;
1228         if ("YAML".equals(name)) {
1229             value = scanYamlDirectiveValue(startMark);
1230             endMark = reader.getMark();
1231         } else if ("TAG".equals(name)) {
1232             value = scanTagDirectiveValue(startMark);
1233             endMark = reader.getMark();
1234         } else {
1235             endMark = reader.getMark();
1236             int ff = 0;
1237             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1238                 ff++;
1239             }
1240             if (ff > 0) {
1241                 reader.forward(ff);
1242             }
1243         }
1244         scanDirectiveIgnoredLine(startMark);
1245         return new DirectiveToken(name, value, startMark, endMark);
1246     }
1247 
1248     /**
1249      * Scan a directive name. Directive names are a series of non-space
1250      * characters.
1251      *
1252      * @see <a href="http://www.yaml.org/spec/1.1/#id895217"></a>
1253      */
scanDirectiveName(Mark startMark)1254     private String scanDirectiveName(Mark startMark) {
1255         // See the specification for details.
1256         int length = 0;
1257         // A Directive-name is a sequence of alphanumeric characters
1258         // (a-z,A-Z,0-9). We scan until we find something that isn't.
1259         // FIXME this disagrees with the specification.
1260         char ch = reader.peek(length);
1261         while (Constant.ALPHA.has(ch)) {
1262             length++;
1263             ch = reader.peek(length);
1264         }
1265         // If the name would be empty, an error occurs.
1266         if (length == 0) {
1267             throw new ScannerException("while scanning a directive", startMark,
1268                     "expected alphabetic or numeric character, but found " + ch + "(" + ((int) ch)
1269                             + ")", reader.getMark());
1270         }
1271         String value = reader.prefixForward(length);
1272         ch = reader.peek();
1273         if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1274             throw new ScannerException("while scanning a directive", startMark,
1275                     "expected alphabetic or numeric character, but found " + ch + "(" + ((int) ch)
1276                             + ")", reader.getMark());
1277         }
1278         return value;
1279     }
1280 
scanYamlDirectiveValue(Mark startMark)1281     private List<Integer> scanYamlDirectiveValue(Mark startMark) {
1282         // See the specification for details.
1283         while (reader.peek() == ' ') {
1284             reader.forward();
1285         }
1286         Integer major = scanYamlDirectiveNumber(startMark);
1287         if (reader.peek() != '.') {
1288             throw new ScannerException("while scanning a directive", startMark,
1289                     "expected a digit or '.', but found " + reader.peek() + "("
1290                             + ((int) reader.peek()) + ")", reader.getMark());
1291         }
1292         reader.forward();
1293         Integer minor = scanYamlDirectiveNumber(startMark);
1294         if (Constant.NULL_BL_LINEBR.hasNo(reader.peek())) {
1295             throw new ScannerException("while scanning a directive", startMark,
1296                     "expected a digit or ' ', but found " + reader.peek() + "("
1297                             + ((int) reader.peek()) + ")", reader.getMark());
1298         }
1299         List<Integer> result = new ArrayList<Integer>(2);
1300         result.add(major);
1301         result.add(minor);
1302         return result;
1303     }
1304 
1305     /**
1306      * Read a %YAML directive number: this is either the major or the minor
1307      * part. Stop reading at a non-digit character (usually either '.' or '\n').
1308      *
1309      * @see <a href="http://www.yaml.org/spec/1.1/#id895631"></a>
1310      * @see <a href="http://www.yaml.org/spec/1.1/#ns-dec-digit"></a>
1311      */
scanYamlDirectiveNumber(Mark startMark)1312     private Integer scanYamlDirectiveNumber(Mark startMark) {
1313         // See the specification for details.
1314         char ch = reader.peek();
1315         if (!Character.isDigit(ch)) {
1316             throw new ScannerException("while scanning a directive", startMark,
1317                     "expected a digit, but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
1318         }
1319         int length = 0;
1320         while (Character.isDigit(reader.peek(length))) {
1321             length++;
1322         }
1323         Integer value = Integer.parseInt(reader.prefixForward(length));
1324         return value;
1325     }
1326 
1327     /**
1328      * <p>
1329      * Read a %TAG directive value:
1330      *
1331      * <pre>
1332      * s-ignored-space+ c-tag-handle s-ignored-space+ ns-tag-prefix s-l-comments
1333      * </pre>
1334      *
1335      * </p>
1336      *
1337      * @see <a href="http://www.yaml.org/spec/1.1/#id896044"></a>
1338      */
scanTagDirectiveValue(Mark startMark)1339     private List<String> scanTagDirectiveValue(Mark startMark) {
1340         // See the specification for details.
1341         while (reader.peek() == ' ') {
1342             reader.forward();
1343         }
1344         String handle = scanTagDirectiveHandle(startMark);
1345         while (reader.peek() == ' ') {
1346             reader.forward();
1347         }
1348         String prefix = scanTagDirectivePrefix(startMark);
1349         List<String> result = new ArrayList<String>(2);
1350         result.add(handle);
1351         result.add(prefix);
1352         return result;
1353     }
1354 
1355     /**
1356      * Scan a %TAG directive's handle. This is YAML's c-tag-handle.
1357      *
1358      * @see <a href="http://www.yaml.org/spec/1.1/#id896876"></a>
1359      * @param startMark
1360      * @return
1361      */
scanTagDirectiveHandle(Mark startMark)1362     private String scanTagDirectiveHandle(Mark startMark) {
1363         // See the specification for details.
1364         String value = scanTagHandle("directive", startMark);
1365         char ch = reader.peek();
1366         if (ch != ' ') {
1367             throw new ScannerException("while scanning a directive", startMark,
1368                     "expected ' ', but found " + reader.peek() + "(" + ch + ")", reader.getMark());
1369         }
1370         return value;
1371     }
1372 
1373     /**
1374      * Scan a %TAG directive's prefix. This is YAML's ns-tag-prefix.
1375      *
1376      * @see <a href="http://www.yaml.org/spec/1.1/#ns-tag-prefix"></a>
1377      */
scanTagDirectivePrefix(Mark startMark)1378     private String scanTagDirectivePrefix(Mark startMark) {
1379         // See the specification for details.
1380         String value = scanTagUri("directive", startMark);
1381         if (Constant.NULL_BL_LINEBR.hasNo(reader.peek())) {
1382             throw new ScannerException("while scanning a directive", startMark,
1383                     "expected ' ', but found " + reader.peek() + "(" + ((int) reader.peek()) + ")",
1384                     reader.getMark());
1385         }
1386         return value;
1387     }
1388 
scanDirectiveIgnoredLine(Mark startMark)1389     private String scanDirectiveIgnoredLine(Mark startMark) {
1390         // See the specification for details.
1391         int ff = 0;
1392         while (reader.peek(ff) == ' ') {
1393             ff++;
1394         }
1395         if (ff > 0) {
1396             reader.forward(ff);
1397         }
1398         if (reader.peek() == '#') {
1399             ff = 0;
1400             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1401                 ff++;
1402             }
1403             reader.forward(ff);
1404         }
1405         char ch = reader.peek();
1406         String lineBreak = scanLineBreak();
1407         if (lineBreak.length() == 0 && ch != '\0') {
1408             throw new ScannerException("while scanning a directive", startMark,
1409                     "expected a comment or a line break, but found " + ch + "(" + ((int) ch) + ")",
1410                     reader.getMark());
1411         }
1412         return lineBreak;
1413     }
1414 
1415     /**
1416      * <pre>
1417      * The specification does not restrict characters for anchors and
1418      * aliases. This may lead to problems, for instance, the document:
1419      *   [ *alias, value ]
1420      * can be interpreted in two ways, as
1421      *   [ &quot;value&quot; ]
1422      * and
1423      *   [ *alias , &quot;value&quot; ]
1424      * Therefore we restrict aliases to numbers and ASCII letters.
1425      * </pre>
1426      */
scanAnchor(boolean isAnchor)1427     private Token scanAnchor(boolean isAnchor) {
1428         Mark startMark = reader.getMark();
1429         char indicator = reader.peek();
1430         String name = indicator == '*' ? "alias" : "anchor";
1431         reader.forward();
1432         int length = 0;
1433         char ch = reader.peek(length);
1434         while (Constant.ALPHA.has(ch)) {
1435             length++;
1436             ch = reader.peek(length);
1437         }
1438         if (length == 0) {
1439             throw new ScannerException("while scanning an " + name, startMark,
1440                     "expected alphabetic or numeric character, but found " + ch,
1441                     reader.getMark());
1442         }
1443         String value = reader.prefixForward(length);
1444         ch = reader.peek();
1445         if (Constant.NULL_BL_T_LINEBR.hasNo(ch, "?:,]}%@`")) {
1446             throw new ScannerException("while scanning an " + name, startMark,
1447                     "expected alphabetic or numeric character, but found " + ch + "("
1448                             + ((int) reader.peek()) + ")", reader.getMark());
1449         }
1450         Mark endMark = reader.getMark();
1451         Token tok;
1452         if (isAnchor) {
1453             tok = new AnchorToken(value, startMark, endMark);
1454         } else {
1455             tok = new AliasToken(value, startMark, endMark);
1456         }
1457         return tok;
1458     }
1459 
1460     /**
1461      * <p>
1462      * Scan a Tag property. A Tag property may be specified in one of three
1463      * ways: c-verbatim-tag, c-ns-shorthand-tag, or c-ns-non-specific-tag
1464      * </p>
1465      *
1466      * <p>
1467      * c-verbatim-tag takes the form !&lt;ns-uri-char+&gt; and must be delivered
1468      * verbatim (as-is) to the application. In particular, verbatim tags are not
1469      * subject to tag resolution.
1470      * </p>
1471      *
1472      * <p>
1473      * c-ns-shorthand-tag is a valid tag handle followed by a non-empty suffix.
1474      * If the tag handle is a c-primary-tag-handle ('!') then the suffix must
1475      * have all exclamation marks properly URI-escaped (%21); otherwise, the
1476      * string will look like a named tag handle: !foo!bar would be interpreted
1477      * as (handle="!foo!", suffix="bar").
1478      * </p>
1479      *
1480      * <p>
1481      * c-ns-non-specific-tag is always a lone '!'; this is only useful for plain
1482      * scalars, where its specification means that the scalar MUST be resolved
1483      * to have type tag:yaml.org,2002:str.
1484      * </p>
1485      *
1486      * TODO SnakeYaml incorrectly ignores c-ns-non-specific-tag right now.
1487      *
1488      * @see <a href="http://www.yaml.org/spec/1.1/#id900262"></a>
1489      *
1490      *      TODO Note that this method does not enforce rules about local versus
1491      *      global tags!
1492      */
scanTag()1493     private Token scanTag() {
1494         // See the specification for details.
1495         Mark startMark = reader.getMark();
1496         // Determine the type of tag property based on the first character
1497         // encountered
1498         char ch = reader.peek(1);
1499         String handle = null;
1500         String suffix = null;
1501         // Verbatim tag! (c-verbatim-tag)
1502         if (ch == '<') {
1503             // Skip the exclamation mark and &gt;, then read the tag suffix (as
1504             // a URI).
1505             reader.forward(2);
1506             suffix = scanTagUri("tag", startMark);
1507             if (reader.peek() != '>') {
1508                 // If there are any characters between the end of the tag-suffix
1509                 // URI and the closing &gt;, then an error has occurred.
1510                 throw new ScannerException("while scanning a tag", startMark,
1511                         "expected '>', but found '" + reader.peek() + "' (" + ((int) reader.peek())
1512                                 + ")", reader.getMark());
1513             }
1514             reader.forward();
1515         } else if (Constant.NULL_BL_T_LINEBR.has(ch)) {
1516             // A NUL, blank, tab, or line-break means that this was a
1517             // c-ns-non-specific tag.
1518             suffix = "!";
1519             reader.forward();
1520         } else {
1521             // Any other character implies c-ns-shorthand-tag type.
1522 
1523             // Look ahead in the stream to determine whether this tag property
1524             // is of the form !foo or !foo!bar.
1525             int length = 1;
1526             boolean useHandle = false;
1527             while (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1528                 if (ch == '!') {
1529                     useHandle = true;
1530                     break;
1531                 }
1532                 length++;
1533                 ch = reader.peek(length);
1534             }
1535             handle = "!";
1536             // If we need to use a handle, scan it in; otherwise, the handle is
1537             // presumed to be '!'.
1538             if (useHandle) {
1539                 handle = scanTagHandle("tag", startMark);
1540             } else {
1541                 handle = "!";
1542                 reader.forward();
1543             }
1544             suffix = scanTagUri("tag", startMark);
1545         }
1546         ch = reader.peek();
1547         // Check that the next character is allowed to follow a tag-property;
1548         // if it is not, raise the error.
1549         if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1550             throw new ScannerException("while scanning a tag", startMark,
1551                     "expected ' ', but found '" + ch + "' (" + ((int) ch) + ")", reader.getMark());
1552         }
1553         TagTuple value = new TagTuple(handle, suffix);
1554         Mark endMark = reader.getMark();
1555         return new TagToken(value, startMark, endMark);
1556     }
1557 
scanBlockScalar(char style)1558     private Token scanBlockScalar(char style) {
1559         // See the specification for details.
1560         boolean folded;
1561         // Depending on the given style, we determine whether the scalar is
1562         // folded ('>') or literal ('|')
1563         if (style == '>') {
1564             folded = true;
1565         } else {
1566             folded = false;
1567         }
1568         StringBuilder chunks = new StringBuilder();
1569         Mark startMark = reader.getMark();
1570         // Scan the header.
1571         reader.forward();
1572         Chomping chompi = scanBlockScalarIndicators(startMark);
1573         int increment = chompi.getIncrement();
1574         scanBlockScalarIgnoredLine(startMark);
1575 
1576         // Determine the indentation level and go to the first non-empty line.
1577         int minIndent = this.indent + 1;
1578         if (minIndent < 1) {
1579             minIndent = 1;
1580         }
1581         String breaks = null;
1582         int maxIndent = 0;
1583         int indent = 0;
1584         Mark endMark;
1585         if (increment == -1) {
1586             Object[] brme = scanBlockScalarIndentation();
1587             breaks = (String) brme[0];
1588             maxIndent = ((Integer) brme[1]).intValue();
1589             endMark = (Mark) brme[2];
1590             indent = Math.max(minIndent, maxIndent);
1591         } else {
1592             indent = minIndent + increment - 1;
1593             Object[] brme = scanBlockScalarBreaks(indent);
1594             breaks = (String) brme[0];
1595             endMark = (Mark) brme[1];
1596         }
1597 
1598         String lineBreak = "";
1599 
1600         // Scan the inner part of the block scalar.
1601         while (this.reader.getColumn() == indent && reader.peek() != '\0') {
1602             chunks.append(breaks);
1603             boolean leadingNonSpace = " \t".indexOf(reader.peek()) == -1;
1604             int length = 0;
1605             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(length))) {
1606                 length++;
1607             }
1608             chunks.append(reader.prefixForward(length));
1609             lineBreak = scanLineBreak();
1610             Object[] brme = scanBlockScalarBreaks(indent);
1611             breaks = (String) brme[0];
1612             endMark = (Mark) brme[1];
1613             if (this.reader.getColumn() == indent && reader.peek() != '\0') {
1614 
1615                 // Unfortunately, folding rules are ambiguous.
1616                 //
1617                 // This is the folding according to the specification:
1618                 if (folded && "\n".equals(lineBreak) && leadingNonSpace
1619                         && " \t".indexOf(reader.peek()) == -1) {
1620                     if (breaks.length() == 0) {
1621                         chunks.append(" ");
1622                     }
1623                 } else {
1624                     chunks.append(lineBreak);
1625                 }
1626                 // Clark Evans's interpretation (also in the spec examples) not
1627                 // imported from PyYAML
1628             } else {
1629                 break;
1630             }
1631         }
1632         // Chomp the tail.
1633         if (chompi.chompTailIsNotFalse()) {
1634             chunks.append(lineBreak);
1635         }
1636         if (chompi.chompTailIsTrue()) {
1637             chunks.append(breaks);
1638         }
1639         // We are done.
1640         return new ScalarToken(chunks.toString(), false, startMark, endMark, style);
1641     }
1642 
1643     /**
1644      * Scan a block scalar indicator. The block scalar indicator includes two
1645      * optional components, which may appear in either order.
1646      *
1647      * A block indentation indicator is a non-zero digit describing the
1648      * indentation level of the block scalar to follow. This indentation is an
1649      * additional number of spaces relative to the current indentation level.
1650      *
1651      * A block chomping indicator is a + or -, selecting the chomping mode away
1652      * from the default (clip) to either -(strip) or +(keep).
1653      *
1654      * @see <a href="http://www.yaml.org/spec/1.1/#id868988"></a>
1655      * @see <a href="http://www.yaml.org/spec/1.1/#id927035"></a>
1656      * @see <a href="http://www.yaml.org/spec/1.1/#id927557"></a>
1657      */
scanBlockScalarIndicators(Mark startMark)1658     private Chomping scanBlockScalarIndicators(Mark startMark) {
1659         // See the specification for details.
1660         Boolean chomping = null;
1661         int increment = -1;
1662         char ch = reader.peek();
1663         if (ch == '-' || ch == '+') {
1664             if (ch == '+') {
1665                 chomping = Boolean.TRUE;
1666             } else {
1667                 chomping = Boolean.FALSE;
1668             }
1669             reader.forward();
1670             ch = reader.peek();
1671             if (Character.isDigit(ch)) {
1672                 increment = Integer.parseInt(String.valueOf(ch));
1673                 if (increment == 0) {
1674                     throw new ScannerException("while scanning a block scalar", startMark,
1675                             "expected indentation indicator in the range 1-9, but found 0",
1676                             reader.getMark());
1677                 }
1678                 reader.forward();
1679             }
1680         } else if (Character.isDigit(ch)) {
1681             increment = Integer.parseInt(String.valueOf(ch));
1682             if (increment == 0) {
1683                 throw new ScannerException("while scanning a block scalar", startMark,
1684                         "expected indentation indicator in the range 1-9, but found 0",
1685                         reader.getMark());
1686             }
1687             reader.forward();
1688             ch = reader.peek();
1689             if (ch == '-' || ch == '+') {
1690                 if (ch == '+') {
1691                     chomping = Boolean.TRUE;
1692                 } else {
1693                     chomping = Boolean.FALSE;
1694                 }
1695                 reader.forward();
1696             }
1697         }
1698         ch = reader.peek();
1699         if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1700             throw new ScannerException("while scanning a block scalar", startMark,
1701                     "expected chomping or indentation indicators, but found " + ch,
1702                     reader.getMark());
1703         }
1704         return new Chomping(chomping, increment);
1705     }
1706 
1707     /**
1708      * Scan to the end of the line after a block scalar has been scanned; the
1709      * only things that are permitted at this time are comments and spaces.
1710      */
scanBlockScalarIgnoredLine(Mark startMark)1711     private String scanBlockScalarIgnoredLine(Mark startMark) {
1712         // See the specification for details.
1713         int ff = 0;
1714         // Forward past any number of trailing spaces
1715         while (reader.peek(ff) == ' ') {
1716             ff++;
1717         }
1718         if (ff > 0) {
1719             reader.forward(ff);
1720         }
1721         // If a comment occurs, scan to just before the end of line.
1722         if (reader.peek() == '#') {
1723             ff = 0;
1724             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1725                 ff++;
1726             }
1727             if (ff > 0) {
1728                 reader.forward(ff);
1729             }
1730         }
1731         // If the next character is not a null or line break, an error has
1732         // occurred.
1733         char ch = reader.peek();
1734         String lineBreak = scanLineBreak();
1735         if (lineBreak.length() == 0 && ch != '\0') {
1736             throw new ScannerException("while scanning a block scalar", startMark,
1737                     "expected a comment or a line break, but found " + ch, reader.getMark());
1738         }
1739         return lineBreak;
1740     }
1741 
1742     /**
1743      * Scans for the indentation of a block scalar implicitly. This mechanism is
1744      * used only if the block did not explicitly state an indentation to be
1745      * used.
1746      *
1747      * @see <a href="http://www.yaml.org/spec/1.1/#id927035"></a>
1748      */
scanBlockScalarIndentation()1749     private Object[] scanBlockScalarIndentation() {
1750         // See the specification for details.
1751         StringBuilder chunks = new StringBuilder();
1752         int maxIndent = 0;
1753         Mark endMark = reader.getMark();
1754         // Look ahead some number of lines until the first non-blank character
1755         // occurs; the determined indentation will be the maximum number of
1756         // leading spaces on any of these lines.
1757         while (Constant.LINEBR.has(reader.peek(), " \r")) {
1758             if (reader.peek() != ' ') {
1759                 // If the character isn't a space, it must be some kind of
1760                 // line-break; scan the line break and track it.
1761                 chunks.append(scanLineBreak());
1762                 endMark = reader.getMark();
1763             } else {
1764                 // If the character is a space, move forward to the next
1765                 // character; if we surpass our previous maximum for indent
1766                 // level, update that too.
1767                 reader.forward();
1768                 if (this.reader.getColumn() > maxIndent) {
1769                     maxIndent = reader.getColumn();
1770                 }
1771             }
1772         }
1773         // Pass several results back together.
1774         return new Object[] { chunks.toString(), maxIndent, endMark };
1775     }
1776 
scanBlockScalarBreaks(int indent)1777     private Object[] scanBlockScalarBreaks(int indent) {
1778         // See the specification for details.
1779         StringBuilder chunks = new StringBuilder();
1780         Mark endMark = reader.getMark();
1781         int ff = 0;
1782         int col = this.reader.getColumn();
1783         // Scan for up to the expected indentation-level of spaces, then move
1784         // forward past that amount.
1785         while (col < indent && reader.peek(ff) == ' ') {
1786             ff++;
1787             col++;
1788         }
1789         if (ff > 0) {
1790             reader.forward(ff);
1791         }
1792         // Consume one or more line breaks followed by any amount of spaces,
1793         // until we find something that isn't a line-break.
1794         String lineBreak = null;
1795         while ((lineBreak = scanLineBreak()).length() != 0) {
1796             chunks.append(lineBreak);
1797             endMark = reader.getMark();
1798             // Scan past up to (indent) spaces on the next line, then forward
1799             // past them.
1800             ff = 0;
1801             col = this.reader.getColumn();
1802             while (col < indent && reader.peek(ff) == ' ') {
1803                 ff++;
1804                 col++;
1805             }
1806             if (ff > 0) {
1807                 reader.forward(ff);
1808             }
1809         }
1810         // Return both the assembled intervening string and the end-mark.
1811         return new Object[] { chunks.toString(), endMark };
1812     }
1813 
1814     /**
1815      * Scan a flow-style scalar. Flow scalars are presented in one of two forms;
1816      * first, a flow scalar may be a double-quoted string; second, a flow scalar
1817      * may be a single-quoted string.
1818      *
1819      * @see <a href="http://www.yaml.org/spec/1.1/#flow"></a> style/syntax
1820      *
1821      *      <pre>
1822      * See the specification for details.
1823      * Note that we loose indentation rules for quoted scalars. Quoted
1824      * scalars don't need to adhere indentation because &quot; and ' clearly
1825      * mark the beginning and the end of them. Therefore we are less
1826      * restrictive then the specification requires. We only need to check
1827      * that document separators are not included in scalars.
1828      * </pre>
1829      */
scanFlowScalar(char style)1830     private Token scanFlowScalar(char style) {
1831         boolean _double;
1832         // The style will be either single- or double-quoted; we determine this
1833         // by the first character in the entry (supplied)
1834         if (style == '"') {
1835             _double = true;
1836         } else {
1837             _double = false;
1838         }
1839         StringBuilder chunks = new StringBuilder();
1840         Mark startMark = reader.getMark();
1841         char quote = reader.peek();
1842         reader.forward();
1843         chunks.append(scanFlowScalarNonSpaces(_double, startMark));
1844         while (reader.peek() != quote) {
1845             chunks.append(scanFlowScalarSpaces(startMark));
1846             chunks.append(scanFlowScalarNonSpaces(_double, startMark));
1847         }
1848         reader.forward();
1849         Mark endMark = reader.getMark();
1850         return new ScalarToken(chunks.toString(), false, startMark, endMark, style);
1851     }
1852 
1853     /**
1854      * Scan some number of flow-scalar non-space characters.
1855      */
scanFlowScalarNonSpaces(boolean doubleQuoted, Mark startMark)1856     private String scanFlowScalarNonSpaces(boolean doubleQuoted, Mark startMark) {
1857         // See the specification for details.
1858         StringBuilder chunks = new StringBuilder();
1859         while (true) {
1860             // Scan through any number of characters which are not: NUL, blank,
1861             // tabs, line breaks, single-quotes, double-quotes, or backslashes.
1862             int length = 0;
1863             while (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length), "\'\"\\")) {
1864                 length++;
1865             }
1866             if (length != 0) {
1867                 chunks.append(reader.prefixForward(length));
1868             }
1869             // Depending on our quoting-type, the characters ', " and \ have
1870             // differing meanings.
1871             char ch = reader.peek();
1872             if (!doubleQuoted && ch == '\'' && reader.peek(1) == '\'') {
1873                 chunks.append("'");
1874                 reader.forward(2);
1875             } else if ((doubleQuoted && ch == '\'') || (!doubleQuoted && "\"\\".indexOf(ch) != -1)) {
1876                 chunks.append(ch);
1877                 reader.forward();
1878             } else if (doubleQuoted && ch == '\\') {
1879                 reader.forward();
1880                 ch = reader.peek();
1881                 if (ESCAPE_REPLACEMENTS.containsKey(Character.valueOf(ch))) {
1882                     // The character is one of the single-replacement
1883                     // types; these are replaced with a literal character
1884                     // from the mapping.
1885                     chunks.append(ESCAPE_REPLACEMENTS.get(Character.valueOf(ch)));
1886                     reader.forward();
1887                 } else if (ESCAPE_CODES.containsKey(Character.valueOf(ch))) {
1888                     // The character is a multi-digit escape sequence, with
1889                     // length defined by the value in the ESCAPE_CODES map.
1890                     length = ESCAPE_CODES.get(Character.valueOf(ch)).intValue();
1891                     reader.forward();
1892                     String hex = reader.prefix(length);
1893                     if (NOT_HEXA.matcher(hex).find()) {
1894                         throw new ScannerException("while scanning a double-quoted scalar",
1895                                 startMark, "expected escape sequence of " + length
1896                                         + " hexadecimal numbers, but found: " + hex,
1897                                 reader.getMark());
1898                     }
1899                     int decimal = Integer.parseInt(hex, 16);
1900                     String unicode = new String(Character.toChars(decimal));
1901                     chunks.append(unicode);
1902                     reader.forward(length);
1903                 } else if (scanLineBreak().length() != 0) {
1904                     chunks.append(scanFlowScalarBreaks(startMark));
1905                 } else {
1906                     throw new ScannerException("while scanning a double-quoted scalar", startMark,
1907                             "found unknown escape character " + ch + "(" + ((int) ch) + ")",
1908                             reader.getMark());
1909                 }
1910             } else {
1911                 return chunks.toString();
1912             }
1913         }
1914     }
1915 
scanFlowScalarSpaces(Mark startMark)1916     private String scanFlowScalarSpaces(Mark startMark) {
1917         // See the specification for details.
1918         StringBuilder chunks = new StringBuilder();
1919         int length = 0;
1920         // Scan through any number of whitespace (space, tab) characters,
1921         // consuming them.
1922         while (" \t".indexOf(reader.peek(length)) != -1) {
1923             length++;
1924         }
1925         String whitespaces = reader.prefixForward(length);
1926         char ch = reader.peek();
1927         if (ch == '\0') {
1928             // A flow scalar cannot end with an end-of-stream
1929             throw new ScannerException("while scanning a quoted scalar", startMark,
1930                     "found unexpected end of stream", reader.getMark());
1931         }
1932         // If we encounter a line break, scan it into our assembled string...
1933         String lineBreak = scanLineBreak();
1934         if (lineBreak.length() != 0) {
1935             String breaks = scanFlowScalarBreaks(startMark);
1936             if (!"\n".equals(lineBreak)) {
1937                 chunks.append(lineBreak);
1938             } else if (breaks.length() == 0) {
1939                 chunks.append(" ");
1940             }
1941             chunks.append(breaks);
1942         } else {
1943             chunks.append(whitespaces);
1944         }
1945         return chunks.toString();
1946     }
1947 
scanFlowScalarBreaks(Mark startMark)1948     private String scanFlowScalarBreaks(Mark startMark) {
1949         // See the specification for details.
1950         StringBuilder chunks = new StringBuilder();
1951         while (true) {
1952             // Instead of checking indentation, we check for document
1953             // separators.
1954             String prefix = reader.prefix(3);
1955             if (("---".equals(prefix) || "...".equals(prefix))
1956                     && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
1957                 throw new ScannerException("while scanning a quoted scalar", startMark,
1958                         "found unexpected document separator", reader.getMark());
1959             }
1960             // Scan past any number of spaces and tabs, ignoring them
1961             while (" \t".indexOf(reader.peek()) != -1) {
1962                 reader.forward();
1963             }
1964             // If we stopped at a line break, add that; otherwise, return the
1965             // assembled set of scalar breaks.
1966             String lineBreak = scanLineBreak();
1967             if (lineBreak.length() != 0) {
1968                 chunks.append(lineBreak);
1969             } else {
1970                 return chunks.toString();
1971             }
1972         }
1973     }
1974 
1975     /**
1976      * Scan a plain scalar.
1977      *
1978      * <pre>
1979      * See the specification for details.
1980      * We add an additional restriction for the flow context:
1981      *   plain scalars in the flow context cannot contain ',', ':' and '?'.
1982      * We also keep track of the `allow_simple_key` flag here.
1983      * Indentation rules are loosed for the flow context.
1984      * </pre>
1985      */
scanPlain()1986     private Token scanPlain() {
1987         StringBuilder chunks = new StringBuilder();
1988         Mark startMark = reader.getMark();
1989         Mark endMark = startMark;
1990         int indent = this.indent + 1;
1991         String spaces = "";
1992         while (true) {
1993             char ch;
1994             int length = 0;
1995             // A comment indicates the end of the scalar.
1996             if (reader.peek() == '#') {
1997                 break;
1998             }
1999             while (true) {
2000                 ch = reader.peek(length);
2001                 if (Constant.NULL_BL_T_LINEBR.has(ch)
2002                         || (this.flowLevel == 0 && ch == ':' && Constant.NULL_BL_T_LINEBR
2003                                 .has(reader.peek(length + 1)))
2004                         || (this.flowLevel != 0 && ",:?[]{}".indexOf(ch) != -1)) {
2005                     break;
2006                 }
2007                 length++;
2008             }
2009             // It's not clear what we should do with ':' in the flow context.
2010             if (this.flowLevel != 0 && ch == ':'
2011                     && Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length + 1), ",[]{}")) {
2012                 reader.forward(length);
2013                 throw new ScannerException("while scanning a plain scalar", startMark,
2014                         "found unexpected ':'", reader.getMark(),
2015                         "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.");
2016             }
2017             if (length == 0) {
2018                 break;
2019             }
2020             this.allowSimpleKey = false;
2021             chunks.append(spaces);
2022             chunks.append(reader.prefixForward(length));
2023             endMark = reader.getMark();
2024             spaces = scanPlainSpaces();
2025             // System.out.printf("spaces[%s]\n", spaces);
2026             if (spaces.length() == 0 || reader.peek() == '#'
2027                     || (this.flowLevel == 0 && this.reader.getColumn() < indent)) {
2028                 break;
2029             }
2030         }
2031         return new ScalarToken(chunks.toString(), startMark, endMark, true);
2032     }
2033 
2034     /**
2035      * See the specification for details. SnakeYAML and libyaml allow tabs
2036      * inside plain scalar
2037      */
scanPlainSpaces()2038     private String scanPlainSpaces() {
2039         int length = 0;
2040         while (reader.peek(length) == ' ' || reader.peek(length) == '\t') {
2041             length++;
2042         }
2043         String whitespaces = reader.prefixForward(length);
2044         String lineBreak = scanLineBreak();
2045         if (lineBreak.length() != 0) {
2046             this.allowSimpleKey = true;
2047             String prefix = reader.prefix(3);
2048             if ("---".equals(prefix) || "...".equals(prefix)
2049                     && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
2050                 return "";
2051             }
2052             StringBuilder breaks = new StringBuilder();
2053             while (true) {
2054                 if (reader.peek() == ' ') {
2055                     reader.forward();
2056                 } else {
2057                     String lb = scanLineBreak();
2058                     if (lb.length() != 0) {
2059                         breaks.append(lb);
2060                         prefix = reader.prefix(3);
2061                         if ("---".equals(prefix) || "...".equals(prefix)
2062                                 && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
2063                             return "";
2064                         }
2065                     } else {
2066                         break;
2067                     }
2068                 }
2069             }
2070             if (!"\n".equals(lineBreak)) {
2071                 return lineBreak + breaks;
2072             } else if (breaks.length() == 0) {
2073                 return " ";
2074             }
2075             return breaks.toString();
2076         }
2077         return whitespaces;
2078     }
2079 
2080     /**
2081      * <p>
2082      * Scan a Tag handle. A Tag handle takes one of three forms:
2083      *
2084      * <pre>
2085      * "!" (c-primary-tag-handle)
2086      * "!!" (ns-secondary-tag-handle)
2087      * "!(name)!" (c-named-tag-handle)
2088      * </pre>
2089      *
2090      * Where (name) must be formatted as an ns-word-char.
2091      * </p>
2092      *
2093      * @see <a href="http://www.yaml.org/spec/1.1/#c-tag-handle"></a>
2094      * @see <a href="http://www.yaml.org/spec/1.1/#ns-word-char"></a>
2095      *
2096      *      <pre>
2097      * See the specification for details.
2098      * For some strange reasons, the specification does not allow '_' in
2099      * tag handles. I have allowed it anyway.
2100      * </pre>
2101      */
scanTagHandle(String name, Mark startMark)2102     private String scanTagHandle(String name, Mark startMark) {
2103         char ch = reader.peek();
2104         if (ch != '!') {
2105             throw new ScannerException("while scanning a " + name, startMark,
2106                     "expected '!', but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
2107         }
2108         // Look for the next '!' in the stream, stopping if we hit a
2109         // non-word-character. If the first character is a space, then the
2110         // tag-handle is a c-primary-tag-handle ('!').
2111         int length = 1;
2112         ch = reader.peek(length);
2113         if (ch != ' ') {
2114             // Scan through 0+ alphabetic characters.
2115             // FIXME According to the specification, these should be
2116             // ns-word-char only, which prohibits '_'. This might be a
2117             // candidate for a configuration option.
2118             while (Constant.ALPHA.has(ch)) {
2119                 length++;
2120                 ch = reader.peek(length);
2121             }
2122             // Found the next non-word-char. If this is not a space and not an
2123             // '!', then this is an error, as the tag-handle was specified as:
2124             // !(name) or similar; the trailing '!' is missing.
2125             if (ch != '!') {
2126                 reader.forward(length);
2127                 throw new ScannerException("while scanning a " + name, startMark,
2128                         "expected '!', but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
2129             }
2130             length++;
2131         }
2132         String value = reader.prefixForward(length);
2133         return value;
2134     }
2135 
2136     /**
2137      * <p>
2138      * Scan a Tag URI. This scanning is valid for both local and global tag
2139      * directives, because both appear to be valid URIs as far as scanning is
2140      * concerned. The difference may be distinguished later, in parsing. This
2141      * method will scan for ns-uri-char*, which covers both cases.
2142      * </p>
2143      *
2144      * <p>
2145      * This method performs no verification that the scanned URI conforms to any
2146      * particular kind of URI specification.
2147      * </p>
2148      *
2149      * @see <a href="http://www.yaml.org/spec/1.1/#ns-uri-char"></a>
2150      */
scanTagUri(String name, Mark startMark)2151     private String scanTagUri(String name, Mark startMark) {
2152         // See the specification for details.
2153         // Note: we do not check if URI is well-formed.
2154         StringBuilder chunks = new StringBuilder();
2155         // Scan through accepted URI characters, which includes the standard
2156         // URI characters, plus the start-escape character ('%'). When we get
2157         // to a start-escape, scan the escaped sequence, then return.
2158         int length = 0;
2159         char ch = reader.peek(length);
2160         while (Constant.URI_CHARS.has(ch)) {
2161             if (ch == '%') {
2162                 chunks.append(reader.prefixForward(length));
2163                 length = 0;
2164                 chunks.append(scanUriEscapes(name, startMark));
2165             } else {
2166                 length++;
2167             }
2168             ch = reader.peek(length);
2169         }
2170         // Consume the last "chunk", which would not otherwise be consumed by
2171         // the loop above.
2172         if (length != 0) {
2173             chunks.append(reader.prefixForward(length));
2174             length = 0;
2175         }
2176         if (chunks.length() == 0) {
2177             // If no URI was found, an error has occurred.
2178             throw new ScannerException("while scanning a " + name, startMark,
2179                     "expected URI, but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
2180         }
2181         return chunks.toString();
2182     }
2183 
2184     /**
2185      * <p>
2186      * Scan a sequence of %-escaped URI escape codes and convert them into a
2187      * String representing the unescaped values.
2188      * </p>
2189      *
2190      * FIXME This method fails for more than 256 bytes' worth of URI-encoded
2191      * characters in a row. Is this possible? Is this a use-case?
2192      *
2193      * @see <a href="http://www.ietf.org/rfc/rfc2396.txt"></a>, section 2.4, Escaped Encoding.
2194      */
scanUriEscapes(String name, Mark startMark)2195     private String scanUriEscapes(String name, Mark startMark) {
2196         // First, look ahead to see how many URI-escaped characters we should
2197         // expect, so we can use the correct buffer size.
2198         int length = 1;
2199         while (reader.peek(length * 3) == '%') {
2200             length++;
2201         }
2202         // See the specification for details.
2203         // URIs containing 16 and 32 bit Unicode characters are
2204         // encoded in UTF-8, and then each octet is written as a
2205         // separate character.
2206         Mark beginningMark = reader.getMark();
2207         ByteBuffer buff = ByteBuffer.allocate(length);
2208         while (reader.peek() == '%') {
2209             reader.forward();
2210             try {
2211                 byte code = (byte) Integer.parseInt(reader.prefix(2), 16);
2212                 buff.put(code);
2213             } catch (NumberFormatException nfe) {
2214                 throw new ScannerException("while scanning a " + name, startMark,
2215                         "expected URI escape sequence of 2 hexadecimal numbers, but found "
2216                                 + reader.peek() + "(" + ((int) reader.peek()) + ") and "
2217                                 + reader.peek(1) + "(" + ((int) reader.peek(1)) + ")",
2218                         reader.getMark());
2219             }
2220             reader.forward(2);
2221         }
2222         buff.flip();
2223         try {
2224             return UriEncoder.decode(buff);
2225         } catch (CharacterCodingException e) {
2226             throw new ScannerException("while scanning a " + name, startMark,
2227                     "expected URI in UTF-8: " + e.getMessage(), beginningMark);
2228         }
2229     }
2230 
2231     /**
2232      * Scan a line break, transforming:
2233      *
2234      * <pre>
2235      * '\r\n' : '\n'
2236      * '\r' : '\n'
2237      * '\n' : '\n'
2238      * '\x85' : '\n'
2239      * default : ''
2240      * </pre>
2241      */
scanLineBreak()2242     private String scanLineBreak() {
2243         // Transforms:
2244         // '\r\n' : '\n'
2245         // '\r' : '\n'
2246         // '\n' : '\n'
2247         // '\x85' : '\n'
2248         // default : ''
2249         char ch = reader.peek();
2250         if (ch == '\r' || ch == '\n' || ch == '\u0085') {
2251             if (ch == '\r' && '\n' == reader.peek(1)) {
2252                 reader.forward(2);
2253             } else {
2254                 reader.forward();
2255             }
2256             return "\n";
2257         } else if (ch == '\u2028' || ch == '\u2029') {
2258             reader.forward();
2259             return String.valueOf(ch);
2260         }
2261         return "";
2262     }
2263 
2264     /**
2265      * Chomping the tail may have 3 values - yes, no, not defined.
2266      */
2267     private static class Chomping {
2268         private final Boolean value;
2269         private final int increment;
2270 
Chomping(Boolean value, int increment)2271         public Chomping(Boolean value, int increment) {
2272             this.value = value;
2273             this.increment = increment;
2274         }
2275 
chompTailIsNotFalse()2276         public boolean chompTailIsNotFalse() {
2277             return value == null || value;
2278         }
2279 
chompTailIsTrue()2280         public boolean chompTailIsTrue() {
2281             return value != null && value;
2282         }
2283 
getIncrement()2284         public int getIncrement() {
2285             return increment;
2286         }
2287     }
2288 }
2289