• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.jsoup.parser;
2 
3 import org.jsoup.internal.StringUtil;
4 import org.jsoup.helper.Validate;
5 
6 /**
7  * A character queue with parsing helpers.
8  *
9  * @author Jonathan Hedley
10  */
11 public class TokenQueue {
12     private String queue;
13     private int pos = 0;
14 
15     private static final char ESC = '\\'; // escape char for chomp balanced.
16 
17     /**
18      Create a new TokenQueue.
19      @param data string of data to back queue.
20      */
TokenQueue(String data)21     public TokenQueue(String data) {
22         Validate.notNull(data);
23         queue = data;
24     }
25 
26     /**
27      * Is the queue empty?
28      * @return true if no data left in queue.
29      */
isEmpty()30     public boolean isEmpty() {
31         return remainingLength() == 0;
32     }
33 
remainingLength()34     private int remainingLength() {
35         return queue.length() - pos;
36     }
37 
38     /**
39      Add a string to the start of the queue.
40      @param seq string to add.
41      */
addFirst(String seq)42     public void addFirst(String seq) {
43         // not very performant, but an edge case
44         queue = seq + queue.substring(pos);
45         pos = 0;
46     }
47 
48     /**
49      * Tests if the next characters on the queue match the sequence. Case insensitive.
50      * @param seq String to check queue for.
51      * @return true if the next characters match.
52      */
matches(String seq)53     public boolean matches(String seq) {
54         return queue.regionMatches(true, pos, seq, 0, seq.length());
55     }
56 
57     /**
58      Tests if the next characters match any of the sequences. Case insensitive.
59      @param seq list of strings to case insensitively check for
60      @return true of any matched, false if none did
61      */
matchesAny(String... seq)62     public boolean matchesAny(String... seq) {
63         for (String s : seq) {
64             if (matches(s))
65                 return true;
66         }
67         return false;
68     }
69 
matchesAny(char... seq)70     public boolean matchesAny(char... seq) {
71         if (isEmpty())
72             return false;
73 
74         for (char c: seq) {
75             if (queue.charAt(pos) == c)
76                 return true;
77         }
78         return false;
79     }
80 
81     /**
82      * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the
83      * queue.
84      * @param seq String to search for, and if found, remove from queue.
85      * @return true if found and removed, false if not found.
86      */
matchChomp(String seq)87     public boolean matchChomp(String seq) {
88         if (matches(seq)) {
89             pos += seq.length();
90             return true;
91         } else {
92             return false;
93         }
94     }
95 
96     /**
97      Tests if queue starts with a whitespace character.
98      @return if starts with whitespace
99      */
matchesWhitespace()100     public boolean matchesWhitespace() {
101         return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos));
102     }
103 
104     /**
105      Test if the queue matches a word character (letter or digit).
106      @return if matches a word character
107      */
matchesWord()108     public boolean matchesWord() {
109         return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos));
110     }
111 
112     /**
113      * Drops the next character off the queue.
114      */
advance()115     public void advance() {
116         if (!isEmpty()) pos++;
117     }
118 
119     /**
120      * Consume one character off queue.
121      * @return first character on queue.
122      */
consume()123     public char consume() {
124         return queue.charAt(pos++);
125     }
126 
127     /**
128      * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will
129      * throw an illegal state exception -- but you should be running match() against that condition.
130      <p>
131      Case insensitive.
132      * @param seq sequence to remove from head of queue.
133      */
consume(String seq)134     public void consume(String seq) {
135         if (!matches(seq))
136             throw new IllegalStateException("Queue did not match expected sequence");
137         int len = seq.length();
138         if (len > remainingLength())
139             throw new IllegalStateException("Queue not long enough to consume sequence");
140 
141         pos += len;
142     }
143 
144     /**
145      * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out.
146      * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b>
147      * @return The matched data consumed from queue.
148      */
consumeTo(String seq)149     public String consumeTo(String seq) {
150         int offset = queue.indexOf(seq, pos);
151         if (offset != -1) {
152             String consumed = queue.substring(pos, offset);
153             pos += consumed.length();
154             return consumed;
155         } else {
156             return remainder();
157         }
158     }
159 
consumeToIgnoreCase(String seq)160     public String consumeToIgnoreCase(String seq) {
161         int start = pos;
162         String first = seq.substring(0, 1);
163         boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of
164         while (!isEmpty()) {
165             if (matches(seq))
166                 break;
167 
168             if (canScan) {
169                 int skip = queue.indexOf(first, pos) - pos;
170                 if (skip == 0) // this char is the skip char, but not match, so force advance of pos
171                     pos++;
172                 else if (skip < 0) // no chance of finding, grab to end
173                     pos = queue.length();
174                 else
175                     pos += skip;
176             }
177             else
178                 pos++;
179         }
180 
181         return queue.substring(start, pos);
182     }
183 
184     /**
185      Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue.
186      @param seq any number of terminators to consume to. <b>Case insensitive.</b>
187      @return consumed string
188      */
189     // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this
190     // is a case sensitive time...
consumeToAny(String... seq)191     public String consumeToAny(String... seq) {
192         int start = pos;
193         while (!isEmpty() && !matchesAny(seq)) {
194             pos++;
195         }
196 
197         return queue.substring(start, pos);
198     }
199 
200     /**
201      * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it).
202      * <p>
203      * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go
204      * isEmpty() == true).
205      * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b>
206      * @return Data matched from queue.
207      */
chompTo(String seq)208     public String chompTo(String seq) {
209         String data = consumeTo(seq);
210         matchChomp(seq);
211         return data;
212     }
213 
chompToIgnoreCase(String seq)214     public String chompToIgnoreCase(String seq) {
215         String data = consumeToIgnoreCase(seq); // case insensitive scan
216         matchChomp(seq);
217         return data;
218     }
219 
220     /**
221      * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three",
222      * and leave " four" on the queue. Unbalanced openers and closers can be quoted (with ' or ") or escaped (with \). Those escapes will be left
223      * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for
224      * contains text strings; use unescape for that.
225      * @param open opener
226      * @param close closer
227      * @return data matched from the queue
228      */
chompBalanced(char open, char close)229     public String chompBalanced(char open, char close) {
230         int start = -1;
231         int end = -1;
232         int depth = 0;
233         char last = 0;
234         boolean inSingleQuote = false;
235         boolean inDoubleQuote = false;
236         boolean inRegexQE = false; // regex \Q .. \E escapes from Pattern.quote()
237 
238         do {
239             if (isEmpty()) break;
240             char c = consume();
241             if (last != ESC) {
242                 if (c == '\'' && c != open && !inDoubleQuote)
243                     inSingleQuote = !inSingleQuote;
244                 else if (c == '"' && c != open && !inSingleQuote)
245                     inDoubleQuote = !inDoubleQuote;
246                 if (inSingleQuote || inDoubleQuote || inRegexQE){
247                     last = c;
248                     continue;
249                 }
250 
251                 if (c == open) {
252                     depth++;
253                     if (start == -1)
254                         start = pos;
255                 }
256                 else if (c == close)
257                     depth--;
258             } else if (c == 'Q') {
259                 inRegexQE = true;
260             } else if (c == 'E') {
261                 inRegexQE = false;
262             }
263 
264             if (depth > 0 && last != 0)
265                 end = pos; // don't include the outer match pair in the return
266             last = c;
267         } while (depth > 0);
268         final String out = (end >= 0) ? queue.substring(start, end) : "";
269         if (depth > 0) {// ran out of queue before seeing enough )
270             Validate.fail("Did not find balanced marker at '" + out + "'");
271         }
272         return out;
273     }
274 
275     /**
276      * Unescape a \ escaped string.
277      * @param in backslash escaped string
278      * @return unescaped string
279      */
unescape(String in)280     public static String unescape(String in) {
281         StringBuilder out = StringUtil.borrowBuilder();
282         char last = 0;
283         for (char c : in.toCharArray()) {
284             if (c == ESC) {
285                 if (last == ESC) {
286                     out.append(c);
287                     c = 0;
288                 }
289             }
290             else
291                 out.append(c);
292             last = c;
293         }
294         return StringUtil.releaseBuilder(out);
295     }
296 
297     /*
298     Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be
299     valid in a selector.
300      */
escapeCssIdentifier(String in)301     public static String escapeCssIdentifier(String in) {
302         StringBuilder out = StringUtil.borrowBuilder();
303         TokenQueue q = new TokenQueue(in);
304         while (!q.isEmpty()) {
305             if (q.matchesCssIdentifier(ElementSelectorChars)) {
306                 out.append(q.consume());
307             } else {
308                 out.append(ESC).append(q.consume());
309             }
310         }
311         return StringUtil.releaseBuilder(out);
312     }
313 
314     /**
315      * Pulls the next run of whitespace characters of the queue.
316      * @return Whether consuming whitespace or not
317      */
consumeWhitespace()318     public boolean consumeWhitespace() {
319         boolean seen = false;
320         while (matchesWhitespace()) {
321             pos++;
322             seen = true;
323         }
324         return seen;
325     }
326 
327     /**
328      * Retrieves the next run of word type (letter or digit) off the queue.
329      * @return String of word characters from queue, or empty string if none.
330      */
consumeWord()331     public String consumeWord() {
332         int start = pos;
333         while (matchesWord())
334             pos++;
335         return queue.substring(start, pos);
336     }
337 
338 
339     /**
340      * Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects).
341      *
342      * @return tag name
343      */
consumeElementSelector()344     public String consumeElementSelector() {
345         return consumeEscapedCssIdentifier(ElementSelectorChars);
346     }
347     private static final String[] ElementSelectorChars = {"*|", "|", "_", "-"};
348 
349     /**
350      Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _)
351      http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier
352      @return identifier
353      */
consumeCssIdentifier()354     public String consumeCssIdentifier() {
355         return consumeEscapedCssIdentifier(CssIdentifierChars);
356     }
357     private static final String[] CssIdentifierChars = {"-", "_"};
358 
359 
consumeEscapedCssIdentifier(String... matches)360     private String consumeEscapedCssIdentifier(String... matches) {
361         int start = pos;
362         boolean escaped = false;
363         while (!isEmpty()) {
364             if (queue.charAt(pos) == ESC && remainingLength() >1 ) {
365                 escaped = true;
366                 pos+=2; // skip the escape and the escaped
367             } else if (matchesCssIdentifier(matches)) {
368                 pos++;
369             } else {
370                 break;
371             }
372         }
373 
374         String consumed = queue.substring(start, pos);
375         return escaped ? unescape(consumed) : consumed;
376     }
377 
matchesCssIdentifier(String... matches)378     private boolean matchesCssIdentifier(String... matches) {
379         return matchesWord() || matchesAny(matches);
380     }
381 
382     /**
383      Consume and return whatever is left on the queue.
384      @return remained of queue.
385      */
remainder()386     public String remainder() {
387         final String remainder = queue.substring(pos);
388         pos = queue.length();
389         return remainder;
390     }
391 
392     @Override
toString()393     public String toString() {
394         return queue.substring(pos);
395     }
396 }
397