• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2013, Mike Samuel
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions
6 // are met:
7 //
8 // Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // Neither the name of the OWASP nor the names of its contributors may
14 // be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 // POSSIBILITY OF SUCH DAMAGE.
28 
29 package org.owasp.html;
30 
31 import java.util.ArrayList;
32 import java.util.Iterator;
33 import java.util.List;
34 import java.util.NoSuchElementException;
35 
36 import javax.annotation.Nullable;
37 
38 import com.google.common.collect.ImmutableMap;
39 
40 /**
41  * Given a string of CSS, produces a string of normalized CSS with certain
42  * useful properties detailed below.
43  * <ul>
44  *   <li>All runs of white-space and comment tokens (including CDO and CDC)
45  *     have been replaced with a single space character.</li>
46  *   <li>All strings are quoted and escapes are escaped according to the
47  *     following scheme:
48  *     <table>
49  *       <tr><td>NUL</td>            <td><code>\0</code></tr>
50  *       <tr><td>line feed</td>      <td><code>\a</code></tr>
51  *       <tr><td>vertical feed</td>  <td><code>\c</code></tr>
52  *       <tr><td>carriage return</td><td><code>\d</code></tr>
53  *       <tr><td>double quote</td>   <td><code>\22</code></tr>
54  *       <tr><td>ampersand &amp;</td><td><code>\26</code></tr>
55  *       <tr><td>single quote</td>   <td><code>\27</code></tr>
56  *       <tr><td>left-angle &lt;</td><td><code>\3c</code></tr>
57  *       <tr><td>rt-angle &gt;</td>  <td><code>\3e</code></tr>
58  *       <tr><td>back slash</td>     <td><code>\\</code></tr>
59  *       <tr><td>all others</td>     <td>raw</td></tr>
60  *     </table>
61  *   </li>
62  *   <li>All <code>url(&hellip;)</code> tokens are quoted.
63  *   <li>All keywords, identifiers, and hex literals are lower-case and have
64  *       embedded escape sequences decoded, except that .</li>
65  *   <li>All brackets nest properly.</li>
66  *   <li>Does not contain any case-insensitive variant of the sequences
67  *       {@code <!--}, {@code -->}, {@code <![CDATA[}, {@code ]]>}, or
68  *       {@code </style}.</li>
69  *   <li>All delimiters that can start longer tokens are followed by a space.
70  * </ul>
71  */
72 final class CssTokens implements Iterable<String> {
73 
74   public final String normalizedCss;
75   public final Brackets brackets;
76   private final int[] tokenBreaks;
77   private final TokenType[] tokenTypes;
78 
start()79   public TokenIterator start() {
80     return new TokenIterator(tokenTypes.length);
81   }
82 
iterator()83   public TokenIterator iterator() { return start(); }
84 
lex(String css)85   public static CssTokens lex(String css) {
86     Lexer lexer = new Lexer(css);
87     lexer.lex();
88     return lexer.build();
89   }
90 
91   /** A cursor into a list of tokens. */
92   public final class TokenIterator implements Iterator<String> {
93     private int tokenIndex = 0;
94     private final int limit;
95 
TokenIterator(int limit)96     TokenIterator(int limit) {
97       this.limit = limit;
98     }
99 
hasNext()100     public boolean hasNext() {
101       return hasToken();
102     }
103 
next()104     public String next() {
105       String token = token();
106       advance();
107       return token;
108     }
109 
spliceToEnd()110     public @Nullable TokenIterator spliceToEnd() {
111       if (!hasNext()) { throw new NoSuchElementException(); }
112       int end = brackets.partner(tokenIndex);
113       if (end < 0) {
114         return null;
115       }
116       TokenIterator between = new TokenIterator(end);
117       between.tokenIndex = tokenIndex + 1;
118       tokenIndex = end + 1;
119       return between;
120     }
121 
tokenIndex()122     public int tokenIndex() {
123       return tokenIndex;
124     }
125 
startOffset()126     public int startOffset() {
127       return tokenBreaks[tokenIndex];
128     }
129 
endOffset()130     public int endOffset() {
131       return tokenBreaks[tokenIndex+1];
132     }
133 
token()134     public String token() {
135       return normalizedCss.substring(startOffset(), endOffset());
136     }
137 
hasToken()138     public boolean hasToken() {
139       return tokenIndex < limit;
140     }
141 
hasTokenAfterSpace()142     public boolean hasTokenAfterSpace() {
143       while (hasToken()) {
144         if (type() != TokenType.WHITESPACE) { return true; }
145         advance();
146       }
147       return false;
148     }
149 
150     /** The type of the current token. */
type()151     public TokenType type() {
152       return tokenTypes[tokenIndex];
153     }
154 
seek(int tokenIndex)155     public void seek(int tokenIndex) {
156       this.tokenIndex = tokenIndex;
157     }
158 
advance()159     public void advance() {
160       if (!hasToken()) { throw new NoSuchElementException(); }
161       ++tokenIndex;
162     }
163 
backup()164     public void backup() {
165       if (tokenIndex == 0) { throw new NoSuchElementException(); }
166       --tokenIndex;
167     }
168 
remove()169     public void remove() throws UnsupportedOperationException {
170       throw new UnsupportedOperationException();
171     }
172   }
173 
CssTokens( String normalizedCss, Brackets brackets, int[] tokenBreaks, TokenType[] tokenTypes)174   private CssTokens(
175       String normalizedCss, Brackets brackets, int[] tokenBreaks,
176       TokenType[] tokenTypes) {
177     this.normalizedCss = normalizedCss;
178     this.brackets = brackets;
179     this.tokenBreaks = tokenBreaks;
180     this.tokenTypes = tokenTypes;
181   }
182 
183   public enum TokenType {
184     /** An identifier. */
185     IDENT,
186     /** An identifier prefixed with a period. */
187     DOT_IDENT,
188     /** A function name and opening bracket. */
189     FUNCTION,
190     /** An {@code @<identifier>} directive token. */
191     AT,
192     /** A hash token that contains non-hex characters. */
193     HASH_ID,
194     /** A hash token that could be a color literal. */
195     HASH_UNRESTRICTED,
196     /** A quoted string. */
197     STRING,
198     /** A URL of the form <code>url("...")</code>. */
199     URL,
200     /** A single character. */
201     DELIM,
202     /** A scalar numeric value. */
203     NUMBER,
204     /** A percentage. */
205     PERCENTAGE,
206     /** A numeric value with a unit suffix. */
207     DIMENSION,
208     /** A numeric value with an unknown unit suffix. */
209     BAD_DIMENSION,
210     /** {@code U+<hex-or-qmark>} */
211     UNICODE_RANGE,
212     /**
213      * include-match, dash-match, prefix-match, suffix-match, substring-match
214      */
215     MATCH,
216     /** {@code ||} */
217     COLUMN,
218     /** A run of white-space, comment, CDO, and CDC tokens. */
219     WHITESPACE,
220     /** {@code :} */
221     COLON,
222     /** {@code ;} */
223     SEMICOLON,
224     /** {@code ,} */
225     COMMA,
226     /** {@code [} */
227     LEFT_SQUARE,
228     /** {@code ]} */
229     RIGHT_SQUARE,
230     /** {@code (} */
231     LEFT_PAREN,
232     /** {@code )} */
233     RIGHT_PAREN,
234     /** <code>{</code> */
235     LEFT_CURLY,
236     /** <code>}</code> */
237     RIGHT_CURLY,
238     ;
239   }
240 
241   /**
242    * Maps tokens to their partners.  A close bracket token like {@code (} may
243    * have a partner token like {@code )} if properly nested, and vice-versa.
244    */
245   static final class Brackets {
246     /**
247      * For each token index, the index of the indexed token's partner or -1 if
248      * it has none.
249      */
250     private final int[] brackets;
251 
Brackets(int[] brackets)252     private Brackets(int[] brackets) {
253       this.brackets = brackets;
254     }
255 
256     /** The index of the partner token or -1 if none. */
partner(int tokenIndex)257     int partner(int tokenIndex) {
258       int bracketIndex = bracketIndexForToken(tokenIndex);
259       if (bracketIndex < 0) { return -1; }
260       return brackets[(bracketIndex << 1) + 1];
261     }
262 
bracketIndexForToken(int target)263     int bracketIndexForToken(int target) {
264       // Binary search by leftmost element of pair.
265       int left = 0;
266       int right = brackets.length >> 1;
267       while (left < right) {
268         int mid = left + ((right - left) >> 1);
269         int value = brackets[mid << 1];
270         if (value == target) { return mid; }
271         if (value < target) {
272           left = mid + 1;
273         } else {
274           right = mid;
275         }
276       }
277       return -1;
278     }
279   }
280 
281   private static final int[] ZERO_INTS = new int[0];
282 
283   private static final TokenType[] ZERO_TYPES = new TokenType[0];
284 
285   private static final Brackets EMPTY_BRACKETS = new Brackets(ZERO_INTS);
286 
287   private static final CssTokens EMPTY = new CssTokens(
288       "", EMPTY_BRACKETS, ZERO_INTS, ZERO_TYPES);
289 
290   /**
291    * Tokenizes according to section 4 of http://dev.w3.org/csswg/css-syntax/
292    */
293   private static final class Lexer {
294     private final String css;
295     private final StringBuilder sb;
296     private int pos = 0;
297     private final int cssLimit;
298 
299     private List<TokenType> tokenTypes = null;
300     private int[] tokenBreaks = new int[128];
301     private int tokenBreaksLimit = 0;
302 
303     /**
304      * For each bracket, 2 ints: the token index of the bracket, and the token
305      * index of its partner.
306      * The array is sorted by the first int.
307      * The second int is -1 when the bracket has not yet been closed.
308      */
309     private int[] brackets = ZERO_INTS;
310     /**
311      * The number of elements in {@link #brackets} that are valid.
312      * {@code brackets[bracketsLimit:]} is zeroed space that the list can grow
313      * into.
314      */
315     private int bracketsLimit = 0;
316     /**
317      * For each bracket that has not been closed, 2 ints:
318      * its index in {@link #brackets} and the character of its close bracket
319      * as an int.
320      * This is a bracket stack so the array is sorted by the first int.
321      */
322     private int[] open = ZERO_INTS;
323     /**
324      * The number of elements in {@link #open} that are valid.
325      * {@code open[openLimit:]} is garbage space that the stack can grow into.
326      */
327     private int openLimit = 0;
328 
Lexer(String css)329     Lexer(String css) {
330       this.css = css;
331       this.sb = new StringBuilder();
332       this.cssLimit = css.length();
333     }
334 
openBracket(char bracketChar)335     TokenType openBracket(char bracketChar) {
336       char close;
337       TokenType type;
338       switch (bracketChar) {
339         case '(': close = ')'; type = TokenType.LEFT_PAREN;  break;
340         case '[': close = ']'; type = TokenType.LEFT_SQUARE; break;
341         case '{': close = '}'; type = TokenType.LEFT_CURLY;  break;
342         default:
343           throw new AssertionError("Invalid open bracket " + bracketChar);
344       }
345       brackets = expandIfNecessary(brackets, bracketsLimit, 2);
346       open = expandIfNecessary(open, openLimit, 2);
347       open[openLimit++] = bracketsLimit;
348       open[openLimit++] = close;
349       brackets[bracketsLimit++] = tokenBreaksLimit;
350       brackets[bracketsLimit++] = -1;
351       sb.append(bracketChar);
352       return type;
353     }
354 
closeBracket(char bracketChar)355     void closeBracket(char bracketChar) {
356       int openLimitAfterClose = openLimit;
357       do {
358         if (openLimitAfterClose == 0) {
359           // Drop an orphaned close bracket.
360           breakOutput();
361           return;
362         }
363         openLimitAfterClose -= 2;
364       } while (bracketChar != open[openLimitAfterClose + 1]);
365       closeBrackets(openLimitAfterClose);
366     }
367 
closeBrackets(int openLimitAfterClose)368     private void closeBrackets(int openLimitAfterClose) {
369       // Make sure we've got space on brackets.
370       int spaceNeeded = openLimit - openLimitAfterClose;
371       brackets = expandIfNecessary(brackets, bracketsLimit, spaceNeeded);
372 
373       int closeTokenIndex = tokenBreaksLimit;
374       while (openLimit > openLimitAfterClose) {
375         // Pop the stack.
376         int closeBracket = open[--openLimit];
377         int openBracketIndex = open[--openLimit];
378         int openTokenIndex = brackets[openBracketIndex];
379         // Update open bracket to point to its partner.
380         brackets[openBracketIndex + 1] = closeTokenIndex;
381         // Emit the close bracket.
382         brackets[bracketsLimit++] = closeTokenIndex;
383         brackets[bracketsLimit++] = openTokenIndex;
384         sb.appendCodePoint(closeBracket);
385         closeTokenIndex++;
386       }
387     }
388 
build()389     CssTokens build() {
390       // Close any still open brackets.
391       {
392         int startOfCloseBrackets = sb.length();
393         closeBrackets(0);
394         emitMergedTokens(startOfCloseBrackets, sb.length());
395       }
396 
397       if (tokenTypes == null) { return EMPTY; }
398       int[] bracketsTrunc = truncateOrShare(brackets, bracketsLimit);
399 
400       // Strip any trailing space off, since it may have been inserted by a
401       // breakAfter call anyway.
402       int cssEnd = sb.length();
403       if (cssEnd > 0 && sb.charAt(cssEnd - 1) == ' ') {
404         --cssEnd;
405         tokenTypes.remove(--tokenBreaksLimit);
406       }
407       String normalizedCss = sb.substring(0, cssEnd);
408 
409       // Store the last character on the tokenBreaksList to simplify finding the
410       // end of a token.
411       tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1);
412       tokenBreaks[tokenBreaksLimit++] = normalizedCss.length();
413 
414       int[] tokenBreaksTrunc = truncateOrShare(tokenBreaks, tokenBreaksLimit);
415       TokenType[] tokenTypesArr = tokenTypes.toArray(ZERO_TYPES);
416 
417       return new CssTokens(
418           normalizedCss, new Brackets(bracketsTrunc),
419           tokenBreaksTrunc, tokenTypesArr);
420     }
421 
lex()422     void lex() {
423       // Fast-track no content.
424       consumeIgnorable();
425       sb.setLength(0);
426       if (pos == cssLimit) { return; }
427 
428       tokenTypes = new ArrayList<TokenType>();
429 
430       String css = this.css;
431       int cssLimit = this.cssLimit;
432       while (pos < cssLimit) {
433         assert this.tokenBreaksLimit == this.tokenTypes.size()
434             : "token and types out of sync at " + tokenBreaksLimit
435             + " in `" + css + "`";
436         // SPEC: 4. Tokenization
437         // The output of the tokenization step is a stream of zero
438         // or more of the following tokens: <ident>, <function>,
439         // <at-keyword>, <hash>, <string>, <bad-string>, <url>,
440         // <bad-url>, <delim>, <number>, <percentage>,
441         // <dimension>, <unicode-range>, <include-match>,
442         // <dash-match>, <prefix-match>, <suffix-match>,
443         // <substring-match>, <column>, <whitespace>, <CDO>,
444         // <CDC>, <colon>, <semicolon>, <comma>, <[>, <]>,
445         // <(>, <)>, <{>, and <}>.
446 
447         // IMPLEMENTS: 4.3 Consume a token
448         char ch = css.charAt(pos);
449         int startOfToken = pos;
450         int startOfOutputToken = sb.length();
451         final TokenType type;
452         switch (ch) {
453           case '\t': case '\n': case '\f': case '\r': case ' ': case '\ufeff':
454             consumeIgnorable();
455             type = TokenType.WHITESPACE;
456             break;
457           case '/': {
458             char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
459             if (lookahead == '/' || lookahead == '*') {
460               consumeIgnorable();
461               type = TokenType.WHITESPACE;
462             } else {
463               consumeDelim(ch);
464               type = TokenType.DELIM;
465             }
466             break;
467           }
468           case '<':
469             if (consumeIgnorable()) {  // <!--
470               type = TokenType.WHITESPACE;
471             } else {
472               consumeDelim('<');
473               type = TokenType.DELIM;
474             }
475             break;
476           case '>':
477             breakOutput();
478             sb.append('>');
479             type = TokenType.DELIM;
480             ++pos;
481             break;
482           case '@':
483             if (consumeAtKeyword()) {
484               type = TokenType.AT;
485             } else {
486               consumeDelim(ch);
487               type = TokenType.DELIM;
488             }
489             break;
490           case '#': {
491             sb.append('#');
492             TokenType hashType = consumeHash();
493             if (hashType != null) {
494               type = hashType;
495             } else {
496               ++pos;
497               sb.append(' ');
498               type = TokenType.DELIM;
499             }
500             break;
501           }
502           case '"':
503           case '\'':
504             type = consumeString();
505             break;
506           case 'U': case 'u':
507             // SPEC handle URL under "ident like token".
508             if (consumeUnicodeRange()) {
509               type = TokenType.UNICODE_RANGE;
510             } else {
511               type = consumeIdentOrUrlOrFunction();
512             }
513             break;
514           case '0': case '1': case '2': case '3': case '4':
515           case '5': case '6': case '7': case '8': case '9':
516             type = consumeNumberOrPercentageOrDimension();
517             break;
518           case '+': case '-': case '.': {
519             char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
520             if (isDecimal(lookahead)
521                 || (lookahead == '.' && pos + 2 < cssLimit
522                     && isDecimal(css.charAt(pos + 2)))) {
523               type = consumeNumberOrPercentageOrDimension();
524             } else if (ch == '+') {
525               consumeDelim(ch);
526               type = TokenType.DELIM;
527             } else if (ch == '-') {
528               if (consumeIgnorable()) {  // -->
529                 type = TokenType.WHITESPACE;
530               } else {
531                 type = consumeIdentOrUrlOrFunction();
532               }
533             } else if (isIdentPart(lookahead)) {
534               // treat ".<IDENT>" as one token.
535               sb.append('.');
536               ++pos;
537               consumeIdent(false);
538               if (pos != startOfToken + 1) {
539                 type = TokenType.DOT_IDENT;
540                 if (pos < cssLimit) {
541                   char next = css.charAt(pos);
542                   if ('(' == next) {
543                     // A dotted identifier followed by a parenthesis is
544                     // ambiguously a function.
545                     sb.append(' ');
546                   }
547                 }
548               } else {
549                 type = TokenType.DELIM;
550                 sb.append(' ');
551               }
552             } else {
553               consumeDelim('.');
554               type = TokenType.DELIM;
555             }
556             break;
557           }
558           case ':': consumeDelim(ch); type = TokenType.COLON; break;
559           case ';': consumeDelim(ch); type = TokenType.SEMICOLON; break;
560           case ',': consumeDelim(ch); type = TokenType.COMMA; break;
561           case '[': case '(': case '{':
562             type = openBracket(ch);
563             ++pos;
564             break;
565           case '}': case ')': case ']':
566             closeBracket(ch);
567             ++pos;
568             // Use DELIM so that a later loop will split output into multiple
569             // tokens since we may have inserted missing close brackets for
570             // unclosed open brackets already on the stack.
571             type = TokenType.DELIM;
572             break;
573           case '~': case '|': case '^': case '$': case '*': {
574             char lookahead = pos + 1 < cssLimit ? css.charAt(pos + 1) : 0;
575             if (lookahead == '=') {
576               consumeMatch(ch);
577               type = TokenType.MATCH;
578             } else if (ch == '|' && lookahead == '|') {
579               consumeColumn();
580               type = TokenType.COLUMN;
581             } else {
582               consumeDelim(ch);
583               type = TokenType.DELIM;
584             }
585             break;
586           }
587           case '_':
588             type = consumeIdentOrUrlOrFunction();
589             break;
590           case '\\': {
591             // Optimistically parse as an ident.
592             TokenType identType = consumeIdentOrUrlOrFunction();
593             if (identType == null) {
594               ++pos;  // drop
595               breakOutput();
596               type = TokenType.WHITESPACE;
597             } else {
598               type = identType;
599             }
600             // TODO: handle case where "url" is encoded.
601             break;
602           }
603           default:
604             int chlower = ch | 32;
605             if ('a' <= chlower && chlower <= 'z' || ch >= 0x80) {
606               TokenType identType = consumeIdentOrUrlOrFunction();
607               if (identType != null) {
608                 type = identType;
609               } else {  // Occurs on undefined-codepoints.
610                 ++pos;
611                 breakOutput();
612                 type = TokenType.WHITESPACE;
613               }
614             } else if (ch > 0x20) {
615               consumeDelim(ch);
616               type = TokenType.DELIM;
617             } else {  // Ignore.
618               consumeIgnorable();
619               type = TokenType.WHITESPACE;
620             }
621         }
622         assert pos > startOfToken
623             : "empty token at " + pos + ", ch0=" + css.charAt(startOfToken)
624             + ":U+" + Integer.toHexString(css.charAt(startOfToken));
625         int endOfOutputToken = sb.length();
626         if (endOfOutputToken > startOfOutputToken) {
627           if (type == TokenType.DELIM) {
628             emitMergedTokens(startOfOutputToken, endOfOutputToken);
629           } else {
630             if (type != TokenType.WHITESPACE
631                 && sb.charAt(startOfOutputToken) == ' ') {
632               emitToken(TokenType.WHITESPACE, startOfOutputToken);
633               ++startOfOutputToken;
634               assert startOfOutputToken != endOfOutputToken;
635             }
636             emitToken(type, startOfOutputToken);
637             // Token emitters can emit a space after a token to avoid possible
638             // merges with following tokens
639             if (type != TokenType.WHITESPACE) {
640               int sbLen = sb.length();
641               if (startOfOutputToken + 1 < sbLen
642                   && sb.charAt(sbLen - 1) == ' ') {
643                 emitToken(TokenType.WHITESPACE, sbLen - 1);
644               }
645             }
646           }
647         }
648       }
649     }
650 
651     private void emitMergedTokens(int start, int end) {
652       // Handle breakOutput and merging of output tokens.
653       for (int e = start; e < end; ++e) {
654         TokenType delimType;
655         switch (sb.charAt(e)) {
656           case ' ': delimType = TokenType.WHITESPACE;   break;
657           case '}': delimType = TokenType.RIGHT_CURLY;  break;
658           case ')': delimType = TokenType.RIGHT_PAREN;  break;
659           case ']': delimType = TokenType.RIGHT_SQUARE; break;
660           default : delimType = TokenType.DELIM;        break;
661         }
662         emitToken(delimType, e);
663       }
664     }
665 
666     private void emitToken(TokenType type, int startOfOutputToken) {
667       if (tokenBreaksLimit == 0
668           || tokenBreaks[tokenBreaksLimit - 1] != startOfOutputToken) {
669         tokenBreaks = expandIfNecessary(tokenBreaks, tokenBreaksLimit, 1);
670         tokenBreaks[tokenBreaksLimit++] = startOfOutputToken;
671         tokenTypes.add(type);
672       }
673     }
674 
675     private void consumeDelim(char ch) {
676       sb.append(ch);
677       switch (ch) {
678         // Prevent token merging.
679         case '~': case '|': case '^': case '$': case '\\':
680         case '.': case '+': case '-': case '@': case '/':  case '<':
681           sb.append(' ');
682           break;
683         default:
684           break;
685       }
686       ++pos;
687     }
688 
689     private boolean consumeIgnorable() {
690       String css = this.css;
691       int cssLimit = this.cssLimit;
692       int posBefore = pos;
693       while (pos < cssLimit) {
694         char ch = css.charAt(pos);
695         if (ch <= 0x20
696             // Treat a BOM as white-space so that it is ignored at the beginning
697             // of a file.
698             || ch == '\ufeff') {
699           ++pos;
700         } else if (pos + 1 == cssLimit) {
701           break;
702         } else if (ch == '/') {
703           char next = css.charAt(pos + 1);
704           if (next == '*') {
705             pos += 2;
706             while (pos < cssLimit) {
707               int ast = css.indexOf('*', pos);
708               if (ast < 0) {
709                 pos = cssLimit;  // Unclosed /* comment */
710                 break;
711               } else {
712                 // Advance over a run of '*'s.
713                 pos = ast + 1;
714                 while (pos < cssLimit && css.charAt(pos) == '*') {
715                   ++pos;
716                 }
717                 if (pos < cssLimit && css.charAt(pos) == '/') {
718                   ++pos;
719                   break;
720                 }
721               }
722             }
723           } else if (next == '/') {  // Non-standard but widely supported
724             while (++pos < cssLimit) {
725               if (isLineTerminator(css.charAt(pos))) { break; }
726             }
727           } else {
728             break;
729           }
730         } else if (ch == '<') {
731           if (pos + 3 < cssLimit
732               && '!' == css.charAt(pos + 1)
733               && '-' == css.charAt(pos + 2)
734               && '-' == css.charAt(pos + 3)) {
735             pos += 4;
736           } else {
737             break;
738           }
739         } else if (ch == '-') {
740           if (pos + 2 < cssLimit
741               && '-' == css.charAt(pos + 1)
742               && '>' == css.charAt(pos + 2)) {
743             pos += 3;
744           } else {
745             break;
746           }
747         } else {
748           break;
749         }
750       }
751       if (pos == posBefore) {
752         return false;
753       } else {
754         breakOutput();
755         return true;
756       }
757     }
758 
759     private void breakOutput() {
760       int last = sb.length() - 1;
761       if (last >= 0 && sb.charAt(last) != ' ') { sb.append(' '); }
762     }
763 
764     private void consumeColumn() {
765       pos += 2;
766       sb.append("||");
767     }
768 
769     private void consumeMatch(char ch) {
770       pos += 2;
771       sb.append(ch).append('=');
772     }
773 
774     private void consumeIdent(boolean allowFirstDigit) {
775       int cssLimit = this.cssLimit;
776       int last = -1, nCodepoints = 0;
777       int sbAtStart = sb.length();
778       int posAtStart = pos;
779       while (pos < cssLimit) {
780         int posBefore = pos;
781 
782         int decoded = readCodepoint();
783         if (decoded == '\\') {
784           decoded = consumeAndDecodeEscapeSequence();
785         } else {
786           ++pos;
787         }
788 
789         if (decoded >= 0 && isIdentPart(decoded)) {
790           if (!allowFirstDigit && nCodepoints < 2
791               && '0' <= decoded && decoded <= '9') {
792             // Don't allow encoded identifiers that look like numeric tokens
793             // like \-1 or ones that start with an encoded decimal digit.
794             if (last == '-' || last == -1) {
795               pos = posAtStart;
796               sb.setLength(sbAtStart);
797               return;
798             }
799           }
800           sb.appendCodePoint(decoded);
801           last = decoded;
802           ++nCodepoints;
803         } else {
804           pos = posBefore;
805           return;
806         }
807       }
808     }
809 
810     private boolean consumeAtKeyword() {
811       assert css.charAt(pos) == '@';
812       int bufferLengthBeforeWrite = sb.length();
813       sb.append('@');
814       int posBeforeKeyword = ++pos;
815       consumeIdent(false);
816       if (pos == posBeforeKeyword) {
817         --pos;  // back up over '@'
818         sb.setLength(bufferLengthBeforeWrite);  // Unwrite the '@'
819         return false;
820       } else {
821         return true;
822       }
823     }
824 
825 
826     private int consumeAndDecodeEscapeSequence() {
827       String css = this.css;
828       int cssLimit = this.cssLimit;
829       assert css.charAt(pos) == '\\';
830       if (pos + 1 >= cssLimit) { return -1; }
831       char esc = css.charAt(pos + 1);
832       if (isLineTerminator(esc)) { return -1; }
833       int escLower = esc | 32;
834       if (('0' <= esc && esc <= '9')
835           || ('a' <= escLower && escLower <= 'f')) {
836         int hexValue = 0;
837         int hexStart = pos + 1;
838         int hexLimit = Math.min(pos + 7, cssLimit);
839         int hexEnd = hexStart;
840         do {
841           hexValue = (hexValue << 4)
842               | (esc <= '9' ? esc - '0' : escLower - ('a' - 10));
843           ++hexEnd;
844           if (hexEnd == hexLimit) { break; }
845           esc = css.charAt(hexEnd);
846           escLower = esc | 32;
847         } while (('0' <= esc && esc <= '9')
848                  || ('a' <= escLower && escLower <= 'f'));
849         if (!Character.isDefined(hexValue)) {
850           hexValue = 0xfffd;
851         }
852         pos = hexEnd;
853         if (pos < cssLimit) {
854           // A sequence of hex digits can be followed by a space that allows
855           // so that code-point U+A followed by the letter 'b' can be rendered
856           // as "\a b" since "\ab" specifies the single code-point U+AB.
857           char next = css.charAt(pos);
858           if (next == ' ' || next == '\t' || isLineTerminator(next)) {
859             ++pos;
860           }
861         }
862         return hexValue;
863       }
864       pos += 2;
865       return esc;
866     }
867 
868     private static final long HEX_ENCODED_BITMASK =
869         (1L << 0) | LINE_TERMINATOR_BITMASK
870         | (1L << '"') | (1L << '\'') | (1L << '&') | (1L << '<') | (1L << '>');
871     private static boolean isHexEncoded(int codepoint) {
872       return (0 <= codepoint && codepoint < 63
873               && 0 != ((1L << codepoint) & HEX_ENCODED_BITMASK));
874     }
875 
876     private void encodeCharOntoOutput(int codepoint, int last) {
877       switch (codepoint) {
878         case '\\': sb.append("\\\\"); break;
879         case '\0': sb.append("\\0");  break;
880         case '\n': sb.append("\\a");  break;
881         case '\f': sb.append("\\c");  break;
882         case '\r': sb.append("\\d");  break;
883         case '\"': sb.append("\\22"); break;
884         case '&':  sb.append("\\26"); break;
885         case '\'': sb.append("\\27"); break;
886         case '<':  sb.append("\\3c"); break;
887         case '>':  sb.append("\\3e"); break;
888         // The set of escapes above that end with a hex digit must appear in
889         // HEX_ENCODED_BITMASK.
890         case '-':
891           sb.append('-');
892           break;
893         default:
894           if (isHexEncoded(last)
895               // We need to put a space after a trailing hex digit if the
896               // next encoded character on the output would be another hex
897               // digit or a space character.  The other space characters
898               // are handled above.
899               && (codepoint == ' ' || codepoint == '\t'
900                   || ('0' <= codepoint && codepoint <= '9')
901                   || ('a' <= (codepoint | 32) && (codepoint | 32) <= 'f'))) {
902             sb.append(' ');
903           }
904           sb.appendCodePoint(codepoint);
905           break;
906       }
907     }
908 
909     private TokenType consumeNumberOrPercentageOrDimension() {
910       String css = this.css;
911       int cssLimit = this.cssLimit;
912       boolean isZero = true;
913       int intStart = pos;
914       if (intStart < cssLimit) {
915         char ch = css.charAt(intStart);
916         if (ch == '-' || ch == '+') {
917           ++intStart;
918         }
919       }
920       // Find the integer part after any sign.
921       int intEnd = intStart;
922       for (; intEnd < cssLimit; ++intEnd) {
923         char ch = css.charAt(intEnd);
924         if (!('0' <= ch && ch <= '9')) { break; }
925         if (ch != '0') { isZero = false; }
926       }
927       // Find a fraction like ".5" or ".".
928       int fractionStart = intEnd;
929       int fractionEnd = fractionStart;
930       if (fractionEnd < cssLimit && '.' == css.charAt(fractionEnd)) {
931         ++fractionEnd;
932         for (; fractionEnd < cssLimit; ++fractionEnd) {
933           char ch = css.charAt(fractionEnd);
934           if (!('0' <= ch && ch <= '9')) { break; }
935           if (ch != '0') { isZero = false; }
936         }
937       }
938       int exponentStart = fractionEnd;
939       int exponentIntStart = exponentStart;
940       int exponentEnd = exponentStart;
941       boolean isExponentZero = true;
942       if (exponentStart < cssLimit && 'e' == (css.charAt(exponentStart) | 32)) {
943         // 'e' and 'e' in "5e-f" for a
944         exponentEnd = exponentStart + 1;
945         if (exponentEnd < cssLimit) {
946           char ch = css.charAt(exponentEnd);
947           if (ch == '+' || ch == '-') { ++exponentEnd; }
948         }
949         exponentIntStart = exponentEnd;
950         for (; exponentEnd < cssLimit; ++exponentEnd) {
951           char ch = css.charAt(exponentEnd);
952           if (!('0' <= ch && ch <= '9')) { break; }
953           if (ch != '0') { isExponentZero = false; }
954         }
955         // Since
956         //    dimension := <number> <ident>
957         // the below are technically valid dimensions even though they appear
958         // to have incomplete exponents:
959         //    5e
960         //    5ex
961         //    5e-
962         if (exponentEnd == exponentIntStart) {  // Incomplete exponent.
963           exponentIntStart = exponentEnd = exponentStart;
964           isExponentZero = true;
965         }
966       }
967 
968       int unitStart = exponentEnd;
969       // Skip over space between number and unit.
970       // Many user-agents allow "5 ex" instead of "5ex".
971       while (unitStart < cssLimit) {
972         char ch = css.charAt(unitStart);
973         if (ch == ' ' || isLineTerminator(ch)) {
974           ++unitStart;
975         } else {
976           break;
977         }
978       }
979 
980       if (sb.length() != 0 && isIdentPart(sb.charAt(sb.length() - 1))) {
981         sb.append(' ');
982       }
983       // Normalize the number onto the buffer.
984       // We will normalize and unit later.
985       // Skip the sign if it is positive.
986       if (intStart != pos && '-' == css.charAt(pos) && !isZero) {
987         sb.append('-');
988       }
989       if (isZero) {
990         sb.append('0');
991       } else {
992         // Strip leading zeroes from the integer and exponent and trailing
993         // zeroes from the fraction.
994         while (intStart < intEnd && css.charAt(intStart) == '0') { ++intStart; }
995         while (fractionEnd > fractionStart
996                && css.charAt(fractionEnd - 1) == '0') {
997           --fractionEnd;
998         }
999         if (intStart == intEnd) {
1000           sb.append('0');  // .5 -> 0.5
1001         } else {
1002           sb.append(css, intStart, intEnd);
1003         }
1004         if (fractionEnd > fractionStart + 1) {  // 5. -> 5; 5.0 -> 5
1005           sb.append(css, fractionStart, fractionEnd);
1006         }
1007         if (!isExponentZero) {
1008           sb.append('e');
1009           // 1e+1 -> 1e1
1010           if ('-' == css.charAt(exponentIntStart - 1)) { sb.append('-'); }
1011           while (exponentIntStart < exponentEnd
1012                  && css.charAt(exponentIntStart) == '0') {
1013             ++exponentIntStart;
1014           }
1015           sb.append(css, exponentIntStart, exponentEnd);
1016         }
1017       }
1018 
1019       int unitEnd;
1020       TokenType type;
1021       if (unitStart < cssLimit && '%' == css.charAt(unitStart)) {
1022         unitEnd = unitStart + 1;
1023         type = TokenType.PERCENTAGE;
1024         sb.append('%');
1025       } else {
1026         // The grammar says that any identifier following a number is a unit.
1027         int bufferBeforeUnit = sb.length();
1028         pos = unitStart;
1029         consumeIdent(false);
1030         int bufferAfterUnit = sb.length();
1031         boolean knownUnit = isWellKnownUnit(
1032             sb, bufferBeforeUnit, bufferAfterUnit);
1033         if (unitStart == exponentEnd  // No intervening space
1034             || knownUnit) {
1035           unitEnd = pos;
1036           // 3IN -> 3in
1037           for (int i = bufferBeforeUnit; i < bufferAfterUnit; ++i) {
1038             char ch = sb.charAt(i);
1039             if ('A' <= ch && ch <= 'Z') { sb.setCharAt(i, (char) (ch | 32)); }
1040           }
1041         } else {
1042           unitEnd = unitStart = exponentEnd;
1043           sb.setLength(bufferBeforeUnit);
1044         }
1045         type = unitStart == unitEnd
1046             ? TokenType.NUMBER
1047             : knownUnit
1048             ? TokenType.DIMENSION
1049             : TokenType.BAD_DIMENSION;
1050       }
1051       pos = unitEnd;
1052       if (type != TokenType.PERCENTAGE
1053           && pos < cssLimit && css.charAt(pos) == '.') {
1054         sb.append(' ');
1055       }
1056       return type;
1057     }
1058 
1059     private TokenType consumeString() {
1060       String css = this.css;
1061       int cssLimit = this.cssLimit;
1062 
1063       char delim = css.charAt(pos);
1064       assert delim == '"' || delim == '\'';
1065       ++pos;
1066       int startOfStringOnOutput = sb.length();
1067       sb.append('\'');
1068       int last = -1;
1069       boolean closed = false;
1070       while (pos < cssLimit) {
1071         char ch = css.charAt(pos);
1072         if (ch == delim) {
1073           ++pos;
1074           closed = true;
1075           break;
1076         }
1077         if (isLineTerminator(ch)) { break; }
1078         int decoded = ch;
1079         if (ch == '\\') {
1080           if (pos + 1 < cssLimit && isLineTerminator(css.charAt(pos+1))) {
1081             // consume it but generate no tokens.
1082             // Lookahead to treat a \r\n sequence as one line-terminator.
1083             if (pos + 2 < cssLimit
1084                 && css.charAt(pos+1) == '\r' && css.charAt(pos+2) == '\n') {
1085               pos += 3;
1086             } else {
1087               pos += 2;
1088             }
1089             continue;
1090           } else {
1091             decoded = consumeAndDecodeEscapeSequence();
1092             if (decoded < 0) {
1093               break;
1094             }
1095           }
1096         } else {
1097           ++pos;
1098         }
1099         encodeCharOntoOutput(decoded, last);
1100         last = decoded;
1101       }
1102       if (closed) {
1103         sb.append('\'');
1104         return TokenType.STRING;
1105       } else {  // Drop <bad-string>s
1106         sb.setLength(startOfStringOnOutput);
1107         breakOutput();
1108         return TokenType.WHITESPACE;
1109       }
1110     }
1111 
1112     private @Nullable TokenType consumeHash() {
1113       assert css.charAt(pos) == '#';
1114       ++pos;
1115       int beforeIdent = pos;
1116       consumeIdent(true);
1117       if (pos == beforeIdent) {
1118         pos = beforeIdent - 1;
1119         return null;
1120       }
1121       for (int i = beforeIdent; i < pos; ++i) {
1122         char chLower = (char) (css.charAt(i) | 32);
1123         if (!(('0' <= chLower && chLower <= '9')
1124               || ('a' <= chLower && chLower <= 'f'))) {
1125           return TokenType.HASH_ID;
1126         }
1127       }
1128       return TokenType.HASH_UNRESTRICTED;
1129     }
1130 
1131     private boolean consumeUnicodeRange() {
1132       final String css = this.css;
1133       final int cssLimit = this.cssLimit;
1134 
1135       assert pos < cssLimit && (css.charAt(pos) | 32) == 'u';
1136 
1137       final int start = pos;
1138       final int startOfOutput = sb.length();
1139       ++pos;
1140       boolean ok = false;
1141       parse:
1142       try {
1143         if (pos == cssLimit || css.charAt(pos) != '+') {
1144           break parse;
1145         }
1146         ++pos;
1147         sb.append("U+");
1148         int numStartDigits = 0;
1149         while (pos < cssLimit && numStartDigits < 6) {
1150           char chLower = (char) (css.charAt(pos) | 32);
1151           if (('0' <= chLower && chLower <= '9')
1152               || ('a' <= chLower && chLower <= 'f')) {
1153             sb.append(chLower);
1154             ++numStartDigits;
1155             ++pos;
1156           } else {
1157             break;
1158           }
1159         }
1160         if (numStartDigits == 0) {
1161           break parse;
1162         }
1163         boolean hasQmark = false;
1164         while (pos < cssLimit && numStartDigits < 6 && css.charAt(pos) == '?') {
1165           hasQmark = true;
1166           sb.append('?');
1167           ++numStartDigits;
1168           ++pos;
1169         }
1170         if (numStartDigits == 0) {
1171           break parse;
1172         }
1173         if (pos < cssLimit && css.charAt(pos) == '-') {
1174           if (!hasQmark) {
1175             // Look for end of range.
1176             ++pos;
1177             sb.append('-');
1178             int numEndDigits = 0;
1179             while (pos < cssLimit && numEndDigits < 6) {
1180               char chLower = (char) (css.charAt(pos) | 32);
1181               if (('0' <= chLower && chLower <= '9')
1182                   || ('a' <= chLower && chLower <= 'f')) {
1183                 ++numEndDigits;
1184                 ++pos;
1185                 sb.append(chLower);
1186               } else {
1187                 break;
1188               }
1189             }
1190             if (numEndDigits == 0) {
1191               // Back up over '-'
1192               --pos;
1193               sb.append(' ');
1194             }
1195           } else {
1196             sb.append(' ');
1197           }
1198         }
1199         ok = true;
1200       } finally {
1201         if (!ok) {
1202           pos = start;
1203           sb.setLength(startOfOutput);
1204         }
1205       }
1206       return ok;
1207     }
1208 
1209     private @Nullable TokenType consumeIdentOrUrlOrFunction() {
1210       int bufferStart = sb.length();
1211       int posBefore = pos;
1212       consumeIdent(false);
1213       if (pos == posBefore) { return null; }
1214       boolean parenAfter = pos < cssLimit && css.charAt(pos) == '(';
1215       if (sb.length() - bufferStart == 3
1216           && 'u' == (sb.charAt(bufferStart) | 32)
1217           && 'r' == (sb.charAt(bufferStart + 1) | 32)
1218           && 'l' == (sb.charAt(bufferStart + 2) | 32)) {
1219         if (parenAfter && consumeUrlValue()) {
1220           sb.setCharAt(bufferStart, 'u');
1221           sb.setCharAt(bufferStart + 1, 'r');
1222           sb.setCharAt(bufferStart + 2, 'l');
1223           return TokenType.URL;
1224         } else {
1225           sb.setLength(bufferStart);
1226           breakOutput();
1227           return TokenType.WHITESPACE;
1228         }
1229       } else if (parenAfter) {
1230         openBracket('(');
1231         ++pos;
1232         return TokenType.FUNCTION;
1233       } else {
1234         if (pos + 1 < cssLimit && '.' == css.charAt(pos)) {
1235           // Prevent merging of ident and number as in
1236           //     border:solid.1cm black
1237           // when .1 is rewritten to 0.1 becoming
1238           //     border:solid0.1cm black
1239           char next = css.charAt(pos + 1);
1240           if ('0' <= next && next <= '9') {
1241             sb.append(' ');
1242           }
1243         }
1244         return TokenType.IDENT;
1245       }
1246     }
1247 
1248     private boolean consumeUrlValue() {
1249       String css = this.css;
1250       int cssLimit = this.cssLimit;
1251       if (pos == cssLimit || css.charAt(pos) != '(') { return false; }
1252       ++pos;
1253       // skip space.
1254       for (; pos < cssLimit; ++pos) {
1255         char ch = css.charAt(pos);
1256         if (ch != ' ' && !isLineTerminator(ch)) { break; }
1257       }
1258       // Find the value.
1259       int delim;
1260       if (pos < cssLimit) {
1261         char ch = pos < cssLimit ? css.charAt(pos) : '\0';
1262         if (ch == '"' || ch == '\'') {
1263           delim = ch;
1264           ++pos;
1265         } else {
1266           delim = '\0';
1267         }
1268       } else {
1269         return false;
1270       }
1271       sb.append("('");
1272       while (pos < cssLimit) {
1273         int decoded = readCodepoint();
1274         if (delim != 0) {
1275           if (decoded == delim) {
1276             ++pos;
1277             break;
1278           }
1279         } else if (decoded <= ' ' || decoded == ')') {
1280           break;
1281         }
1282         if (decoded == '\\') {
1283           decoded = consumeAndDecodeEscapeSequence();
1284           if (decoded < 0) {
1285             return false;
1286           }
1287         } else {
1288           ++pos;
1289         }
1290         // Any character not in the RFC 3986 safe set is %-encoded.
1291         if (decoded < URL_SAFE.length && URL_SAFE[decoded]) {
1292           sb.appendCodePoint(decoded);
1293         } else if (decoded < 0x80) {
1294           sb.append('%')
1295             .append(HEX_DIGITS[(decoded >>> 4) & 0xf])
1296             .append(HEX_DIGITS[(decoded >>> 0) & 0xf]);
1297         } else if (decoded < 0x800) {
1298           int octet0 = 0xc0 | ((decoded >>> 6) & 0x1f),
1299               octet1 = 0x80 | (decoded         & 0x3f);
1300           sb.append('%')
1301             .append(HEX_DIGITS[(octet0 >>> 4) & 0xf])
1302             .append(HEX_DIGITS[(octet0 >>> 0) & 0xf])
1303             .append('%')
1304             .append(HEX_DIGITS[(octet1 >>> 4) & 0xf])
1305             .append(HEX_DIGITS[(octet1 >>> 0) & 0xf]);
1306         } else if (decoded < 0x10000) {
1307           int octet0 = 0xe0 | ((decoded >>> 12) & 0xf),
1308               octet1 = 0x80 | ((decoded >>> 6)  & 0x3f),
1309               octet2 = 0x80 | (decoded          & 0x3f);
1310           sb.append('%')
1311             .append(HEX_DIGITS[(octet0 >>> 4) & 0xf])
1312             .append(HEX_DIGITS[(octet0 >>> 0) & 0xf])
1313             .append('%')
1314             .append(HEX_DIGITS[(octet1 >>> 4) & 0xf])
1315             .append(HEX_DIGITS[(octet1 >>> 0) & 0xf])
1316             .append('%')
1317             .append(HEX_DIGITS[(octet2 >>> 4) & 0xf])
1318             .append(HEX_DIGITS[(octet2 >>> 0) & 0xf]);
1319         } else {
1320           int octet0 = 0xf0 | ((decoded >>> 18) & 0x7),
1321               octet1 = 0x80 | ((decoded >>> 12) & 0x3f),
1322               octet2 = 0x80 | ((decoded >>> 6)  & 0x3f),
1323               octet3 = 0x80 | (decoded          & 0x3f);
1324           sb.append('%')
1325             .append(HEX_DIGITS[(octet0 >>> 4) & 0xf])
1326             .append(HEX_DIGITS[(octet0 >>> 0) & 0xf])
1327             .append('%')
1328             .append(HEX_DIGITS[(octet1 >>> 4) & 0xf])
1329             .append(HEX_DIGITS[(octet1 >>> 0) & 0xf])
1330             .append('%')
1331             .append(HEX_DIGITS[(octet2 >>> 4) & 0xf])
1332             .append(HEX_DIGITS[(octet2 >>> 0) & 0xf])
1333             .append('%')
1334             .append(HEX_DIGITS[(octet3 >>> 4) & 0xf])
1335             .append(HEX_DIGITS[(octet3 >>> 0) & 0xf]);
1336         }
1337       }
1338 
1339       // skip space.
1340       for (; pos < cssLimit; ++pos) {
1341         char ch = css.charAt(pos);
1342         if (ch != ' ' && !isLineTerminator(ch)) { break; }
1343       }
1344       if (pos < cssLimit && css.charAt(pos) == ')') {
1345         ++pos;
1346       } else {
1347         // broken-url
1348       }
1349       sb.append("')");
1350       return true;
1351     }
1352 
1353     /**
1354      * Reads the codepoint at pos, leaving pos at the index of the last code
1355      * unit.
1356      */
1357     private int readCodepoint() {
1358       String css = this.css;
1359       char ch = css.charAt(pos);
1360       if (Character.isHighSurrogate(ch) && pos + 1 < cssLimit) {
1361         char next = css.charAt(pos + 1);
1362         if (Character.isLowSurrogate(next)) {
1363           ++pos;
1364           return 0x10000 + (((ch - 0xd800) << 10) | (next - 0xdc00));
1365         }
1366       }
1367       return ch;
1368     }
1369   }
1370 
1371   private static final boolean isIdentPart(int cp) {
1372     return cp >= 0x80
1373         ? Character.isDefined(cp) && cp != '\ufeff'
1374         : IDENT_PART_ASCII[cp];
1375   }
1376 
1377   private static final boolean isDecimal(char ch) {
1378     return '0' <= ch && ch <= '9';
1379   }
1380 
1381   private static final boolean[] IDENT_PART_ASCII = new boolean[128];
1382   static {
1383     for (int i = '0'; i <= '9'; ++i) { IDENT_PART_ASCII[i] = true; }
1384     for (int i = 'A'; i <= 'Z'; ++i) { IDENT_PART_ASCII[i] = true; }
1385     for (int i = 'a'; i <= 'z'; ++i) { IDENT_PART_ASCII[i] = true; }
1386     IDENT_PART_ASCII['_'] = true;
1387     IDENT_PART_ASCII['-'] = true;
1388   }
1389 
1390   private static final int LINE_TERMINATOR_BITMASK =
1391       (1 << '\n') | (1 << '\r') | (1 << '\f');
1392 
1393   private static boolean isLineTerminator(char ch) {
1394     return ch < 0x20 && 0 != (LINE_TERMINATOR_BITMASK & (1 << ch));
1395   }
1396 
1397   private static int[] expandIfNecessary(int[] arr, int limit, int needed) {
1398     int neededLength = limit + needed;
1399     int length = arr.length;
1400     if (length >= neededLength) { return arr; }
1401     int[] newArr = new int[Math.max(16, Math.max(neededLength, length * 2))];
1402     System.arraycopy(arr, 0, newArr, 0, limit);
1403     return newArr;
1404   }
1405 
1406   private static int[] truncateOrShare(int[] arr, int limit) {
1407     if (limit == 0) { return ZERO_INTS; }
1408     if (limit == arr.length) {
1409       return arr;
1410     }
1411     int[] trunc = new int[limit];
1412     System.arraycopy(arr, 0, trunc, 0, limit);
1413     return trunc;
1414   }
1415 
1416   private static final int LENGTH_UNIT_TYPE = 0;
1417   private static final int ANGLE_UNIT_TYPE = 1;
1418   private static final int TIME_UNIT_TYPE = 2;
1419   private static final int FREQUENCY_UNIT_TYPE = 3;
1420   private static final int RESOLUTION_UNIT_TYPE = 4;
1421 
1422   /**
1423    * See http://dev.w3.org/csswg/css-values/#lengths and
1424    *     http://dev.w3.org/csswg/css-values/#other-units
1425    */
1426   private static final Trie UNIT_TRIE = new Trie(
1427       ImmutableMap.<String, Integer>builder()
1428         .put("em", LENGTH_UNIT_TYPE)
1429         .put("ex", LENGTH_UNIT_TYPE)
1430         .put("ch", LENGTH_UNIT_TYPE)  // Width of zero character
1431         .put("rem", LENGTH_UNIT_TYPE)  // Root element font-size
1432         .put("vh", LENGTH_UNIT_TYPE)
1433         .put("vw", LENGTH_UNIT_TYPE)
1434         .put("vmin", LENGTH_UNIT_TYPE)
1435         .put("vmax", LENGTH_UNIT_TYPE)
1436         .put("px", LENGTH_UNIT_TYPE)
1437         .put("mm", LENGTH_UNIT_TYPE)
1438         .put("cm", LENGTH_UNIT_TYPE)
1439         .put("in", LENGTH_UNIT_TYPE)
1440         .put("pt", LENGTH_UNIT_TYPE)
1441         .put("pc", LENGTH_UNIT_TYPE)
1442         .put("deg", ANGLE_UNIT_TYPE)
1443         .put("rad", ANGLE_UNIT_TYPE)
1444         .put("grad", ANGLE_UNIT_TYPE)
1445         .put("turn", ANGLE_UNIT_TYPE)
1446         .put("s", TIME_UNIT_TYPE)
1447         .put("ms", TIME_UNIT_TYPE)
1448         .put("hz", FREQUENCY_UNIT_TYPE)
1449         .put("khz", FREQUENCY_UNIT_TYPE)
1450         .put("dpi", RESOLUTION_UNIT_TYPE)
1451         .put("dpcm", RESOLUTION_UNIT_TYPE)
1452         .put("dppx", RESOLUTION_UNIT_TYPE)
1453         .build());
1454 
1455   static boolean isWellKnownUnit(CharSequence s, int start, int end) {
1456     if (start == end) { return false; }
1457     Trie t = UNIT_TRIE;
1458     for (int i = start; i < end; ++i) {
1459       char ch = s.charAt(i);
1460       t = t.lookup('A' <= ch && ch <= 'Z' ? (char) (ch | 32) : ch);
1461       if (t == null) { return false; }
1462     }
1463     return t.isTerminal();
1464   }
1465 
1466   static boolean isWellKnownUnit(CharSequence s) {
1467     return isWellKnownUnit(s, 0, s.length());
1468   }
1469 
1470   private static final boolean[] URL_SAFE = new boolean[128];
1471   static {
1472     // From RFC 3986
1473     // unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
1474     for (int i = 'A'; i <= 'Z'; ++i) { URL_SAFE[i] = true; }
1475     for (int i = 'a'; i <= 'z'; ++i) { URL_SAFE[i] = true; }
1476     for (int i = '0'; i <= '9'; ++i) { URL_SAFE[i] = true; }
1477     URL_SAFE['-'] = true;
1478     URL_SAFE['.'] = true;
1479     URL_SAFE['_'] = true;
1480     URL_SAFE['~'] = true;
1481     // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
1482     URL_SAFE[':'] = true;
1483     URL_SAFE['/'] = true;
1484     URL_SAFE['?'] = true;
1485     URL_SAFE['#'] = true;
1486     URL_SAFE['['] = true;
1487     URL_SAFE[']'] = true;
1488     URL_SAFE['@'] = true;
1489     // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
1490     //             / "*" / "+" / "," / ";" / "="
1491     URL_SAFE['!'] = true;
1492     URL_SAFE['$'] = true;
1493     URL_SAFE['&'] = true;
1494     // Only used in obsolete mark rule and special in unquoted URLs or comment
1495     // delimiters.
1496     // URL_SAFE['\''] = true;
1497     // URL_SAFE['('] = true;
1498     // URL_SAFE[')'] = true;
1499     // URL_SAFE['*'] = true;
1500     URL_SAFE['+'] = true;
1501     URL_SAFE[','] = true;
1502     URL_SAFE[';'] = true;
1503     URL_SAFE['='] = true;
1504     // % is used to encode unsafe octets.
1505     URL_SAFE['%'] = true;
1506   }
1507 
1508   private static final char[] HEX_DIGITS = {
1509     '0', '1', '2', '3',
1510     '4', '5', '6', '7',
1511     '8', '9', 'a', 'b',
1512     'c', 'd', 'e', 'f'
1513   };
1514 }
1515