• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1'use strict';
2
3var Preprocessor = require('./preprocessor'),
4    LocationInfoMixin = require('./location_info_mixin'),
5    UNICODE = require('../common/unicode'),
6    NAMED_ENTITY_TRIE = require('./named_entity_trie');
7
8//Aliases
9var $ = UNICODE.CODE_POINTS,
10    $$ = UNICODE.CODE_POINT_SEQUENCES;
11
12//Replacement code points for numeric entities
13var NUMERIC_ENTITY_REPLACEMENTS = {
14    0x00: 0xFFFD, 0x0D: 0x000D, 0x80: 0x20AC, 0x81: 0x0081, 0x82: 0x201A, 0x83: 0x0192, 0x84: 0x201E,
15    0x85: 0x2026, 0x86: 0x2020, 0x87: 0x2021, 0x88: 0x02C6, 0x89: 0x2030, 0x8A: 0x0160, 0x8B: 0x2039,
16    0x8C: 0x0152, 0x8D: 0x008D, 0x8E: 0x017D, 0x8F: 0x008F, 0x90: 0x0090, 0x91: 0x2018, 0x92: 0x2019,
17    0x93: 0x201C, 0x94: 0x201D, 0x95: 0x2022, 0x96: 0x2013, 0x97: 0x2014, 0x98: 0x02DC, 0x99: 0x2122,
18    0x9A: 0x0161, 0x9B: 0x203A, 0x9C: 0x0153, 0x9D: 0x009D, 0x9E: 0x017E, 0x9F: 0x0178
19};
20
21//States
22var DATA_STATE = 'DATA_STATE',
23    CHARACTER_REFERENCE_IN_DATA_STATE = 'CHARACTER_REFERENCE_IN_DATA_STATE',
24    RCDATA_STATE = 'RCDATA_STATE',
25    CHARACTER_REFERENCE_IN_RCDATA_STATE = 'CHARACTER_REFERENCE_IN_RCDATA_STATE',
26    RAWTEXT_STATE = 'RAWTEXT_STATE',
27    SCRIPT_DATA_STATE = 'SCRIPT_DATA_STATE',
28    PLAINTEXT_STATE = 'PLAINTEXT_STATE',
29    TAG_OPEN_STATE = 'TAG_OPEN_STATE',
30    END_TAG_OPEN_STATE = 'END_TAG_OPEN_STATE',
31    TAG_NAME_STATE = 'TAG_NAME_STATE',
32    RCDATA_LESS_THAN_SIGN_STATE = 'RCDATA_LESS_THAN_SIGN_STATE',
33    RCDATA_END_TAG_OPEN_STATE = 'RCDATA_END_TAG_OPEN_STATE',
34    RCDATA_END_TAG_NAME_STATE = 'RCDATA_END_TAG_NAME_STATE',
35    RAWTEXT_LESS_THAN_SIGN_STATE = 'RAWTEXT_LESS_THAN_SIGN_STATE',
36    RAWTEXT_END_TAG_OPEN_STATE = 'RAWTEXT_END_TAG_OPEN_STATE',
37    RAWTEXT_END_TAG_NAME_STATE = 'RAWTEXT_END_TAG_NAME_STATE',
38    SCRIPT_DATA_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_LESS_THAN_SIGN_STATE',
39    SCRIPT_DATA_END_TAG_OPEN_STATE = 'SCRIPT_DATA_END_TAG_OPEN_STATE',
40    SCRIPT_DATA_END_TAG_NAME_STATE = 'SCRIPT_DATA_END_TAG_NAME_STATE',
41    SCRIPT_DATA_ESCAPE_START_STATE = 'SCRIPT_DATA_ESCAPE_START_STATE',
42    SCRIPT_DATA_ESCAPE_START_DASH_STATE = 'SCRIPT_DATA_ESCAPE_START_DASH_STATE',
43    SCRIPT_DATA_ESCAPED_STATE = 'SCRIPT_DATA_ESCAPED_STATE',
44    SCRIPT_DATA_ESCAPED_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_STATE',
45    SCRIPT_DATA_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_ESCAPED_DASH_DASH_STATE',
46    SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE',
47    SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE',
48    SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE = 'SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE',
49    SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE',
50    SCRIPT_DATA_DOUBLE_ESCAPED_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_STATE',
51    SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE',
52    SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE',
53    SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE',
54    SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE = 'SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE',
55    BEFORE_ATTRIBUTE_NAME_STATE = 'BEFORE_ATTRIBUTE_NAME_STATE',
56    ATTRIBUTE_NAME_STATE = 'ATTRIBUTE_NAME_STATE',
57    AFTER_ATTRIBUTE_NAME_STATE = 'AFTER_ATTRIBUTE_NAME_STATE',
58    BEFORE_ATTRIBUTE_VALUE_STATE = 'BEFORE_ATTRIBUTE_VALUE_STATE',
59    ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE',
60    ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE = 'ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE',
61    ATTRIBUTE_VALUE_UNQUOTED_STATE = 'ATTRIBUTE_VALUE_UNQUOTED_STATE',
62    CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE = 'CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE',
63    AFTER_ATTRIBUTE_VALUE_QUOTED_STATE = 'AFTER_ATTRIBUTE_VALUE_QUOTED_STATE',
64    SELF_CLOSING_START_TAG_STATE = 'SELF_CLOSING_START_TAG_STATE',
65    BOGUS_COMMENT_STATE = 'BOGUS_COMMENT_STATE',
66    MARKUP_DECLARATION_OPEN_STATE = 'MARKUP_DECLARATION_OPEN_STATE',
67    COMMENT_START_STATE = 'COMMENT_START_STATE',
68    COMMENT_START_DASH_STATE = 'COMMENT_START_DASH_STATE',
69    COMMENT_STATE = 'COMMENT_STATE',
70    COMMENT_END_DASH_STATE = 'COMMENT_END_DASH_STATE',
71    COMMENT_END_STATE = 'COMMENT_END_STATE',
72    COMMENT_END_BANG_STATE = 'COMMENT_END_BANG_STATE',
73    DOCTYPE_STATE = 'DOCTYPE_STATE',
74    BEFORE_DOCTYPE_NAME_STATE = 'BEFORE_DOCTYPE_NAME_STATE',
75    DOCTYPE_NAME_STATE = 'DOCTYPE_NAME_STATE',
76    AFTER_DOCTYPE_NAME_STATE = 'AFTER_DOCTYPE_NAME_STATE',
77    AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE = 'AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE',
78    BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE',
79    DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE',
80    DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE',
81    AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE = 'AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE',
82    BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE = 'BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE',
83    AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE = 'AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE',
84    BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE',
85    DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE',
86    DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE = 'DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE',
87    AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE = 'AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE',
88    BOGUS_DOCTYPE_STATE = 'BOGUS_DOCTYPE_STATE',
89    CDATA_SECTION_STATE = 'CDATA_SECTION_STATE';
90
91//Utils
92
93//OPTIMIZATION: these utility functions should not be moved out of this module. V8 Crankshaft will not inline
94//this functions if they will be situated in another module due to context switch.
95//Always perform inlining check before modifying this functions ('node --trace-inlining').
96function isWhitespace(cp) {
97    return cp === $.SPACE || cp === $.LINE_FEED || cp === $.TABULATION || cp === $.FORM_FEED;
98}
99
100function isAsciiDigit(cp) {
101    return cp >= $.DIGIT_0 && cp <= $.DIGIT_9;
102}
103
104function isAsciiUpper(cp) {
105    return cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_Z;
106}
107
108function isAsciiLower(cp) {
109    return cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_Z;
110}
111
112function isAsciiAlphaNumeric(cp) {
113    return isAsciiDigit(cp) || isAsciiUpper(cp) || isAsciiLower(cp);
114}
115
116function isDigit(cp, isHex) {
117    return isAsciiDigit(cp) || (isHex && ((cp >= $.LATIN_CAPITAL_A && cp <= $.LATIN_CAPITAL_F) ||
118                                          (cp >= $.LATIN_SMALL_A && cp <= $.LATIN_SMALL_F)));
119}
120
121function isReservedCodePoint(cp) {
122    return cp >= 0xD800 && cp <= 0xDFFF || cp > 0x10FFFF;
123}
124
125function toAsciiLowerCodePoint(cp) {
126    return cp + 0x0020;
127}
128
129//NOTE: String.fromCharCode() function can handle only characters from BMP subset.
130//So, we need to workaround this manually.
131//(see: https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/String/fromCharCode#Getting_it_to_work_with_higher_values)
132function toChar(cp) {
133    if (cp <= 0xFFFF)
134        return String.fromCharCode(cp);
135
136    cp -= 0x10000;
137    return String.fromCharCode(cp >>> 10 & 0x3FF | 0xD800) + String.fromCharCode(0xDC00 | cp & 0x3FF);
138}
139
140function toAsciiLowerChar(cp) {
141    return String.fromCharCode(toAsciiLowerCodePoint(cp));
142}
143
144//Tokenizer
145var Tokenizer = module.exports = function (html, options) {
146    this.disableEntitiesDecoding = false;
147
148    this.preprocessor = new Preprocessor(html);
149
150    this.tokenQueue = [];
151
152    this.allowCDATA = false;
153
154    this.state = DATA_STATE;
155    this.returnState = '';
156
157    this.consumptionPos = 0;
158
159    this.tempBuff = [];
160    this.additionalAllowedCp = void 0;
161    this.lastStartTagName = '';
162
163    this.currentCharacterToken = null;
164    this.currentToken = null;
165    this.currentAttr = null;
166
167    if (options) {
168        this.disableEntitiesDecoding = !options.decodeHtmlEntities;
169
170        if (options.locationInfo)
171            LocationInfoMixin.assign(this);
172    }
173};
174
175//Token types
176Tokenizer.CHARACTER_TOKEN = 'CHARACTER_TOKEN';
177Tokenizer.NULL_CHARACTER_TOKEN = 'NULL_CHARACTER_TOKEN';
178Tokenizer.WHITESPACE_CHARACTER_TOKEN = 'WHITESPACE_CHARACTER_TOKEN';
179Tokenizer.START_TAG_TOKEN = 'START_TAG_TOKEN';
180Tokenizer.END_TAG_TOKEN = 'END_TAG_TOKEN';
181Tokenizer.COMMENT_TOKEN = 'COMMENT_TOKEN';
182Tokenizer.DOCTYPE_TOKEN = 'DOCTYPE_TOKEN';
183Tokenizer.EOF_TOKEN = 'EOF_TOKEN';
184
185//Tokenizer initial states for different modes
186Tokenizer.MODE = Tokenizer.prototype.MODE = {
187    DATA: DATA_STATE,
188    RCDATA: RCDATA_STATE,
189    RAWTEXT: RAWTEXT_STATE,
190    SCRIPT_DATA: SCRIPT_DATA_STATE,
191    PLAINTEXT: PLAINTEXT_STATE
192};
193
194//Static
195Tokenizer.getTokenAttr = function (token, attrName) {
196    for (var i = token.attrs.length - 1; i >= 0; i--) {
197        if (token.attrs[i].name === attrName)
198            return token.attrs[i].value;
199    }
200
201    return null;
202};
203
204//Get token
205Tokenizer.prototype.getNextToken = function () {
206    while (!this.tokenQueue.length)
207        this[this.state](this._consume());
208
209    return this.tokenQueue.shift();
210};
211
212//Consumption
213Tokenizer.prototype._consume = function () {
214    this.consumptionPos++;
215    return this.preprocessor.advanceAndPeekCodePoint();
216};
217
218Tokenizer.prototype._unconsume = function () {
219    this.consumptionPos--;
220    this.preprocessor.retreat();
221};
222
223Tokenizer.prototype._unconsumeSeveral = function (count) {
224    while (count--)
225        this._unconsume();
226};
227
228Tokenizer.prototype._reconsumeInState = function (state) {
229    this.state = state;
230    this._unconsume();
231};
232
233Tokenizer.prototype._consumeSubsequentIfMatch = function (pattern, startCp, caseSensitive) {
234    var rollbackPos = this.consumptionPos,
235        isMatch = true,
236        patternLength = pattern.length,
237        patternPos = 0,
238        cp = startCp,
239        patternCp = void 0;
240
241    for (; patternPos < patternLength; patternPos++) {
242        if (patternPos > 0)
243            cp = this._consume();
244
245        if (cp === $.EOF) {
246            isMatch = false;
247            break;
248        }
249
250        patternCp = pattern[patternPos];
251
252        if (cp !== patternCp && (caseSensitive || cp !== toAsciiLowerCodePoint(patternCp))) {
253            isMatch = false;
254            break;
255        }
256    }
257
258    if (!isMatch)
259        this._unconsumeSeveral(this.consumptionPos - rollbackPos);
260
261    return isMatch;
262};
263
264//Lookahead
265Tokenizer.prototype._lookahead = function () {
266    var cp = this.preprocessor.advanceAndPeekCodePoint();
267    this.preprocessor.retreat();
268
269    return cp;
270};
271
272//Temp buffer
273Tokenizer.prototype.isTempBufferEqualToScriptString = function () {
274    if (this.tempBuff.length !== $$.SCRIPT_STRING.length)
275        return false;
276
277    for (var i = 0; i < this.tempBuff.length; i++) {
278        if (this.tempBuff[i] !== $$.SCRIPT_STRING[i])
279            return false;
280    }
281
282    return true;
283};
284
285//Token creation
286Tokenizer.prototype.buildStartTagToken = function (tagName) {
287    return {
288        type: Tokenizer.START_TAG_TOKEN,
289        tagName: tagName,
290        selfClosing: false,
291        attrs: []
292    };
293};
294
295Tokenizer.prototype.buildEndTagToken = function (tagName) {
296    return {
297        type: Tokenizer.END_TAG_TOKEN,
298        tagName: tagName,
299        ignored: false,
300        attrs: []
301    };
302};
303
304Tokenizer.prototype._createStartTagToken = function (tagNameFirstCh) {
305    this.currentToken = this.buildStartTagToken(tagNameFirstCh);
306};
307
308Tokenizer.prototype._createEndTagToken = function (tagNameFirstCh) {
309    this.currentToken = this.buildEndTagToken(tagNameFirstCh);
310};
311
312Tokenizer.prototype._createCommentToken = function () {
313    this.currentToken = {
314        type: Tokenizer.COMMENT_TOKEN,
315        data: ''
316    };
317};
318
319Tokenizer.prototype._createDoctypeToken = function (doctypeNameFirstCh) {
320    this.currentToken = {
321        type: Tokenizer.DOCTYPE_TOKEN,
322        name: doctypeNameFirstCh || '',
323        forceQuirks: false,
324        publicId: null,
325        systemId: null
326    };
327};
328
329Tokenizer.prototype._createCharacterToken = function (type, ch) {
330    this.currentCharacterToken = {
331        type: type,
332        chars: ch
333    };
334};
335
336//Tag attributes
337Tokenizer.prototype._createAttr = function (attrNameFirstCh) {
338    this.currentAttr = {
339        name: attrNameFirstCh,
340        value: ''
341    };
342};
343
344Tokenizer.prototype._isDuplicateAttr = function () {
345    return Tokenizer.getTokenAttr(this.currentToken, this.currentAttr.name) !== null;
346};
347
348Tokenizer.prototype._leaveAttrName = function (toState) {
349    this.state = toState;
350
351    if (!this._isDuplicateAttr())
352        this.currentToken.attrs.push(this.currentAttr);
353};
354
355//Appropriate end tag token
356//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#appropriate-end-tag-token)
357Tokenizer.prototype._isAppropriateEndTagToken = function () {
358    return this.lastStartTagName === this.currentToken.tagName;
359};
360
361//Token emission
362Tokenizer.prototype._emitCurrentToken = function () {
363    this._emitCurrentCharacterToken();
364
365    //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate.
366    if (this.currentToken.type === Tokenizer.START_TAG_TOKEN)
367        this.lastStartTagName = this.currentToken.tagName;
368
369    this.tokenQueue.push(this.currentToken);
370    this.currentToken = null;
371};
372
373Tokenizer.prototype._emitCurrentCharacterToken = function () {
374    if (this.currentCharacterToken) {
375        this.tokenQueue.push(this.currentCharacterToken);
376        this.currentCharacterToken = null;
377    }
378};
379
380Tokenizer.prototype._emitEOFToken = function () {
381    this._emitCurrentCharacterToken();
382    this.tokenQueue.push({type: Tokenizer.EOF_TOKEN});
383};
384
385//Characters emission
386
387//OPTIMIZATION: specification uses only one type of character tokens (one token per character).
388//This causes a huge memory overhead and a lot of unnecessary parser loops. parse5 uses 3 groups of characters.
389//If we have a sequence of characters that belong to the same group, parser can process it
390//as a single solid character token.
391//So, there are 3 types of character tokens in parse5:
392//1)NULL_CHARACTER_TOKEN - \u0000-character sequences (e.g. '\u0000\u0000\u0000')
393//2)WHITESPACE_CHARACTER_TOKEN - any whitespace/new-line character sequences (e.g. '\n  \r\t   \f')
394//3)CHARACTER_TOKEN - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^')
395Tokenizer.prototype._appendCharToCurrentCharacterToken = function (type, ch) {
396    if (this.currentCharacterToken && this.currentCharacterToken.type !== type)
397        this._emitCurrentCharacterToken();
398
399    if (this.currentCharacterToken)
400        this.currentCharacterToken.chars += ch;
401
402    else
403        this._createCharacterToken(type, ch);
404};
405
406Tokenizer.prototype._emitCodePoint = function (cp) {
407    var type = Tokenizer.CHARACTER_TOKEN;
408
409    if (isWhitespace(cp))
410        type = Tokenizer.WHITESPACE_CHARACTER_TOKEN;
411
412    else if (cp === $.NULL)
413        type = Tokenizer.NULL_CHARACTER_TOKEN;
414
415    this._appendCharToCurrentCharacterToken(type, toChar(cp));
416};
417
418Tokenizer.prototype._emitSeveralCodePoints = function (codePoints) {
419    for (var i = 0; i < codePoints.length; i++)
420        this._emitCodePoint(codePoints[i]);
421};
422
423//NOTE: used then we emit character explicitly. This is always a non-whitespace and a non-null character.
424//So we can avoid additional checks here.
425Tokenizer.prototype._emitChar = function (ch) {
426    this._appendCharToCurrentCharacterToken(Tokenizer.CHARACTER_TOKEN, ch);
427};
428
429//Character reference tokenization
430Tokenizer.prototype._consumeNumericEntity = function (isHex) {
431    var digits = '',
432        nextCp = void 0;
433
434    do {
435        digits += toChar(this._consume());
436        nextCp = this._lookahead();
437    } while (nextCp !== $.EOF && isDigit(nextCp, isHex));
438
439    if (this._lookahead() === $.SEMICOLON)
440        this._consume();
441
442    var referencedCp = parseInt(digits, isHex ? 16 : 10),
443        replacement = NUMERIC_ENTITY_REPLACEMENTS[referencedCp];
444
445    if (replacement)
446        return replacement;
447
448    if (isReservedCodePoint(referencedCp))
449        return $.REPLACEMENT_CHARACTER;
450
451    return referencedCp;
452};
453
454Tokenizer.prototype._consumeNamedEntity = function (startCp, inAttr) {
455    var referencedCodePoints = null,
456        entityCodePointsCount = 0,
457        cp = startCp,
458        leaf = NAMED_ENTITY_TRIE[cp],
459        consumedCount = 1,
460        semicolonTerminated = false;
461
462    for (; leaf && cp !== $.EOF; cp = this._consume(), consumedCount++, leaf = leaf.l && leaf.l[cp]) {
463        if (leaf.c) {
464            //NOTE: we have at least one named reference match. But we don't stop lookup at this point,
465            //because longer matches still can be found (e.g. '&not' and '&notin;') except the case
466            //then found match is terminated by semicolon.
467            referencedCodePoints = leaf.c;
468            entityCodePointsCount = consumedCount;
469
470            if (cp === $.SEMICOLON) {
471                semicolonTerminated = true;
472                break;
473            }
474        }
475    }
476
477    if (referencedCodePoints) {
478        if (!semicolonTerminated) {
479            //NOTE: unconsume excess (e.g. 'it' in '&notit')
480            this._unconsumeSeveral(consumedCount - entityCodePointsCount);
481
482            //NOTE: If the character reference is being consumed as part of an attribute and the next character
483            //is either a U+003D EQUALS SIGN character (=) or an alphanumeric ASCII character, then, for historical
484            //reasons, all the characters that were matched after the U+0026 AMPERSAND character (&) must be
485            //unconsumed, and nothing is returned.
486            //However, if this next character is in fact a U+003D EQUALS SIGN character (=), then this is a
487            //parse error, because some legacy user agents will misinterpret the markup in those cases.
488            //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tokenizing-character-references)
489            if (inAttr) {
490                var nextCp = this._lookahead();
491
492                if (nextCp === $.EQUALS_SIGN || isAsciiAlphaNumeric(nextCp)) {
493                    this._unconsumeSeveral(entityCodePointsCount);
494                    return null;
495                }
496            }
497        }
498
499        return referencedCodePoints;
500    }
501
502    this._unconsumeSeveral(consumedCount);
503
504    return null;
505};
506
507Tokenizer.prototype._consumeCharacterReference = function (startCp, inAttr) {
508    if (this.disableEntitiesDecoding || isWhitespace(startCp) || startCp === $.GREATER_THAN_SIGN ||
509        startCp === $.AMPERSAND || startCp === this.additionalAllowedCp || startCp === $.EOF) {
510        //NOTE: not a character reference. No characters are consumed, and nothing is returned.
511        this._unconsume();
512        return null;
513    }
514
515    else if (startCp === $.NUMBER_SIGN) {
516        //NOTE: we have a numeric entity candidate, now we should determine if it's hex or decimal
517        var isHex = false,
518            nextCp = this._lookahead();
519
520        if (nextCp === $.LATIN_SMALL_X || nextCp === $.LATIN_CAPITAL_X) {
521            this._consume();
522            isHex = true;
523        }
524
525        nextCp = this._lookahead();
526
527        //NOTE: if we have at least one digit this is a numeric entity for sure, so we consume it
528        if (nextCp !== $.EOF && isDigit(nextCp, isHex))
529            return [this._consumeNumericEntity(isHex)];
530
531        else {
532            //NOTE: otherwise this is a bogus number entity and a parse error. Unconsume the number sign
533            //and the 'x'-character if appropriate.
534            this._unconsumeSeveral(isHex ? 2 : 1);
535            return null;
536        }
537    }
538
539    else
540        return this._consumeNamedEntity(startCp, inAttr);
541};
542
543//State machine
544var _ = Tokenizer.prototype;
545
546//12.2.4.1 Data state
547//------------------------------------------------------------------
548_[DATA_STATE] = function dataState(cp) {
549    if (cp === $.AMPERSAND)
550        this.state = CHARACTER_REFERENCE_IN_DATA_STATE;
551
552    else if (cp === $.LESS_THAN_SIGN)
553        this.state = TAG_OPEN_STATE;
554
555    else if (cp === $.NULL)
556        this._emitCodePoint(cp);
557
558    else if (cp === $.EOF)
559        this._emitEOFToken();
560
561    else
562        this._emitCodePoint(cp);
563};
564
565
566//12.2.4.2 Character reference in data state
567//------------------------------------------------------------------
568_[CHARACTER_REFERENCE_IN_DATA_STATE] = function characterReferenceInDataState(cp) {
569    this.state = DATA_STATE;
570    this.additionalAllowedCp = void 0;
571
572    var referencedCodePoints = this._consumeCharacterReference(cp, false);
573
574    if (referencedCodePoints)
575        this._emitSeveralCodePoints(referencedCodePoints);
576    else
577        this._emitChar('&');
578};
579
580
581//12.2.4.3 RCDATA state
582//------------------------------------------------------------------
583_[RCDATA_STATE] = function rcdataState(cp) {
584    if (cp === $.AMPERSAND)
585        this.state = CHARACTER_REFERENCE_IN_RCDATA_STATE;
586
587    else if (cp === $.LESS_THAN_SIGN)
588        this.state = RCDATA_LESS_THAN_SIGN_STATE;
589
590    else if (cp === $.NULL)
591        this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
592
593    else if (cp === $.EOF)
594        this._emitEOFToken();
595
596    else
597        this._emitCodePoint(cp);
598};
599
600
601//12.2.4.4 Character reference in RCDATA state
602//------------------------------------------------------------------
603_[CHARACTER_REFERENCE_IN_RCDATA_STATE] = function characterReferenceInRcdataState(cp) {
604    this.state = RCDATA_STATE;
605    this.additionalAllowedCp = void 0;
606
607    var referencedCodePoints = this._consumeCharacterReference(cp, false);
608
609    if (referencedCodePoints)
610        this._emitSeveralCodePoints(referencedCodePoints);
611    else
612        this._emitChar('&');
613};
614
615
616//12.2.4.5 RAWTEXT state
617//------------------------------------------------------------------
618_[RAWTEXT_STATE] = function rawtextState(cp) {
619    if (cp === $.LESS_THAN_SIGN)
620        this.state = RAWTEXT_LESS_THAN_SIGN_STATE;
621
622    else if (cp === $.NULL)
623        this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
624
625    else if (cp === $.EOF)
626        this._emitEOFToken();
627
628    else
629        this._emitCodePoint(cp);
630};
631
632
633//12.2.4.6 Script data state
634//------------------------------------------------------------------
635_[SCRIPT_DATA_STATE] = function scriptDataState(cp) {
636    if (cp === $.LESS_THAN_SIGN)
637        this.state = SCRIPT_DATA_LESS_THAN_SIGN_STATE;
638
639    else if (cp === $.NULL)
640        this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
641
642    else if (cp === $.EOF)
643        this._emitEOFToken();
644
645    else
646        this._emitCodePoint(cp);
647};
648
649
650//12.2.4.7 PLAINTEXT state
651//------------------------------------------------------------------
652_[PLAINTEXT_STATE] = function plaintextState(cp) {
653    if (cp === $.NULL)
654        this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
655
656    else if (cp === $.EOF)
657        this._emitEOFToken();
658
659    else
660        this._emitCodePoint(cp);
661};
662
663
664//12.2.4.8 Tag open state
665//------------------------------------------------------------------
666_[TAG_OPEN_STATE] = function tagOpenState(cp) {
667    if (cp === $.EXCLAMATION_MARK)
668        this.state = MARKUP_DECLARATION_OPEN_STATE;
669
670    else if (cp === $.SOLIDUS)
671        this.state = END_TAG_OPEN_STATE;
672
673    else if (isAsciiUpper(cp)) {
674        this._createStartTagToken(toAsciiLowerChar(cp));
675        this.state = TAG_NAME_STATE;
676    }
677
678    else if (isAsciiLower(cp)) {
679        this._createStartTagToken(toChar(cp));
680        this.state = TAG_NAME_STATE;
681    }
682
683    else if (cp === $.QUESTION_MARK) {
684        //NOTE: call bogus comment state directly with current consumed character to avoid unnecessary reconsumption.
685        this[BOGUS_COMMENT_STATE](cp);
686    }
687
688    else {
689        this._emitChar('<');
690        this._reconsumeInState(DATA_STATE);
691    }
692};
693
694
695//12.2.4.9 End tag open state
696//------------------------------------------------------------------
697_[END_TAG_OPEN_STATE] = function endTagOpenState(cp) {
698    if (isAsciiUpper(cp)) {
699        this._createEndTagToken(toAsciiLowerChar(cp));
700        this.state = TAG_NAME_STATE;
701    }
702
703    else if (isAsciiLower(cp)) {
704        this._createEndTagToken(toChar(cp));
705        this.state = TAG_NAME_STATE;
706    }
707
708    else if (cp === $.GREATER_THAN_SIGN)
709        this.state = DATA_STATE;
710
711    else if (cp === $.EOF) {
712        this._reconsumeInState(DATA_STATE);
713        this._emitChar('<');
714        this._emitChar('/');
715    }
716
717    else {
718        //NOTE: call bogus comment state directly with current consumed character to avoid unnecessary reconsumption.
719        this[BOGUS_COMMENT_STATE](cp);
720    }
721};
722
723
724//12.2.4.10 Tag name state
725//------------------------------------------------------------------
726_[TAG_NAME_STATE] = function tagNameState(cp) {
727    if (isWhitespace(cp))
728        this.state = BEFORE_ATTRIBUTE_NAME_STATE;
729
730    else if (cp === $.SOLIDUS)
731        this.state = SELF_CLOSING_START_TAG_STATE;
732
733    else if (cp === $.GREATER_THAN_SIGN) {
734        this.state = DATA_STATE;
735        this._emitCurrentToken();
736    }
737
738    else if (isAsciiUpper(cp))
739        this.currentToken.tagName += toAsciiLowerChar(cp);
740
741    else if (cp === $.NULL)
742        this.currentToken.tagName += UNICODE.REPLACEMENT_CHARACTER;
743
744    else if (cp === $.EOF)
745        this._reconsumeInState(DATA_STATE);
746
747    else
748        this.currentToken.tagName += toChar(cp);
749};
750
751
752//12.2.4.11 RCDATA less-than sign state
753//------------------------------------------------------------------
754_[RCDATA_LESS_THAN_SIGN_STATE] = function rcdataLessThanSignState(cp) {
755    if (cp === $.SOLIDUS) {
756        this.tempBuff = [];
757        this.state = RCDATA_END_TAG_OPEN_STATE;
758    }
759
760    else {
761        this._emitChar('<');
762        this._reconsumeInState(RCDATA_STATE);
763    }
764};
765
766
767//12.2.4.12 RCDATA end tag open state
768//------------------------------------------------------------------
769_[RCDATA_END_TAG_OPEN_STATE] = function rcdataEndTagOpenState(cp) {
770    if (isAsciiUpper(cp)) {
771        this._createEndTagToken(toAsciiLowerChar(cp));
772        this.tempBuff.push(cp);
773        this.state = RCDATA_END_TAG_NAME_STATE;
774    }
775
776    else if (isAsciiLower(cp)) {
777        this._createEndTagToken(toChar(cp));
778        this.tempBuff.push(cp);
779        this.state = RCDATA_END_TAG_NAME_STATE;
780    }
781
782    else {
783        this._emitChar('<');
784        this._emitChar('/');
785        this._reconsumeInState(RCDATA_STATE);
786    }
787};
788
789
790//12.2.4.13 RCDATA end tag name state
791//------------------------------------------------------------------
792_[RCDATA_END_TAG_NAME_STATE] = function rcdataEndTagNameState(cp) {
793    if (isAsciiUpper(cp)) {
794        this.currentToken.tagName += toAsciiLowerChar(cp);
795        this.tempBuff.push(cp);
796    }
797
798    else if (isAsciiLower(cp)) {
799        this.currentToken.tagName += toChar(cp);
800        this.tempBuff.push(cp);
801    }
802
803    else {
804        if (this._isAppropriateEndTagToken()) {
805            if (isWhitespace(cp)) {
806                this.state = BEFORE_ATTRIBUTE_NAME_STATE;
807                return;
808            }
809
810            if (cp === $.SOLIDUS) {
811                this.state = SELF_CLOSING_START_TAG_STATE;
812                return;
813            }
814
815            if (cp === $.GREATER_THAN_SIGN) {
816                this.state = DATA_STATE;
817                this._emitCurrentToken();
818                return;
819            }
820        }
821
822        this._emitChar('<');
823        this._emitChar('/');
824        this._emitSeveralCodePoints(this.tempBuff);
825        this._reconsumeInState(RCDATA_STATE);
826    }
827};
828
829
830//12.2.4.14 RAWTEXT less-than sign state
831//------------------------------------------------------------------
832_[RAWTEXT_LESS_THAN_SIGN_STATE] = function rawtextLessThanSignState(cp) {
833    if (cp === $.SOLIDUS) {
834        this.tempBuff = [];
835        this.state = RAWTEXT_END_TAG_OPEN_STATE;
836    }
837
838    else {
839        this._emitChar('<');
840        this._reconsumeInState(RAWTEXT_STATE);
841    }
842};
843
844
845//12.2.4.15 RAWTEXT end tag open state
846//------------------------------------------------------------------
847_[RAWTEXT_END_TAG_OPEN_STATE] = function rawtextEndTagOpenState(cp) {
848    if (isAsciiUpper(cp)) {
849        this._createEndTagToken(toAsciiLowerChar(cp));
850        this.tempBuff.push(cp);
851        this.state = RAWTEXT_END_TAG_NAME_STATE;
852    }
853
854    else if (isAsciiLower(cp)) {
855        this._createEndTagToken(toChar(cp));
856        this.tempBuff.push(cp);
857        this.state = RAWTEXT_END_TAG_NAME_STATE;
858    }
859
860    else {
861        this._emitChar('<');
862        this._emitChar('/');
863        this._reconsumeInState(RAWTEXT_STATE);
864    }
865};
866
867
868//12.2.4.16 RAWTEXT end tag name state
869//------------------------------------------------------------------
870_[RAWTEXT_END_TAG_NAME_STATE] = function rawtextEndTagNameState(cp) {
871    if (isAsciiUpper(cp)) {
872        this.currentToken.tagName += toAsciiLowerChar(cp);
873        this.tempBuff.push(cp);
874    }
875
876    else if (isAsciiLower(cp)) {
877        this.currentToken.tagName += toChar(cp);
878        this.tempBuff.push(cp);
879    }
880
881    else {
882        if (this._isAppropriateEndTagToken()) {
883            if (isWhitespace(cp)) {
884                this.state = BEFORE_ATTRIBUTE_NAME_STATE;
885                return;
886            }
887
888            if (cp === $.SOLIDUS) {
889                this.state = SELF_CLOSING_START_TAG_STATE;
890                return;
891            }
892
893            if (cp === $.GREATER_THAN_SIGN) {
894                this._emitCurrentToken();
895                this.state = DATA_STATE;
896                return;
897            }
898        }
899
900        this._emitChar('<');
901        this._emitChar('/');
902        this._emitSeveralCodePoints(this.tempBuff);
903        this._reconsumeInState(RAWTEXT_STATE);
904    }
905};
906
907
908//12.2.4.17 Script data less-than sign state
909//------------------------------------------------------------------
910_[SCRIPT_DATA_LESS_THAN_SIGN_STATE] = function scriptDataLessThanSignState(cp) {
911    if (cp === $.SOLIDUS) {
912        this.tempBuff = [];
913        this.state = SCRIPT_DATA_END_TAG_OPEN_STATE;
914    }
915
916    else if (cp === $.EXCLAMATION_MARK) {
917        this.state = SCRIPT_DATA_ESCAPE_START_STATE;
918        this._emitChar('<');
919        this._emitChar('!');
920    }
921
922    else {
923        this._emitChar('<');
924        this._reconsumeInState(SCRIPT_DATA_STATE);
925    }
926};
927
928
929//12.2.4.18 Script data end tag open state
930//------------------------------------------------------------------
931_[SCRIPT_DATA_END_TAG_OPEN_STATE] = function scriptDataEndTagOpenState(cp) {
932    if (isAsciiUpper(cp)) {
933        this._createEndTagToken(toAsciiLowerChar(cp));
934        this.tempBuff.push(cp);
935        this.state = SCRIPT_DATA_END_TAG_NAME_STATE;
936    }
937
938    else if (isAsciiLower(cp)) {
939        this._createEndTagToken(toChar(cp));
940        this.tempBuff.push(cp);
941        this.state = SCRIPT_DATA_END_TAG_NAME_STATE;
942    }
943
944    else {
945        this._emitChar('<');
946        this._emitChar('/');
947        this._reconsumeInState(SCRIPT_DATA_STATE);
948    }
949};
950
951
952//12.2.4.19 Script data end tag name state
953//------------------------------------------------------------------
954_[SCRIPT_DATA_END_TAG_NAME_STATE] = function scriptDataEndTagNameState(cp) {
955    if (isAsciiUpper(cp)) {
956        this.currentToken.tagName += toAsciiLowerChar(cp);
957        this.tempBuff.push(cp);
958    }
959
960    else if (isAsciiLower(cp)) {
961        this.currentToken.tagName += toChar(cp);
962        this.tempBuff.push(cp);
963    }
964
965    else {
966        if (this._isAppropriateEndTagToken()) {
967            if (isWhitespace(cp)) {
968                this.state = BEFORE_ATTRIBUTE_NAME_STATE;
969                return;
970            }
971
972            else if (cp === $.SOLIDUS) {
973                this.state = SELF_CLOSING_START_TAG_STATE;
974                return;
975            }
976
977            else if (cp === $.GREATER_THAN_SIGN) {
978                this._emitCurrentToken();
979                this.state = DATA_STATE;
980                return;
981            }
982        }
983
984        this._emitChar('<');
985        this._emitChar('/');
986        this._emitSeveralCodePoints(this.tempBuff);
987        this._reconsumeInState(SCRIPT_DATA_STATE);
988    }
989};
990
991
992//12.2.4.20 Script data escape start state
993//------------------------------------------------------------------
994_[SCRIPT_DATA_ESCAPE_START_STATE] = function scriptDataEscapeStartState(cp) {
995    if (cp === $.HYPHEN_MINUS) {
996        this.state = SCRIPT_DATA_ESCAPE_START_DASH_STATE;
997        this._emitChar('-');
998    }
999
1000    else
1001        this._reconsumeInState(SCRIPT_DATA_STATE);
1002};
1003
1004
1005//12.2.4.21 Script data escape start dash state
1006//------------------------------------------------------------------
1007_[SCRIPT_DATA_ESCAPE_START_DASH_STATE] = function scriptDataEscapeStartDashState(cp) {
1008    if (cp === $.HYPHEN_MINUS) {
1009        this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
1010        this._emitChar('-');
1011    }
1012
1013    else
1014        this._reconsumeInState(SCRIPT_DATA_STATE);
1015};
1016
1017
1018//12.2.4.22 Script data escaped state
1019//------------------------------------------------------------------
1020_[SCRIPT_DATA_ESCAPED_STATE] = function scriptDataEscapedState(cp) {
1021    if (cp === $.HYPHEN_MINUS) {
1022        this.state = SCRIPT_DATA_ESCAPED_DASH_STATE;
1023        this._emitChar('-');
1024    }
1025
1026    else if (cp === $.LESS_THAN_SIGN)
1027        this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
1028
1029    else if (cp === $.NULL)
1030        this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
1031
1032    else if (cp === $.EOF)
1033        this._reconsumeInState(DATA_STATE);
1034
1035    else
1036        this._emitCodePoint(cp);
1037};
1038
1039
1040//12.2.4.23 Script data escaped dash state
1041//------------------------------------------------------------------
1042_[SCRIPT_DATA_ESCAPED_DASH_STATE] = function scriptDataEscapedDashState(cp) {
1043    if (cp === $.HYPHEN_MINUS) {
1044        this.state = SCRIPT_DATA_ESCAPED_DASH_DASH_STATE;
1045        this._emitChar('-');
1046    }
1047
1048    else if (cp === $.LESS_THAN_SIGN)
1049        this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
1050
1051    else if (cp === $.NULL) {
1052        this.state = SCRIPT_DATA_ESCAPED_STATE;
1053        this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
1054    }
1055
1056    else if (cp === $.EOF)
1057        this._reconsumeInState(DATA_STATE);
1058
1059    else {
1060        this.state = SCRIPT_DATA_ESCAPED_STATE;
1061        this._emitCodePoint(cp);
1062    }
1063};
1064
1065
1066//12.2.4.24 Script data escaped dash dash state
1067//------------------------------------------------------------------
1068_[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE] = function scriptDataEscapedDashDashState(cp) {
1069    if (cp === $.HYPHEN_MINUS)
1070        this._emitChar('-');
1071
1072    else if (cp === $.LESS_THAN_SIGN)
1073        this.state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE;
1074
1075    else if (cp === $.GREATER_THAN_SIGN) {
1076        this.state = SCRIPT_DATA_STATE;
1077        this._emitChar('>');
1078    }
1079
1080    else if (cp === $.NULL) {
1081        this.state = SCRIPT_DATA_ESCAPED_STATE;
1082        this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
1083    }
1084
1085    else if (cp === $.EOF)
1086        this._reconsumeInState(DATA_STATE);
1087
1088    else {
1089        this.state = SCRIPT_DATA_ESCAPED_STATE;
1090        this._emitCodePoint(cp);
1091    }
1092};
1093
1094
1095//12.2.4.25 Script data escaped less-than sign state
1096//------------------------------------------------------------------
1097_[SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN_STATE] = function scriptDataEscapedLessThanSignState(cp) {
1098    if (cp === $.SOLIDUS) {
1099        this.tempBuff = [];
1100        this.state = SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE;
1101    }
1102
1103    else if (isAsciiUpper(cp)) {
1104        this.tempBuff = [];
1105        this.tempBuff.push(toAsciiLowerCodePoint(cp));
1106        this.state = SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE;
1107        this._emitChar('<');
1108        this._emitCodePoint(cp);
1109    }
1110
1111    else if (isAsciiLower(cp)) {
1112        this.tempBuff = [];
1113        this.tempBuff.push(cp);
1114        this.state = SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE;
1115        this._emitChar('<');
1116        this._emitCodePoint(cp);
1117    }
1118
1119    else {
1120        this._emitChar('<');
1121        this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
1122    }
1123};
1124
1125
1126//12.2.4.26 Script data escaped end tag open state
1127//------------------------------------------------------------------
1128_[SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE] = function scriptDataEscapedEndTagOpenState(cp) {
1129    if (isAsciiUpper(cp)) {
1130        this._createEndTagToken(toAsciiLowerChar(cp));
1131        this.tempBuff.push(cp);
1132        this.state = SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE;
1133    }
1134
1135    else if (isAsciiLower(cp)) {
1136        this._createEndTagToken(toChar(cp));
1137        this.tempBuff.push(cp);
1138        this.state = SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE;
1139    }
1140
1141    else {
1142        this._emitChar('<');
1143        this._emitChar('/');
1144        this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
1145    }
1146};
1147
1148
1149//12.2.4.27 Script data escaped end tag name state
1150//------------------------------------------------------------------
1151_[SCRIPT_DATA_ESCAPED_END_TAG_NAME_STATE] = function scriptDataEscapedEndTagNameState(cp) {
1152    if (isAsciiUpper(cp)) {
1153        this.currentToken.tagName += toAsciiLowerChar(cp);
1154        this.tempBuff.push(cp);
1155    }
1156
1157    else if (isAsciiLower(cp)) {
1158        this.currentToken.tagName += toChar(cp);
1159        this.tempBuff.push(cp);
1160    }
1161
1162    else {
1163        if (this._isAppropriateEndTagToken()) {
1164            if (isWhitespace(cp)) {
1165                this.state = BEFORE_ATTRIBUTE_NAME_STATE;
1166                return;
1167            }
1168
1169            if (cp === $.SOLIDUS) {
1170                this.state = SELF_CLOSING_START_TAG_STATE;
1171                return;
1172            }
1173
1174            if (cp === $.GREATER_THAN_SIGN) {
1175                this._emitCurrentToken();
1176                this.state = DATA_STATE;
1177                return;
1178            }
1179        }
1180
1181        this._emitChar('<');
1182        this._emitChar('/');
1183        this._emitSeveralCodePoints(this.tempBuff);
1184        this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
1185    }
1186};
1187
1188
1189//12.2.4.28 Script data double escape start state
1190//------------------------------------------------------------------
1191_[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE] = function scriptDataDoubleEscapeStartState(cp) {
1192    if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) {
1193        this.state = this.isTempBufferEqualToScriptString() ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE : SCRIPT_DATA_ESCAPED_STATE;
1194        this._emitCodePoint(cp);
1195    }
1196
1197    else if (isAsciiUpper(cp)) {
1198        this.tempBuff.push(toAsciiLowerCodePoint(cp));
1199        this._emitCodePoint(cp);
1200    }
1201
1202    else if (isAsciiLower(cp)) {
1203        this.tempBuff.push(cp);
1204        this._emitCodePoint(cp);
1205    }
1206
1207    else
1208        this._reconsumeInState(SCRIPT_DATA_ESCAPED_STATE);
1209};
1210
1211
1212//12.2.4.29 Script data double escaped state
1213//------------------------------------------------------------------
1214_[SCRIPT_DATA_DOUBLE_ESCAPED_STATE] = function scriptDataDoubleEscapedState(cp) {
1215    if (cp === $.HYPHEN_MINUS) {
1216        this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE;
1217        this._emitChar('-');
1218    }
1219
1220    else if (cp === $.LESS_THAN_SIGN) {
1221        this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
1222        this._emitChar('<');
1223    }
1224
1225    else if (cp === $.NULL)
1226        this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
1227
1228    else if (cp === $.EOF)
1229        this._reconsumeInState(DATA_STATE);
1230
1231    else
1232        this._emitCodePoint(cp);
1233};
1234
1235
1236//12.2.4.30 Script data double escaped dash state
1237//------------------------------------------------------------------
1238_[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE] = function scriptDataDoubleEscapedDashState(cp) {
1239    if (cp === $.HYPHEN_MINUS) {
1240        this.state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE;
1241        this._emitChar('-');
1242    }
1243
1244    else if (cp === $.LESS_THAN_SIGN) {
1245        this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
1246        this._emitChar('<');
1247    }
1248
1249    else if (cp === $.NULL) {
1250        this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1251        this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
1252    }
1253
1254    else if (cp === $.EOF)
1255        this._reconsumeInState(DATA_STATE);
1256
1257    else {
1258        this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1259        this._emitCodePoint(cp);
1260    }
1261};
1262
1263
1264//12.2.4.31 Script data double escaped dash dash state
1265//------------------------------------------------------------------
1266_[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE] = function scriptDataDoubleEscapedDashDashState(cp) {
1267    if (cp === $.HYPHEN_MINUS)
1268        this._emitChar('-');
1269
1270    else if (cp === $.LESS_THAN_SIGN) {
1271        this.state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE;
1272        this._emitChar('<');
1273    }
1274
1275    else if (cp === $.GREATER_THAN_SIGN) {
1276        this.state = SCRIPT_DATA_STATE;
1277        this._emitChar('>');
1278    }
1279
1280    else if (cp === $.NULL) {
1281        this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1282        this._emitChar(UNICODE.REPLACEMENT_CHARACTER);
1283    }
1284
1285    else if (cp === $.EOF)
1286        this._reconsumeInState(DATA_STATE);
1287
1288    else {
1289        this.state = SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1290        this._emitCodePoint(cp);
1291    }
1292};
1293
1294
1295//12.2.4.32 Script data double escaped less-than sign state
1296//------------------------------------------------------------------
1297_[SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN_STATE] = function scriptDataDoubleEscapedLessThanSignState(cp) {
1298    if (cp === $.SOLIDUS) {
1299        this.tempBuff = [];
1300        this.state = SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE;
1301        this._emitChar('/');
1302    }
1303
1304    else
1305        this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE);
1306};
1307
1308
1309//12.2.4.33 Script data double escape end state
1310//------------------------------------------------------------------
1311_[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE] = function scriptDataDoubleEscapeEndState(cp) {
1312    if (isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN) {
1313        this.state = this.isTempBufferEqualToScriptString() ? SCRIPT_DATA_ESCAPED_STATE : SCRIPT_DATA_DOUBLE_ESCAPED_STATE;
1314
1315        this._emitCodePoint(cp);
1316    }
1317
1318    else if (isAsciiUpper(cp)) {
1319        this.tempBuff.push(toAsciiLowerCodePoint(cp));
1320        this._emitCodePoint(cp);
1321    }
1322
1323    else if (isAsciiLower(cp)) {
1324        this.tempBuff.push(cp);
1325        this._emitCodePoint(cp);
1326    }
1327
1328    else
1329        this._reconsumeInState(SCRIPT_DATA_DOUBLE_ESCAPED_STATE);
1330};
1331
1332
1333//12.2.4.34 Before attribute name state
1334//------------------------------------------------------------------
1335_[BEFORE_ATTRIBUTE_NAME_STATE] = function beforeAttributeNameState(cp) {
1336    if (isWhitespace(cp))
1337        return;
1338
1339    if (cp === $.SOLIDUS)
1340        this.state = SELF_CLOSING_START_TAG_STATE;
1341
1342    else if (cp === $.GREATER_THAN_SIGN) {
1343        this.state = DATA_STATE;
1344        this._emitCurrentToken();
1345    }
1346
1347    else if (isAsciiUpper(cp)) {
1348        this._createAttr(toAsciiLowerChar(cp));
1349        this.state = ATTRIBUTE_NAME_STATE;
1350    }
1351
1352    else if (cp === $.NULL) {
1353        this._createAttr(UNICODE.REPLACEMENT_CHARACTER);
1354        this.state = ATTRIBUTE_NAME_STATE;
1355    }
1356
1357    else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN || cp === $.EQUALS_SIGN) {
1358        this._createAttr(toChar(cp));
1359        this.state = ATTRIBUTE_NAME_STATE;
1360    }
1361
1362    else if (cp === $.EOF)
1363        this._reconsumeInState(DATA_STATE);
1364
1365    else {
1366        this._createAttr(toChar(cp));
1367        this.state = ATTRIBUTE_NAME_STATE;
1368    }
1369};
1370
1371
1372//12.2.4.35 Attribute name state
1373//------------------------------------------------------------------
1374_[ATTRIBUTE_NAME_STATE] = function attributeNameState(cp) {
1375    if (isWhitespace(cp))
1376        this._leaveAttrName(AFTER_ATTRIBUTE_NAME_STATE);
1377
1378    else if (cp === $.SOLIDUS)
1379        this._leaveAttrName(SELF_CLOSING_START_TAG_STATE);
1380
1381    else if (cp === $.EQUALS_SIGN)
1382        this._leaveAttrName(BEFORE_ATTRIBUTE_VALUE_STATE);
1383
1384    else if (cp === $.GREATER_THAN_SIGN) {
1385        this._leaveAttrName(DATA_STATE);
1386        this._emitCurrentToken();
1387    }
1388
1389    else if (isAsciiUpper(cp))
1390        this.currentAttr.name += toAsciiLowerChar(cp);
1391
1392    else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN)
1393        this.currentAttr.name += toChar(cp);
1394
1395    else if (cp === $.NULL)
1396        this.currentAttr.name += UNICODE.REPLACEMENT_CHARACTER;
1397
1398    else if (cp === $.EOF)
1399        this._reconsumeInState(DATA_STATE);
1400
1401    else
1402        this.currentAttr.name += toChar(cp);
1403};
1404
1405
1406//12.2.4.36 After attribute name state
1407//------------------------------------------------------------------
1408_[AFTER_ATTRIBUTE_NAME_STATE] = function afterAttributeNameState(cp) {
1409    if (isWhitespace(cp))
1410        return;
1411
1412    if (cp === $.SOLIDUS)
1413        this.state = SELF_CLOSING_START_TAG_STATE;
1414
1415    else if (cp === $.EQUALS_SIGN)
1416        this.state = BEFORE_ATTRIBUTE_VALUE_STATE;
1417
1418    else if (cp === $.GREATER_THAN_SIGN) {
1419        this.state = DATA_STATE;
1420        this._emitCurrentToken();
1421    }
1422
1423    else if (isAsciiUpper(cp)) {
1424        this._createAttr(toAsciiLowerChar(cp));
1425        this.state = ATTRIBUTE_NAME_STATE;
1426    }
1427
1428    else if (cp === $.NULL) {
1429        this._createAttr(UNICODE.REPLACEMENT_CHARACTER);
1430        this.state = ATTRIBUTE_NAME_STATE;
1431    }
1432
1433    else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN) {
1434        this._createAttr(toChar(cp));
1435        this.state = ATTRIBUTE_NAME_STATE;
1436    }
1437
1438    else if (cp === $.EOF)
1439        this._reconsumeInState(DATA_STATE);
1440
1441    else {
1442        this._createAttr(toChar(cp));
1443        this.state = ATTRIBUTE_NAME_STATE;
1444    }
1445};
1446
1447
1448//12.2.4.37 Before attribute value state
1449//------------------------------------------------------------------
1450_[BEFORE_ATTRIBUTE_VALUE_STATE] = function beforeAttributeValueState(cp) {
1451    if (isWhitespace(cp))
1452        return;
1453
1454    if (cp === $.QUOTATION_MARK)
1455        this.state = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE;
1456
1457    else if (cp === $.AMPERSAND)
1458        this._reconsumeInState(ATTRIBUTE_VALUE_UNQUOTED_STATE);
1459
1460    else if (cp === $.APOSTROPHE)
1461        this.state = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE;
1462
1463    else if (cp === $.NULL) {
1464        this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER;
1465        this.state = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1466    }
1467
1468    else if (cp === $.GREATER_THAN_SIGN) {
1469        this.state = DATA_STATE;
1470        this._emitCurrentToken();
1471    }
1472
1473    else if (cp === $.LESS_THAN_SIGN || cp === $.EQUALS_SIGN || cp === $.GRAVE_ACCENT) {
1474        this.currentAttr.value += toChar(cp);
1475        this.state = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1476    }
1477
1478    else if (cp === $.EOF)
1479        this._reconsumeInState(DATA_STATE);
1480
1481    else {
1482        this.currentAttr.value += toChar(cp);
1483        this.state = ATTRIBUTE_VALUE_UNQUOTED_STATE;
1484    }
1485};
1486
1487
1488//12.2.4.38 Attribute value (double-quoted) state
1489//------------------------------------------------------------------
1490_[ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE] = function attributeValueDoubleQuotedState(cp) {
1491    if (cp === $.QUOTATION_MARK)
1492        this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1493
1494    else if (cp === $.AMPERSAND) {
1495        this.additionalAllowedCp = $.QUOTATION_MARK;
1496        this.returnState = this.state;
1497        this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE;
1498    }
1499
1500    else if (cp === $.NULL)
1501        this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER;
1502
1503    else if (cp === $.EOF)
1504        this._reconsumeInState(DATA_STATE);
1505
1506    else
1507        this.currentAttr.value += toChar(cp);
1508};
1509
1510
1511//12.2.4.39 Attribute value (single-quoted) state
1512//------------------------------------------------------------------
1513_[ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE] = function attributeValueSingleQuotedState(cp) {
1514    if (cp === $.APOSTROPHE)
1515        this.state = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE;
1516
1517    else if (cp === $.AMPERSAND) {
1518        this.additionalAllowedCp = $.APOSTROPHE;
1519        this.returnState = this.state;
1520        this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE;
1521    }
1522
1523    else if (cp === $.NULL)
1524        this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER;
1525
1526    else if (cp === $.EOF)
1527        this._reconsumeInState(DATA_STATE);
1528
1529    else
1530        this.currentAttr.value += toChar(cp);
1531};
1532
1533
1534//12.2.4.40 Attribute value (unquoted) state
1535//------------------------------------------------------------------
1536_[ATTRIBUTE_VALUE_UNQUOTED_STATE] = function attributeValueUnquotedState(cp) {
1537    if (isWhitespace(cp))
1538        this.state = BEFORE_ATTRIBUTE_NAME_STATE;
1539
1540    else if (cp === $.AMPERSAND) {
1541        this.additionalAllowedCp = $.GREATER_THAN_SIGN;
1542        this.returnState = this.state;
1543        this.state = CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE;
1544    }
1545
1546    else if (cp === $.GREATER_THAN_SIGN) {
1547        this.state = DATA_STATE;
1548        this._emitCurrentToken();
1549    }
1550
1551    else if (cp === $.NULL)
1552        this.currentAttr.value += UNICODE.REPLACEMENT_CHARACTER;
1553
1554    else if (cp === $.QUOTATION_MARK || cp === $.APOSTROPHE || cp === $.LESS_THAN_SIGN ||
1555             cp === $.EQUALS_SIGN || cp === $.GRAVE_ACCENT) {
1556        this.currentAttr.value += toChar(cp);
1557    }
1558
1559    else if (cp === $.EOF)
1560        this._reconsumeInState(DATA_STATE);
1561
1562    else
1563        this.currentAttr.value += toChar(cp);
1564};
1565
1566
1567//12.2.4.41 Character reference in attribute value state
1568//------------------------------------------------------------------
1569_[CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE_STATE] = function characterReferenceInAttributeValueState(cp) {
1570    var referencedCodePoints = this._consumeCharacterReference(cp, true);
1571
1572    if (referencedCodePoints) {
1573        for (var i = 0; i < referencedCodePoints.length; i++)
1574            this.currentAttr.value += toChar(referencedCodePoints[i]);
1575    } else
1576        this.currentAttr.value += '&';
1577
1578    this.state = this.returnState;
1579};
1580
1581
1582//12.2.4.42 After attribute value (quoted) state
1583//------------------------------------------------------------------
1584_[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE] = function afterAttributeValueQuotedState(cp) {
1585    if (isWhitespace(cp))
1586        this.state = BEFORE_ATTRIBUTE_NAME_STATE;
1587
1588    else if (cp === $.SOLIDUS)
1589        this.state = SELF_CLOSING_START_TAG_STATE;
1590
1591    else if (cp === $.GREATER_THAN_SIGN) {
1592        this.state = DATA_STATE;
1593        this._emitCurrentToken();
1594    }
1595
1596    else if (cp === $.EOF)
1597        this._reconsumeInState(DATA_STATE);
1598
1599    else
1600        this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE);
1601};
1602
1603
1604//12.2.4.43 Self-closing start tag state
1605//------------------------------------------------------------------
1606_[SELF_CLOSING_START_TAG_STATE] = function selfClosingStartTagState(cp) {
1607    if (cp === $.GREATER_THAN_SIGN) {
1608        this.currentToken.selfClosing = true;
1609        this.state = DATA_STATE;
1610        this._emitCurrentToken();
1611    }
1612
1613    else if (cp === $.EOF)
1614        this._reconsumeInState(DATA_STATE);
1615
1616    else
1617        this._reconsumeInState(BEFORE_ATTRIBUTE_NAME_STATE);
1618};
1619
1620
1621//12.2.4.44 Bogus comment state
1622//------------------------------------------------------------------
1623_[BOGUS_COMMENT_STATE] = function bogusCommentState(cp) {
1624    this._createCommentToken();
1625
1626    while (true) {
1627        if (cp === $.GREATER_THAN_SIGN) {
1628            this.state = DATA_STATE;
1629            break;
1630        }
1631
1632        else if (cp === $.EOF) {
1633            this._reconsumeInState(DATA_STATE);
1634            break;
1635        }
1636
1637        else {
1638            this.currentToken.data += cp === $.NULL ? UNICODE.REPLACEMENT_CHARACTER : toChar(cp);
1639            cp = this._consume();
1640        }
1641    }
1642
1643    this._emitCurrentToken();
1644};
1645
1646
1647//12.2.4.45 Markup declaration open state
1648//------------------------------------------------------------------
1649_[MARKUP_DECLARATION_OPEN_STATE] = function markupDeclarationOpenState(cp) {
1650    if (this._consumeSubsequentIfMatch($$.DASH_DASH_STRING, cp, true)) {
1651        this._createCommentToken();
1652        this.state = COMMENT_START_STATE;
1653    }
1654
1655    else if (this._consumeSubsequentIfMatch($$.DOCTYPE_STRING, cp, false))
1656        this.state = DOCTYPE_STATE;
1657
1658    else if (this.allowCDATA && this._consumeSubsequentIfMatch($$.CDATA_START_STRING, cp, true))
1659        this.state = CDATA_SECTION_STATE;
1660
1661    else {
1662        //NOTE: call bogus comment state directly with current consumed character to avoid unnecessary reconsumption.
1663        this[BOGUS_COMMENT_STATE](cp);
1664    }
1665};
1666
1667
1668//12.2.4.46 Comment start state
1669//------------------------------------------------------------------
1670_[COMMENT_START_STATE] = function commentStartState(cp) {
1671    if (cp === $.HYPHEN_MINUS)
1672        this.state = COMMENT_START_DASH_STATE;
1673
1674    else if (cp === $.NULL) {
1675        this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
1676        this.state = COMMENT_STATE;
1677    }
1678
1679    else if (cp === $.GREATER_THAN_SIGN) {
1680        this.state = DATA_STATE;
1681        this._emitCurrentToken();
1682    }
1683
1684    else if (cp === $.EOF) {
1685        this._emitCurrentToken();
1686        this._reconsumeInState(DATA_STATE);
1687    }
1688
1689    else {
1690        this.currentToken.data += toChar(cp);
1691        this.state = COMMENT_STATE;
1692    }
1693};
1694
1695
1696//12.2.4.47 Comment start dash state
1697//------------------------------------------------------------------
1698_[COMMENT_START_DASH_STATE] = function commentStartDashState(cp) {
1699    if (cp === $.HYPHEN_MINUS)
1700        this.state = COMMENT_END_STATE;
1701
1702    else if (cp === $.NULL) {
1703        this.currentToken.data += '-';
1704        this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
1705        this.state = COMMENT_STATE;
1706    }
1707
1708    else if (cp === $.GREATER_THAN_SIGN) {
1709        this.state = DATA_STATE;
1710        this._emitCurrentToken();
1711    }
1712
1713    else if (cp === $.EOF) {
1714        this._emitCurrentToken();
1715        this._reconsumeInState(DATA_STATE);
1716    }
1717
1718    else {
1719        this.currentToken.data += '-';
1720        this.currentToken.data += toChar(cp);
1721        this.state = COMMENT_STATE;
1722    }
1723};
1724
1725
1726//12.2.4.48 Comment state
1727//------------------------------------------------------------------
1728_[COMMENT_STATE] = function commentState(cp) {
1729    if (cp === $.HYPHEN_MINUS)
1730        this.state = COMMENT_END_DASH_STATE;
1731
1732    else if (cp === $.NULL)
1733        this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
1734
1735    else if (cp === $.EOF) {
1736        this._emitCurrentToken();
1737        this._reconsumeInState(DATA_STATE);
1738    }
1739
1740    else
1741        this.currentToken.data += toChar(cp);
1742};
1743
1744
1745//12.2.4.49 Comment end dash state
1746//------------------------------------------------------------------
1747_[COMMENT_END_DASH_STATE] = function commentEndDashState(cp) {
1748    if (cp === $.HYPHEN_MINUS)
1749        this.state = COMMENT_END_STATE;
1750
1751    else if (cp === $.NULL) {
1752        this.currentToken.data += '-';
1753        this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
1754        this.state = COMMENT_STATE;
1755    }
1756
1757    else if (cp === $.EOF) {
1758        this._emitCurrentToken();
1759        this._reconsumeInState(DATA_STATE);
1760    }
1761
1762    else {
1763        this.currentToken.data += '-';
1764        this.currentToken.data += toChar(cp);
1765        this.state = COMMENT_STATE;
1766    }
1767};
1768
1769
1770//12.2.4.50 Comment end state
1771//------------------------------------------------------------------
1772_[COMMENT_END_STATE] = function commentEndState(cp) {
1773    if (cp === $.GREATER_THAN_SIGN) {
1774        this.state = DATA_STATE;
1775        this._emitCurrentToken();
1776    }
1777
1778    else if (cp === $.EXCLAMATION_MARK)
1779        this.state = COMMENT_END_BANG_STATE;
1780
1781    else if (cp === $.HYPHEN_MINUS)
1782        this.currentToken.data += '-';
1783
1784    else if (cp === $.NULL) {
1785        this.currentToken.data += '--';
1786        this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
1787        this.state = COMMENT_STATE;
1788    }
1789
1790    else if (cp === $.EOF) {
1791        this._reconsumeInState(DATA_STATE);
1792        this._emitCurrentToken();
1793    }
1794
1795    else {
1796        this.currentToken.data += '--';
1797        this.currentToken.data += toChar(cp);
1798        this.state = COMMENT_STATE;
1799    }
1800};
1801
1802
1803//12.2.4.51 Comment end bang state
1804//------------------------------------------------------------------
1805_[COMMENT_END_BANG_STATE] = function commentEndBangState(cp) {
1806    if (cp === $.HYPHEN_MINUS) {
1807        this.currentToken.data += '--!';
1808        this.state = COMMENT_END_DASH_STATE;
1809    }
1810
1811    else if (cp === $.GREATER_THAN_SIGN) {
1812        this.state = DATA_STATE;
1813        this._emitCurrentToken();
1814    }
1815
1816    else if (cp === $.NULL) {
1817        this.currentToken.data += '--!';
1818        this.currentToken.data += UNICODE.REPLACEMENT_CHARACTER;
1819        this.state = COMMENT_STATE;
1820    }
1821
1822    else if (cp === $.EOF) {
1823        this._emitCurrentToken();
1824        this._reconsumeInState(DATA_STATE);
1825    }
1826
1827    else {
1828        this.currentToken.data += '--!';
1829        this.currentToken.data += toChar(cp);
1830        this.state = COMMENT_STATE;
1831    }
1832};
1833
1834
1835//12.2.4.52 DOCTYPE state
1836//------------------------------------------------------------------
1837_[DOCTYPE_STATE] = function doctypeState(cp) {
1838    if (isWhitespace(cp))
1839        this.state = BEFORE_DOCTYPE_NAME_STATE;
1840
1841    else if (cp === $.EOF) {
1842        this._createDoctypeToken();
1843        this.currentToken.forceQuirks = true;
1844        this._emitCurrentToken();
1845        this._reconsumeInState(DATA_STATE);
1846    }
1847
1848    else
1849        this._reconsumeInState(BEFORE_DOCTYPE_NAME_STATE);
1850};
1851
1852
1853//12.2.4.53 Before DOCTYPE name state
1854//------------------------------------------------------------------
1855_[BEFORE_DOCTYPE_NAME_STATE] = function beforeDoctypeNameState(cp) {
1856    if (isWhitespace(cp))
1857        return;
1858
1859    if (isAsciiUpper(cp)) {
1860        this._createDoctypeToken(toAsciiLowerChar(cp));
1861        this.state = DOCTYPE_NAME_STATE;
1862    }
1863
1864    else if (cp === $.GREATER_THAN_SIGN) {
1865        this._createDoctypeToken();
1866        this.currentToken.forceQuirks = true;
1867        this._emitCurrentToken();
1868        this.state = DATA_STATE;
1869    }
1870
1871    else if (cp === $.EOF) {
1872        this._createDoctypeToken();
1873        this.currentToken.forceQuirks = true;
1874        this._emitCurrentToken();
1875        this._reconsumeInState(DATA_STATE);
1876    }
1877
1878    else if (cp === $.NULL) {
1879        this._createDoctypeToken(UNICODE.REPLACEMENT_CHARACTER);
1880        this.state = DOCTYPE_NAME_STATE;
1881    }
1882
1883    else {
1884        this._createDoctypeToken(toChar(cp));
1885        this.state = DOCTYPE_NAME_STATE;
1886    }
1887};
1888
1889
1890//12.2.4.54 DOCTYPE name state
1891//------------------------------------------------------------------
1892_[DOCTYPE_NAME_STATE] = function doctypeNameState(cp) {
1893    if (isWhitespace(cp))
1894        this.state = AFTER_DOCTYPE_NAME_STATE;
1895
1896    else if (cp === $.GREATER_THAN_SIGN) {
1897        this._emitCurrentToken();
1898        this.state = DATA_STATE;
1899    }
1900
1901    else if (isAsciiUpper(cp))
1902        this.currentToken.name += toAsciiLowerChar(cp);
1903
1904    else if (cp === $.NULL)
1905        this.currentToken.name += UNICODE.REPLACEMENT_CHARACTER;
1906
1907    else if (cp === $.EOF) {
1908        this.currentToken.forceQuirks = true;
1909        this._emitCurrentToken();
1910        this._reconsumeInState(DATA_STATE);
1911    }
1912
1913    else
1914        this.currentToken.name += toChar(cp);
1915};
1916
1917
1918//12.2.4.55 After DOCTYPE name state
1919//------------------------------------------------------------------
1920_[AFTER_DOCTYPE_NAME_STATE] = function afterDoctypeNameState(cp) {
1921    if (isWhitespace(cp))
1922        return;
1923
1924    if (cp === $.GREATER_THAN_SIGN) {
1925        this.state = DATA_STATE;
1926        this._emitCurrentToken();
1927    }
1928
1929    else if (cp === $.EOF) {
1930        this.currentToken.forceQuirks = true;
1931        this._emitCurrentToken();
1932        this._reconsumeInState(DATA_STATE);
1933    }
1934
1935    else if (this._consumeSubsequentIfMatch($$.PUBLIC_STRING, cp, false))
1936        this.state = AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE;
1937
1938    else if (this._consumeSubsequentIfMatch($$.SYSTEM_STRING, cp, false))
1939        this.state = AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE;
1940
1941    else {
1942        this.currentToken.forceQuirks = true;
1943        this.state = BOGUS_DOCTYPE_STATE;
1944    }
1945};
1946
1947
1948//12.2.4.56 After DOCTYPE public keyword state
1949//------------------------------------------------------------------
1950_[AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE] = function afterDoctypePublicKeywordState(cp) {
1951    if (isWhitespace(cp))
1952        this.state = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
1953
1954    else if (cp === $.QUOTATION_MARK) {
1955        this.currentToken.publicId = '';
1956        this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1957    }
1958
1959    else if (cp === $.APOSTROPHE) {
1960        this.currentToken.publicId = '';
1961        this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1962    }
1963
1964    else if (cp === $.GREATER_THAN_SIGN) {
1965        this.currentToken.forceQuirks = true;
1966        this._emitCurrentToken();
1967        this.state = DATA_STATE;
1968    }
1969
1970    else if (cp === $.EOF) {
1971        this.currentToken.forceQuirks = true;
1972        this._emitCurrentToken();
1973        this._reconsumeInState(DATA_STATE);
1974    }
1975
1976    else {
1977        this.currentToken.forceQuirks = true;
1978        this.state = BOGUS_DOCTYPE_STATE;
1979    }
1980};
1981
1982
1983//12.2.4.57 Before DOCTYPE public identifier state
1984//------------------------------------------------------------------
1985_[BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE] = function beforeDoctypePublicIdentifierState(cp) {
1986    if (isWhitespace(cp))
1987        return;
1988
1989    if (cp === $.QUOTATION_MARK) {
1990        this.currentToken.publicId = '';
1991        this.state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE;
1992    }
1993
1994    else if (cp === $.APOSTROPHE) {
1995        this.currentToken.publicId = '';
1996        this.state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE;
1997    }
1998
1999    else if (cp === $.GREATER_THAN_SIGN) {
2000        this.currentToken.forceQuirks = true;
2001        this._emitCurrentToken();
2002        this.state = DATA_STATE;
2003    }
2004
2005    else if (cp === $.EOF) {
2006        this.currentToken.forceQuirks = true;
2007        this._emitCurrentToken();
2008        this._reconsumeInState(DATA_STATE);
2009    }
2010
2011    else {
2012        this.currentToken.forceQuirks = true;
2013        this.state = BOGUS_DOCTYPE_STATE;
2014    }
2015};
2016
2017
2018//12.2.4.58 DOCTYPE public identifier (double-quoted) state
2019//------------------------------------------------------------------
2020_[DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE] = function doctypePublicIdentifierDoubleQuotedState(cp) {
2021    if (cp === $.QUOTATION_MARK)
2022        this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2023
2024    else if (cp === $.NULL)
2025        this.currentToken.publicId += UNICODE.REPLACEMENT_CHARACTER;
2026
2027    else if (cp === $.GREATER_THAN_SIGN) {
2028        this.currentToken.forceQuirks = true;
2029        this._emitCurrentToken();
2030        this.state = DATA_STATE;
2031    }
2032
2033    else if (cp === $.EOF) {
2034        this.currentToken.forceQuirks = true;
2035        this._emitCurrentToken();
2036        this._reconsumeInState(DATA_STATE);
2037    }
2038
2039    else
2040        this.currentToken.publicId += toChar(cp);
2041};
2042
2043
2044//12.2.4.59 DOCTYPE public identifier (single-quoted) state
2045//------------------------------------------------------------------
2046_[DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE] = function doctypePublicIdentifierSingleQuotedState(cp) {
2047    if (cp === $.APOSTROPHE)
2048        this.state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE;
2049
2050    else if (cp === $.NULL)
2051        this.currentToken.publicId += UNICODE.REPLACEMENT_CHARACTER;
2052
2053    else if (cp === $.GREATER_THAN_SIGN) {
2054        this.currentToken.forceQuirks = true;
2055        this._emitCurrentToken();
2056        this.state = DATA_STATE;
2057    }
2058
2059    else if (cp === $.EOF) {
2060        this.currentToken.forceQuirks = true;
2061        this._emitCurrentToken();
2062        this._reconsumeInState(DATA_STATE);
2063    }
2064
2065    else
2066        this.currentToken.publicId += toChar(cp);
2067};
2068
2069
2070//12.2.4.60 After DOCTYPE public identifier state
2071//------------------------------------------------------------------
2072_[AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE] = function afterDoctypePublicIdentifierState(cp) {
2073    if (isWhitespace(cp))
2074        this.state = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE;
2075
2076    else if (cp === $.GREATER_THAN_SIGN) {
2077        this._emitCurrentToken();
2078        this.state = DATA_STATE;
2079    }
2080
2081    else if (cp === $.QUOTATION_MARK) {
2082        this.currentToken.systemId = '';
2083        this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2084    }
2085
2086    else if (cp === $.APOSTROPHE) {
2087        this.currentToken.systemId = '';
2088        this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2089    }
2090
2091    else if (cp === $.EOF) {
2092        this.currentToken.forceQuirks = true;
2093        this._emitCurrentToken();
2094        this._reconsumeInState(DATA_STATE);
2095    }
2096
2097    else {
2098        this.currentToken.forceQuirks = true;
2099        this.state = BOGUS_DOCTYPE_STATE;
2100    }
2101};
2102
2103
2104//12.2.4.61 Between DOCTYPE public and system identifiers state
2105//------------------------------------------------------------------
2106_[BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS_STATE] = function betweenDoctypePublicAndSystemIdentifiersState(cp) {
2107    if (isWhitespace(cp))
2108        return;
2109
2110    if (cp === $.GREATER_THAN_SIGN) {
2111        this._emitCurrentToken();
2112        this.state = DATA_STATE;
2113    }
2114
2115    else if (cp === $.QUOTATION_MARK) {
2116        this.currentToken.systemId = '';
2117        this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2118    }
2119
2120
2121    else if (cp === $.APOSTROPHE) {
2122        this.currentToken.systemId = '';
2123        this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2124    }
2125
2126    else if (cp === $.EOF) {
2127        this.currentToken.forceQuirks = true;
2128        this._emitCurrentToken();
2129        this._reconsumeInState(DATA_STATE);
2130    }
2131
2132    else {
2133        this.currentToken.forceQuirks = true;
2134        this.state = BOGUS_DOCTYPE_STATE;
2135    }
2136};
2137
2138
2139//12.2.4.62 After DOCTYPE system keyword state
2140//------------------------------------------------------------------
2141_[AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE] = function afterDoctypeSystemKeywordState(cp) {
2142    if (isWhitespace(cp))
2143        this.state = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2144
2145    else if (cp === $.QUOTATION_MARK) {
2146        this.currentToken.systemId = '';
2147        this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2148    }
2149
2150    else if (cp === $.APOSTROPHE) {
2151        this.currentToken.systemId = '';
2152        this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2153    }
2154
2155    else if (cp === $.GREATER_THAN_SIGN) {
2156        this.currentToken.forceQuirks = true;
2157        this._emitCurrentToken();
2158        this.state = DATA_STATE;
2159    }
2160
2161    else if (cp === $.EOF) {
2162        this.currentToken.forceQuirks = true;
2163        this._emitCurrentToken();
2164        this._reconsumeInState(DATA_STATE);
2165    }
2166
2167    else {
2168        this.currentToken.forceQuirks = true;
2169        this.state = BOGUS_DOCTYPE_STATE;
2170    }
2171};
2172
2173
2174//12.2.4.63 Before DOCTYPE system identifier state
2175//------------------------------------------------------------------
2176_[BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE] = function beforeDoctypeSystemIdentifierState(cp) {
2177    if (isWhitespace(cp))
2178        return;
2179
2180    if (cp === $.QUOTATION_MARK) {
2181        this.currentToken.systemId = '';
2182        this.state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE;
2183    }
2184
2185    else if (cp === $.APOSTROPHE) {
2186        this.currentToken.systemId = '';
2187        this.state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE;
2188    }
2189
2190    else if (cp === $.GREATER_THAN_SIGN) {
2191        this.currentToken.forceQuirks = true;
2192        this._emitCurrentToken();
2193        this.state = DATA_STATE;
2194    }
2195
2196    else if (cp === $.EOF) {
2197        this.currentToken.forceQuirks = true;
2198        this._emitCurrentToken();
2199        this._reconsumeInState(DATA_STATE);
2200    }
2201
2202    else {
2203        this.currentToken.forceQuirks = true;
2204        this.state = BOGUS_DOCTYPE_STATE;
2205    }
2206};
2207
2208
2209//12.2.4.64 DOCTYPE system identifier (double-quoted) state
2210//------------------------------------------------------------------
2211_[DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE] = function doctypeSystemIdentifierDoubleQuotedState(cp) {
2212    if (cp === $.QUOTATION_MARK)
2213        this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2214
2215    else if (cp === $.GREATER_THAN_SIGN) {
2216        this.currentToken.forceQuirks = true;
2217        this._emitCurrentToken();
2218        this.state = DATA_STATE;
2219    }
2220
2221    else if (cp === $.NULL)
2222        this.currentToken.systemId += UNICODE.REPLACEMENT_CHARACTER;
2223
2224    else if (cp === $.EOF) {
2225        this.currentToken.forceQuirks = true;
2226        this._emitCurrentToken();
2227        this._reconsumeInState(DATA_STATE);
2228    }
2229
2230    else
2231        this.currentToken.systemId += toChar(cp);
2232};
2233
2234
2235//12.2.4.65 DOCTYPE system identifier (single-quoted) state
2236//------------------------------------------------------------------
2237_[DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE] = function doctypeSystemIdentifierSingleQuotedState(cp) {
2238    if (cp === $.APOSTROPHE)
2239        this.state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE;
2240
2241    else if (cp === $.GREATER_THAN_SIGN) {
2242        this.currentToken.forceQuirks = true;
2243        this._emitCurrentToken();
2244        this.state = DATA_STATE;
2245    }
2246
2247    else if (cp === $.NULL)
2248        this.currentToken.systemId += UNICODE.REPLACEMENT_CHARACTER;
2249
2250    else if (cp === $.EOF) {
2251        this.currentToken.forceQuirks = true;
2252        this._emitCurrentToken();
2253        this._reconsumeInState(DATA_STATE);
2254    }
2255
2256    else
2257        this.currentToken.systemId += toChar(cp);
2258};
2259
2260
2261//12.2.4.66 After DOCTYPE system identifier state
2262//------------------------------------------------------------------
2263_[AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE] = function afterDoctypeSystemIdentifierState(cp) {
2264    if (isWhitespace(cp))
2265        return;
2266
2267    if (cp === $.GREATER_THAN_SIGN) {
2268        this._emitCurrentToken();
2269        this.state = DATA_STATE;
2270    }
2271
2272    else if (cp === $.EOF) {
2273        this.currentToken.forceQuirks = true;
2274        this._emitCurrentToken();
2275        this._reconsumeInState(DATA_STATE);
2276    }
2277
2278    else
2279        this.state = BOGUS_DOCTYPE_STATE;
2280};
2281
2282
2283//12.2.4.67 Bogus DOCTYPE state
2284//------------------------------------------------------------------
2285_[BOGUS_DOCTYPE_STATE] = function bogusDoctypeState(cp) {
2286    if (cp === $.GREATER_THAN_SIGN) {
2287        this._emitCurrentToken();
2288        this.state = DATA_STATE;
2289    }
2290
2291    else if (cp === $.EOF) {
2292        this._emitCurrentToken();
2293        this._reconsumeInState(DATA_STATE);
2294    }
2295};
2296
2297
2298//12.2.4.68 CDATA section state
2299//------------------------------------------------------------------
2300_[CDATA_SECTION_STATE] = function cdataSectionState(cp) {
2301    while (true) {
2302        if (cp === $.EOF) {
2303            this._reconsumeInState(DATA_STATE);
2304            break;
2305        }
2306
2307        else if (this._consumeSubsequentIfMatch($$.CDATA_END_STRING, cp, true)) {
2308            this.state = DATA_STATE;
2309            break;
2310        }
2311
2312        else {
2313            this._emitCodePoint(cp);
2314            cp = this._consume();
2315        }
2316    }
2317};
2318