1'use strict'; 2 3const { Transform } = require('stream'); 4const Tokenizer = require('parse5/lib/tokenizer'); 5const LocationInfoTokenizerMixin = require('parse5/lib/extensions/location-info/tokenizer-mixin'); 6const Mixin = require('parse5/lib/utils/mixin'); 7const mergeOptions = require('parse5/lib/utils/merge-options'); 8const DevNullStream = require('./dev-null-stream'); 9const ParserFeedbackSimulator = require('./parser-feedback-simulator'); 10 11const DEFAULT_OPTIONS = { 12 sourceCodeLocationInfo: false 13}; 14 15class SAXParser extends Transform { 16 constructor(options) { 17 super({ encoding: 'utf8', decodeStrings: false }); 18 19 this.options = mergeOptions(DEFAULT_OPTIONS, options); 20 21 this.tokenizer = new Tokenizer(options); 22 this.locInfoMixin = null; 23 24 if (this.options.sourceCodeLocationInfo) { 25 this.locInfoMixin = Mixin.install(this.tokenizer, LocationInfoTokenizerMixin); 26 } 27 28 this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.tokenizer); 29 30 this.pendingText = null; 31 32 this.lastChunkWritten = false; 33 this.stopped = false; 34 35 // NOTE: always pipe stream to the /dev/null stream to avoid 36 // `highWaterMark` hit even if we don't have consumers. 37 // (see: https://github.com/inikulin/parse5/issues/97#issuecomment-171940774) 38 this.pipe(new DevNullStream()); 39 } 40 41 //TransformStream implementation 42 _transform(chunk, encoding, callback) { 43 if (typeof chunk !== 'string') { 44 throw new TypeError('Parser can work only with string streams.'); 45 } 46 47 callback(null, this._transformChunk(chunk)); 48 } 49 50 _final(callback) { 51 this.lastChunkWritten = true; 52 callback(null, this._transformChunk('')); 53 } 54 55 stop() { 56 this.stopped = true; 57 } 58 59 //Internals 60 _transformChunk(chunk) { 61 if (!this.stopped) { 62 this.tokenizer.write(chunk, this.lastChunkWritten); 63 this._runParsingLoop(); 64 } 65 return chunk; 66 } 67 68 _runParsingLoop() { 69 let token = null; 70 71 do { 72 token = this.parserFeedbackSimulator.getNextToken(); 73 74 if (token.type === Tokenizer.HIBERNATION_TOKEN) { 75 break; 76 } 77 78 if ( 79 token.type === Tokenizer.CHARACTER_TOKEN || 80 token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN || 81 token.type === Tokenizer.NULL_CHARACTER_TOKEN 82 ) { 83 if (this.pendingText === null) { 84 token.type = Tokenizer.CHARACTER_TOKEN; 85 this.pendingText = token; 86 } else { 87 this.pendingText.chars += token.chars; 88 89 if (this.options.sourceCodeLocationInfo) { 90 const { endLine, endCol, endOffset } = token.location; 91 Object.assign(this.pendingText.location, { 92 endLine, 93 endCol, 94 endOffset 95 }); 96 } 97 } 98 } else { 99 this._emitPendingText(); 100 this._handleToken(token); 101 } 102 } while (!this.stopped && token.type !== Tokenizer.EOF_TOKEN); 103 } 104 105 _handleToken(token) { 106 if (token.type === Tokenizer.EOF_TOKEN) { 107 return true; 108 } 109 110 const { eventName, reshapeToken } = TOKEN_EMISSION_HELPERS[token.type]; 111 112 if (this.listenerCount(eventName) === 0) { 113 return false; 114 } 115 116 this._emitToken(eventName, reshapeToken(token)); 117 118 return true; 119 } 120 121 _emitToken(eventName, token) { 122 this.emit(eventName, token); 123 } 124 125 _emitPendingText() { 126 if (this.pendingText !== null) { 127 this._handleToken(this.pendingText); 128 this.pendingText = null; 129 } 130 } 131} 132 133const TOKEN_EMISSION_HELPERS = { 134 [Tokenizer.START_TAG_TOKEN]: { 135 eventName: 'startTag', 136 reshapeToken: origToken => ({ 137 tagName: origToken.tagName, 138 attrs: origToken.attrs, 139 selfClosing: origToken.selfClosing, 140 sourceCodeLocation: origToken.location 141 }) 142 }, 143 [Tokenizer.END_TAG_TOKEN]: { 144 eventName: 'endTag', 145 reshapeToken: origToken => ({ tagName: origToken.tagName, sourceCodeLocation: origToken.location }) 146 }, 147 [Tokenizer.COMMENT_TOKEN]: { 148 eventName: 'comment', 149 reshapeToken: origToken => ({ text: origToken.data, sourceCodeLocation: origToken.location }) 150 }, 151 [Tokenizer.DOCTYPE_TOKEN]: { 152 eventName: 'doctype', 153 reshapeToken: origToken => ({ 154 name: origToken.name, 155 publicId: origToken.publicId, 156 systemId: origToken.systemId, 157 sourceCodeLocation: origToken.location 158 }) 159 }, 160 [Tokenizer.CHARACTER_TOKEN]: { 161 eventName: 'text', 162 reshapeToken: origToken => ({ text: origToken.chars, sourceCodeLocation: origToken.location }) 163 } 164}; 165 166module.exports = SAXParser; 167