1import { Transform } from 'node:stream'; 2import type { Tokenizer, TokenHandler, Token } from 'parse5'; 3import { DevNullStream } from './dev-null-stream.js'; 4import { ParserFeedbackSimulator } from './parser-feedback-simulator.js'; 5 6export interface SAXParserOptions { 7 /** 8 * Enables source code location information for tokens. 9 * 10 * When enabled, each token will have a `sourceCodeLocation` property. 11 */ 12 sourceCodeLocationInfo?: boolean; 13} 14 15/** 16 * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML parser. 17 * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example). 18 * 19 * @example 20 * 21 * ```js 22 * const SAXParser = require('parse5-sax-parser'); 23 * const http = require('http'); 24 * const fs = require('fs'); 25 * 26 * const file = fs.createWriteStream('/home/google.com.html'); 27 * const parser = new SAXParser(); 28 * 29 * parser.on('text', text => { 30 * // Handle page text content 31 * ... 32 * }); 33 * 34 * http.get('http://google.com', res => { 35 * // `SAXParser` is the `Transform` stream, which means you can pipe 36 * // through it. So, you can analyze the page content and, e.g., save it 37 * // to the file at the same time: 38 * res.pipe(parser).pipe(file); 39 * }); 40 * ``` 41 */ 42export class SAXParser extends Transform implements TokenHandler { 43 protected options: SAXParserOptions; 44 protected parserFeedbackSimulator: ParserFeedbackSimulator; 45 private pendingText: Text | null = null; 46 private lastChunkWritten = false; 47 private stopped = false; 48 protected tokenizer: Tokenizer; 49 50 /** 51 * @param options Parsing options. 52 */ 53 constructor(options: SAXParserOptions = {}) { 54 super({ encoding: 'utf8', decodeStrings: false }); 55 56 this.options = { 57 sourceCodeLocationInfo: false, 58 ...options, 59 }; 60 61 this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.options, this); 62 this.tokenizer = this.parserFeedbackSimulator.tokenizer; 63 64 // NOTE: always pipe the stream to the /dev/null stream to avoid 65 // the `highWaterMark` to be hit even if we don't have consumers. 66 // (see: https://github.com/inikulin/parse5/issues/97#issuecomment-171940774) 67 this.pipe(new DevNullStream()); 68 } 69 70 //`Transform` implementation 71 override _transform( 72 chunk: string, 73 _encoding: string, 74 callback: (error?: Error | null, data?: string) => void 75 ): void { 76 if (typeof chunk !== 'string') { 77 throw new TypeError('Parser can work only with string streams.'); 78 } 79 80 callback(null, this._transformChunk(chunk)); 81 } 82 83 override _final(callback: (error?: Error | null, data?: string) => void): void { 84 this.lastChunkWritten = true; 85 callback(null, this._transformChunk('')); 86 } 87 88 /** 89 * Stops parsing. Useful if you want the parser to stop consuming CPU time 90 * once you've obtained the desired info from the input stream. Doesn't 91 * prevent piping, so that data will flow through the parser as usual. 92 * 93 * @example 94 * 95 * ```js 96 * const SAXParser = require('parse5-sax-parser'); 97 * const http = require('http'); 98 * const fs = require('fs'); 99 * 100 * const file = fs.createWriteStream('google.com.html'); 101 * const parser = new SAXParser(); 102 * 103 * parser.on('doctype', ({ name, publicId, systemId }) => { 104 * // Process doctype info and stop parsing 105 * ... 106 * parser.stop(); 107 * }); 108 * 109 * http.get('http://google.com', res => { 110 * // Despite the fact that parser.stop() was called whole 111 * // content of the page will be written to the file 112 * res.pipe(parser).pipe(file); 113 * }); 114 * ``` 115 */ 116 public stop(): void { 117 this.stopped = true; 118 this.tokenizer.pause(); 119 } 120 121 //Internals 122 protected _transformChunk(chunk: string): string { 123 if (!this.stopped) { 124 this.tokenizer.write(chunk, this.lastChunkWritten); 125 } 126 return chunk; 127 } 128 129 /** @internal */ 130 onCharacter({ chars, location }: Token.CharacterToken): void { 131 if (this.pendingText === null) { 132 this.pendingText = { text: chars, sourceCodeLocation: location }; 133 } else { 134 this.pendingText.text += chars; 135 136 if (location && this.pendingText.sourceCodeLocation) { 137 const { endLine, endCol, endOffset } = location; 138 this.pendingText.sourceCodeLocation = { 139 ...this.pendingText.sourceCodeLocation, 140 endLine, 141 endCol, 142 endOffset, 143 }; 144 } 145 } 146 147 if (this.tokenizer.preprocessor.willDropParsedChunk()) { 148 this._emitPendingText(); 149 } 150 } 151 152 /** @internal */ 153 onWhitespaceCharacter(token: Token.CharacterToken): void { 154 this.onCharacter(token); 155 } 156 157 /** @internal */ 158 onNullCharacter(token: Token.CharacterToken): void { 159 this.onCharacter(token); 160 } 161 162 /** @internal */ 163 onEof(): void { 164 this._emitPendingText(); 165 this.stopped = true; 166 } 167 168 /** @internal */ 169 onStartTag(token: Token.TagToken): void { 170 this._emitPendingText(); 171 172 const startTag: StartTag = { 173 tagName: token.tagName, 174 attrs: token.attrs, 175 selfClosing: token.selfClosing, 176 sourceCodeLocation: token.location, 177 }; 178 this.emitIfListenerExists('startTag', startTag); 179 } 180 181 /** @internal */ 182 onEndTag(token: Token.TagToken): void { 183 this._emitPendingText(); 184 185 const endTag: EndTag = { 186 tagName: token.tagName, 187 sourceCodeLocation: token.location, 188 }; 189 this.emitIfListenerExists('endTag', endTag); 190 } 191 192 /** @internal */ 193 onDoctype(token: Token.DoctypeToken): void { 194 this._emitPendingText(); 195 196 const doctype: Doctype = { 197 name: token.name, 198 publicId: token.publicId, 199 systemId: token.systemId, 200 sourceCodeLocation: token.location, 201 }; 202 this.emitIfListenerExists('doctype', doctype); 203 } 204 205 /** @internal */ 206 onComment(token: Token.CommentToken): void { 207 this._emitPendingText(); 208 209 const comment: Comment = { 210 text: token.data, 211 sourceCodeLocation: token.location, 212 }; 213 this.emitIfListenerExists('comment', comment); 214 } 215 216 protected emitIfListenerExists(eventName: string, token: SaxToken): boolean { 217 if (this.listenerCount(eventName) === 0) { 218 return false; 219 } 220 221 this._emitToken(eventName, token); 222 223 return true; 224 } 225 226 protected _emitToken(eventName: string, token: SaxToken): void { 227 this.emit(eventName, token); 228 } 229 230 private _emitPendingText(): void { 231 if (this.pendingText !== null) { 232 this.emitIfListenerExists('text', this.pendingText); 233 this.pendingText = null; 234 } 235 } 236} 237 238export interface SaxToken { 239 /** Source code location info. Available if location info is enabled via {@link SAXParserOptions}. */ 240 sourceCodeLocation?: Token.Location | null; 241} 242 243export interface StartTag extends SaxToken { 244 /** Tag name */ 245 tagName: string; 246 /** List of attributes */ 247 attrs: Token.Attribute[]; 248 /** Indicates if the tag is self-closing */ 249 selfClosing: boolean; 250} 251 252export interface EndTag extends SaxToken { 253 /** Tag name */ 254 tagName: string; 255} 256 257export interface Text extends SaxToken { 258 /** Text content. */ 259 text: string; 260} 261 262export interface Comment extends SaxToken { 263 /** Comment text. */ 264 text: string; 265} 266 267export interface Doctype extends SaxToken { 268 /** Document type name. */ 269 name: string | null; 270 /** Document type public identifier. */ 271 publicId: string | null; 272 /** Document type system identifier. */ 273 systemId: string | null; 274} 275 276export interface SAXParser { 277 /** Raised when the parser encounters a start tag. */ 278 on(event: 'startTag', listener: (startTag: StartTag) => void): this; 279 /** Raised when the parser encounters an end tag. */ 280 on(event: 'endTag', listener: (endTag: EndTag) => void): this; 281 /** Raised when the parser encounters a comment. */ 282 on(event: 'comment', listener: (comment: Comment) => void): this; 283 /** Raised when the parser encounters text content. */ 284 on(event: 'text', listener: (text: Text) => void): this; 285 /** Raised when the parser encounters a [document type declaration](https://en.wikipedia.org/wiki/Document_type_declaration) */ 286 on(event: 'doctype', listener: (doctype: Doctype) => void): this; 287 /** 288 * Base event handler. 289 * 290 * @param event Name of the event 291 * @param handler Event handler 292 */ 293 on(event: string, handler: (...args: any[]) => void): this; 294} 295