1import { Transform } from 'node:stream'; 2import type { Tokenizer, TokenHandler, Token } from 'parse5'; 3import { DevNullStream } from './dev-null-stream.js'; 4import { ParserFeedbackSimulator } from './parser-feedback-simulator.js'; 5 6export interface SAXParserOptions { 7 /** 8 * Enables source code location information for tokens. 9 * 10 * When enabled, each token will have a `sourceCodeLocation` property. 11 */ 12 sourceCodeLocationInfo?: boolean; 13} 14 15/** 16 * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML parser. 17 * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example). 18 * 19 * @example 20 * 21 * ```js 22 * const SAXParser = require('parse5-sax-parser'); 23 * const http = require('http'); 24 * const fs = require('fs'); 25 * 26 * const file = fs.createWriteStream('/home/google.com.html'); 27 * const parser = new SAXParser(); 28 * 29 * parser.on('text', text => { 30 * // Handle page text content 31 * ... 32 * }); 33 * 34 * http.get('http://google.com', res => { 35 * // `SAXParser` is the `Transform` stream, which means you can pipe 36 * // through it. So, you can analyze the page content and, e.g., save it 37 * // to the file at the same time: 38 * res.pipe(parser).pipe(file); 39 * }); 40 * ``` 41 */ 42export class SAXParser extends Transform implements TokenHandler { 43 protected options: SAXParserOptions; 44 /** @internal */ 45 protected parserFeedbackSimulator: ParserFeedbackSimulator; 46 private pendingText: Text | null = null; 47 private lastChunkWritten = false; 48 private stopped = false; 49 protected tokenizer: Tokenizer; 50 51 /** 52 * @param options Parsing options. 53 */ 54 constructor(options: SAXParserOptions = {}) { 55 super({ encoding: 'utf8', decodeStrings: false }); 56 57 this.options = { 58 sourceCodeLocationInfo: false, 59 ...options, 60 }; 61 62 this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.options, this); 63 this.tokenizer = this.parserFeedbackSimulator.tokenizer; 64 65 // NOTE: always pipe the stream to the /dev/null stream to avoid 66 // the `highWaterMark` to be hit even if we don't have consumers. 67 // (see: https://github.com/inikulin/parse5/issues/97#issuecomment-171940774) 68 this.pipe(new DevNullStream()); 69 } 70 71 //`Transform` implementation 72 override _transform( 73 chunk: string, 74 _encoding: string, 75 callback: (error?: Error | null, data?: string) => void, 76 ): void { 77 if (typeof chunk !== 'string') { 78 throw new TypeError('Parser can work only with string streams.'); 79 } 80 81 callback(null, this._transformChunk(chunk)); 82 } 83 84 override _final(callback: (error?: Error | null, data?: string) => void): void { 85 this.lastChunkWritten = true; 86 callback(null, this._transformChunk('')); 87 } 88 89 /** 90 * Stops parsing. Useful if you want the parser to stop consuming CPU time 91 * once you've obtained the desired info from the input stream. Doesn't 92 * prevent piping, so that data will flow through the parser as usual. 93 * 94 * @example 95 * 96 * ```js 97 * const SAXParser = require('parse5-sax-parser'); 98 * const http = require('http'); 99 * const fs = require('fs'); 100 * 101 * const file = fs.createWriteStream('google.com.html'); 102 * const parser = new SAXParser(); 103 * 104 * parser.on('doctype', ({ name, publicId, systemId }) => { 105 * // Process doctype info and stop parsing 106 * ... 107 * parser.stop(); 108 * }); 109 * 110 * http.get('http://google.com', res => { 111 * // Despite the fact that parser.stop() was called whole 112 * // content of the page will be written to the file 113 * res.pipe(parser).pipe(file); 114 * }); 115 * ``` 116 */ 117 public stop(): void { 118 this.stopped = true; 119 this.tokenizer.pause(); 120 } 121 122 //Internals 123 protected _transformChunk(chunk: string): string { 124 if (!this.stopped) { 125 this.tokenizer.write(chunk, this.lastChunkWritten); 126 } 127 return chunk; 128 } 129 130 /** @internal */ 131 onCharacter({ chars, location }: Token.CharacterToken): void { 132 if (this.pendingText === null) { 133 this.pendingText = { text: chars, sourceCodeLocation: location }; 134 } else { 135 this.pendingText.text += chars; 136 137 if (location && this.pendingText.sourceCodeLocation) { 138 const { endLine, endCol, endOffset } = location; 139 this.pendingText.sourceCodeLocation = { 140 ...this.pendingText.sourceCodeLocation, 141 endLine, 142 endCol, 143 endOffset, 144 }; 145 } 146 } 147 148 if (this.tokenizer.preprocessor.willDropParsedChunk()) { 149 this._emitPendingText(); 150 } 151 } 152 153 /** @internal */ 154 onWhitespaceCharacter(token: Token.CharacterToken): void { 155 this.onCharacter(token); 156 } 157 158 /** @internal */ 159 onNullCharacter(token: Token.CharacterToken): void { 160 this.onCharacter(token); 161 } 162 163 /** @internal */ 164 onEof(): void { 165 this._emitPendingText(); 166 this.stopped = true; 167 } 168 169 /** @internal */ 170 onStartTag(token: Token.TagToken): void { 171 this._emitPendingText(); 172 173 const startTag: StartTag = { 174 tagName: token.tagName, 175 attrs: token.attrs, 176 selfClosing: token.selfClosing, 177 sourceCodeLocation: token.location, 178 }; 179 this.emitIfListenerExists('startTag', startTag); 180 } 181 182 /** @internal */ 183 onEndTag(token: Token.TagToken): void { 184 this._emitPendingText(); 185 186 const endTag: EndTag = { 187 tagName: token.tagName, 188 sourceCodeLocation: token.location, 189 }; 190 this.emitIfListenerExists('endTag', endTag); 191 } 192 193 /** @internal */ 194 onDoctype(token: Token.DoctypeToken): void { 195 this._emitPendingText(); 196 197 const doctype: Doctype = { 198 name: token.name, 199 publicId: token.publicId, 200 systemId: token.systemId, 201 sourceCodeLocation: token.location, 202 }; 203 this.emitIfListenerExists('doctype', doctype); 204 } 205 206 /** @internal */ 207 onComment(token: Token.CommentToken): void { 208 this._emitPendingText(); 209 210 const comment: Comment = { 211 text: token.data, 212 sourceCodeLocation: token.location, 213 }; 214 this.emitIfListenerExists('comment', comment); 215 } 216 217 protected emitIfListenerExists(eventName: string, token: SaxToken): boolean { 218 if (this.listenerCount(eventName) === 0) { 219 return false; 220 } 221 222 this._emitToken(eventName, token); 223 224 return true; 225 } 226 227 protected _emitToken(eventName: string, token: SaxToken): void { 228 this.emit(eventName, token); 229 } 230 231 private _emitPendingText(): void { 232 if (this.pendingText !== null) { 233 this.emitIfListenerExists('text', this.pendingText); 234 this.pendingText = null; 235 } 236 } 237} 238 239export interface SaxToken { 240 /** Source code location info. Available if location info is enabled via {@link SAXParserOptions}. */ 241 sourceCodeLocation?: Token.Location | null; 242} 243 244export interface StartTag extends SaxToken { 245 /** Tag name */ 246 tagName: string; 247 /** List of attributes */ 248 attrs: Token.Attribute[]; 249 /** Indicates if the tag is self-closing */ 250 selfClosing: boolean; 251} 252 253export interface EndTag extends SaxToken { 254 /** Tag name */ 255 tagName: string; 256} 257 258export interface Text extends SaxToken { 259 /** Text content. */ 260 text: string; 261} 262 263export interface Comment extends SaxToken { 264 /** Comment text. */ 265 text: string; 266} 267 268export interface Doctype extends SaxToken { 269 /** Document type name. */ 270 name: string | null; 271 /** Document type public identifier. */ 272 publicId: string | null; 273 /** Document type system identifier. */ 274 systemId: string | null; 275} 276 277export interface SAXParser { 278 /** Raised when the parser encounters a start tag. */ 279 on(event: 'startTag', listener: (startTag: StartTag) => void): this; 280 /** Raised when the parser encounters an end tag. */ 281 on(event: 'endTag', listener: (endTag: EndTag) => void): this; 282 /** Raised when the parser encounters a comment. */ 283 on(event: 'comment', listener: (comment: Comment) => void): this; 284 /** Raised when the parser encounters text content. */ 285 on(event: 'text', listener: (text: Text) => void): this; 286 /** Raised when the parser encounters a [document type declaration](https://en.wikipedia.org/wiki/Document_type_declaration) */ 287 on(event: 'doctype', listener: (doctype: Doctype) => void): this; 288 /** 289 * Base event handler. 290 * 291 * @param event Name of the event 292 * @param handler Event handler 293 */ 294 on(event: string, handler: (...args: any[]) => void): this; 295} 296