1import { Writable } from 'node:stream'; 2import { Parser, type ParserOptions, type TreeAdapterTypeMap, type DefaultTreeAdapterMap } from 'parse5'; 3 4/** 5 * Streaming HTML parser with scripting support. 6 * A [writable stream](https://nodejs.org/api/stream.html#stream_class_stream_writable). 7 * 8 * @example 9 * 10 * ```js 11 * const ParserStream = require('parse5-parser-stream'); 12 * const http = require('http'); 13 * const { finished } = require('node:stream'); 14 * 15 * // Fetch the page content and obtain it's <head> node 16 * http.get('http://inikulin.github.io/parse5/', res => { 17 * const parser = new ParserStream(); 18 * 19 * finished(parser, () => { 20 * console.log(parser.document.childNodes[1].childNodes[0].tagName); //> 'head' 21 * }); 22 * 23 * res.pipe(parser); 24 * }); 25 * ``` 26 * 27 */ 28export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> extends Writable { 29 static getFragmentStream<T extends TreeAdapterTypeMap>( 30 fragmentContext?: T['parentNode'] | null, 31 options?: ParserOptions<T>, 32 ): ParserStream<T> { 33 const parser = Parser.getFragmentParser(fragmentContext, options); 34 const stream = new ParserStream(options, parser); 35 return stream; 36 } 37 38 private lastChunkWritten = false; 39 private writeCallback: undefined | (() => void) = undefined; 40 41 private pendingHtmlInsertions: string[] = []; 42 /** The resulting document node. */ 43 public get document(): T['document'] { 44 return this.parser.document; 45 } 46 public getFragment(): T['documentFragment'] { 47 return this.parser.getFragment(); 48 } 49 50 /** 51 * @param options Parsing options. 52 */ 53 constructor( 54 options?: ParserOptions<T>, 55 public parser: Parser<T> = new Parser(options), 56 ) { 57 super({ decodeStrings: false }); 58 59 const resume = (): void => { 60 for (let i = this.pendingHtmlInsertions.length - 1; i >= 0; i--) { 61 this.parser.tokenizer.insertHtmlAtCurrentPos(this.pendingHtmlInsertions[i]); 62 } 63 64 this.pendingHtmlInsertions.length = 0; 65 66 //NOTE: keep parsing if we don't wait for the next input chunk 67 this.parser.tokenizer.resume(this.writeCallback); 68 }; 69 70 const documentWrite = (html: string): void => { 71 if (!this.parser.stopped) { 72 this.pendingHtmlInsertions.push(html); 73 } 74 }; 75 76 const scriptHandler = (scriptElement: T['element']): void => { 77 if (this.listenerCount('script') > 0) { 78 this.parser.tokenizer.pause(); 79 this.emit('script', scriptElement, documentWrite, resume); 80 } 81 }; 82 83 this.parser.scriptHandler = scriptHandler; 84 } 85 86 //WritableStream implementation 87 override _write(chunk: string, _encoding: string, callback: () => void): void { 88 if (typeof chunk !== 'string') { 89 throw new TypeError('Parser can work only with string streams.'); 90 } 91 92 this.writeCallback = callback; 93 this.parser.tokenizer.write(chunk, this.lastChunkWritten, this.writeCallback); 94 } 95 96 // TODO [engine:node@>=16]: Due to issues with Node < 16, we are overriding `end` instead of `_final`. 97 98 // eslint-disable-next-line @typescript-eslint/no-explicit-any 99 override end(chunk?: any, encoding?: any, callback?: any): any { 100 this.lastChunkWritten = true; 101 super.end(chunk || '', encoding, callback); 102 } 103} 104 105export interface ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> { 106 /** 107 * Raised when parser encounters a `<script>` element. If this event has listeners, parsing will be suspended once 108 * it is emitted. So, if `<script>` has the `src` attribute, you can fetch it, execute and then resume parsing just 109 * like browsers do. 110 * 111 * @example 112 * 113 * ```js 114 * const ParserStream = require('parse5-parser-stream'); 115 * const http = require('http'); 116 * 117 * const parser = new ParserStream(); 118 * 119 * parser.on('script', (scriptElement, documentWrite, resume) => { 120 * const src = scriptElement.attrs.find(({ name }) => name === 'src').value; 121 * 122 * http.get(src, res => { 123 * // Fetch the script content, execute it with DOM built around `parser.document` and 124 * // `document.write` implemented using `documentWrite`. 125 * ... 126 * // Then resume parsing. 127 * resume(); 128 * }); 129 * }); 130 * 131 * parser.end('<script src="example.com/script.js"></script>'); 132 * ``` 133 * 134 * @param event Name of the event 135 * @param handler 136 */ 137 on( 138 event: 'script', 139 handler: (scriptElement: T['element'], documentWrite: (html: string) => void, resume: () => void) => void, 140 ): void; 141 /** 142 * Base event handler. 143 * 144 * @param event Name of the event 145 * @param handler Event handler 146 */ 147 on(event: string, handler: (...args: any[]) => void): this; 148} 149