1import { Writable } from 'node:stream'; 2import { Parser, type ParserOptions, type TreeAdapterTypeMap, type DefaultTreeAdapterMap } from 'parse5'; 3 4/* eslint-disable unicorn/consistent-function-scoping -- The rule seems to be broken here. */ 5 6/** 7 * Streaming HTML parser with scripting support. 8 * A [writable stream](https://nodejs.org/api/stream.html#stream_class_stream_writable). 9 * 10 * @example 11 * 12 * ```js 13 * const ParserStream = require('parse5-parser-stream'); 14 * const http = require('http'); 15 * const { finished } = require('node:stream'); 16 * 17 * // Fetch the page content and obtain it's <head> node 18 * http.get('http://inikulin.github.io/parse5/', res => { 19 * const parser = new ParserStream(); 20 * 21 * finished(parser, () => { 22 * console.log(parser.document.childNodes[1].childNodes[0].tagName); //> 'head' 23 * }); 24 * 25 * res.pipe(parser); 26 * }); 27 * ``` 28 * 29 */ 30export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> extends Writable { 31 static getFragmentStream<T extends TreeAdapterTypeMap>( 32 fragmentContext?: T['parentNode'] | null, 33 options?: ParserOptions<T> 34 ): ParserStream<T> { 35 const parser = Parser.getFragmentParser(fragmentContext, options); 36 const stream = new ParserStream(options, parser); 37 return stream; 38 } 39 40 private lastChunkWritten = false; 41 private writeCallback: undefined | (() => void) = undefined; 42 43 private pendingHtmlInsertions: string[] = []; 44 /** The resulting document node. */ 45 public get document(): T['document'] { 46 return this.parser.document; 47 } 48 public getFragment(): T['documentFragment'] { 49 return this.parser.getFragment(); 50 } 51 52 /** 53 * @param options Parsing options. 54 */ 55 constructor(options?: ParserOptions<T>, public parser: Parser<T> = new Parser(options)) { 56 super({ decodeStrings: false }); 57 58 const resume = (): void => { 59 for (let i = this.pendingHtmlInsertions.length - 1; i >= 0; i--) { 60 this.parser.tokenizer.insertHtmlAtCurrentPos(this.pendingHtmlInsertions[i]); 61 } 62 63 this.pendingHtmlInsertions.length = 0; 64 65 //NOTE: keep parsing if we don't wait for the next input chunk 66 this.parser.tokenizer.resume(this.writeCallback); 67 }; 68 69 const documentWrite = (html: string): void => { 70 if (!this.parser.stopped) { 71 this.pendingHtmlInsertions.push(html); 72 } 73 }; 74 75 const scriptHandler = (scriptElement: T['element']): void => { 76 if (this.listenerCount('script') > 0) { 77 this.parser.tokenizer.pause(); 78 this.emit('script', scriptElement, documentWrite, resume); 79 } 80 }; 81 82 this.parser.scriptHandler = scriptHandler; 83 } 84 85 //WritableStream implementation 86 override _write(chunk: string, _encoding: string, callback: () => void): void { 87 if (typeof chunk !== 'string') { 88 throw new TypeError('Parser can work only with string streams.'); 89 } 90 91 this.writeCallback = callback; 92 this.parser.tokenizer.write(chunk, this.lastChunkWritten, this.writeCallback); 93 } 94 95 // TODO [engine:node@>=16]: Due to issues with Node < 16, we are overriding `end` instead of `_final`. 96 97 // eslint-disable-next-line @typescript-eslint/no-explicit-any 98 override end(chunk?: any, encoding?: any, callback?: any): any { 99 this.lastChunkWritten = true; 100 super.end(chunk || '', encoding, callback); 101 } 102} 103 104export interface ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> { 105 /** 106 * Raised when parser encounters a `<script>` element. If this event has listeners, parsing will be suspended once 107 * it is emitted. So, if `<script>` has the `src` attribute, you can fetch it, execute and then resume parsing just 108 * like browsers do. 109 * 110 * @example 111 * 112 * ```js 113 * const ParserStream = require('parse5-parser-stream'); 114 * const http = require('http'); 115 * 116 * const parser = new ParserStream(); 117 * 118 * parser.on('script', (scriptElement, documentWrite, resume) => { 119 * const src = scriptElement.attrs.find(({ name }) => name === 'src').value; 120 * 121 * http.get(src, res => { 122 * // Fetch the script content, execute it with DOM built around `parser.document` and 123 * // `document.write` implemented using `documentWrite`. 124 * ... 125 * // Then resume parsing. 126 * resume(); 127 * }); 128 * }); 129 * 130 * parser.end('<script src="example.com/script.js"></script>'); 131 * ``` 132 * 133 * @param event Name of the event 134 * @param handler 135 */ 136 on( 137 event: 'script', 138 handler: (scriptElement: T['element'], documentWrite: (html: string) => void, resume: () => void) => void 139 ): void; 140 /** 141 * Base event handler. 142 * 143 * @param event Name of the event 144 * @param handler Event handler 145 */ 146 on(event: string, handler: (...args: any[]) => void): this; 147} 148