• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import { Writable } from 'node:stream';
2import { Parser, type ParserOptions, type TreeAdapterTypeMap, type DefaultTreeAdapterMap } from 'parse5';
3
4/**
5 * Streaming HTML parser with scripting support.
6 * A [writable stream](https://nodejs.org/api/stream.html#stream_class_stream_writable).
7 *
8 * @example
9 *
10 * ```js
11 * const ParserStream = require('parse5-parser-stream');
12 * const http = require('http');
13 * const { finished } = require('node:stream');
14 *
15 * // Fetch the page content and obtain it's <head> node
16 * http.get('http://inikulin.github.io/parse5/', res => {
17 *     const parser = new ParserStream();
18 *
19 *     finished(parser, () => {
20 *         console.log(parser.document.childNodes[1].childNodes[0].tagName); //> 'head'
21 *     });
22 *
23 *     res.pipe(parser);
24 * });
25 * ```
26 *
27 */
28export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> extends Writable {
29    static getFragmentStream<T extends TreeAdapterTypeMap>(
30        fragmentContext?: T['parentNode'] | null,
31        options?: ParserOptions<T>,
32    ): ParserStream<T> {
33        const parser = Parser.getFragmentParser(fragmentContext, options);
34        const stream = new ParserStream(options, parser);
35        return stream;
36    }
37
38    private lastChunkWritten = false;
39    private writeCallback: undefined | (() => void) = undefined;
40
41    private pendingHtmlInsertions: string[] = [];
42    /** The resulting document node. */
43    public get document(): T['document'] {
44        return this.parser.document;
45    }
46    public getFragment(): T['documentFragment'] {
47        return this.parser.getFragment();
48    }
49
50    /**
51     * @param options Parsing options.
52     */
53    constructor(
54        options?: ParserOptions<T>,
55        public parser: Parser<T> = new Parser(options),
56    ) {
57        super({ decodeStrings: false });
58
59        const resume = (): void => {
60            for (let i = this.pendingHtmlInsertions.length - 1; i >= 0; i--) {
61                this.parser.tokenizer.insertHtmlAtCurrentPos(this.pendingHtmlInsertions[i]);
62            }
63
64            this.pendingHtmlInsertions.length = 0;
65
66            //NOTE: keep parsing if we don't wait for the next input chunk
67            this.parser.tokenizer.resume(this.writeCallback);
68        };
69
70        const documentWrite = (html: string): void => {
71            if (!this.parser.stopped) {
72                this.pendingHtmlInsertions.push(html);
73            }
74        };
75
76        const scriptHandler = (scriptElement: T['element']): void => {
77            if (this.listenerCount('script') > 0) {
78                this.parser.tokenizer.pause();
79                this.emit('script', scriptElement, documentWrite, resume);
80            }
81        };
82
83        this.parser.scriptHandler = scriptHandler;
84    }
85
86    //WritableStream implementation
87    override _write(chunk: string, _encoding: string, callback: () => void): void {
88        if (typeof chunk !== 'string') {
89            throw new TypeError('Parser can work only with string streams.');
90        }
91
92        this.writeCallback = callback;
93        this.parser.tokenizer.write(chunk, this.lastChunkWritten, this.writeCallback);
94    }
95
96    // TODO [engine:node@>=16]: Due to issues with Node < 16, we are overriding `end` instead of `_final`.
97
98    // eslint-disable-next-line @typescript-eslint/no-explicit-any
99    override end(chunk?: any, encoding?: any, callback?: any): any {
100        this.lastChunkWritten = true;
101        super.end(chunk || '', encoding, callback);
102    }
103}
104
105export interface ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> {
106    /**
107     * Raised when parser encounters a `<script>` element. If this event has listeners, parsing will be suspended once
108     * it is emitted. So, if `<script>` has the `src` attribute, you can fetch it, execute and then resume parsing just
109     * like browsers do.
110     *
111     * @example
112     *
113     * ```js
114     * const ParserStream = require('parse5-parser-stream');
115     * const http = require('http');
116     *
117     * const parser = new ParserStream();
118     *
119     * parser.on('script', (scriptElement, documentWrite, resume) => {
120     *     const src = scriptElement.attrs.find(({ name }) => name === 'src').value;
121     *
122     *     http.get(src, res => {
123     *         // Fetch the script content, execute it with DOM built around `parser.document` and
124     *         // `document.write` implemented using `documentWrite`.
125     *         ...
126     *         // Then resume parsing.
127     *         resume();
128     *     });
129     * });
130     *
131     * parser.end('<script src="example.com/script.js"></script>');
132     * ```
133     *
134     * @param event Name of the event
135     * @param handler
136     */
137    on(
138        event: 'script',
139        handler: (scriptElement: T['element'], documentWrite: (html: string) => void, resume: () => void) => void,
140    ): void;
141    /**
142     * Base event handler.
143     *
144     * @param event Name of the event
145     * @param handler Event handler
146     */
147    on(event: string, handler: (...args: any[]) => void): this;
148}
149