• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import { Writable } from 'node:stream';
2import { Parser, type ParserOptions, type TreeAdapterTypeMap, type DefaultTreeAdapterMap } from 'parse5';
3
4/* eslint-disable unicorn/consistent-function-scoping -- The rule seems to be broken here. */
5
6/**
7 * Streaming HTML parser with scripting support.
8 * A [writable stream](https://nodejs.org/api/stream.html#stream_class_stream_writable).
9 *
10 * @example
11 *
12 * ```js
13 * const ParserStream = require('parse5-parser-stream');
14 * const http = require('http');
15 * const { finished } = require('node:stream');
16 *
17 * // Fetch the page content and obtain it's <head> node
18 * http.get('http://inikulin.github.io/parse5/', res => {
19 *     const parser = new ParserStream();
20 *
21 *     finished(parser, () => {
22 *         console.log(parser.document.childNodes[1].childNodes[0].tagName); //> 'head'
23 *     });
24 *
25 *     res.pipe(parser);
26 * });
27 * ```
28 *
29 */
30export class ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> extends Writable {
31    static getFragmentStream<T extends TreeAdapterTypeMap>(
32        fragmentContext?: T['parentNode'] | null,
33        options?: ParserOptions<T>
34    ): ParserStream<T> {
35        const parser = Parser.getFragmentParser(fragmentContext, options);
36        const stream = new ParserStream(options, parser);
37        return stream;
38    }
39
40    private lastChunkWritten = false;
41    private writeCallback: undefined | (() => void) = undefined;
42
43    private pendingHtmlInsertions: string[] = [];
44    /** The resulting document node. */
45    public get document(): T['document'] {
46        return this.parser.document;
47    }
48    public getFragment(): T['documentFragment'] {
49        return this.parser.getFragment();
50    }
51
52    /**
53     * @param options Parsing options.
54     */
55    constructor(options?: ParserOptions<T>, public parser: Parser<T> = new Parser(options)) {
56        super({ decodeStrings: false });
57
58        const resume = (): void => {
59            for (let i = this.pendingHtmlInsertions.length - 1; i >= 0; i--) {
60                this.parser.tokenizer.insertHtmlAtCurrentPos(this.pendingHtmlInsertions[i]);
61            }
62
63            this.pendingHtmlInsertions.length = 0;
64
65            //NOTE: keep parsing if we don't wait for the next input chunk
66            this.parser.tokenizer.resume(this.writeCallback);
67        };
68
69        const documentWrite = (html: string): void => {
70            if (!this.parser.stopped) {
71                this.pendingHtmlInsertions.push(html);
72            }
73        };
74
75        const scriptHandler = (scriptElement: T['element']): void => {
76            if (this.listenerCount('script') > 0) {
77                this.parser.tokenizer.pause();
78                this.emit('script', scriptElement, documentWrite, resume);
79            }
80        };
81
82        this.parser.scriptHandler = scriptHandler;
83    }
84
85    //WritableStream implementation
86    override _write(chunk: string, _encoding: string, callback: () => void): void {
87        if (typeof chunk !== 'string') {
88            throw new TypeError('Parser can work only with string streams.');
89        }
90
91        this.writeCallback = callback;
92        this.parser.tokenizer.write(chunk, this.lastChunkWritten, this.writeCallback);
93    }
94
95    // TODO [engine:node@>=16]: Due to issues with Node < 16, we are overriding `end` instead of `_final`.
96
97    // eslint-disable-next-line @typescript-eslint/no-explicit-any
98    override end(chunk?: any, encoding?: any, callback?: any): any {
99        this.lastChunkWritten = true;
100        super.end(chunk || '', encoding, callback);
101    }
102}
103
104export interface ParserStream<T extends TreeAdapterTypeMap = DefaultTreeAdapterMap> {
105    /**
106     * Raised when parser encounters a `<script>` element. If this event has listeners, parsing will be suspended once
107     * it is emitted. So, if `<script>` has the `src` attribute, you can fetch it, execute and then resume parsing just
108     * like browsers do.
109     *
110     * @example
111     *
112     * ```js
113     * const ParserStream = require('parse5-parser-stream');
114     * const http = require('http');
115     *
116     * const parser = new ParserStream();
117     *
118     * parser.on('script', (scriptElement, documentWrite, resume) => {
119     *     const src = scriptElement.attrs.find(({ name }) => name === 'src').value;
120     *
121     *     http.get(src, res => {
122     *         // Fetch the script content, execute it with DOM built around `parser.document` and
123     *         // `document.write` implemented using `documentWrite`.
124     *         ...
125     *         // Then resume parsing.
126     *         resume();
127     *     });
128     * });
129     *
130     * parser.end('<script src="example.com/script.js"></script>');
131     * ```
132     *
133     * @param event Name of the event
134     * @param handler
135     */
136    on(
137        event: 'script',
138        handler: (scriptElement: T['element'], documentWrite: (html: string) => void, resume: () => void) => void
139    ): void;
140    /**
141     * Base event handler.
142     *
143     * @param event Name of the event
144     * @param handler Event handler
145     */
146    on(event: string, handler: (...args: any[]) => void): this;
147}
148