• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import { Transform } from 'node:stream';
2import type { Tokenizer, TokenHandler, Token } from 'parse5';
3import { DevNullStream } from './dev-null-stream.js';
4import { ParserFeedbackSimulator } from './parser-feedback-simulator.js';
5
6export interface SAXParserOptions {
7    /**
8     * Enables source code location information for tokens.
9     *
10     * When enabled, each token will have a `sourceCodeLocation` property.
11     */
12    sourceCodeLocationInfo?: boolean;
13}
14
15/**
16 * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML parser.
17 * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example).
18 *
19 * @example
20 *
21 * ```js
22 *     const SAXParser = require('parse5-sax-parser');
23 *     const http = require('http');
24 *     const fs = require('fs');
25 *
26 *     const file = fs.createWriteStream('/home/google.com.html');
27 *     const parser = new SAXParser();
28 *
29 *     parser.on('text', text => {
30 *        // Handle page text content
31 *        ...
32 *     });
33 *
34 *     http.get('http://google.com', res => {
35 *        // `SAXParser` is the `Transform` stream, which means you can pipe
36 *        // through it. So, you can analyze the page content and, e.g., save it
37 *        // to the file at the same time:
38 *        res.pipe(parser).pipe(file);
39 *     });
40 * ```
41 */
42export class SAXParser extends Transform implements TokenHandler {
43    protected options: SAXParserOptions;
44    protected parserFeedbackSimulator: ParserFeedbackSimulator;
45    private pendingText: Text | null = null;
46    private lastChunkWritten = false;
47    private stopped = false;
48    protected tokenizer: Tokenizer;
49
50    /**
51     * @param options Parsing options.
52     */
53    constructor(options: SAXParserOptions = {}) {
54        super({ encoding: 'utf8', decodeStrings: false });
55
56        this.options = {
57            sourceCodeLocationInfo: false,
58            ...options,
59        };
60
61        this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.options, this);
62        this.tokenizer = this.parserFeedbackSimulator.tokenizer;
63
64        // NOTE: always pipe the stream to the /dev/null stream to avoid
65        // the `highWaterMark` to be hit even if we don't have consumers.
66        // (see: https://github.com/inikulin/parse5/issues/97#issuecomment-171940774)
67        this.pipe(new DevNullStream());
68    }
69
70    //`Transform` implementation
71    override _transform(
72        chunk: string,
73        _encoding: string,
74        callback: (error?: Error | null, data?: string) => void
75    ): void {
76        if (typeof chunk !== 'string') {
77            throw new TypeError('Parser can work only with string streams.');
78        }
79
80        callback(null, this._transformChunk(chunk));
81    }
82
83    override _final(callback: (error?: Error | null, data?: string) => void): void {
84        this.lastChunkWritten = true;
85        callback(null, this._transformChunk(''));
86    }
87
88    /**
89     * Stops parsing. Useful if you want the parser to stop consuming CPU time
90     * once you've obtained the desired info from the input stream. Doesn't
91     * prevent piping, so that data will flow through the parser as usual.
92     *
93     * @example
94     *
95     * ```js
96     * const SAXParser = require('parse5-sax-parser');
97     * const http = require('http');
98     * const fs = require('fs');
99     *
100     * const file = fs.createWriteStream('google.com.html');
101     * const parser = new SAXParser();
102     *
103     * parser.on('doctype', ({ name, publicId, systemId }) => {
104     *     // Process doctype info and stop parsing
105     *     ...
106     *     parser.stop();
107     * });
108     *
109     * http.get('http://google.com', res => {
110     *     // Despite the fact that parser.stop() was called whole
111     *     // content of the page will be written to the file
112     *     res.pipe(parser).pipe(file);
113     * });
114     * ```
115     */
116    public stop(): void {
117        this.stopped = true;
118        this.tokenizer.pause();
119    }
120
121    //Internals
122    protected _transformChunk(chunk: string): string {
123        if (!this.stopped) {
124            this.tokenizer.write(chunk, this.lastChunkWritten);
125        }
126        return chunk;
127    }
128
129    /** @internal */
130    onCharacter({ chars, location }: Token.CharacterToken): void {
131        if (this.pendingText === null) {
132            this.pendingText = { text: chars, sourceCodeLocation: location };
133        } else {
134            this.pendingText.text += chars;
135
136            if (location && this.pendingText.sourceCodeLocation) {
137                const { endLine, endCol, endOffset } = location;
138                this.pendingText.sourceCodeLocation = {
139                    ...this.pendingText.sourceCodeLocation,
140                    endLine,
141                    endCol,
142                    endOffset,
143                };
144            }
145        }
146
147        if (this.tokenizer.preprocessor.willDropParsedChunk()) {
148            this._emitPendingText();
149        }
150    }
151
152    /** @internal */
153    onWhitespaceCharacter(token: Token.CharacterToken): void {
154        this.onCharacter(token);
155    }
156
157    /** @internal */
158    onNullCharacter(token: Token.CharacterToken): void {
159        this.onCharacter(token);
160    }
161
162    /** @internal */
163    onEof(): void {
164        this._emitPendingText();
165        this.stopped = true;
166    }
167
168    /** @internal */
169    onStartTag(token: Token.TagToken): void {
170        this._emitPendingText();
171
172        const startTag: StartTag = {
173            tagName: token.tagName,
174            attrs: token.attrs,
175            selfClosing: token.selfClosing,
176            sourceCodeLocation: token.location,
177        };
178        this.emitIfListenerExists('startTag', startTag);
179    }
180
181    /** @internal */
182    onEndTag(token: Token.TagToken): void {
183        this._emitPendingText();
184
185        const endTag: EndTag = {
186            tagName: token.tagName,
187            sourceCodeLocation: token.location,
188        };
189        this.emitIfListenerExists('endTag', endTag);
190    }
191
192    /** @internal */
193    onDoctype(token: Token.DoctypeToken): void {
194        this._emitPendingText();
195
196        const doctype: Doctype = {
197            name: token.name,
198            publicId: token.publicId,
199            systemId: token.systemId,
200            sourceCodeLocation: token.location,
201        };
202        this.emitIfListenerExists('doctype', doctype);
203    }
204
205    /** @internal */
206    onComment(token: Token.CommentToken): void {
207        this._emitPendingText();
208
209        const comment: Comment = {
210            text: token.data,
211            sourceCodeLocation: token.location,
212        };
213        this.emitIfListenerExists('comment', comment);
214    }
215
216    protected emitIfListenerExists(eventName: string, token: SaxToken): boolean {
217        if (this.listenerCount(eventName) === 0) {
218            return false;
219        }
220
221        this._emitToken(eventName, token);
222
223        return true;
224    }
225
226    protected _emitToken(eventName: string, token: SaxToken): void {
227        this.emit(eventName, token);
228    }
229
230    private _emitPendingText(): void {
231        if (this.pendingText !== null) {
232            this.emitIfListenerExists('text', this.pendingText);
233            this.pendingText = null;
234        }
235    }
236}
237
238export interface SaxToken {
239    /** Source code location info. Available if location info is enabled via {@link SAXParserOptions}. */
240    sourceCodeLocation?: Token.Location | null;
241}
242
243export interface StartTag extends SaxToken {
244    /** Tag name */
245    tagName: string;
246    /** List of attributes */
247    attrs: Token.Attribute[];
248    /** Indicates if the tag is self-closing */
249    selfClosing: boolean;
250}
251
252export interface EndTag extends SaxToken {
253    /** Tag name */
254    tagName: string;
255}
256
257export interface Text extends SaxToken {
258    /** Text content. */
259    text: string;
260}
261
262export interface Comment extends SaxToken {
263    /** Comment text. */
264    text: string;
265}
266
267export interface Doctype extends SaxToken {
268    /** Document type name. */
269    name: string | null;
270    /** Document type public identifier. */
271    publicId: string | null;
272    /** Document type system identifier. */
273    systemId: string | null;
274}
275
276export interface SAXParser {
277    /** Raised when the parser encounters a start tag. */
278    on(event: 'startTag', listener: (startTag: StartTag) => void): this;
279    /** Raised when the parser encounters an end tag. */
280    on(event: 'endTag', listener: (endTag: EndTag) => void): this;
281    /** Raised when the parser encounters a comment. */
282    on(event: 'comment', listener: (comment: Comment) => void): this;
283    /** Raised when the parser encounters text content. */
284    on(event: 'text', listener: (text: Text) => void): this;
285    /** Raised when the parser encounters a [document type declaration](https://en.wikipedia.org/wiki/Document_type_declaration) */
286    on(event: 'doctype', listener: (doctype: Doctype) => void): this;
287    /**
288     * Base event handler.
289     *
290     * @param event Name of the event
291     * @param handler Event handler
292     */
293    on(event: string, handler: (...args: any[]) => void): this;
294}
295