• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import { Transform } from 'node:stream';
2import type { Tokenizer, TokenHandler, Token } from 'parse5';
3import { DevNullStream } from './dev-null-stream.js';
4import { ParserFeedbackSimulator } from './parser-feedback-simulator.js';
5
6export interface SAXParserOptions {
7    /**
8     * Enables source code location information for tokens.
9     *
10     * When enabled, each token will have a `sourceCodeLocation` property.
11     */
12    sourceCodeLocationInfo?: boolean;
13}
14
15/**
16 * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML parser.
17 * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example).
18 *
19 * @example
20 *
21 * ```js
22 *     const SAXParser = require('parse5-sax-parser');
23 *     const http = require('http');
24 *     const fs = require('fs');
25 *
26 *     const file = fs.createWriteStream('/home/google.com.html');
27 *     const parser = new SAXParser();
28 *
29 *     parser.on('text', text => {
30 *        // Handle page text content
31 *        ...
32 *     });
33 *
34 *     http.get('http://google.com', res => {
35 *        // `SAXParser` is the `Transform` stream, which means you can pipe
36 *        // through it. So, you can analyze the page content and, e.g., save it
37 *        // to the file at the same time:
38 *        res.pipe(parser).pipe(file);
39 *     });
40 * ```
41 */
42export class SAXParser extends Transform implements TokenHandler {
43    protected options: SAXParserOptions;
44    /** @internal */
45    protected parserFeedbackSimulator: ParserFeedbackSimulator;
46    private pendingText: Text | null = null;
47    private lastChunkWritten = false;
48    private stopped = false;
49    protected tokenizer: Tokenizer;
50
51    /**
52     * @param options Parsing options.
53     */
54    constructor(options: SAXParserOptions = {}) {
55        super({ encoding: 'utf8', decodeStrings: false });
56
57        this.options = {
58            sourceCodeLocationInfo: false,
59            ...options,
60        };
61
62        this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.options, this);
63        this.tokenizer = this.parserFeedbackSimulator.tokenizer;
64
65        // NOTE: always pipe the stream to the /dev/null stream to avoid
66        // the `highWaterMark` to be hit even if we don't have consumers.
67        // (see: https://github.com/inikulin/parse5/issues/97#issuecomment-171940774)
68        this.pipe(new DevNullStream());
69    }
70
71    //`Transform` implementation
72    override _transform(
73        chunk: string,
74        _encoding: string,
75        callback: (error?: Error | null, data?: string) => void,
76    ): void {
77        if (typeof chunk !== 'string') {
78            throw new TypeError('Parser can work only with string streams.');
79        }
80
81        callback(null, this._transformChunk(chunk));
82    }
83
84    override _final(callback: (error?: Error | null, data?: string) => void): void {
85        this.lastChunkWritten = true;
86        callback(null, this._transformChunk(''));
87    }
88
89    /**
90     * Stops parsing. Useful if you want the parser to stop consuming CPU time
91     * once you've obtained the desired info from the input stream. Doesn't
92     * prevent piping, so that data will flow through the parser as usual.
93     *
94     * @example
95     *
96     * ```js
97     * const SAXParser = require('parse5-sax-parser');
98     * const http = require('http');
99     * const fs = require('fs');
100     *
101     * const file = fs.createWriteStream('google.com.html');
102     * const parser = new SAXParser();
103     *
104     * parser.on('doctype', ({ name, publicId, systemId }) => {
105     *     // Process doctype info and stop parsing
106     *     ...
107     *     parser.stop();
108     * });
109     *
110     * http.get('http://google.com', res => {
111     *     // Despite the fact that parser.stop() was called whole
112     *     // content of the page will be written to the file
113     *     res.pipe(parser).pipe(file);
114     * });
115     * ```
116     */
117    public stop(): void {
118        this.stopped = true;
119        this.tokenizer.pause();
120    }
121
122    //Internals
123    protected _transformChunk(chunk: string): string {
124        if (!this.stopped) {
125            this.tokenizer.write(chunk, this.lastChunkWritten);
126        }
127        return chunk;
128    }
129
130    /** @internal */
131    onCharacter({ chars, location }: Token.CharacterToken): void {
132        if (this.pendingText === null) {
133            this.pendingText = { text: chars, sourceCodeLocation: location };
134        } else {
135            this.pendingText.text += chars;
136
137            if (location && this.pendingText.sourceCodeLocation) {
138                const { endLine, endCol, endOffset } = location;
139                this.pendingText.sourceCodeLocation = {
140                    ...this.pendingText.sourceCodeLocation,
141                    endLine,
142                    endCol,
143                    endOffset,
144                };
145            }
146        }
147
148        if (this.tokenizer.preprocessor.willDropParsedChunk()) {
149            this._emitPendingText();
150        }
151    }
152
153    /** @internal */
154    onWhitespaceCharacter(token: Token.CharacterToken): void {
155        this.onCharacter(token);
156    }
157
158    /** @internal */
159    onNullCharacter(token: Token.CharacterToken): void {
160        this.onCharacter(token);
161    }
162
163    /** @internal */
164    onEof(): void {
165        this._emitPendingText();
166        this.stopped = true;
167    }
168
169    /** @internal */
170    onStartTag(token: Token.TagToken): void {
171        this._emitPendingText();
172
173        const startTag: StartTag = {
174            tagName: token.tagName,
175            attrs: token.attrs,
176            selfClosing: token.selfClosing,
177            sourceCodeLocation: token.location,
178        };
179        this.emitIfListenerExists('startTag', startTag);
180    }
181
182    /** @internal */
183    onEndTag(token: Token.TagToken): void {
184        this._emitPendingText();
185
186        const endTag: EndTag = {
187            tagName: token.tagName,
188            sourceCodeLocation: token.location,
189        };
190        this.emitIfListenerExists('endTag', endTag);
191    }
192
193    /** @internal */
194    onDoctype(token: Token.DoctypeToken): void {
195        this._emitPendingText();
196
197        const doctype: Doctype = {
198            name: token.name,
199            publicId: token.publicId,
200            systemId: token.systemId,
201            sourceCodeLocation: token.location,
202        };
203        this.emitIfListenerExists('doctype', doctype);
204    }
205
206    /** @internal */
207    onComment(token: Token.CommentToken): void {
208        this._emitPendingText();
209
210        const comment: Comment = {
211            text: token.data,
212            sourceCodeLocation: token.location,
213        };
214        this.emitIfListenerExists('comment', comment);
215    }
216
217    protected emitIfListenerExists(eventName: string, token: SaxToken): boolean {
218        if (this.listenerCount(eventName) === 0) {
219            return false;
220        }
221
222        this._emitToken(eventName, token);
223
224        return true;
225    }
226
227    protected _emitToken(eventName: string, token: SaxToken): void {
228        this.emit(eventName, token);
229    }
230
231    private _emitPendingText(): void {
232        if (this.pendingText !== null) {
233            this.emitIfListenerExists('text', this.pendingText);
234            this.pendingText = null;
235        }
236    }
237}
238
239export interface SaxToken {
240    /** Source code location info. Available if location info is enabled via {@link SAXParserOptions}. */
241    sourceCodeLocation?: Token.Location | null;
242}
243
244export interface StartTag extends SaxToken {
245    /** Tag name */
246    tagName: string;
247    /** List of attributes */
248    attrs: Token.Attribute[];
249    /** Indicates if the tag is self-closing */
250    selfClosing: boolean;
251}
252
253export interface EndTag extends SaxToken {
254    /** Tag name */
255    tagName: string;
256}
257
258export interface Text extends SaxToken {
259    /** Text content. */
260    text: string;
261}
262
263export interface Comment extends SaxToken {
264    /** Comment text. */
265    text: string;
266}
267
268export interface Doctype extends SaxToken {
269    /** Document type name. */
270    name: string | null;
271    /** Document type public identifier. */
272    publicId: string | null;
273    /** Document type system identifier. */
274    systemId: string | null;
275}
276
277export interface SAXParser {
278    /** Raised when the parser encounters a start tag. */
279    on(event: 'startTag', listener: (startTag: StartTag) => void): this;
280    /** Raised when the parser encounters an end tag. */
281    on(event: 'endTag', listener: (endTag: EndTag) => void): this;
282    /** Raised when the parser encounters a comment. */
283    on(event: 'comment', listener: (comment: Comment) => void): this;
284    /** Raised when the parser encounters text content. */
285    on(event: 'text', listener: (text: Text) => void): this;
286    /** Raised when the parser encounters a [document type declaration](https://en.wikipedia.org/wiki/Document_type_declaration) */
287    on(event: 'doctype', listener: (doctype: Doctype) => void): this;
288    /**
289     * Base event handler.
290     *
291     * @param event Name of the event
292     * @param handler Event handler
293     */
294    on(event: string, handler: (...args: any[]) => void): this;
295}
296