• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import { html, type Token } from 'parse5';
2import {
3    SAXParser,
4    type EndTag,
5    type StartTag,
6    type Doctype,
7    type Text,
8    type Comment,
9    type SaxToken,
10} from 'parse5-sax-parser';
11import { escapeText, escapeAttribute } from 'entities/lib/escape.js';
12
13/**
14 * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter.
15 * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example).
16 *
17 * The rewriter uses the raw source representation of tokens if they are not modified by the user. Therefore, the resulting
18 * HTML is not affected by parser error-recovery mechanisms as in a classical parsing-serialization roundtrip.
19 *
20 * @example
21 *
22 * ```js
23 * const RewritingStream = require('parse5-html-rewriting-stream');
24 * const http = require('http');
25 * const fs = require('fs');
26 *
27 * const file = fs.createWriteStream('/home/google.com.html');
28 * const rewriter = new RewritingStream();
29 *
30 * // Replace divs with spans
31 * rewriter.on('startTag', startTag => {
32 *     if (startTag.tagName === 'span') {
33 *         startTag.tagName = 'div';
34 *     }
35 *
36 *     rewriter.emitStartTag(startTag);
37 * });
38 *
39 * rewriter.on('endTag', endTag => {
40 *     if (endTag.tagName === 'span') {
41 *         endTag.tagName = 'div';
42 *     }
43 *
44 *     rewriter.emitEndTag(endTag);
45 * });
46 *
47 * // Wrap all text nodes with an <i> tag
48 * rewriter.on('text', (_, raw) => {
49 *     // Use the raw representation of text without HTML entities decoding
50 *     rewriter.emitRaw(`<i>${raw}</i>`);
51 * });
52 *
53 * http.get('http://google.com', res => {
54 *    // Assumes response is UTF-8.
55 *    res.setEncoding('utf8');
56 *    // `RewritingStream` is a `Transform` stream, which means you can pipe
57 *    // through it.
58 *    res.pipe(rewriter).pipe(file);
59 * });
60 * ```
61 */
62export class RewritingStream extends SAXParser {
63    /** Note: `sourceCodeLocationInfo` is always enabled. */
64    constructor() {
65        super({ sourceCodeLocationInfo: true });
66    }
67
68    override _transformChunk(chunk: string): string {
69        // NOTE: ignore upstream return values as we want to push to
70        // the `Writable` part of the `Transform` stream ourselves.
71        super._transformChunk(chunk);
72        return '';
73    }
74
75    private _getRawHtml(location: Token.Location): string {
76        const { droppedBufferSize, html } = this.tokenizer.preprocessor;
77        const start = location.startOffset - droppedBufferSize;
78        const end = location.endOffset - droppedBufferSize;
79
80        return html.slice(start, end);
81    }
82
83    // Events
84    protected override emitIfListenerExists(eventName: string, token: SaxToken): boolean {
85        if (!super.emitIfListenerExists(eventName, token)) {
86            this.emitRaw(this._getRawHtml(token.sourceCodeLocation!));
87        }
88
89        // NOTE: don't skip new lines after `<pre>` and other tags,
90        // otherwise we'll have incorrect raw data.
91        this.parserFeedbackSimulator.skipNextNewLine = false;
92        return true;
93    }
94
95    // Emitter API
96    protected override _emitToken(eventName: string, token: SaxToken): void {
97        this.emit(eventName, token, this._getRawHtml(token.sourceCodeLocation!));
98    }
99
100    /** Emits a serialized document type token into the output stream. */
101    public emitDoctype(token: Doctype): void {
102        let res = `<!DOCTYPE ${token.name}`;
103
104        if (token.publicId !== null) {
105            res += ` PUBLIC "${token.publicId}"`;
106        } else if (token.systemId !== null) {
107            res += ' SYSTEM';
108        }
109
110        if (token.systemId !== null) {
111            res += ` "${token.systemId}"`;
112        }
113
114        res += '>';
115
116        this.push(res);
117    }
118
119    /** Emits a serialized start tag token into the output stream. */
120    public emitStartTag(token: StartTag): void {
121        let res = `<${token.tagName}`;
122
123        for (const attr of token.attrs) {
124            res += ` ${attr.name}="${escapeAttribute(attr.value)}"`;
125        }
126
127        res += token.selfClosing ? '/>' : '>';
128
129        this.push(res);
130    }
131
132    /** Emits a serialized end tag token into the output stream. */
133    public emitEndTag(token: EndTag): void {
134        this.push(`</${token.tagName}>`);
135    }
136
137    /** Emits a serialized text token into the output stream. */
138    public emitText({ text }: Text): void {
139        this.push(
140            !this.parserFeedbackSimulator.inForeignContent &&
141                html.hasUnescapedText(this.tokenizer.lastStartTagName, true)
142                ? text
143                : escapeText(text)
144        );
145    }
146
147    /** Emits a serialized comment token into the output stream. */
148    public emitComment(token: Comment): void {
149        this.push(`<!--${token.text}-->`);
150    }
151
152    /** Emits a raw HTML string into the output stream. */
153    public emitRaw(html: string): void {
154        this.push(html);
155    }
156}
157
158export interface RewritingStream {
159    /** Raised when the rewriter encounters a start tag. */
160    on(event: 'startTag', listener: (startTag: StartTag, rawHtml: string) => void): this;
161    /** Raised when rewriter encounters an end tag. */
162    on(event: 'endTag', listener: (endTag: EndTag, rawHtml: string) => void): this;
163    /** Raised when rewriter encounters a comment. */
164    on(event: 'comment', listener: (comment: Comment, rawHtml: string) => void): this;
165    /** Raised when rewriter encounters text content. */
166    on(event: 'text', listener: (text: Text, rawHtml: string) => void): this;
167    /** Raised when rewriter encounters a [document type declaration](https://en.wikipedia.org/wiki/Document_type_declaration). */
168    on(event: 'doctype', listener: (doctype: Doctype, rawHtml: string) => void): this;
169
170    /**
171     * Base event handler.
172     *
173     * @param event Name of the event
174     * @param handler Event handler
175     */
176    on(event: string, handler: (...args: any[]) => void): this;
177}
178