1import { html, type Token } from 'parse5'; 2import { 3 SAXParser, 4 type EndTag, 5 type StartTag, 6 type Doctype, 7 type Text, 8 type Comment, 9 type SaxToken, 10} from 'parse5-sax-parser'; 11import { escapeText, escapeAttribute } from 'entities/lib/escape.js'; 12 13/** 14 * Streaming [SAX](https://en.wikipedia.org/wiki/Simple_API_for_XML)-style HTML rewriter. 15 * A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform) (which means you can pipe _through_ it, see example). 16 * 17 * The rewriter uses the raw source representation of tokens if they are not modified by the user. Therefore, the resulting 18 * HTML is not affected by parser error-recovery mechanisms as in a classical parsing-serialization roundtrip. 19 * 20 * @example 21 * 22 * ```js 23 * const RewritingStream = require('parse5-html-rewriting-stream'); 24 * const http = require('http'); 25 * const fs = require('fs'); 26 * 27 * const file = fs.createWriteStream('/home/google.com.html'); 28 * const rewriter = new RewritingStream(); 29 * 30 * // Replace divs with spans 31 * rewriter.on('startTag', startTag => { 32 * if (startTag.tagName === 'span') { 33 * startTag.tagName = 'div'; 34 * } 35 * 36 * rewriter.emitStartTag(startTag); 37 * }); 38 * 39 * rewriter.on('endTag', endTag => { 40 * if (endTag.tagName === 'span') { 41 * endTag.tagName = 'div'; 42 * } 43 * 44 * rewriter.emitEndTag(endTag); 45 * }); 46 * 47 * // Wrap all text nodes with an <i> tag 48 * rewriter.on('text', (_, raw) => { 49 * // Use the raw representation of text without HTML entities decoding 50 * rewriter.emitRaw(`<i>${raw}</i>`); 51 * }); 52 * 53 * http.get('http://google.com', res => { 54 * // Assumes response is UTF-8. 55 * res.setEncoding('utf8'); 56 * // `RewritingStream` is a `Transform` stream, which means you can pipe 57 * // through it. 58 * res.pipe(rewriter).pipe(file); 59 * }); 60 * ``` 61 */ 62export class RewritingStream extends SAXParser { 63 /** Note: `sourceCodeLocationInfo` is always enabled. */ 64 constructor() { 65 super({ sourceCodeLocationInfo: true }); 66 } 67 68 override _transformChunk(chunk: string): string { 69 // NOTE: ignore upstream return values as we want to push to 70 // the `Writable` part of the `Transform` stream ourselves. 71 super._transformChunk(chunk); 72 return ''; 73 } 74 75 private _getRawHtml(location: Token.Location): string { 76 const { droppedBufferSize, html } = this.tokenizer.preprocessor; 77 const start = location.startOffset - droppedBufferSize; 78 const end = location.endOffset - droppedBufferSize; 79 80 return html.slice(start, end); 81 } 82 83 // Events 84 protected override emitIfListenerExists(eventName: string, token: SaxToken): boolean { 85 if (!super.emitIfListenerExists(eventName, token)) { 86 this.emitRaw(this._getRawHtml(token.sourceCodeLocation!)); 87 } 88 89 // NOTE: don't skip new lines after `<pre>` and other tags, 90 // otherwise we'll have incorrect raw data. 91 this.parserFeedbackSimulator.skipNextNewLine = false; 92 return true; 93 } 94 95 // Emitter API 96 protected override _emitToken(eventName: string, token: SaxToken): void { 97 this.emit(eventName, token, this._getRawHtml(token.sourceCodeLocation!)); 98 } 99 100 /** Emits a serialized document type token into the output stream. */ 101 public emitDoctype(token: Doctype): void { 102 let res = `<!DOCTYPE ${token.name}`; 103 104 if (token.publicId !== null) { 105 res += ` PUBLIC "${token.publicId}"`; 106 } else if (token.systemId !== null) { 107 res += ' SYSTEM'; 108 } 109 110 if (token.systemId !== null) { 111 res += ` "${token.systemId}"`; 112 } 113 114 res += '>'; 115 116 this.push(res); 117 } 118 119 /** Emits a serialized start tag token into the output stream. */ 120 public emitStartTag(token: StartTag): void { 121 let res = `<${token.tagName}`; 122 123 for (const attr of token.attrs) { 124 res += ` ${attr.name}="${escapeAttribute(attr.value)}"`; 125 } 126 127 res += token.selfClosing ? '/>' : '>'; 128 129 this.push(res); 130 } 131 132 /** Emits a serialized end tag token into the output stream. */ 133 public emitEndTag(token: EndTag): void { 134 this.push(`</${token.tagName}>`); 135 } 136 137 /** Emits a serialized text token into the output stream. */ 138 public emitText({ text }: Text): void { 139 this.push( 140 !this.parserFeedbackSimulator.inForeignContent && 141 html.hasUnescapedText(this.tokenizer.lastStartTagName, true) 142 ? text 143 : escapeText(text) 144 ); 145 } 146 147 /** Emits a serialized comment token into the output stream. */ 148 public emitComment(token: Comment): void { 149 this.push(`<!--${token.text}-->`); 150 } 151 152 /** Emits a raw HTML string into the output stream. */ 153 public emitRaw(html: string): void { 154 this.push(html); 155 } 156} 157 158export interface RewritingStream { 159 /** Raised when the rewriter encounters a start tag. */ 160 on(event: 'startTag', listener: (startTag: StartTag, rawHtml: string) => void): this; 161 /** Raised when rewriter encounters an end tag. */ 162 on(event: 'endTag', listener: (endTag: EndTag, rawHtml: string) => void): this; 163 /** Raised when rewriter encounters a comment. */ 164 on(event: 'comment', listener: (comment: Comment, rawHtml: string) => void): this; 165 /** Raised when rewriter encounters text content. */ 166 on(event: 'text', listener: (text: Text, rawHtml: string) => void): this; 167 /** Raised when rewriter encounters a [document type declaration](https://en.wikipedia.org/wiki/Document_type_declaration). */ 168 on(event: 'doctype', listener: (doctype: Doctype, rawHtml: string) => void): this; 169 170 /** 171 * Base event handler. 172 * 173 * @param event Name of the event 174 * @param handler Event handler 175 */ 176 on(event: string, handler: (...args: any[]) => void): this; 177} 178