• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import {
2    CODE_POINTS as $,
3    getSurrogatePairCodePoint,
4    isControlCodePoint,
5    isSurrogate,
6    isSurrogatePair,
7    isUndefinedCodePoint,
8} from '../common/unicode.js';
9import { ERR, type ParserError, type ParserErrorHandler } from '../common/error-codes.js';
10
11//Const
12const DEFAULT_BUFFER_WATERLINE = 1 << 16;
13
14//Preprocessor
15//NOTE: HTML input preprocessing
16//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
17export class Preprocessor {
18    public html = '';
19    private pos = -1;
20    // NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0
21    private lastGapPos = -2;
22    private gapStack: number[] = [];
23    private skipNextNewLine = false;
24    private lastChunkWritten = false;
25    public endOfChunkHit = false;
26    public bufferWaterline = DEFAULT_BUFFER_WATERLINE;
27
28    private isEol = false;
29    private lineStartPos = 0;
30    public droppedBufferSize = 0;
31    public line = 1;
32
33    constructor(private handler: { onParseError?: ParserErrorHandler | null }) {}
34
35    /** The column on the current line. If we just saw a gap (eg. a surrogate pair), return the index before. */
36    public get col(): number {
37        return this.pos - this.lineStartPos + Number(this.lastGapPos !== this.pos);
38    }
39
40    public get offset(): number {
41        return this.droppedBufferSize + this.pos;
42    }
43
44    public getError(code: ERR): ParserError {
45        const { line, col, offset } = this;
46
47        return {
48            code,
49            startLine: line,
50            endLine: line,
51            startCol: col,
52            endCol: col,
53            startOffset: offset,
54            endOffset: offset,
55        };
56    }
57
58    //NOTE: avoid reporting errors twice on advance/retreat
59    private lastErrOffset = -1;
60    private _err(code: ERR): void {
61        if (this.handler.onParseError && this.lastErrOffset !== this.offset) {
62            this.lastErrOffset = this.offset;
63            this.handler.onParseError(this.getError(code));
64        }
65    }
66
67    private _addGap(): void {
68        this.gapStack.push(this.lastGapPos);
69        this.lastGapPos = this.pos;
70    }
71
72    private _processSurrogate(cp: number): number {
73        //NOTE: try to peek a surrogate pair
74        if (this.pos !== this.html.length - 1) {
75            const nextCp = this.html.charCodeAt(this.pos + 1);
76
77            if (isSurrogatePair(nextCp)) {
78                //NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
79                this.pos++;
80
81                //NOTE: add a gap that should be avoided during retreat
82                this._addGap();
83
84                return getSurrogatePairCodePoint(cp, nextCp);
85            }
86        }
87
88        //NOTE: we are at the end of a chunk, therefore we can't infer the surrogate pair yet.
89        else if (!this.lastChunkWritten) {
90            this.endOfChunkHit = true;
91            return $.EOF;
92        }
93
94        //NOTE: isolated surrogate
95        this._err(ERR.surrogateInInputStream);
96
97        return cp;
98    }
99
100    public willDropParsedChunk(): boolean {
101        return this.pos > this.bufferWaterline;
102    }
103
104    public dropParsedChunk(): void {
105        if (this.willDropParsedChunk()) {
106            this.html = this.html.substring(this.pos);
107            this.lineStartPos -= this.pos;
108            this.droppedBufferSize += this.pos;
109            this.pos = 0;
110            this.lastGapPos = -2;
111            this.gapStack.length = 0;
112        }
113    }
114
115    public write(chunk: string, isLastChunk: boolean): void {
116        if (this.html.length > 0) {
117            this.html += chunk;
118        } else {
119            this.html = chunk;
120        }
121
122        this.endOfChunkHit = false;
123        this.lastChunkWritten = isLastChunk;
124    }
125
126    public insertHtmlAtCurrentPos(chunk: string): void {
127        this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1);
128
129        this.endOfChunkHit = false;
130    }
131
132    public startsWith(pattern: string, caseSensitive: boolean): boolean {
133        // Check if our buffer has enough characters
134        if (this.pos + pattern.length > this.html.length) {
135            this.endOfChunkHit = !this.lastChunkWritten;
136            return false;
137        }
138
139        if (caseSensitive) {
140            return this.html.startsWith(pattern, this.pos);
141        }
142
143        for (let i = 0; i < pattern.length; i++) {
144            const cp = this.html.charCodeAt(this.pos + i) | 0x20;
145
146            if (cp !== pattern.charCodeAt(i)) {
147                return false;
148            }
149        }
150
151        return true;
152    }
153
154    public peek(offset: number): number {
155        const pos = this.pos + offset;
156
157        if (pos >= this.html.length) {
158            this.endOfChunkHit = !this.lastChunkWritten;
159            return $.EOF;
160        }
161
162        const code = this.html.charCodeAt(pos);
163
164        return code === $.CARRIAGE_RETURN ? $.LINE_FEED : code;
165    }
166
167    public advance(): number {
168        this.pos++;
169
170        //NOTE: LF should be in the last column of the line
171        if (this.isEol) {
172            this.isEol = false;
173            this.line++;
174            this.lineStartPos = this.pos;
175        }
176
177        if (this.pos >= this.html.length) {
178            this.endOfChunkHit = !this.lastChunkWritten;
179            return $.EOF;
180        }
181
182        let cp = this.html.charCodeAt(this.pos);
183
184        //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
185        if (cp === $.CARRIAGE_RETURN) {
186            this.isEol = true;
187            this.skipNextNewLine = true;
188            return $.LINE_FEED;
189        }
190
191        //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
192        //must be ignored.
193        if (cp === $.LINE_FEED) {
194            this.isEol = true;
195
196            if (this.skipNextNewLine) {
197                // `line` will be bumped again in the recursive call.
198                this.line--;
199                this.skipNextNewLine = false;
200                this._addGap();
201                return this.advance();
202            }
203        }
204
205        this.skipNextNewLine = false;
206
207        if (isSurrogate(cp)) {
208            cp = this._processSurrogate(cp);
209        }
210
211        //OPTIMIZATION: first check if code point is in the common allowed
212        //range (ASCII alphanumeric, whitespaces, big chunk of BMP)
213        //before going into detailed performance cost validation.
214        const isCommonValidRange =
215            this.handler.onParseError === null ||
216            (cp > 0x1f && cp < 0x7f) ||
217            cp === $.LINE_FEED ||
218            cp === $.CARRIAGE_RETURN ||
219            (cp > 0x9f && cp < 0xfd_d0);
220
221        if (!isCommonValidRange) {
222            this._checkForProblematicCharacters(cp);
223        }
224
225        return cp;
226    }
227
228    private _checkForProblematicCharacters(cp: number): void {
229        if (isControlCodePoint(cp)) {
230            this._err(ERR.controlCharacterInInputStream);
231        } else if (isUndefinedCodePoint(cp)) {
232            this._err(ERR.noncharacterInInputStream);
233        }
234    }
235
236    public retreat(count: number): void {
237        this.pos -= count;
238
239        while (this.pos < this.lastGapPos) {
240            this.lastGapPos = this.gapStack.pop()!;
241            this.pos--;
242        }
243
244        this.isEol = false;
245    }
246}
247