1import { 2 CODE_POINTS as $, 3 getSurrogatePairCodePoint, 4 isControlCodePoint, 5 isSurrogate, 6 isSurrogatePair, 7 isUndefinedCodePoint, 8} from '../common/unicode.js'; 9import { ERR, type ParserError, type ParserErrorHandler } from '../common/error-codes.js'; 10 11//Const 12const DEFAULT_BUFFER_WATERLINE = 1 << 16; 13 14//Preprocessor 15//NOTE: HTML input preprocessing 16//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream) 17export class Preprocessor { 18 public html = ''; 19 private pos = -1; 20 // NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0 21 private lastGapPos = -2; 22 private gapStack: number[] = []; 23 private skipNextNewLine = false; 24 private lastChunkWritten = false; 25 public endOfChunkHit = false; 26 public bufferWaterline = DEFAULT_BUFFER_WATERLINE; 27 28 private isEol = false; 29 private lineStartPos = 0; 30 public droppedBufferSize = 0; 31 public line = 1; 32 33 constructor(private handler: { onParseError?: ParserErrorHandler | null }) {} 34 35 /** The column on the current line. If we just saw a gap (eg. a surrogate pair), return the index before. */ 36 public get col(): number { 37 return this.pos - this.lineStartPos + Number(this.lastGapPos !== this.pos); 38 } 39 40 public get offset(): number { 41 return this.droppedBufferSize + this.pos; 42 } 43 44 public getError(code: ERR): ParserError { 45 const { line, col, offset } = this; 46 47 return { 48 code, 49 startLine: line, 50 endLine: line, 51 startCol: col, 52 endCol: col, 53 startOffset: offset, 54 endOffset: offset, 55 }; 56 } 57 58 //NOTE: avoid reporting errors twice on advance/retreat 59 private lastErrOffset = -1; 60 private _err(code: ERR): void { 61 if (this.handler.onParseError && this.lastErrOffset !== this.offset) { 62 this.lastErrOffset = this.offset; 63 this.handler.onParseError(this.getError(code)); 64 } 65 } 66 67 private _addGap(): void { 68 this.gapStack.push(this.lastGapPos); 69 this.lastGapPos = this.pos; 70 } 71 72 private _processSurrogate(cp: number): number { 73 //NOTE: try to peek a surrogate pair 74 if (this.pos !== this.html.length - 1) { 75 const nextCp = this.html.charCodeAt(this.pos + 1); 76 77 if (isSurrogatePair(nextCp)) { 78 //NOTE: we have a surrogate pair. Peek pair character and recalculate code point. 79 this.pos++; 80 81 //NOTE: add a gap that should be avoided during retreat 82 this._addGap(); 83 84 return getSurrogatePairCodePoint(cp, nextCp); 85 } 86 } 87 88 //NOTE: we are at the end of a chunk, therefore we can't infer the surrogate pair yet. 89 else if (!this.lastChunkWritten) { 90 this.endOfChunkHit = true; 91 return $.EOF; 92 } 93 94 //NOTE: isolated surrogate 95 this._err(ERR.surrogateInInputStream); 96 97 return cp; 98 } 99 100 public willDropParsedChunk(): boolean { 101 return this.pos > this.bufferWaterline; 102 } 103 104 public dropParsedChunk(): void { 105 if (this.willDropParsedChunk()) { 106 this.html = this.html.substring(this.pos); 107 this.lineStartPos -= this.pos; 108 this.droppedBufferSize += this.pos; 109 this.pos = 0; 110 this.lastGapPos = -2; 111 this.gapStack.length = 0; 112 } 113 } 114 115 public write(chunk: string, isLastChunk: boolean): void { 116 if (this.html.length > 0) { 117 this.html += chunk; 118 } else { 119 this.html = chunk; 120 } 121 122 this.endOfChunkHit = false; 123 this.lastChunkWritten = isLastChunk; 124 } 125 126 public insertHtmlAtCurrentPos(chunk: string): void { 127 this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1); 128 129 this.endOfChunkHit = false; 130 } 131 132 public startsWith(pattern: string, caseSensitive: boolean): boolean { 133 // Check if our buffer has enough characters 134 if (this.pos + pattern.length > this.html.length) { 135 this.endOfChunkHit = !this.lastChunkWritten; 136 return false; 137 } 138 139 if (caseSensitive) { 140 return this.html.startsWith(pattern, this.pos); 141 } 142 143 for (let i = 0; i < pattern.length; i++) { 144 const cp = this.html.charCodeAt(this.pos + i) | 0x20; 145 146 if (cp !== pattern.charCodeAt(i)) { 147 return false; 148 } 149 } 150 151 return true; 152 } 153 154 public peek(offset: number): number { 155 const pos = this.pos + offset; 156 157 if (pos >= this.html.length) { 158 this.endOfChunkHit = !this.lastChunkWritten; 159 return $.EOF; 160 } 161 162 const code = this.html.charCodeAt(pos); 163 164 return code === $.CARRIAGE_RETURN ? $.LINE_FEED : code; 165 } 166 167 public advance(): number { 168 this.pos++; 169 170 //NOTE: LF should be in the last column of the line 171 if (this.isEol) { 172 this.isEol = false; 173 this.line++; 174 this.lineStartPos = this.pos; 175 } 176 177 if (this.pos >= this.html.length) { 178 this.endOfChunkHit = !this.lastChunkWritten; 179 return $.EOF; 180 } 181 182 let cp = this.html.charCodeAt(this.pos); 183 184 //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters 185 if (cp === $.CARRIAGE_RETURN) { 186 this.isEol = true; 187 this.skipNextNewLine = true; 188 return $.LINE_FEED; 189 } 190 191 //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character 192 //must be ignored. 193 if (cp === $.LINE_FEED) { 194 this.isEol = true; 195 196 if (this.skipNextNewLine) { 197 // `line` will be bumped again in the recursive call. 198 this.line--; 199 this.skipNextNewLine = false; 200 this._addGap(); 201 return this.advance(); 202 } 203 } 204 205 this.skipNextNewLine = false; 206 207 if (isSurrogate(cp)) { 208 cp = this._processSurrogate(cp); 209 } 210 211 //OPTIMIZATION: first check if code point is in the common allowed 212 //range (ASCII alphanumeric, whitespaces, big chunk of BMP) 213 //before going into detailed performance cost validation. 214 const isCommonValidRange = 215 this.handler.onParseError === null || 216 (cp > 0x1f && cp < 0x7f) || 217 cp === $.LINE_FEED || 218 cp === $.CARRIAGE_RETURN || 219 (cp > 0x9f && cp < 0xfd_d0); 220 221 if (!isCommonValidRange) { 222 this._checkForProblematicCharacters(cp); 223 } 224 225 return cp; 226 } 227 228 private _checkForProblematicCharacters(cp: number): void { 229 if (isControlCodePoint(cp)) { 230 this._err(ERR.controlCharacterInInputStream); 231 } else if (isUndefinedCodePoint(cp)) { 232 this._err(ERR.noncharacterInInputStream); 233 } 234 } 235 236 public retreat(count: number): void { 237 this.pos -= count; 238 239 while (this.pos < this.lastGapPos) { 240 this.lastGapPos = this.gapStack.pop()!; 241 this.pos--; 242 } 243 244 this.isEol = false; 245 } 246} 247