1import * as assert from 'node:assert'; 2import * as fs from 'node:fs'; 3import * as path from 'node:path'; 4import type { ParserError, Token } from 'parse5'; 5import { type Tokenizer, TokenizerMode, type TokenHandler } from 'parse5'; 6import { makeChunks } from './common.js'; 7 8export type HtmlLibToken = [string, string | null, ...unknown[]]; 9 10interface TokenError { 11 code: string; 12 line: number; 13 col: number; 14} 15 16interface TokenSourceData { 17 tokens: HtmlLibToken[]; 18 errors: TokenError[]; 19} 20 21type TokenSourceCreator = (data: TokenizeHandler) => Tokenizer; 22 23/** Receives events and immediately compares them against the expected values. We check the entire output again at the end. */ 24class TokenizeHandler implements TokenSourceData, TokenHandler { 25 constructor(private testData: LoadedTest) {} 26 27 private addToken(token: HtmlLibToken): void { 28 assert.deepStrictEqual(token, this.testData.expected[this.tokens.length]); 29 30 this.tokens.push(token); 31 } 32 33 onComment(token: Token.CommentToken): void { 34 this.addToken(['Comment', token.data]); 35 } 36 onDoctype(token: Token.DoctypeToken): void { 37 this.addToken(['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]); 38 } 39 onStartTag(token: Token.TagToken): void { 40 const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value])); 41 const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs]; 42 43 if (token.selfClosing) { 44 startTagEntry.push(true); 45 } 46 47 this.addToken(startTagEntry); 48 } 49 onEndTag(token: Token.TagToken): void { 50 // NOTE: parser feedback simulator can produce adjusted SVG 51 // tag names for end tag tokens so we need to lower case it 52 this.addToken(['EndTag', token.tagName.toLowerCase()]); 53 } 54 onEof(): void { 55 this.sawEof = true; 56 } 57 onCharacter(token: Token.CharacterToken): void { 58 const lastEntry = this.tokens[this.tokens.length - 1]; 59 60 if (lastEntry && lastEntry[0] === 'Character' && lastEntry[1] != null) { 61 lastEntry[1] += token.chars; 62 } else { 63 this.tokens.push(['Character', token.chars]); 64 } 65 66 const actual = this.tokens[this.tokens.length - 1]; 67 const expected = this.testData.expected[this.tokens.length - 1]; 68 assert.strictEqual('Character', expected[0]); 69 assert.ok(typeof actual[1] === 'string'); 70 assert.ok(expected[1]?.startsWith(actual[1])); 71 } 72 onNullCharacter(token: Token.CharacterToken): void { 73 this.onCharacter(token); 74 } 75 onWhitespaceCharacter(token: Token.CharacterToken): void { 76 this.onCharacter(token); 77 } 78 onParseError(err: ParserError): void { 79 assert.ok( 80 this.testData.expectedErrors.some( 81 ({ code, line, col }) => code === err.code && line === err.startLine && col === err.startCol 82 ) 83 ); 84 85 this.errors.push({ 86 code: err.code, 87 line: err.startLine, 88 col: err.startCol, 89 }); 90 } 91 92 public sawEof = false; 93 public tokens: HtmlLibToken[] = []; 94 public errors: TokenError[] = []; 95} 96 97function tokenize(createTokenSource: TokenSourceCreator, chunks: string[], testData: LoadedTest): TokenSourceData { 98 const result = new TokenizeHandler(testData); 99 const tokenizer = createTokenSource(result); 100 101 // NOTE: set small waterline for testing purposes 102 tokenizer.preprocessor.bufferWaterline = 8; 103 tokenizer.state = testData.initialState; 104 105 if (testData.lastStartTag) { 106 tokenizer.lastStartTagName = testData.lastStartTag; 107 } 108 109 for (let i = 0; i < chunks.length; i++) { 110 assert.ok(!result.sawEof); 111 tokenizer.write(chunks[i], i === chunks.length - 1); 112 } 113 114 assert.ok(result.sawEof); 115 assert.ok(!tokenizer.active); 116 117 // Sort errors by line and column 118 result.errors.sort((err1, err2) => err1.line - err2.line || err1.col - err2.col); 119 120 return result; 121} 122 123function unicodeUnescape(str: string): string { 124 return str.replace(/\\[Uu]\w{4}/g, (match: string) => String.fromCharCode(Number.parseInt(match.slice(2), 16))); 125} 126 127function unescapeDescrIO(testDescr: TestDescription): void { 128 testDescr.input = unicodeUnescape(testDescr.input); 129 130 for (const tokenEntry of testDescr.output) { 131 //NOTE: unescape token tagName (for StartTag and EndTag tokens), comment data (for Comment token), 132 //character token data (for Character token). 133 if (tokenEntry[1]) { 134 tokenEntry[1] = unicodeUnescape(tokenEntry[1]); 135 } 136 } 137} 138 139function getTokenizerSuitableStateName(testDataStateName: string): Tokenizer['state'] { 140 const name = testDataStateName.slice(0, -6).replace(' ', '_').toUpperCase(); 141 return TokenizerMode[name as keyof typeof TokenizerMode]; 142} 143 144interface TestDescription { 145 initialStates: string[]; 146 doubleEscaped?: boolean; 147 output: HtmlLibToken[]; 148 description: string; 149 input: string; 150 lastStartTag: string; 151 errors?: TokenError[]; 152} 153 154interface LoadedTest { 155 idx: number; 156 setName: string; 157 name: string; 158 input: string; 159 expected: HtmlLibToken[]; 160 initialState: Tokenizer['state']; 161 initialStateName: string; 162 lastStartTag: string; 163 expectedErrors: TokenError[]; 164} 165 166function loadTests(dataDirPath: string): LoadedTest[] { 167 const testSetFileNames = fs.readdirSync(dataDirPath); 168 const tests: LoadedTest[] = []; 169 let testIdx = 0; 170 171 for (const fileName of testSetFileNames) { 172 if (path.extname(fileName) !== '.test') { 173 continue; 174 } 175 176 const filePath = path.join(dataDirPath, fileName); 177 const testSetJson = fs.readFileSync(filePath).toString(); 178 const testSet = JSON.parse(testSetJson); 179 const testDescrs: TestDescription[] = testSet.tests; 180 181 if (!testDescrs) { 182 continue; 183 } 184 185 const setName = fileName.replace('.test', ''); 186 187 for (const descr of testDescrs) { 188 if (!descr.initialStates) { 189 descr.initialStates = ['Data state']; 190 } 191 192 if (descr.doubleEscaped) { 193 unescapeDescrIO(descr); 194 } 195 196 const expected = descr.output; 197 198 for (const initialStateName of descr.initialStates) { 199 tests.push({ 200 idx: ++testIdx, 201 setName, 202 name: descr.description, 203 input: descr.input, 204 expected, 205 initialState: getTokenizerSuitableStateName(initialStateName), 206 initialStateName, 207 lastStartTag: descr.lastStartTag, 208 expectedErrors: descr.errors || [], 209 }); 210 } 211 } 212 } 213 214 return tests; 215} 216 217export function generateTokenizationTests( 218 prefix: string, 219 testSuite: string, 220 createTokenSource: TokenSourceCreator 221): void { 222 for (const testData of loadTests(testSuite)) { 223 const testName = `${prefix} - ${testData.idx}.${testData.setName} - ${testData.name} - Initial state: ${testData.initialStateName}`; 224 225 it(testName, () => { 226 const chunks = makeChunks(testData.input); 227 const result = tokenize(createTokenSource, chunks, testData); 228 229 assert.deepEqual(result.tokens, testData.expected, `Chunks: ${JSON.stringify(chunks)}`); 230 assert.deepEqual(result.errors, testData.expectedErrors || []); 231 }); 232 } 233} 234