• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import * as assert from 'node:assert';
2import * as fs from 'node:fs';
3import * as path from 'node:path';
4import type { ParserError, Token } from 'parse5';
5import { type Tokenizer, TokenizerMode, type TokenHandler } from 'parse5';
6import { makeChunks } from './common.js';
7
8export type HtmlLibToken = [string, string | null, ...unknown[]];
9
10interface TokenError {
11    code: string;
12    line: number;
13    col: number;
14}
15
16interface TokenSourceData {
17    tokens: HtmlLibToken[];
18    errors: TokenError[];
19}
20
21type TokenSourceCreator = (data: TokenizeHandler) => Tokenizer;
22
23/** Receives events and immediately compares them against the expected values. We check the entire output again at the end. */
24class TokenizeHandler implements TokenSourceData, TokenHandler {
25    constructor(private testData: LoadedTest) {}
26
27    private addToken(token: HtmlLibToken): void {
28        assert.deepStrictEqual(token, this.testData.expected[this.tokens.length]);
29
30        this.tokens.push(token);
31    }
32
33    onComment(token: Token.CommentToken): void {
34        this.addToken(['Comment', token.data]);
35    }
36    onDoctype(token: Token.DoctypeToken): void {
37        this.addToken(['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]);
38    }
39    onStartTag(token: Token.TagToken): void {
40        const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value]));
41        const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs];
42
43        if (token.selfClosing) {
44            startTagEntry.push(true);
45        }
46
47        this.addToken(startTagEntry);
48    }
49    onEndTag(token: Token.TagToken): void {
50        // NOTE: parser feedback simulator can produce adjusted SVG
51        // tag names for end tag tokens so we need to lower case it
52        this.addToken(['EndTag', token.tagName.toLowerCase()]);
53    }
54    onEof(): void {
55        this.sawEof = true;
56    }
57    onCharacter(token: Token.CharacterToken): void {
58        const lastEntry = this.tokens[this.tokens.length - 1];
59
60        if (lastEntry && lastEntry[0] === 'Character' && lastEntry[1] != null) {
61            lastEntry[1] += token.chars;
62        } else {
63            this.tokens.push(['Character', token.chars]);
64        }
65
66        const actual = this.tokens[this.tokens.length - 1];
67        const expected = this.testData.expected[this.tokens.length - 1];
68        assert.strictEqual('Character', expected[0]);
69        assert.ok(typeof actual[1] === 'string');
70        assert.ok(expected[1]?.startsWith(actual[1]));
71    }
72    onNullCharacter(token: Token.CharacterToken): void {
73        this.onCharacter(token);
74    }
75    onWhitespaceCharacter(token: Token.CharacterToken): void {
76        this.onCharacter(token);
77    }
78    onParseError(err: ParserError): void {
79        assert.ok(
80            this.testData.expectedErrors.some(
81                ({ code, line, col }) => code === err.code && line === err.startLine && col === err.startCol
82            )
83        );
84
85        this.errors.push({
86            code: err.code,
87            line: err.startLine,
88            col: err.startCol,
89        });
90    }
91
92    public sawEof = false;
93    public tokens: HtmlLibToken[] = [];
94    public errors: TokenError[] = [];
95}
96
97function tokenize(createTokenSource: TokenSourceCreator, chunks: string[], testData: LoadedTest): TokenSourceData {
98    const result = new TokenizeHandler(testData);
99    const tokenizer = createTokenSource(result);
100
101    // NOTE: set small waterline for testing purposes
102    tokenizer.preprocessor.bufferWaterline = 8;
103    tokenizer.state = testData.initialState;
104
105    if (testData.lastStartTag) {
106        tokenizer.lastStartTagName = testData.lastStartTag;
107    }
108
109    for (let i = 0; i < chunks.length; i++) {
110        assert.ok(!result.sawEof);
111        tokenizer.write(chunks[i], i === chunks.length - 1);
112    }
113
114    assert.ok(result.sawEof);
115    assert.ok(!tokenizer.active);
116
117    // Sort errors by line and column
118    result.errors.sort((err1, err2) => err1.line - err2.line || err1.col - err2.col);
119
120    return result;
121}
122
123function unicodeUnescape(str: string): string {
124    return str.replace(/\\[Uu]\w{4}/g, (match: string) => String.fromCharCode(Number.parseInt(match.slice(2), 16)));
125}
126
127function unescapeDescrIO(testDescr: TestDescription): void {
128    testDescr.input = unicodeUnescape(testDescr.input);
129
130    for (const tokenEntry of testDescr.output) {
131        //NOTE: unescape token tagName (for StartTag and EndTag tokens), comment data (for Comment token),
132        //character token data (for Character token).
133        if (tokenEntry[1]) {
134            tokenEntry[1] = unicodeUnescape(tokenEntry[1]);
135        }
136    }
137}
138
139function getTokenizerSuitableStateName(testDataStateName: string): Tokenizer['state'] {
140    const name = testDataStateName.slice(0, -6).replace(' ', '_').toUpperCase();
141    return TokenizerMode[name as keyof typeof TokenizerMode];
142}
143
144interface TestDescription {
145    initialStates: string[];
146    doubleEscaped?: boolean;
147    output: HtmlLibToken[];
148    description: string;
149    input: string;
150    lastStartTag: string;
151    errors?: TokenError[];
152}
153
154interface LoadedTest {
155    idx: number;
156    setName: string;
157    name: string;
158    input: string;
159    expected: HtmlLibToken[];
160    initialState: Tokenizer['state'];
161    initialStateName: string;
162    lastStartTag: string;
163    expectedErrors: TokenError[];
164}
165
166function loadTests(dataDirPath: string): LoadedTest[] {
167    const testSetFileNames = fs.readdirSync(dataDirPath);
168    const tests: LoadedTest[] = [];
169    let testIdx = 0;
170
171    for (const fileName of testSetFileNames) {
172        if (path.extname(fileName) !== '.test') {
173            continue;
174        }
175
176        const filePath = path.join(dataDirPath, fileName);
177        const testSetJson = fs.readFileSync(filePath).toString();
178        const testSet = JSON.parse(testSetJson);
179        const testDescrs: TestDescription[] = testSet.tests;
180
181        if (!testDescrs) {
182            continue;
183        }
184
185        const setName = fileName.replace('.test', '');
186
187        for (const descr of testDescrs) {
188            if (!descr.initialStates) {
189                descr.initialStates = ['Data state'];
190            }
191
192            if (descr.doubleEscaped) {
193                unescapeDescrIO(descr);
194            }
195
196            const expected = descr.output;
197
198            for (const initialStateName of descr.initialStates) {
199                tests.push({
200                    idx: ++testIdx,
201                    setName,
202                    name: descr.description,
203                    input: descr.input,
204                    expected,
205                    initialState: getTokenizerSuitableStateName(initialStateName),
206                    initialStateName,
207                    lastStartTag: descr.lastStartTag,
208                    expectedErrors: descr.errors || [],
209                });
210            }
211        }
212    }
213
214    return tests;
215}
216
217export function generateTokenizationTests(
218    prefix: string,
219    testSuite: string,
220    createTokenSource: TokenSourceCreator
221): void {
222    for (const testData of loadTests(testSuite)) {
223        const testName = `${prefix} - ${testData.idx}.${testData.setName} - ${testData.name} - Initial state: ${testData.initialStateName}`;
224
225        it(testName, () => {
226            const chunks = makeChunks(testData.input);
227            const result = tokenize(createTokenSource, chunks, testData);
228
229            assert.deepEqual(result.tokens, testData.expected, `Chunks: ${JSON.stringify(chunks)}`);
230            assert.deepEqual(result.errors, testData.expectedErrors || []);
231        });
232    }
233}
234