• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1'use strict';
2
3const assert = require('assert');
4const fs = require('fs');
5const path = require('path');
6const Tokenizer = require('../../packages/parse5/lib/tokenizer');
7const { makeChunks } = require('./common');
8
9function convertTokenToHtml5Lib(token) {
10    switch (token.type) {
11        case Tokenizer.CHARACTER_TOKEN:
12        case Tokenizer.NULL_CHARACTER_TOKEN:
13        case Tokenizer.WHITESPACE_CHARACTER_TOKEN:
14            return ['Character', token.chars];
15
16        case Tokenizer.START_TAG_TOKEN: {
17            const reformatedAttrs = {};
18
19            token.attrs.forEach(attr => {
20                reformatedAttrs[attr.name] = attr.value;
21            });
22
23            const startTagEntry = ['StartTag', token.tagName, reformatedAttrs];
24
25            if (token.selfClosing) {
26                startTagEntry.push(true);
27            }
28
29            return startTagEntry;
30        }
31
32        case Tokenizer.END_TAG_TOKEN:
33            // NOTE: parser feedback simulator can produce adjusted SVG
34            // tag names for end tag tokens so we need to lower case it
35            return ['EndTag', token.tagName.toLowerCase()];
36
37        case Tokenizer.COMMENT_TOKEN:
38            return ['Comment', token.data];
39
40        case Tokenizer.DOCTYPE_TOKEN:
41            return ['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks];
42
43        default:
44            throw new TypeError('Unrecognized token type: ' + token.type);
45    }
46}
47
48function sortErrors(result) {
49    result.errors = result.errors.sort((err1, err2) => {
50        const lineDiff = err1.line - err2.line;
51
52        if (lineDiff !== 0) {
53            return lineDiff;
54        }
55
56        return err1.col - err2.col;
57    });
58}
59
60function tokenize(createTokenSource, chunks, initialState, lastStartTag) {
61    const result = { tokens: [], errors: [] };
62    const { tokenizer, getNextToken } = createTokenSource(result);
63    let token = { type: Tokenizer.HIBERNATION_TOKEN };
64    let chunkIdx = 0;
65
66    // NOTE: set small waterline for testing purposes
67    tokenizer.preprocessor.bufferWaterline = 8;
68    tokenizer.state = initialState;
69
70    if (lastStartTag) {
71        tokenizer.lastStartTagName = lastStartTag;
72    }
73
74    function writeChunk() {
75        const chunk = chunks[chunkIdx];
76
77        tokenizer.write(chunk, ++chunkIdx === chunks.length);
78    }
79
80    do {
81        if (token.type === Tokenizer.HIBERNATION_TOKEN) {
82            writeChunk();
83        } else {
84            appendTokenEntry(result.tokens, convertTokenToHtml5Lib(token));
85        }
86
87        token = getNextToken();
88    } while (token.type !== Tokenizer.EOF_TOKEN);
89
90    sortErrors(result);
91
92    return result;
93}
94
95function unicodeUnescape(str) {
96    return str.replace(/\\u([\d\w]{4})/gi, (match, chCodeStr) => String.fromCharCode(parseInt(chCodeStr, 16)));
97}
98
99function unescapeDescrIO(testDescr) {
100    testDescr.input = unicodeUnescape(testDescr.input);
101
102    testDescr.output.forEach(tokenEntry => {
103        //NOTE: unescape token tagName (for StartTag and EndTag tokens), comment data (for Comment token),
104        //character token data (for Character token).
105        tokenEntry[1] = unicodeUnescape(tokenEntry[1]);
106
107        //NOTE: unescape token attributes(if we have them).
108        if (tokenEntry.length > 2) {
109            Object.keys(tokenEntry).forEach(attrName => {
110                const attrVal = tokenEntry[attrName];
111
112                delete tokenEntry[attrName];
113                tokenEntry[unicodeUnescape(attrName)] = unicodeUnescape(attrVal);
114            });
115        }
116    });
117}
118
119function appendTokenEntry(result, tokenEntry) {
120    if (tokenEntry[0] === 'Character') {
121        const lastEntry = result[result.length - 1];
122
123        if (lastEntry && lastEntry[0] === 'Character') {
124            lastEntry[1] += tokenEntry[1];
125            return;
126        }
127    }
128
129    result.push(tokenEntry);
130}
131
132function concatCharacterTokens(tokenEntries) {
133    const result = [];
134
135    tokenEntries.forEach(tokenEntry => {
136        appendTokenEntry(result, tokenEntry);
137    });
138
139    return result;
140}
141
142function getTokenizerSuitableStateName(testDataStateName) {
143    return testDataStateName.toUpperCase().replace(/\s/g, '_');
144}
145
146function loadTests(dataDirPath) {
147    const testSetFileNames = fs.readdirSync(dataDirPath);
148    const tests = [];
149    let testIdx = 0;
150
151    testSetFileNames.forEach(fileName => {
152        if (path.extname(fileName) !== '.test') {
153            return;
154        }
155
156        const filePath = path.join(dataDirPath, fileName);
157        const testSetJson = fs.readFileSync(filePath).toString();
158        const testSet = JSON.parse(testSetJson);
159        const testDescrs = testSet.tests;
160
161        if (!testDescrs) {
162            return;
163        }
164
165        const setName = fileName.replace('.test', '');
166
167        testDescrs.forEach(descr => {
168            if (!descr.initialStates) {
169                descr.initialStates = ['Data state'];
170            }
171
172            if (descr.doubleEscaped) {
173                unescapeDescrIO(descr);
174            }
175
176            const expected = [];
177
178            descr.output.forEach(tokenEntry => {
179                if (tokenEntry !== 'ParseError') {
180                    expected.push(tokenEntry);
181                }
182            });
183
184            descr.initialStates.forEach(initialState => {
185                tests.push({
186                    idx: ++testIdx,
187                    setName: setName,
188                    name: descr.description,
189                    input: descr.input,
190                    expected: concatCharacterTokens(expected),
191                    initialState: getTokenizerSuitableStateName(initialState),
192                    lastStartTag: descr.lastStartTag,
193                    expectedErrors: descr.errors || []
194                });
195            });
196        });
197    });
198
199    return tests;
200}
201
202module.exports = function generateTokenizationTests(moduleExports, prefix, testSuite, createTokenSource) {
203    loadTests(testSuite).forEach(test => {
204        const testName = `${prefix} - ${test.idx}.${test.setName} - ${test.name} - Initial state: ${test.initialState}`;
205
206        moduleExports[testName] = function() {
207            const chunks = makeChunks(test.input);
208            const result = tokenize(createTokenSource, chunks, test.initialState, test.lastStartTag);
209
210            assert.deepEqual(result.tokens, test.expected, 'Chunks: ' + JSON.stringify(chunks));
211            assert.deepEqual(result.errors, test.expectedErrors || []);
212        };
213    });
214};
215
216module.exports.convertTokenToHtml5Lib = convertTokenToHtml5Lib;
217