1'use strict'; 2 3const assert = require('assert'); 4const fs = require('fs'); 5const path = require('path'); 6const Tokenizer = require('../../packages/parse5/lib/tokenizer'); 7const { makeChunks } = require('./common'); 8 9function convertTokenToHtml5Lib(token) { 10 switch (token.type) { 11 case Tokenizer.CHARACTER_TOKEN: 12 case Tokenizer.NULL_CHARACTER_TOKEN: 13 case Tokenizer.WHITESPACE_CHARACTER_TOKEN: 14 return ['Character', token.chars]; 15 16 case Tokenizer.START_TAG_TOKEN: { 17 const reformatedAttrs = {}; 18 19 token.attrs.forEach(attr => { 20 reformatedAttrs[attr.name] = attr.value; 21 }); 22 23 const startTagEntry = ['StartTag', token.tagName, reformatedAttrs]; 24 25 if (token.selfClosing) { 26 startTagEntry.push(true); 27 } 28 29 return startTagEntry; 30 } 31 32 case Tokenizer.END_TAG_TOKEN: 33 // NOTE: parser feedback simulator can produce adjusted SVG 34 // tag names for end tag tokens so we need to lower case it 35 return ['EndTag', token.tagName.toLowerCase()]; 36 37 case Tokenizer.COMMENT_TOKEN: 38 return ['Comment', token.data]; 39 40 case Tokenizer.DOCTYPE_TOKEN: 41 return ['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]; 42 43 default: 44 throw new TypeError('Unrecognized token type: ' + token.type); 45 } 46} 47 48function sortErrors(result) { 49 result.errors = result.errors.sort((err1, err2) => { 50 const lineDiff = err1.line - err2.line; 51 52 if (lineDiff !== 0) { 53 return lineDiff; 54 } 55 56 return err1.col - err2.col; 57 }); 58} 59 60function tokenize(createTokenSource, chunks, initialState, lastStartTag) { 61 const result = { tokens: [], errors: [] }; 62 const { tokenizer, getNextToken } = createTokenSource(result); 63 let token = { type: Tokenizer.HIBERNATION_TOKEN }; 64 let chunkIdx = 0; 65 66 // NOTE: set small waterline for testing purposes 67 tokenizer.preprocessor.bufferWaterline = 8; 68 tokenizer.state = initialState; 69 70 if (lastStartTag) { 71 tokenizer.lastStartTagName = lastStartTag; 72 } 73 74 function writeChunk() { 75 const chunk = chunks[chunkIdx]; 76 77 tokenizer.write(chunk, ++chunkIdx === chunks.length); 78 } 79 80 do { 81 if (token.type === Tokenizer.HIBERNATION_TOKEN) { 82 writeChunk(); 83 } else { 84 appendTokenEntry(result.tokens, convertTokenToHtml5Lib(token)); 85 } 86 87 token = getNextToken(); 88 } while (token.type !== Tokenizer.EOF_TOKEN); 89 90 sortErrors(result); 91 92 return result; 93} 94 95function unicodeUnescape(str) { 96 return str.replace(/\\u([\d\w]{4})/gi, (match, chCodeStr) => String.fromCharCode(parseInt(chCodeStr, 16))); 97} 98 99function unescapeDescrIO(testDescr) { 100 testDescr.input = unicodeUnescape(testDescr.input); 101 102 testDescr.output.forEach(tokenEntry => { 103 //NOTE: unescape token tagName (for StartTag and EndTag tokens), comment data (for Comment token), 104 //character token data (for Character token). 105 tokenEntry[1] = unicodeUnescape(tokenEntry[1]); 106 107 //NOTE: unescape token attributes(if we have them). 108 if (tokenEntry.length > 2) { 109 Object.keys(tokenEntry).forEach(attrName => { 110 const attrVal = tokenEntry[attrName]; 111 112 delete tokenEntry[attrName]; 113 tokenEntry[unicodeUnescape(attrName)] = unicodeUnescape(attrVal); 114 }); 115 } 116 }); 117} 118 119function appendTokenEntry(result, tokenEntry) { 120 if (tokenEntry[0] === 'Character') { 121 const lastEntry = result[result.length - 1]; 122 123 if (lastEntry && lastEntry[0] === 'Character') { 124 lastEntry[1] += tokenEntry[1]; 125 return; 126 } 127 } 128 129 result.push(tokenEntry); 130} 131 132function concatCharacterTokens(tokenEntries) { 133 const result = []; 134 135 tokenEntries.forEach(tokenEntry => { 136 appendTokenEntry(result, tokenEntry); 137 }); 138 139 return result; 140} 141 142function getTokenizerSuitableStateName(testDataStateName) { 143 return testDataStateName.toUpperCase().replace(/\s/g, '_'); 144} 145 146function loadTests(dataDirPath) { 147 const testSetFileNames = fs.readdirSync(dataDirPath); 148 const tests = []; 149 let testIdx = 0; 150 151 testSetFileNames.forEach(fileName => { 152 if (path.extname(fileName) !== '.test') { 153 return; 154 } 155 156 const filePath = path.join(dataDirPath, fileName); 157 const testSetJson = fs.readFileSync(filePath).toString(); 158 const testSet = JSON.parse(testSetJson); 159 const testDescrs = testSet.tests; 160 161 if (!testDescrs) { 162 return; 163 } 164 165 const setName = fileName.replace('.test', ''); 166 167 testDescrs.forEach(descr => { 168 if (!descr.initialStates) { 169 descr.initialStates = ['Data state']; 170 } 171 172 if (descr.doubleEscaped) { 173 unescapeDescrIO(descr); 174 } 175 176 const expected = []; 177 178 descr.output.forEach(tokenEntry => { 179 if (tokenEntry !== 'ParseError') { 180 expected.push(tokenEntry); 181 } 182 }); 183 184 descr.initialStates.forEach(initialState => { 185 tests.push({ 186 idx: ++testIdx, 187 setName: setName, 188 name: descr.description, 189 input: descr.input, 190 expected: concatCharacterTokens(expected), 191 initialState: getTokenizerSuitableStateName(initialState), 192 lastStartTag: descr.lastStartTag, 193 expectedErrors: descr.errors || [] 194 }); 195 }); 196 }); 197 }); 198 199 return tests; 200} 201 202module.exports = function generateTokenizationTests(moduleExports, prefix, testSuite, createTokenSource) { 203 loadTests(testSuite).forEach(test => { 204 const testName = `${prefix} - ${test.idx}.${test.setName} - ${test.name} - Initial state: ${test.initialState}`; 205 206 moduleExports[testName] = function() { 207 const chunks = makeChunks(test.input); 208 const result = tokenize(createTokenSource, chunks, test.initialState, test.lastStartTag); 209 210 assert.deepEqual(result.tokens, test.expected, 'Chunks: ' + JSON.stringify(chunks)); 211 assert.deepEqual(result.errors, test.expectedErrors || []); 212 }; 213 }); 214}; 215 216module.exports.convertTokenToHtml5Lib = convertTokenToHtml5Lib; 217