• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1'use strict';
2
3const { Transform } = require('stream');
4const Tokenizer = require('parse5/lib/tokenizer');
5const LocationInfoTokenizerMixin = require('parse5/lib/extensions/location-info/tokenizer-mixin');
6const Mixin = require('parse5/lib/utils/mixin');
7const mergeOptions = require('parse5/lib/utils/merge-options');
8const DevNullStream = require('./dev-null-stream');
9const ParserFeedbackSimulator = require('./parser-feedback-simulator');
10
11const DEFAULT_OPTIONS = {
12    sourceCodeLocationInfo: false
13};
14
15class SAXParser extends Transform {
16    constructor(options) {
17        super({ encoding: 'utf8', decodeStrings: false });
18
19        this.options = mergeOptions(DEFAULT_OPTIONS, options);
20
21        this.tokenizer = new Tokenizer(options);
22        this.locInfoMixin = null;
23
24        if (this.options.sourceCodeLocationInfo) {
25            this.locInfoMixin = Mixin.install(this.tokenizer, LocationInfoTokenizerMixin);
26        }
27
28        this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.tokenizer);
29
30        this.pendingText = null;
31
32        this.lastChunkWritten = false;
33        this.stopped = false;
34
35        // NOTE: always pipe stream to the /dev/null stream to avoid
36        // `highWaterMark` hit even if we don't have consumers.
37        // (see: https://github.com/inikulin/parse5/issues/97#issuecomment-171940774)
38        this.pipe(new DevNullStream());
39    }
40
41    //TransformStream implementation
42    _transform(chunk, encoding, callback) {
43        if (typeof chunk !== 'string') {
44            throw new TypeError('Parser can work only with string streams.');
45        }
46
47        callback(null, this._transformChunk(chunk));
48    }
49
50    _final(callback) {
51        this.lastChunkWritten = true;
52        callback(null, this._transformChunk(''));
53    }
54
55    stop() {
56        this.stopped = true;
57    }
58
59    //Internals
60    _transformChunk(chunk) {
61        if (!this.stopped) {
62            this.tokenizer.write(chunk, this.lastChunkWritten);
63            this._runParsingLoop();
64        }
65        return chunk;
66    }
67
68    _runParsingLoop() {
69        let token = null;
70
71        do {
72            token = this.parserFeedbackSimulator.getNextToken();
73
74            if (token.type === Tokenizer.HIBERNATION_TOKEN) {
75                break;
76            }
77
78            if (
79                token.type === Tokenizer.CHARACTER_TOKEN ||
80                token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN ||
81                token.type === Tokenizer.NULL_CHARACTER_TOKEN
82            ) {
83                if (this.pendingText === null) {
84                    token.type = Tokenizer.CHARACTER_TOKEN;
85                    this.pendingText = token;
86                } else {
87                    this.pendingText.chars += token.chars;
88
89                    if (this.options.sourceCodeLocationInfo) {
90                        const { endLine, endCol, endOffset } = token.location;
91                        Object.assign(this.pendingText.location, {
92                            endLine,
93                            endCol,
94                            endOffset
95                        });
96                    }
97                }
98            } else {
99                this._emitPendingText();
100                this._handleToken(token);
101            }
102        } while (!this.stopped && token.type !== Tokenizer.EOF_TOKEN);
103    }
104
105    _handleToken(token) {
106        if (token.type === Tokenizer.EOF_TOKEN) {
107            return true;
108        }
109
110        const { eventName, reshapeToken } = TOKEN_EMISSION_HELPERS[token.type];
111
112        if (this.listenerCount(eventName) === 0) {
113            return false;
114        }
115
116        this._emitToken(eventName, reshapeToken(token));
117
118        return true;
119    }
120
121    _emitToken(eventName, token) {
122        this.emit(eventName, token);
123    }
124
125    _emitPendingText() {
126        if (this.pendingText !== null) {
127            this._handleToken(this.pendingText);
128            this.pendingText = null;
129        }
130    }
131}
132
133const TOKEN_EMISSION_HELPERS = {
134    [Tokenizer.START_TAG_TOKEN]: {
135        eventName: 'startTag',
136        reshapeToken: origToken => ({
137            tagName: origToken.tagName,
138            attrs: origToken.attrs,
139            selfClosing: origToken.selfClosing,
140            sourceCodeLocation: origToken.location
141        })
142    },
143    [Tokenizer.END_TAG_TOKEN]: {
144        eventName: 'endTag',
145        reshapeToken: origToken => ({ tagName: origToken.tagName, sourceCodeLocation: origToken.location })
146    },
147    [Tokenizer.COMMENT_TOKEN]: {
148        eventName: 'comment',
149        reshapeToken: origToken => ({ text: origToken.data, sourceCodeLocation: origToken.location })
150    },
151    [Tokenizer.DOCTYPE_TOKEN]: {
152        eventName: 'doctype',
153        reshapeToken: origToken => ({
154            name: origToken.name,
155            publicId: origToken.publicId,
156            systemId: origToken.systemId,
157            sourceCodeLocation: origToken.location
158        })
159    },
160    [Tokenizer.CHARACTER_TOKEN]: {
161        eventName: 'text',
162        reshapeToken: origToken => ({ text: origToken.chars, sourceCodeLocation: origToken.location })
163    }
164};
165
166module.exports = SAXParser;
167