1 // Copyright 2014 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "xfa/fxfa/formcalc/cxfa_fmlexer.h"
8
9 #include <algorithm>
10
11 #include "core/fxcrt/compiler_specific.h"
12 #include "core/fxcrt/fx_extension.h"
13 #include "core/fxcrt/stl_util.h"
14
15 namespace {
16
IsFormCalcCharacter(wchar_t c)17 bool IsFormCalcCharacter(wchar_t c) {
18 return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0xd7FF) ||
19 (c >= 0xE000 && c <= 0xFFFD);
20 }
21
IsIdentifierCharacter(wchar_t c)22 bool IsIdentifierCharacter(wchar_t c) {
23 return FXSYS_iswalnum(c) || c == 0x005F || // '_'
24 c == 0x0024; // '$'
25 }
26
IsInitialIdentifierCharacter(wchar_t c)27 bool IsInitialIdentifierCharacter(wchar_t c) {
28 return FXSYS_iswalpha(c) || c == 0x005F || // '_'
29 c == 0x0024 || // '$'
30 c == 0x0021; // '!'
31 }
32
IsWhitespaceCharacter(wchar_t c)33 bool IsWhitespaceCharacter(wchar_t c) {
34 return c == 0x0009 || // Horizontal tab
35 c == 0x000B || // Vertical tab
36 c == 0x000C || // Form feed
37 c == 0x0020; // Space
38 }
39
40 struct XFA_FMKeyword {
41 XFA_FM_TOKEN m_type;
42 const char* m_keyword; // Raw, POD struct.
43 };
44
45 const XFA_FMKeyword kKeyWords[] = {
46 {TOKdo, "do"},
47 {TOKkseq, "eq"},
48 {TOKksge, "ge"},
49 {TOKksgt, "gt"},
50 {TOKif, "if"},
51 {TOKin, "in"},
52 {TOKksle, "le"},
53 {TOKkslt, "lt"},
54 {TOKksne, "ne"},
55 {TOKksor, "or"},
56 {TOKnull, "null"},
57 {TOKbreak, "break"},
58 {TOKksand, "and"},
59 {TOKend, "end"},
60 {TOKeof, "eof"},
61 {TOKfor, "for"},
62 {TOKnan, "nan"},
63 {TOKksnot, "not"},
64 {TOKvar, "var"},
65 {TOKthen, "then"},
66 {TOKelse, "else"},
67 {TOKexit, "exit"},
68 {TOKdownto, "downto"},
69 {TOKreturn, "return"},
70 {TOKinfinity, "infinity"},
71 {TOKendwhile, "endwhile"},
72 {TOKforeach, "foreach"},
73 {TOKendfunc, "endfunc"},
74 {TOKelseif, "elseif"},
75 {TOKwhile, "while"},
76 {TOKendfor, "endfor"},
77 {TOKthrow, "throw"},
78 {TOKstep, "step"},
79 {TOKupto, "upto"},
80 {TOKcontinue, "continue"},
81 {TOKfunc, "func"},
82 {TOKendif, "endif"},
83 };
84
85 #ifndef NDEBUG
86 constexpr auto kTokenStrings = fxcrt::ToArray<const char*>({
87 "TOKand", "TOKlparen", "TOKrparen", "TOKmul",
88 "TOKplus", "TOKcomma", "TOKminus", "TOKdot",
89 "TOKdiv", "TOKlt", "TOKassign", "TOKgt",
90 "TOKlbracket", "TOKrbracket", "TOKor", "TOKdotscream",
91 "TOKdotstar", "TOKdotdot", "TOKle", "TOKne",
92 "TOKeq", "TOKge", "TOKdo", "TOKkseq",
93 "TOKksge", "TOKksgt", "TOKif", "TOKin",
94 "TOKksle", "TOKkslt", "TOKksne", "TOKksor",
95 "TOKnull", "TOKbreak", "TOKksand", "TOKend",
96 "TOKeof", "TOKfor", "TOKnan", "TOKksnot",
97 "TOKvar", "TOKthen", "TOKelse", "TOKexit",
98 "TOKdownto", "TOKreturn", "TOKinfinity", "TOKendwhile",
99 "TOKforeach", "TOKendfunc", "TOKelseif", "TOKwhile",
100 "TOKendfor", "TOKthrow", "TOKstep", "TOKupto",
101 "TOKcontinue", "TOKfunc", "TOKendif", "TOKstar",
102 "TOKidentifier", "TOKunderscore", "TOKdollar", "TOKexclamation",
103 "TOKcall", "TOKstring", "TOKnumber", "TOKreserver",
104 });
105 #endif // NDEBUG
106
TokenizeIdentifier(WideStringView str)107 XFA_FM_TOKEN TokenizeIdentifier(WideStringView str) {
108 const XFA_FMKeyword* result =
109 std::find_if(std::begin(kKeyWords), std::end(kKeyWords),
110 [str](const XFA_FMKeyword& iter) {
111 return str.EqualsASCII(iter.m_keyword);
112 });
113 if (result != std::end(kKeyWords) && str.EqualsASCII(result->m_keyword)) {
114 return result->m_type;
115 }
116 return TOKidentifier;
117 }
118
119 } // namespace
120
121 CXFA_FMLexer::Token::Token() = default;
122
Token(XFA_FM_TOKEN token)123 CXFA_FMLexer::Token::Token(XFA_FM_TOKEN token) : m_type(token) {}
124
Token(XFA_FM_TOKEN token,WideStringView str)125 CXFA_FMLexer::Token::Token(XFA_FM_TOKEN token, WideStringView str)
126 : m_type(token), m_string(str) {}
127
128 CXFA_FMLexer::Token::Token(const Token& that) = default;
129
130 CXFA_FMLexer::Token::~Token() = default;
131
132 #ifndef NDEBUG
ToDebugString() const133 WideString CXFA_FMLexer::Token::ToDebugString() const {
134 WideString str = WideString::FromASCII("type = ");
135 str += WideString::FromASCII(kTokenStrings[m_type]);
136 str += WideString::FromASCII(", string = ");
137 str += m_string;
138 return str;
139 }
140 #endif // NDEBUG
141
CXFA_FMLexer(WideStringView wsFormCalc)142 CXFA_FMLexer::CXFA_FMLexer(WideStringView wsFormCalc)
143 : m_spInput(wsFormCalc.span()) {}
144
145 CXFA_FMLexer::~CXFA_FMLexer() = default;
146
NextToken()147 CXFA_FMLexer::Token CXFA_FMLexer::NextToken() {
148 if (m_bLexerError)
149 return Token();
150
151 while (!IsComplete() && m_spInput[m_nCursor]) {
152 if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
153 RaiseError();
154 return Token();
155 }
156
157 switch (m_spInput[m_nCursor]) {
158 case '\n':
159 ++m_nCursor;
160 break;
161 case '\r':
162 ++m_nCursor;
163 break;
164 case ';':
165 AdvanceForComment();
166 break;
167 case '"':
168 return AdvanceForString();
169 case '0':
170 case '1':
171 case '2':
172 case '3':
173 case '4':
174 case '5':
175 case '6':
176 case '7':
177 case '8':
178 case '9':
179 return AdvanceForNumber();
180 case '=':
181 ++m_nCursor;
182 if (m_nCursor >= m_spInput.size())
183 return Token(TOKassign);
184
185 if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
186 RaiseError();
187 return Token();
188 }
189 if (m_spInput[m_nCursor] == '=') {
190 ++m_nCursor;
191 return Token(TOKeq);
192 }
193 return Token(TOKassign);
194 case '<':
195 ++m_nCursor;
196 if (m_nCursor >= m_spInput.size())
197 return Token(TOKlt);
198
199 if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
200 RaiseError();
201 return Token();
202 }
203 if (m_spInput[m_nCursor] == '=') {
204 ++m_nCursor;
205 return Token(TOKle);
206 }
207 if (m_spInput[m_nCursor] == '>') {
208 ++m_nCursor;
209 return Token(TOKne);
210 }
211 return Token(TOKlt);
212 case '>':
213 ++m_nCursor;
214 if (m_nCursor >= m_spInput.size())
215 return Token(TOKgt);
216
217 if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
218 RaiseError();
219 return Token();
220 }
221 if (m_spInput[m_nCursor] == '=') {
222 ++m_nCursor;
223 return Token(TOKge);
224 }
225 return Token(TOKgt);
226 case ',':
227 ++m_nCursor;
228 return Token(TOKcomma);
229 case '(':
230 ++m_nCursor;
231 return Token(TOKlparen);
232 case ')':
233 ++m_nCursor;
234 return Token(TOKrparen);
235 case '[':
236 ++m_nCursor;
237 return Token(TOKlbracket);
238 case ']':
239 ++m_nCursor;
240 return Token(TOKrbracket);
241 case '&':
242 ++m_nCursor;
243 return Token(TOKand);
244 case '|':
245 ++m_nCursor;
246 return Token(TOKor);
247 case '+':
248 ++m_nCursor;
249 return Token(TOKplus);
250 case '-':
251 ++m_nCursor;
252 return Token(TOKminus);
253 case '*':
254 ++m_nCursor;
255 return Token(TOKmul);
256 case '/': {
257 ++m_nCursor;
258 if (m_nCursor >= m_spInput.size())
259 return Token(TOKdiv);
260
261 if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
262 RaiseError();
263 return Token();
264 }
265 if (m_spInput[m_nCursor] != '/')
266 return Token(TOKdiv);
267
268 AdvanceForComment();
269 break;
270 }
271 case '.':
272 ++m_nCursor;
273 if (m_nCursor >= m_spInput.size())
274 return Token(TOKdot);
275
276 if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
277 RaiseError();
278 return Token();
279 }
280
281 if (m_spInput[m_nCursor] == '.') {
282 ++m_nCursor;
283 return Token(TOKdotdot);
284 }
285 if (m_spInput[m_nCursor] == '*') {
286 ++m_nCursor;
287 return Token(TOKdotstar);
288 }
289 if (m_spInput[m_nCursor] == '#') {
290 ++m_nCursor;
291 return Token(TOKdotscream);
292 }
293 if (FXSYS_IsDecimalDigit(m_spInput[m_nCursor])) {
294 --m_nCursor;
295 return AdvanceForNumber();
296 }
297 return Token(TOKdot);
298 default:
299 if (IsWhitespaceCharacter(m_spInput[m_nCursor])) {
300 ++m_nCursor;
301 break;
302 }
303 if (!IsInitialIdentifierCharacter(m_spInput[m_nCursor])) {
304 RaiseError();
305 return Token();
306 }
307 return AdvanceForIdentifier();
308 }
309 }
310 return Token(TOKeof);
311 }
312
AdvanceForNumber()313 CXFA_FMLexer::Token CXFA_FMLexer::AdvanceForNumber() {
314 // This will set end to the character after the end of the number.
315 size_t used_length = 0;
316 if (m_nCursor < m_spInput.size()) {
317 FXSYS_wcstof(WideStringView(m_spInput.subspan(m_nCursor)), &used_length);
318 }
319 size_t end = m_nCursor + used_length;
320 if (used_length == 0 ||
321 (end < m_spInput.size() && FXSYS_iswalpha(m_spInput[end]))) {
322 RaiseError();
323 return Token();
324 }
325 WideStringView str(m_spInput.subspan(m_nCursor, end - m_nCursor));
326 m_nCursor = end;
327 return Token(TOKnumber, str);
328 }
329
AdvanceForString()330 CXFA_FMLexer::Token CXFA_FMLexer::AdvanceForString() {
331 size_t start = m_nCursor;
332 ++m_nCursor;
333 while (!IsComplete() && m_spInput[m_nCursor]) {
334 if (!IsFormCalcCharacter(m_spInput[m_nCursor]))
335 break;
336
337 if (m_spInput[m_nCursor] == '"') {
338 // Check for escaped "s, i.e. "".
339 ++m_nCursor;
340 // If the end of the input has been reached it was not escaped.
341 if (m_nCursor >= m_spInput.size()) {
342 return Token(TOKstring, WideStringView(m_spInput.subspan(
343 start, m_nCursor - start)));
344 }
345 // If the next character is not a " then the end of the string has been
346 // found.
347 if (m_spInput[m_nCursor] != '"') {
348 if (!IsFormCalcCharacter(m_spInput[m_nCursor]))
349 break;
350
351 return Token(TOKstring, WideStringView(m_spInput.subspan(
352 start, m_nCursor - start)));
353 }
354 }
355 ++m_nCursor;
356 }
357
358 // Didn't find the end of the string.
359 RaiseError();
360 return Token();
361 }
362
AdvanceForIdentifier()363 CXFA_FMLexer::Token CXFA_FMLexer::AdvanceForIdentifier() {
364 size_t start = m_nCursor;
365 ++m_nCursor;
366 while (!IsComplete() && m_spInput[m_nCursor]) {
367 if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
368 RaiseError();
369 return Token();
370 }
371 if (!IsIdentifierCharacter(m_spInput[m_nCursor]))
372 break;
373
374 ++m_nCursor;
375 }
376
377 WideStringView str(m_spInput.subspan(start, m_nCursor - start));
378 return Token(TokenizeIdentifier(str), str);
379 }
380
AdvanceForComment()381 void CXFA_FMLexer::AdvanceForComment() {
382 ++m_nCursor;
383 while (!IsComplete() && m_spInput[m_nCursor]) {
384 if (!IsFormCalcCharacter(m_spInput[m_nCursor])) {
385 RaiseError();
386 return;
387 }
388 if (m_spInput[m_nCursor] == L'\r') {
389 ++m_nCursor;
390 return;
391 }
392 if (m_spInput[m_nCursor] == L'\n') {
393 ++m_nCursor;
394 return;
395 }
396 ++m_nCursor;
397 }
398 }
399