1 // Copright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "xfa/fxfa/fm2js/cxfa_fmlexer.h"
8
9 #include <algorithm>
10
11 #include "core/fxcrt/fx_extension.h"
12 #include "third_party/base/ptr_util.h"
13 #include "third_party/icu/source/common/unicode/uchar.h"
14
15 namespace {
16
IsFormCalcCharacter(wchar_t c)17 bool IsFormCalcCharacter(wchar_t c) {
18 return (c >= 0x09 && c <= 0x0D) || (c >= 0x20 && c <= 0xd7FF) ||
19 (c >= 0xE000 && c <= 0xFFFD);
20 }
21
IsIdentifierCharacter(wchar_t c)22 bool IsIdentifierCharacter(wchar_t c) {
23 return u_isalnum(c) || c == 0x005F || // '_'
24 c == 0x0024; // '$'
25 }
26
IsInitialIdentifierCharacter(wchar_t c)27 bool IsInitialIdentifierCharacter(wchar_t c) {
28 return u_isalpha(c) || c == 0x005F || // '_'
29 c == 0x0024 || // '$'
30 c == 0x0021; // '!'
31 }
32
IsWhitespaceCharacter(wchar_t c)33 bool IsWhitespaceCharacter(wchar_t c) {
34 return c == 0x0009 || // Horizontal tab
35 c == 0x000B || // Vertical tab
36 c == 0x000C || // Form feed
37 c == 0x0020; // Space
38 }
39
40 const XFA_FMKeyword keyWords[] = {
41 {TOKand, 0x00000026, L"&"},
42 {TOKlparen, 0x00000028, L"("},
43 {TOKrparen, 0x00000029, L")"},
44 {TOKmul, 0x0000002a, L"*"},
45 {TOKplus, 0x0000002b, L"+"},
46 {TOKcomma, 0x0000002c, L","},
47 {TOKminus, 0x0000002d, L"-"},
48 {TOKdot, 0x0000002e, L"."},
49 {TOKdiv, 0x0000002f, L"/"},
50 {TOKlt, 0x0000003c, L"<"},
51 {TOKassign, 0x0000003d, L"="},
52 {TOKgt, 0x0000003e, L">"},
53 {TOKlbracket, 0x0000005b, L"["},
54 {TOKrbracket, 0x0000005d, L"]"},
55 {TOKor, 0x0000007c, L"|"},
56 {TOKdotscream, 0x0000ec11, L".#"},
57 {TOKdotstar, 0x0000ec18, L".*"},
58 {TOKdotdot, 0x0000ec1c, L".."},
59 {TOKle, 0x000133f9, L"<="},
60 {TOKne, 0x000133fa, L"<>"},
61 {TOKeq, 0x0001391a, L"=="},
62 {TOKge, 0x00013e3b, L">="},
63 {TOKdo, 0x00020153, L"do"},
64 {TOKkseq, 0x00020676, L"eq"},
65 {TOKksge, 0x000210ac, L"ge"},
66 {TOKksgt, 0x000210bb, L"gt"},
67 {TOKif, 0x00021aef, L"if"},
68 {TOKin, 0x00021af7, L"in"},
69 {TOKksle, 0x00022a51, L"le"},
70 {TOKkslt, 0x00022a60, L"lt"},
71 {TOKksne, 0x00023493, L"ne"},
72 {TOKksor, 0x000239c1, L"or"},
73 {TOKnull, 0x052931bb, L"null"},
74 {TOKbreak, 0x05518c25, L"break"},
75 {TOKksand, 0x09f9db33, L"and"},
76 {TOKend, 0x0a631437, L"end"},
77 {TOKeof, 0x0a63195a, L"eof"},
78 {TOKfor, 0x0a7d67a7, L"for"},
79 {TOKnan, 0x0b4f91dd, L"nan"},
80 {TOKksnot, 0x0b4fd9b1, L"not"},
81 {TOKvar, 0x0c2203e9, L"var"},
82 {TOKthen, 0x2d5738cf, L"then"},
83 {TOKelse, 0x45f65ee9, L"else"},
84 {TOKexit, 0x4731d6ba, L"exit"},
85 {TOKdownto, 0x4caadc3b, L"downto"},
86 {TOKreturn, 0x4db8bd60, L"return"},
87 {TOKinfinity, 0x5c0a010a, L"infinity"},
88 {TOKendwhile, 0x5c64bff0, L"endwhile"},
89 {TOKforeach, 0x67e31f38, L"foreach"},
90 {TOKendfunc, 0x68f984a3, L"endfunc"},
91 {TOKelseif, 0x78253218, L"elseif"},
92 {TOKwhile, 0x84229259, L"while"},
93 {TOKendfor, 0x8ab49d7e, L"endfor"},
94 {TOKthrow, 0x8db05c94, L"throw"},
95 {TOKstep, 0xa7a7887c, L"step"},
96 {TOKupto, 0xb5155328, L"upto"},
97 {TOKcontinue, 0xc0340685, L"continue"},
98 {TOKfunc, 0xcdce60ec, L"func"},
99 {TOKendif, 0xe0e8fee6, L"endif"},
100 };
101
102 const XFA_FM_TOKEN KEYWORD_START = TOKdo;
103 const XFA_FM_TOKEN KEYWORD_END = TOKendif;
104
105 const wchar_t* tokenStrings[] = {
106 L"TOKand", L"TOKlparen", L"TOKrparen", L"TOKmul",
107 L"TOKplus", L"TOKcomma", L"TOKminus", L"TOKdot",
108 L"TOKdiv", L"TOKlt", L"TOKassign", L"TOKgt",
109 L"TOKlbracket", L"TOKrbracket", L"TOKor", L"TOKdotscream",
110 L"TOKdotstar", L"TOKdotdot", L"TOKle", L"TOKne",
111 L"TOKeq", L"TOKge", L"TOKdo", L"TOKkseq",
112 L"TOKksge", L"TOKksgt", L"TOKif", L"TOKin",
113 L"TOKksle", L"TOKkslt", L"TOKksne", L"TOKksor",
114 L"TOKnull", L"TOKbreak", L"TOKksand", L"TOKend",
115 L"TOKeof", L"TOKfor", L"TOKnan", L"TOKksnot",
116 L"TOKvar", L"TOKthen", L"TOKelse", L"TOKexit",
117 L"TOKdownto", L"TOKreturn", L"TOKinfinity", L"TOKendwhile",
118 L"TOKforeach", L"TOKendfunc", L"TOKelseif", L"TOKwhile",
119 L"TOKendfor", L"TOKthrow", L"TOKstep", L"TOKupto",
120 L"TOKcontinue", L"TOKfunc", L"TOKendif", L"TOKstar",
121 L"TOKidentifier", L"TOKunderscore", L"TOKdollar", L"TOKexclamation",
122 L"TOKcall", L"TOKstring", L"TOKnumber", L"TOKreserver",
123 };
124
TokenizeIdentifier(const WideStringView & str)125 XFA_FM_TOKEN TokenizeIdentifier(const WideStringView& str) {
126 uint32_t key = FX_HashCode_GetW(str, true);
127
128 const XFA_FMKeyword* end = std::begin(keyWords) + KEYWORD_END + 1;
129 const XFA_FMKeyword* result =
130 std::lower_bound(std::begin(keyWords) + KEYWORD_START, end, key,
131 [](const XFA_FMKeyword& iter, const uint32_t& val) {
132 return iter.m_hash < val;
133 });
134 if (result != end && result->m_hash == key)
135 return result->m_type;
136 return TOKidentifier;
137 }
138
139 } // namespace
140
CXFA_FMToken()141 CXFA_FMToken::CXFA_FMToken() : m_type(TOKreserver), m_line_num(1) {}
142
CXFA_FMToken(uint32_t line_num)143 CXFA_FMToken::CXFA_FMToken(uint32_t line_num)
144 : m_type(TOKreserver), m_line_num(line_num) {}
145
~CXFA_FMToken()146 CXFA_FMToken::~CXFA_FMToken() {}
147
ToDebugString() const148 WideString CXFA_FMToken::ToDebugString() const {
149 WideString str(L"type = ");
150 str += tokenStrings[m_type];
151 str += L", string = ";
152 str += m_string;
153 str += L", line_num = ";
154 str += std::to_wstring(m_line_num).c_str();
155 return str;
156 }
157
CXFA_FMLexer(const WideStringView & wsFormCalc)158 CXFA_FMLexer::CXFA_FMLexer(const WideStringView& wsFormCalc)
159 : m_cursor(wsFormCalc.unterminated_c_str()),
160 m_end(m_cursor + wsFormCalc.GetLength() - 1),
161 m_current_line(1),
162 m_lexer_error(false) {}
163
~CXFA_FMLexer()164 CXFA_FMLexer::~CXFA_FMLexer() {}
165
NextToken()166 std::unique_ptr<CXFA_FMToken> CXFA_FMLexer::NextToken() {
167 if (m_lexer_error)
168 return nullptr;
169
170 m_token = pdfium::MakeUnique<CXFA_FMToken>(m_current_line);
171 while (m_cursor <= m_end && *m_cursor) {
172 if (!IsFormCalcCharacter(*m_cursor)) {
173 RaiseError();
174 return nullptr;
175 }
176
177 switch (*m_cursor) {
178 case '\n':
179 ++m_current_line;
180 m_token->m_line_num = m_current_line;
181 ++m_cursor;
182 break;
183 case '\r':
184 ++m_cursor;
185 break;
186 case ';':
187 AdvanceForComment();
188 break;
189 case '"':
190 m_token->m_type = TOKstring;
191 AdvanceForString();
192 return std::move(m_token);
193 case '0':
194 case '1':
195 case '2':
196 case '3':
197 case '4':
198 case '5':
199 case '6':
200 case '7':
201 case '8':
202 case '9':
203 m_token->m_type = TOKnumber;
204 AdvanceForNumber();
205 return std::move(m_token);
206 case '=':
207 ++m_cursor;
208 if (m_cursor > m_end) {
209 m_token->m_type = TOKassign;
210 return std::move(m_token);
211 }
212
213 if (!IsFormCalcCharacter(*m_cursor)) {
214 RaiseError();
215 return nullptr;
216 }
217 if (*m_cursor == '=') {
218 m_token->m_type = TOKeq;
219 ++m_cursor;
220 } else {
221 m_token->m_type = TOKassign;
222 }
223 return std::move(m_token);
224 case '<':
225 ++m_cursor;
226 if (m_cursor > m_end) {
227 m_token->m_type = TOKlt;
228 return std::move(m_token);
229 }
230
231 if (!IsFormCalcCharacter(*m_cursor)) {
232 RaiseError();
233 return nullptr;
234 }
235 if (*m_cursor == '=') {
236 m_token->m_type = TOKle;
237 ++m_cursor;
238 } else if (*m_cursor == '>') {
239 m_token->m_type = TOKne;
240 ++m_cursor;
241 } else {
242 m_token->m_type = TOKlt;
243 }
244 return std::move(m_token);
245 case '>':
246 ++m_cursor;
247 if (m_cursor > m_end) {
248 m_token->m_type = TOKgt;
249 return std::move(m_token);
250 }
251
252 if (!IsFormCalcCharacter(*m_cursor)) {
253 RaiseError();
254 return nullptr;
255 }
256 if (*m_cursor == '=') {
257 m_token->m_type = TOKge;
258 ++m_cursor;
259 } else {
260 m_token->m_type = TOKgt;
261 }
262 return std::move(m_token);
263 case ',':
264 m_token->m_type = TOKcomma;
265 ++m_cursor;
266 return std::move(m_token);
267 case '(':
268 m_token->m_type = TOKlparen;
269 ++m_cursor;
270 return std::move(m_token);
271 case ')':
272 m_token->m_type = TOKrparen;
273 ++m_cursor;
274 return std::move(m_token);
275 case '[':
276 m_token->m_type = TOKlbracket;
277 ++m_cursor;
278 return std::move(m_token);
279 case ']':
280 m_token->m_type = TOKrbracket;
281 ++m_cursor;
282 return std::move(m_token);
283 case '&':
284 ++m_cursor;
285 m_token->m_type = TOKand;
286 return std::move(m_token);
287 case '|':
288 ++m_cursor;
289 m_token->m_type = TOKor;
290 return std::move(m_token);
291 case '+':
292 ++m_cursor;
293 m_token->m_type = TOKplus;
294 return std::move(m_token);
295 case '-':
296 ++m_cursor;
297 m_token->m_type = TOKminus;
298 return std::move(m_token);
299 case '*':
300 ++m_cursor;
301 m_token->m_type = TOKmul;
302 return std::move(m_token);
303 case '/': {
304 ++m_cursor;
305 if (m_cursor > m_end) {
306 m_token->m_type = TOKdiv;
307 return std::move(m_token);
308 }
309
310 if (!IsFormCalcCharacter(*m_cursor)) {
311 RaiseError();
312 return nullptr;
313 }
314 if (*m_cursor != '/') {
315 m_token->m_type = TOKdiv;
316 return std::move(m_token);
317 }
318 AdvanceForComment();
319 break;
320 }
321 case '.':
322 ++m_cursor;
323 if (m_cursor > m_end) {
324 m_token->m_type = TOKdot;
325 return std::move(m_token);
326 }
327
328 if (!IsFormCalcCharacter(*m_cursor)) {
329 RaiseError();
330 return nullptr;
331 }
332
333 if (*m_cursor == '.') {
334 m_token->m_type = TOKdotdot;
335 ++m_cursor;
336 } else if (*m_cursor == '*') {
337 m_token->m_type = TOKdotstar;
338 ++m_cursor;
339 } else if (*m_cursor == '#') {
340 m_token->m_type = TOKdotscream;
341 ++m_cursor;
342 } else if (*m_cursor <= '9' && *m_cursor >= '0') {
343 m_token->m_type = TOKnumber;
344 --m_cursor;
345 AdvanceForNumber();
346 } else {
347 m_token->m_type = TOKdot;
348 }
349 return std::move(m_token);
350 default:
351 if (IsWhitespaceCharacter(*m_cursor)) {
352 ++m_cursor;
353 break;
354 }
355 if (!IsInitialIdentifierCharacter(*m_cursor)) {
356 RaiseError();
357 return nullptr;
358 }
359 AdvanceForIdentifier();
360 return std::move(m_token);
361 }
362 }
363
364 // If there isn't currently a token type then mark it EOF.
365 if (m_token->m_type == TOKreserver)
366 m_token->m_type = TOKeof;
367 return std::move(m_token);
368 }
369
AdvanceForNumber()370 void CXFA_FMLexer::AdvanceForNumber() {
371 // This will set end to the character after the end of the number.
372 wchar_t* end = nullptr;
373 if (m_cursor)
374 wcstod(const_cast<wchar_t*>(m_cursor), &end);
375 if (!end || FXSYS_iswalpha(*end)) {
376 RaiseError();
377 return;
378 }
379
380 m_token->m_string =
381 WideStringView(m_cursor, static_cast<size_t>(end - m_cursor));
382 m_cursor = end;
383 }
384
AdvanceForString()385 void CXFA_FMLexer::AdvanceForString() {
386 const wchar_t* start = m_cursor;
387 ++m_cursor;
388 while (m_cursor <= m_end && *m_cursor) {
389 if (!IsFormCalcCharacter(*m_cursor))
390 break;
391
392 if (*m_cursor == '"') {
393 // Check for escaped "s, i.e. "".
394 ++m_cursor;
395 // If the end of the input has been reached it was not escaped.
396 if (m_cursor > m_end) {
397 m_token->m_string =
398 WideStringView(start, static_cast<size_t>(m_cursor - start));
399 return;
400 }
401 // If the next character is not a " then the end of the string has been
402 // found.
403 if (*m_cursor != '"') {
404 if (!IsFormCalcCharacter(*m_cursor)) {
405 break;
406 }
407 m_token->m_string = WideStringView(start, (m_cursor - start));
408 return;
409 }
410 }
411 ++m_cursor;
412 }
413
414 // Didn't find the end of the string.
415 RaiseError();
416 }
417
AdvanceForIdentifier()418 void CXFA_FMLexer::AdvanceForIdentifier() {
419 const wchar_t* start = m_cursor;
420 ++m_cursor;
421 while (m_cursor <= m_end && *m_cursor) {
422 if (!IsFormCalcCharacter(*m_cursor)) {
423 RaiseError();
424 return;
425 }
426
427 if (!IsIdentifierCharacter(*m_cursor)) {
428 break;
429 }
430 ++m_cursor;
431 }
432 m_token->m_string =
433 WideStringView(start, static_cast<size_t>(m_cursor - start));
434 m_token->m_type = TokenizeIdentifier(m_token->m_string);
435 }
436
AdvanceForComment()437 void CXFA_FMLexer::AdvanceForComment() {
438 m_cursor++;
439 while (m_cursor <= m_end && *m_cursor) {
440 if (!IsFormCalcCharacter(*m_cursor)) {
441 RaiseError();
442 return;
443 }
444
445 if (*m_cursor == L'\r') {
446 ++m_cursor;
447 return;
448 }
449 if (*m_cursor == L'\n') {
450 ++m_current_line;
451 ++m_cursor;
452 return;
453 }
454 ++m_cursor;
455 }
456 }
457