1 /*
2 * JSON lexer
3 *
4 * Copyright IBM, Corp. 2009
5 *
6 * Authors:
7 * Anthony Liguori <aliguori@us.ibm.com>
8 *
9 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
10 * See the COPYING.LIB file in the top-level directory.
11 *
12 */
13
14 #include "qstring.h"
15 #include "qlist.h"
16 #include "qdict.h"
17 #include "qint.h"
18 #include "qemu-common.h"
19 #include "json-lexer.h"
20
21 /*
22 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
23 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
24 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
25 * [{}\[\],:]
26 * [a-z]+
27 *
28 */
29
30 #undef ERROR
31
32 enum json_lexer_state {
33 ERROR = 0,
34 IN_DONE_STRING,
35 IN_DQ_UCODE3,
36 IN_DQ_UCODE2,
37 IN_DQ_UCODE1,
38 IN_DQ_UCODE0,
39 IN_DQ_STRING_ESCAPE,
40 IN_DQ_STRING,
41 IN_SQ_UCODE3,
42 IN_SQ_UCODE2,
43 IN_SQ_UCODE1,
44 IN_SQ_UCODE0,
45 IN_SQ_STRING_ESCAPE,
46 IN_SQ_STRING,
47 IN_ZERO,
48 IN_DIGITS,
49 IN_DIGIT,
50 IN_EXP_E,
51 IN_MANTISSA,
52 IN_MANTISSA_DIGITS,
53 IN_NONZERO_NUMBER,
54 IN_NEG_NONZERO_NUMBER,
55 IN_KEYWORD,
56 IN_ESCAPE,
57 IN_ESCAPE_L,
58 IN_ESCAPE_LL,
59 IN_ESCAPE_I,
60 IN_ESCAPE_I6,
61 IN_ESCAPE_I64,
62 IN_ESCAPE_DONE,
63 IN_WHITESPACE,
64 IN_OPERATOR_DONE,
65 IN_START,
66 };
67
68 #define TERMINAL(state) [0 ... 0x7F] = (state)
69
70 static const uint8_t json_lexer[][256] = {
71 [IN_DONE_STRING] = {
72 TERMINAL(JSON_STRING),
73 },
74
75 /* double quote string */
76 [IN_DQ_UCODE3] = {
77 ['0' ... '9'] = IN_DQ_STRING,
78 ['a' ... 'f'] = IN_DQ_STRING,
79 ['A' ... 'F'] = IN_DQ_STRING,
80 },
81 [IN_DQ_UCODE2] = {
82 ['0' ... '9'] = IN_DQ_UCODE3,
83 ['a' ... 'f'] = IN_DQ_UCODE3,
84 ['A' ... 'F'] = IN_DQ_UCODE3,
85 },
86 [IN_DQ_UCODE1] = {
87 ['0' ... '9'] = IN_DQ_UCODE2,
88 ['a' ... 'f'] = IN_DQ_UCODE2,
89 ['A' ... 'F'] = IN_DQ_UCODE2,
90 },
91 [IN_DQ_UCODE0] = {
92 ['0' ... '9'] = IN_DQ_UCODE1,
93 ['a' ... 'f'] = IN_DQ_UCODE1,
94 ['A' ... 'F'] = IN_DQ_UCODE1,
95 },
96 [IN_DQ_STRING_ESCAPE] = {
97 ['b'] = IN_DQ_STRING,
98 ['f'] = IN_DQ_STRING,
99 ['n'] = IN_DQ_STRING,
100 ['r'] = IN_DQ_STRING,
101 ['t'] = IN_DQ_STRING,
102 ['\''] = IN_DQ_STRING,
103 ['\"'] = IN_DQ_STRING,
104 ['u'] = IN_DQ_UCODE0,
105 },
106 [IN_DQ_STRING] = {
107 [1 ... 0xFF] = IN_DQ_STRING,
108 ['\\'] = IN_DQ_STRING_ESCAPE,
109 ['"'] = IN_DONE_STRING,
110 },
111
112 /* single quote string */
113 [IN_SQ_UCODE3] = {
114 ['0' ... '9'] = IN_SQ_STRING,
115 ['a' ... 'f'] = IN_SQ_STRING,
116 ['A' ... 'F'] = IN_SQ_STRING,
117 },
118 [IN_SQ_UCODE2] = {
119 ['0' ... '9'] = IN_SQ_UCODE3,
120 ['a' ... 'f'] = IN_SQ_UCODE3,
121 ['A' ... 'F'] = IN_SQ_UCODE3,
122 },
123 [IN_SQ_UCODE1] = {
124 ['0' ... '9'] = IN_SQ_UCODE2,
125 ['a' ... 'f'] = IN_SQ_UCODE2,
126 ['A' ... 'F'] = IN_SQ_UCODE2,
127 },
128 [IN_SQ_UCODE0] = {
129 ['0' ... '9'] = IN_SQ_UCODE1,
130 ['a' ... 'f'] = IN_SQ_UCODE1,
131 ['A' ... 'F'] = IN_SQ_UCODE1,
132 },
133 [IN_SQ_STRING_ESCAPE] = {
134 ['b'] = IN_SQ_STRING,
135 ['f'] = IN_SQ_STRING,
136 ['n'] = IN_SQ_STRING,
137 ['r'] = IN_SQ_STRING,
138 ['t'] = IN_SQ_STRING,
139 ['\''] = IN_SQ_STRING,
140 ['\"'] = IN_SQ_STRING,
141 ['u'] = IN_SQ_UCODE0,
142 },
143 [IN_SQ_STRING] = {
144 [1 ... 0xFF] = IN_SQ_STRING,
145 ['\\'] = IN_SQ_STRING_ESCAPE,
146 ['\''] = IN_DONE_STRING,
147 },
148
149 /* Zero */
150 [IN_ZERO] = {
151 TERMINAL(JSON_INTEGER),
152 ['0' ... '9'] = ERROR,
153 ['.'] = IN_MANTISSA,
154 },
155
156 /* Float */
157 [IN_DIGITS] = {
158 TERMINAL(JSON_FLOAT),
159 ['0' ... '9'] = IN_DIGITS,
160 },
161
162 [IN_DIGIT] = {
163 ['0' ... '9'] = IN_DIGITS,
164 },
165
166 [IN_EXP_E] = {
167 ['-'] = IN_DIGIT,
168 ['+'] = IN_DIGIT,
169 ['0' ... '9'] = IN_DIGITS,
170 },
171
172 [IN_MANTISSA_DIGITS] = {
173 TERMINAL(JSON_FLOAT),
174 ['0' ... '9'] = IN_MANTISSA_DIGITS,
175 ['e'] = IN_EXP_E,
176 ['E'] = IN_EXP_E,
177 },
178
179 [IN_MANTISSA] = {
180 ['0' ... '9'] = IN_MANTISSA_DIGITS,
181 },
182
183 /* Number */
184 [IN_NONZERO_NUMBER] = {
185 TERMINAL(JSON_INTEGER),
186 ['0' ... '9'] = IN_NONZERO_NUMBER,
187 ['e'] = IN_EXP_E,
188 ['E'] = IN_EXP_E,
189 ['.'] = IN_MANTISSA,
190 },
191
192 [IN_NEG_NONZERO_NUMBER] = {
193 ['0'] = IN_ZERO,
194 ['1' ... '9'] = IN_NONZERO_NUMBER,
195 },
196
197 /* keywords */
198 [IN_KEYWORD] = {
199 TERMINAL(JSON_KEYWORD),
200 ['a' ... 'z'] = IN_KEYWORD,
201 },
202
203 /* whitespace */
204 [IN_WHITESPACE] = {
205 TERMINAL(JSON_SKIP),
206 [' '] = IN_WHITESPACE,
207 ['\t'] = IN_WHITESPACE,
208 ['\r'] = IN_WHITESPACE,
209 ['\n'] = IN_WHITESPACE,
210 },
211
212 /* operator */
213 [IN_OPERATOR_DONE] = {
214 TERMINAL(JSON_OPERATOR),
215 },
216
217 /* escape */
218 [IN_ESCAPE_DONE] = {
219 TERMINAL(JSON_ESCAPE),
220 },
221
222 [IN_ESCAPE_LL] = {
223 ['d'] = IN_ESCAPE_DONE,
224 },
225
226 [IN_ESCAPE_L] = {
227 ['d'] = IN_ESCAPE_DONE,
228 ['l'] = IN_ESCAPE_LL,
229 },
230
231 [IN_ESCAPE_I64] = {
232 ['d'] = IN_ESCAPE_DONE,
233 },
234
235 [IN_ESCAPE_I6] = {
236 ['4'] = IN_ESCAPE_I64,
237 },
238
239 [IN_ESCAPE_I] = {
240 ['6'] = IN_ESCAPE_I6,
241 },
242
243 [IN_ESCAPE] = {
244 ['d'] = IN_ESCAPE_DONE,
245 ['i'] = IN_ESCAPE_DONE,
246 ['p'] = IN_ESCAPE_DONE,
247 ['s'] = IN_ESCAPE_DONE,
248 ['f'] = IN_ESCAPE_DONE,
249 ['l'] = IN_ESCAPE_L,
250 ['I'] = IN_ESCAPE_I,
251 },
252
253 /* top level rule */
254 [IN_START] = {
255 ['"'] = IN_DQ_STRING,
256 ['\''] = IN_SQ_STRING,
257 ['0'] = IN_ZERO,
258 ['1' ... '9'] = IN_NONZERO_NUMBER,
259 ['-'] = IN_NEG_NONZERO_NUMBER,
260 ['{'] = IN_OPERATOR_DONE,
261 ['}'] = IN_OPERATOR_DONE,
262 ['['] = IN_OPERATOR_DONE,
263 [']'] = IN_OPERATOR_DONE,
264 [','] = IN_OPERATOR_DONE,
265 [':'] = IN_OPERATOR_DONE,
266 ['a' ... 'z'] = IN_KEYWORD,
267 ['%'] = IN_ESCAPE,
268 [' '] = IN_WHITESPACE,
269 ['\t'] = IN_WHITESPACE,
270 ['\r'] = IN_WHITESPACE,
271 ['\n'] = IN_WHITESPACE,
272 },
273 };
274
json_lexer_init(JSONLexer * lexer,JSONLexerEmitter func)275 void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
276 {
277 lexer->emit = func;
278 lexer->state = IN_START;
279 lexer->token = qstring_new();
280 }
281
json_lexer_feed_char(JSONLexer * lexer,char ch)282 static int json_lexer_feed_char(JSONLexer *lexer, char ch)
283 {
284 char buf[2];
285
286 lexer->x++;
287 if (ch == '\n') {
288 lexer->x = 0;
289 lexer->y++;
290 }
291
292 lexer->state = json_lexer[lexer->state][(uint8_t)ch];
293
294 switch (lexer->state) {
295 case JSON_OPERATOR:
296 case JSON_ESCAPE:
297 case JSON_INTEGER:
298 case JSON_FLOAT:
299 case JSON_KEYWORD:
300 case JSON_STRING:
301 lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
302 case JSON_SKIP:
303 lexer->state = json_lexer[IN_START][(uint8_t)ch];
304 QDECREF(lexer->token);
305 lexer->token = qstring_new();
306 break;
307 case ERROR:
308 return -EINVAL;
309 default:
310 break;
311 }
312
313 buf[0] = ch;
314 buf[1] = 0;
315
316 qstring_append(lexer->token, buf);
317
318 return 0;
319 }
320
json_lexer_feed(JSONLexer * lexer,const char * buffer,size_t size)321 int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
322 {
323 size_t i;
324
325 for (i = 0; i < size; i++) {
326 int err;
327
328 err = json_lexer_feed_char(lexer, buffer[i]);
329 if (err < 0) {
330 return err;
331 }
332 }
333
334 return 0;
335 }
336
json_lexer_flush(JSONLexer * lexer)337 int json_lexer_flush(JSONLexer *lexer)
338 {
339 return json_lexer_feed_char(lexer, 0);
340 }
341
json_lexer_destroy(JSONLexer * lexer)342 void json_lexer_destroy(JSONLexer *lexer)
343 {
344 QDECREF(lexer->token);
345 }
346