1 /*
2 * Copyright (C) 2008 Esmertec AG.
3 * Copyright (C) 2008 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <setjmp.h>
21 #include <assert.h>
22 #include "wbxml_parser.h"
23 #include "csp13_data.h"
24 #ifdef SUPPORT_SYNCML
25 #include "syncml_data.h"
26 #endif
27
28 #ifdef PLATFORM_ANDROID
29 extern "C" void *bsearch(const void *key, const void *base0, size_t nmemb,
30 size_t size, int (*compar)(const void *, const void *));
31 #endif
32
33 #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
34
35 //#define WBXML_DEBUG 1
36
37 /* Major TODO items:
38 - Attribute value tokens (not used by IMPS CSP)
39 - EXT_* except EXT_T_0 (not used by IMPS CSP)
40 - PI (not used by IMPS CSP)
41 - cleanups
42
43 Other TODO:
44 - Support more public ID? Only IMPS is supported now.
45 - Support other charsets than UTF-8
46 */
47
compareTokenData(const void * t1,const void * t2)48 static int compareTokenData(const void * t1, const void * t2)
49 {
50 return ((TokenData *)t1)->token - ((TokenData *)t2)->token;
51 }
52
compareAttrData(const void * t1,const void * t2)53 static int compareAttrData(const void * t1, const void * t2)
54 {
55 return ((AttrData *)t1)->token - ((AttrData *)t2)->token;
56 }
57
isTagStart(int token)58 static bool isTagStart(int token)
59 {
60 if (token == TOKEN_SWITCH_PAGE)
61 return true;
62
63 token &= 0x3f;
64 return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0);
65 }
66
isAttrStart(int token)67 static bool isAttrStart(int token)
68 {
69 return (token >= TOKEN_LITERAL && token < TOKEN_EXT_I_0) ||
70 (token > TOKEN_LITERAL_C && token < 0x80);
71 }
72
WbxmlParser(uint32_t transportEncoding)73 WbxmlParser::WbxmlParser(uint32_t transportEncoding) :
74 mTransportEncoding(transportEncoding)
75 {
76 reset();
77 }
78
~WbxmlParser()79 WbxmlParser::~WbxmlParser()
80 {
81 }
82
reset(void)83 void WbxmlParser::reset(void)
84 {
85 mContentHandler = NULL;
86
87 mExternalChunk = NULL;
88 mExternalChunkLen = 0;
89 mLastChunk.clear();
90 mDataOffset = 0;
91 mIsDataEnd = false;
92
93 mStartElemStack.clear();
94 mStringTable.clear();
95
96 mCurrTagPage = mCurrAttrPage = 0;
97 mPublicId = 0;
98
99 mState = EXPECT_HEADER;
100 mLastError = ERROR_NO_ERROR;
101 }
102
setContentHandler(WbxmlContentHandler * handler)103 void WbxmlParser::setContentHandler(WbxmlContentHandler * handler)
104 {
105 mContentHandler = handler;
106 }
107
parse(const char * data,uint32_t dataLen,bool end)108 int WbxmlParser::parse(const char * data, uint32_t dataLen, bool end)
109 {
110 if (data == NULL) {
111 mLastError = ERROR_INVALID_DATA;
112 return WBXML_STATUS_ERROR;
113 }
114
115 // All temporary C++ varaibles must be declared before setjmp to make
116 // sure they get properly destructed after longjmp.
117 vector<Attribute> attribs;
118 Attribute attrib;
119 string tagName;
120 string characters;
121 string opaque;
122
123 #ifdef WBXML_DEBUG
124 printf("\nparse dataLen %d; end %d; readPos %d; availData %d\n",
125 dataLen, end, getReadPos(), availDataSize());
126 #endif
127 appendData(data, dataLen, end);
128 volatile int readPos = getReadPos();
129 int setjmpRet;
130 switch (setjmpRet = setjmp(mJmpbuf)) {
131 case 0:
132 break;
133
134 case ERROR_NEED_MORE_DATA:
135 if (!mIsDataEnd) {
136 #ifdef WBXML_DEBUG
137 printf("\nneed more data: readPos %d\n", readPos);
138 #endif
139 setReadPos(readPos);
140 saveRemainingData();
141 return WBXML_STATUS_OK;
142 } else {
143 #ifdef WBXML_DEBUG
144 printf("wbxml parser error: unexpected data end\n");
145 #endif
146 mLastError = ERROR_NEED_MORE_DATA;
147 return WBXML_STATUS_ERROR;
148 }
149 break;
150
151 case ERROR_UNSUPPORTED_PUBID:
152 case ERROR_UNSUPPORTED_CHARSET:
153 case ERROR_INVALID_STRING_TABLE:
154 case ERROR_INVALID_STRING_TABLE_REFERENCE:
155 case ERROR_INVALID_EXT_TOKEN:
156 case ERROR_INVALID_MBUINT:
157 case ERROR_INVALID_ENTITY:
158 case ERROR_UNRECOGNIZED_TAG:
159 case ERROR_UNRECOGNIZED_ATTR:
160 case ERROR_MISSING_ATTR:
161 case ERROR_MISSING_TOKEN_END:
162 #ifdef WBXML_DEBUG
163 printf("wbxml parser error %d\n", setjmpRet);
164 #endif
165 mLastError = ParserError(setjmpRet);
166 return WBXML_STATUS_ERROR;
167 break;
168
169 case ERROR_NOT_SUPPORTED_YET:
170 printf("wbxml parser error: Not implemented feature.\n");
171 mLastError = ParserError(setjmpRet);
172 return WBXML_STATUS_ERROR;
173 break;
174
175 default:
176 printf("wbxml parser error: Impossible execution path.\n");
177 mLastError = ParserError(setjmpRet);
178 return WBXML_STATUS_ERROR;
179 break;
180 }
181
182 for (;;) {
183 // save readPos for error recovery
184 readPos = getReadPos();
185
186 switch (mState) {
187 case EXPECT_HEADER:
188 mDocVersion = readByte();
189
190 mPublicId = readMbuint32();
191 if (mPublicId != 0) {
192 if (!selectTokenMapping(mPublicId)) {
193 #ifdef WBXML_DEBUG
194 printf("wbxml parser error: unsupported public id \n");
195 #endif
196 longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
197 }
198 } else {
199 mPublicId = -readMbuint32();
200 }
201 mCharset = readMbuint32();
202 if (!mCharset) {
203 mCharset = mTransportEncoding;
204 if (!mCharset) {
205 mCharset = CHARSET_UTF8;
206 }
207 }
208 // TODO: support more charsets other than UTF-8
209 if (mCharset != CHARSET_UTF8) {
210 #ifdef WBXML_DEBUG
211 printf("wbxml parser error: unsupported charset\n");
212 #endif
213 longjmp(mJmpbuf, ERROR_UNSUPPORTED_CHARSET);
214 }
215
216 // now advance to next state
217 if (mContentHandler) {
218 mContentHandler->handlePublicId(mPublicId);
219 }
220 mState = EXPECT_STRING_TABLE;
221 break;
222
223 case EXPECT_STRING_TABLE:
224 {
225 uint32_t len = readMbuint32();
226 if (availDataSize() < len) {
227 longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
228 }
229 mStringTable.clear();
230 // TODO: optimize this
231 while (len--) {
232 mStringTable += readByte();
233 }
234 if (mStringTable.size()) {
235 if (mStringTable[mStringTable.size() - 1] != 0) {
236 // must have an ending \0
237 //TODO:the byte array returned by SCTS does not contain '\0' at the
238 //end,should this be fixed accordingly?
239 #ifdef WBXML_DEBUG
240 printf("wbxml parser error: invalid string table\n");
241 #endif
242 longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE);
243 }
244 }
245 mState = EXPECT_BODY_START;
246 if (mPublicId <= 0) {
247 const char * s = mStringTable.c_str() + (-mPublicId);
248 #ifdef SUPPORT_SYNCML
249 if (strcmp(s, "-//SYNCML//DTD SyncML 1.2//EN") == 0) {
250 mPublicId = PUBLICID_SYNCML_1_2;
251 } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.1//EN") == 0) {
252 mPublicId = PUBLICID_SYNCML_1_1;
253 } else if (strcmp(s, "-//SYNCML//DTD SyncML 1.0//EN") == 0) {
254 mPublicId = PUBLICID_SYNCML_1_0;
255 }
256 #endif
257 if ((mPublicId <= 0) || !selectTokenMapping(mPublicId)) {
258 longjmp(mJmpbuf, ERROR_UNSUPPORTED_PUBID);
259 }
260 }
261 break;
262 }
263
264 case EXPECT_BODY_START:
265 //TODO: handle possible PIs
266 mState = EXPECT_ELEMENT_START;
267 break;
268
269 case EXPECT_ELEMENT_START:
270 {
271 int stag = readByte();
272 const char * name;
273 if ((stag & 0x3f) == TOKEN_LITERAL) {
274 name = resolveStrTableRef();
275 } else {
276 if (stag == TOKEN_SWITCH_PAGE) {
277 mCurrTagPage = readByte();
278 stag = readByte();
279 }
280 name = lookupTagName(stag);
281 }
282 if (name == NULL) {
283 #ifdef WBXML_DEBUG
284 printf("wbxml parser error: unrecognized tag\n");
285 #endif
286 longjmp(mJmpbuf, ERROR_UNRECOGNIZED_TAG);
287 }
288 attribs.clear();
289 if (stag & 0x80) {
290 // followed by 1 or more attributes
291 while (peekByte() != TOKEN_END) {
292 readAttribute(&attrib);
293 attribs.push_back(attrib);
294 }
295 if (!attribs.size()) {
296 #ifdef WBXML_DEBUG
297 printf("wbxml parser error: missing attributes\n");
298 #endif
299 longjmp(mJmpbuf, ERROR_MISSING_ATTR);
300 }
301 // TOKEN_END
302 readByte();
303 }
304 if (mContentHandler) {
305 mContentHandler->startElement(name, attribs);
306 }
307 if (stag & 0x40) {
308 mState = EXPECT_CONTENT;
309 } else {
310 mState = ELEMENT_END;
311 }
312 tagName = name;
313 mStartElemStack.push_back(name);
314 break;
315 }
316
317 case EXPECT_CONTENT:
318 {
319 int byte = peekByte();
320 if (byte == TOKEN_SWITCH_PAGE) {
321 readByte();
322 mCurrTagPage = readByte();
323 byte = peekByte();
324 }
325 if (isTagStart(byte) || byte == TOKEN_END) {
326 if (characters.size() && mContentHandler) {
327 mContentHandler->characters(characters.c_str(), characters.size());
328 characters.clear();
329 }
330 if (byte == TOKEN_END) {
331 mState = EXPECT_ELEMENT_END;
332 } else {
333 mState = EXPECT_ELEMENT_START;
334 }
335 } else {
336 // TODO: handle extension and pi
337 switch (byte) {
338 case TOKEN_ENTITY:
339 case TOKEN_STR_I:
340 case TOKEN_STR_T:
341 readString(characters);
342 break;
343
344 case TOKEN_EXT_T_0:
345 {
346 readByte();
347 uint32_t valueToken = readMbuint32();
348 if (mPublicId == PUBLICID_IMPS_1_1
349 || mPublicId == PUBLICID_IMPS_1_2
350 || mPublicId == PUBLICID_IMPS_1_3) {
351 TokenData t = {valueToken, NULL};
352 const TokenData * res = (TokenData *)bsearch(&t,
353 csp13ExtValueTokens, ARRAY_SIZE(csp13ExtValueTokens),
354 sizeof(csp13ExtValueTokens[0]), compareTokenData);
355 if (res) {
356 characters.append(res->tagName);
357 } else {
358 longjmp(mJmpbuf, ERROR_INVALID_EXT_TOKEN);
359 }
360 } else {
361 printf ("Token 0x%x\n", byte);
362 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
363 }
364 break;
365 }
366
367 case TOKEN_OPAQUE:
368 {
369 readByte();
370 uint32_t opaqueDataLen = readMbuint32();
371 opaque.clear();
372 while (opaqueDataLen--) {
373 opaque += (char)readByte();
374 }
375 if (mContentHandler) {
376 mContentHandler->opaque(opaque.c_str(), opaque.size());
377 }
378 break;
379 }
380
381 default:
382 printf ("Token 0x%x\n", byte);
383 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
384 break;
385 }
386 }
387 break;
388 }
389
390 case EXPECT_ELEMENT_END:
391 if (readByte() != TOKEN_END) {
392 #ifdef WBXML_DEBUG
393 printf("wbxml parser error: TOKEN_END expected\n");
394 #endif
395 longjmp(mJmpbuf, ERROR_MISSING_TOKEN_END);
396 }
397 mState = ELEMENT_END;
398 break;
399
400 case ELEMENT_END:
401 assert(!mStartElemStack.empty());
402
403 tagName = mStartElemStack.back();
404 mStartElemStack.pop_back();
405 if (mContentHandler) {
406 mContentHandler->endElement(tagName.c_str());
407 }
408 if (mStartElemStack.empty()) {
409 mState = EXPECT_BODY_END;
410 } else {
411 mState = EXPECT_CONTENT;
412 }
413 break;
414
415 case EXPECT_BODY_END:
416 // TODO: handle possible PIs
417
418 // we're done
419 return WBXML_STATUS_OK;
420 break;
421 }
422 }
423 }
424
425 /*
426 * We don't make a copy of the data chunk for the current parse() until
427 * it returns.
428 * The remaining data will be saved in saveRemainingData() before parse()
429 * returns.
430 */
appendData(const char * data,uint32_t len,bool end)431 void WbxmlParser::appendData(const char * data, uint32_t len, bool end)
432 {
433 mExternalChunk = data;
434 mExternalChunkLen = len;
435 mIsDataEnd = end;
436 }
437
saveRemainingData()438 void WbxmlParser::saveRemainingData()
439 {
440 if (mDataOffset > mLastChunk.size()) {
441 uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
442 assert(offsetToExtChunk <= mExternalChunkLen);
443 mLastChunk.assign(mExternalChunk + offsetToExtChunk,
444 mExternalChunkLen - offsetToExtChunk);
445 mDataOffset = 0;
446 } else {
447 mLastChunk.append(mExternalChunk, mExternalChunkLen);
448 }
449 mExternalChunk = NULL;
450 mExternalChunkLen = 0;
451 }
452
readByte()453 int WbxmlParser::readByte()
454 {
455 if (mDataOffset < mLastChunk.size()) {
456 #ifdef WBXML_DEBUG
457 printf ("rb 0x%x; ", (unsigned char)mLastChunk[mDataOffset]);
458 #endif
459 return (unsigned char)mLastChunk[mDataOffset++];
460 } else {
461 uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
462 if (offsetToExtChunk < mExternalChunkLen) {
463 mDataOffset++;
464 #ifdef WBXML_DEBUG
465 printf ("rb 0x%x; ", (unsigned char)mExternalChunk[offsetToExtChunk]);
466 #endif
467 return (unsigned char)mExternalChunk[offsetToExtChunk];
468 }
469 longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
470 }
471 }
472
peekByte()473 int WbxmlParser::peekByte()
474 {
475 if (mDataOffset < mLastChunk.size()) {
476 return (unsigned char)mLastChunk[mDataOffset];
477 } else {
478 uint32_t offsetToExtChunk = mDataOffset - mLastChunk.size();
479 if (offsetToExtChunk < mExternalChunkLen) {
480 return (unsigned char)mExternalChunk[offsetToExtChunk];
481 }
482 longjmp(mJmpbuf, ERROR_NEED_MORE_DATA);
483 }
484 }
485
readMbuint32()486 uint32_t WbxmlParser::readMbuint32()
487 {
488 uint32_t value = 0;
489 uint32_t byte;
490 do {
491 if ((value >> 25) != 0) {
492 // would go overflow. not a valid uint32.
493 longjmp(mJmpbuf, ERROR_INVALID_MBUINT);
494 }
495 byte = readByte();
496 value = (value << 7) | (byte & 0x7f);
497 } while (byte & 0x80);
498 return value;
499 }
500
501 /**
502 * Read STR_I | STR_T | ENTITY and *append* to str.
503 * Yes this looks ugly...
504 */
readString(string & str)505 void WbxmlParser::readString(string & str)
506 {
507 int byte = readByte();
508 switch (byte) {
509 case TOKEN_STR_I:
510 //TODO: assuming UTF-8
511 while ((byte = readByte()) != 0) {
512 str += (char)byte;
513 }
514 break;
515
516 case TOKEN_ENTITY:
517 {
518 uint32_t ch = readMbuint32();
519 //TODO: assuming UTF-8 for now.
520 if (ch <= 0x7f) {
521 str += (char)ch;
522 } else if (ch <= 0x7ff) {
523 str += (char)((ch >> 6) | 0xc0);
524 str += (char)((ch & 0x3f) | 0x80);
525 } else if (ch <= 0xffff) {
526 str += (char)((ch >> 12) | 0xe0);
527 str += (char)(((ch >> 6) & 0x3f) | 0x80);
528 str += (char)((ch & 0x3f) | 0x80);
529 } else if (ch <= 0x10ffff) {
530 // 010000 - 10FFFF
531 str += (char)((ch >> 18) | 0xf0);
532 str += (char)(((ch >> 12) & 0x3f) | 0x80);
533 str += (char)(((ch >> 6) & 0x3f) | 0x80);
534 str += (char)((ch & 0x3f) | 0x80);
535 } else {
536 // not a valid UCS-4 character
537 longjmp(mJmpbuf, ERROR_INVALID_ENTITY);
538 }
539 break;
540 }
541
542 case TOKEN_STR_T:
543 {
544 const char * s = resolveStrTableRef();
545 str.append(s, strlen(s));
546 break;
547 }
548
549 default:
550 // impossible
551 printf ("Unknown token 0x%02x\n", byte);
552 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
553 break;
554 }
555 }
556
resolveStrTableRef(void)557 const char * WbxmlParser::resolveStrTableRef(void)
558 {
559 uint32_t offset = readMbuint32();
560 if (offset >= mStringTable.size()) {
561 longjmp(mJmpbuf, ERROR_INVALID_STRING_TABLE_REFERENCE);
562 }
563 return mStringTable.c_str() + offset;
564 }
565
selectTokenMapping(int publicId)566 bool WbxmlParser::selectTokenMapping(int publicId)
567 {
568 switch (publicId) {
569 case PUBLICID_IMPS_1_3:
570 case PUBLICID_IMPS_1_2:
571 case PUBLICID_IMPS_1_1:
572 mTagPages = csp13TagPages;
573 mNumTagPages = ARRAY_SIZE(csp13TagPages);
574 mAttrPages = csp13AttrPages;
575 mNumAttrPages = ARRAY_SIZE(csp13AttrPages);
576 break;
577
578 #ifdef SUPPORT_SYNCML
579 case PUBLICID_SYNCML_1_0:
580 case PUBLICID_SYNCML_1_1:
581 case PUBLICID_SYNCML_1_2:
582 case PUBLICID_SYNCML_METINF_1_2:
583 mTagPages = syncmlTagPages;
584 mNumTagPages = ARRAY_SIZE(syncmlTagPages);
585 mAttrPages = NULL;
586 mNumAttrPages = 0;
587 break;
588
589 case PUBLICID_SYNCML_DEVINF_1_2:
590 mTagPages = syncmlDevInfTagPages;
591 mNumTagPages = ARRAY_SIZE(syncmlDevInfTagPages);
592 mAttrPages = NULL;
593 mNumAttrPages = 0;
594 break;
595 #endif
596 default:
597 return false;
598 }
599 return true;
600 }
601
lookupTagName(int tag) const602 const char * WbxmlParser::lookupTagName(int tag) const
603 {
604 tag = tag & 0x3f;
605
606 // TODO: optimize this
607 if (mCurrTagPage >= mNumTagPages) {
608 return NULL;
609 }
610 const TagCodePage * page = &mTagPages[mCurrTagPage];
611 if (page == NULL) {
612 return NULL;
613 }
614
615 TokenData t = {tag, NULL};
616 const TokenData * res = (TokenData *)bsearch(&t, page->tags, page->numTokens,
617 sizeof(TokenData), compareTokenData);
618 if (res) {
619 return res->tagName;
620 }
621
622 return NULL;
623 }
624
lookupAttrName(int token,const char ** prefix) const625 const char * WbxmlParser::lookupAttrName(int token, const char **prefix) const
626 {
627 // TODO: optimize this
628 if (mCurrAttrPage >= mNumAttrPages) {
629 return NULL;
630 }
631 const AttrCodePage * page = &mAttrPages[mCurrAttrPage];
632 if (page == NULL) {
633 return NULL;
634 }
635
636 AttrData t = {token, NULL, NULL};
637 const AttrData * res = (AttrData *)bsearch(&t, page->attrs, page->numTokens,
638 sizeof(AttrData), compareAttrData);
639 if (res) {
640 if (prefix) {
641 *prefix = res->attrValuePrefix;
642 }
643 return res->attrName;
644 }
645
646 return NULL;
647 }
648
readAttribute(Attribute * attrib)649 void WbxmlParser::readAttribute(Attribute * attrib)
650 {
651 // attribute start: attrib start token, LITERAL or END
652 int attrStart = readByte();
653 const char * name;
654 const char * valuePrefix = NULL;
655
656 if (attrStart == TOKEN_LITERAL) {
657 name = resolveStrTableRef();
658 } else {
659 if (attrStart == TOKEN_SWITCH_PAGE) {
660 mCurrAttrPage = readByte();
661 attrStart = readByte();
662 }
663 name = lookupAttrName(attrStart, &valuePrefix);
664 }
665 if (name == NULL) {
666 longjmp(mJmpbuf, ERROR_UNRECOGNIZED_ATTR);
667 }
668 attrib->name = name;
669 attrib->value = "";
670 if (valuePrefix != NULL) {
671 attrib->value = valuePrefix;
672 }
673
674 // now attribute value: zero or more value, string, entity or extension tokens
675 for (;;) {
676 int valueToken = peekByte();
677 if (isAttrStart(valueToken) || valueToken == TOKEN_END) {
678 // An attribute start token, a LITERAL token or the END token
679 // indicates the end of an attribute value.
680 return;
681 }
682 switch (valueToken) {
683 case TOKEN_ENTITY:
684 case TOKEN_STR_I:
685 case TOKEN_STR_T:
686 readString(attrib->value);
687 break;
688
689 case TOKEN_EXT_I_0:
690 case TOKEN_EXT_I_1:
691 case TOKEN_EXT_I_2:
692 case TOKEN_EXT_0:
693 case TOKEN_EXT_1:
694 case TOKEN_EXT_2:
695 //TODO: document type specific
696 printf ("Unsupported Token 0x%x\n", valueToken);
697 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
698 break;
699
700 default:
701 //TODO
702 printf ("Unknown Token 0x%x\n", valueToken);
703 longjmp(mJmpbuf, ERROR_NOT_SUPPORTED_YET);
704 break;
705 }
706 }
707 }
708
709