1 /*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
19 following conditions:
20
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32
33 #include <stddef.h>
34 #include <string.h> /* memcpy */
35 #include <stdbool.h>
36
37 #ifdef _WIN32
38 # include "winconfig.h"
39 #else
40 # ifdef HAVE_EXPAT_CONFIG_H
41 # include <expat_config.h>
42 # endif
43 #endif /* ndef _WIN32 */
44
45 #include "expat_external.h"
46 #include "internal.h"
47 #include "xmltok.h"
48 #include "nametab.h"
49
50 #ifdef XML_DTD
51 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
52 #else
53 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */
54 #endif
55
56 #define VTABLE1 \
57 {PREFIX(prologTok), PREFIX(contentTok), \
58 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
59 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
60 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
61 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
62 PREFIX(updatePosition), PREFIX(isPublicId)
63
64 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
65
66 #define UCS2_GET_NAMING(pages, hi, lo) \
67 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
68
69 /* A 2 byte UTF-8 representation splits the characters 11 bits between
70 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
71 pages, 3 bits to add to that index and 5 bits to generate the mask.
72 */
73 #define UTF8_GET_NAMING2(pages, byte) \
74 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
75 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
76 & (1u << (((byte)[1]) & 0x1F)))
77
78 /* A 3 byte UTF-8 representation splits the characters 16 bits between
79 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
80 into pages, 3 bits to add to that index and 5 bits to generate the
81 mask.
82 */
83 #define UTF8_GET_NAMING3(pages, byte) \
84 (namingBitmap \
85 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
86 << 3) \
87 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
88 & (1u << (((byte)[2]) & 0x1F)))
89
90 #define UTF8_GET_NAMING(pages, p, n) \
91 ((n) == 2 \
92 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
93 : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
94
95 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
96 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
97 with the additional restriction of not allowing the Unicode
98 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
99 Implementation details:
100 (A & 0x80) == 0 means A < 0x80
101 and
102 (A & 0xC0) == 0xC0 means A > 0xBF
103 */
104
105 #define UTF8_INVALID2(p) \
106 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
107
108 #define UTF8_INVALID3(p) \
109 (((p)[2] & 0x80) == 0 \
110 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
111 : ((p)[2] & 0xC0) == 0xC0) \
112 || ((*p) == 0xE0 \
113 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
114 : ((p)[1] & 0x80) == 0 \
115 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
116
117 #define UTF8_INVALID4(p) \
118 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
119 || ((p)[2] & 0xC0) == 0xC0 \
120 || ((*p) == 0xF0 \
121 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
122 : ((p)[1] & 0x80) == 0 \
123 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
124
125 static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)126 isNever(const ENCODING *enc, const char *p) {
127 UNUSED_P(enc);
128 UNUSED_P(p);
129 return 0;
130 }
131
132 static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)133 utf8_isName2(const ENCODING *enc, const char *p) {
134 UNUSED_P(enc);
135 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
136 }
137
138 static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)139 utf8_isName3(const ENCODING *enc, const char *p) {
140 UNUSED_P(enc);
141 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
142 }
143
144 #define utf8_isName4 isNever
145
146 static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)147 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
148 UNUSED_P(enc);
149 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
150 }
151
152 static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)153 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
154 UNUSED_P(enc);
155 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
156 }
157
158 #define utf8_isNmstrt4 isNever
159
160 static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)161 utf8_isInvalid2(const ENCODING *enc, const char *p) {
162 UNUSED_P(enc);
163 return UTF8_INVALID2((const unsigned char *)p);
164 }
165
166 static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)167 utf8_isInvalid3(const ENCODING *enc, const char *p) {
168 UNUSED_P(enc);
169 return UTF8_INVALID3((const unsigned char *)p);
170 }
171
172 static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)173 utf8_isInvalid4(const ENCODING *enc, const char *p) {
174 UNUSED_P(enc);
175 return UTF8_INVALID4((const unsigned char *)p);
176 }
177
178 struct normal_encoding {
179 ENCODING enc;
180 unsigned char type[256];
181 #ifdef XML_MIN_SIZE
182 int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
183 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
184 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
185 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
186 int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
187 #endif /* XML_MIN_SIZE */
188 int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
189 int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
190 int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
191 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
192 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
193 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
194 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
195 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
196 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
197 };
198
199 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
200
201 #ifdef XML_MIN_SIZE
202
203 # define STANDARD_VTABLE(E) \
204 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
205
206 #else
207
208 # define STANDARD_VTABLE(E) /* as nothing */
209
210 #endif
211
212 #define NORMAL_VTABLE(E) \
213 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
214 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
215
216 #define NULL_VTABLE \
217 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
218 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
219 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
220
221 static int FASTCALL checkCharRefNumber(int);
222
223 #include "xmltok_impl.h"
224 #include "ascii.h"
225
226 #ifdef XML_MIN_SIZE
227 # define sb_isNameMin isNever
228 # define sb_isNmstrtMin isNever
229 #endif
230
231 #ifdef XML_MIN_SIZE
232 # define MINBPC(enc) ((enc)->minBytesPerChar)
233 #else
234 /* minimum bytes per character */
235 # define MINBPC(enc) 1
236 #endif
237
238 #define SB_BYTE_TYPE(enc, p) \
239 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
240
241 #ifdef XML_MIN_SIZE
242 static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)243 sb_byteType(const ENCODING *enc, const char *p) {
244 return SB_BYTE_TYPE(enc, p);
245 }
246 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
247 #else
248 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
249 #endif
250
251 #ifdef XML_MIN_SIZE
252 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
253 static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)254 sb_byteToAscii(const ENCODING *enc, const char *p) {
255 UNUSED_P(enc);
256 return *p;
257 }
258 #else
259 # define BYTE_TO_ASCII(enc, p) (*(p))
260 #endif
261
262 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
263 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
264 #define IS_INVALID_CHAR(enc, p, n) \
265 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
266
267 #ifdef XML_MIN_SIZE
268 # define IS_NAME_CHAR_MINBPC(enc, p) \
269 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
270 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \
271 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
272 #else
273 # define IS_NAME_CHAR_MINBPC(enc, p) (0)
274 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
275 #endif
276
277 #ifdef XML_MIN_SIZE
278 # define CHAR_MATCHES(enc, p, c) \
279 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
280 static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)281 sb_charMatches(const ENCODING *enc, const char *p, int c) {
282 UNUSED_P(enc);
283 return *p == c;
284 }
285 #else
286 /* c is an ASCII character */
287 # define CHAR_MATCHES(enc, p, c) (*(p) == c)
288 #endif
289
290 #define PREFIX(ident) normal_##ident
291 #define XML_TOK_IMPL_C
292 #include "xmltok_impl.c"
293 #undef XML_TOK_IMPL_C
294
295 #undef MINBPC
296 #undef BYTE_TYPE
297 #undef BYTE_TO_ASCII
298 #undef CHAR_MATCHES
299 #undef IS_NAME_CHAR
300 #undef IS_NAME_CHAR_MINBPC
301 #undef IS_NMSTRT_CHAR
302 #undef IS_NMSTRT_CHAR_MINBPC
303 #undef IS_INVALID_CHAR
304
305 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
306 UTF8_cval1 = 0x00,
307 UTF8_cval2 = 0xc0,
308 UTF8_cval3 = 0xe0,
309 UTF8_cval4 = 0xf0
310 };
311
312 void
_INTERNAL_trim_to_complete_utf8_characters(const char * from,const char ** fromLimRef)313 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
314 const char **fromLimRef) {
315 const char *fromLim = *fromLimRef;
316 size_t walked = 0;
317 for (; fromLim > from; fromLim--, walked++) {
318 const unsigned char prev = (unsigned char)fromLim[-1];
319 if ((prev & 0xf8u)
320 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
321 if (walked + 1 >= 4) {
322 fromLim += 4 - 1;
323 break;
324 } else {
325 walked = 0;
326 }
327 } else if ((prev & 0xf0u)
328 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
329 if (walked + 1 >= 3) {
330 fromLim += 3 - 1;
331 break;
332 } else {
333 walked = 0;
334 }
335 } else if ((prev & 0xe0u)
336 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
337 if (walked + 1 >= 2) {
338 fromLim += 2 - 1;
339 break;
340 } else {
341 walked = 0;
342 }
343 } else if ((prev & 0x80u)
344 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
345 break;
346 }
347 }
348 *fromLimRef = fromLim;
349 }
350
351 static enum XML_Convert_Result PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)352 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
353 char **toP, const char *toLim) {
354 bool input_incomplete = false;
355 bool output_exhausted = false;
356
357 /* Avoid copying partial characters (due to limited space). */
358 const ptrdiff_t bytesAvailable = fromLim - *fromP;
359 const ptrdiff_t bytesStorable = toLim - *toP;
360 UNUSED_P(enc);
361 if (bytesAvailable > bytesStorable) {
362 fromLim = *fromP + bytesStorable;
363 output_exhausted = true;
364 }
365
366 /* Avoid copying partial characters (from incomplete input). */
367 {
368 const char *const fromLimBefore = fromLim;
369 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
370 if (fromLim < fromLimBefore) {
371 input_incomplete = true;
372 }
373 }
374
375 {
376 const ptrdiff_t bytesToCopy = fromLim - *fromP;
377 memcpy(*toP, *fromP, bytesToCopy);
378 *fromP += bytesToCopy;
379 *toP += bytesToCopy;
380 }
381
382 if (output_exhausted) /* needs to go first */
383 return XML_CONVERT_OUTPUT_EXHAUSTED;
384 else if (input_incomplete)
385 return XML_CONVERT_INPUT_INCOMPLETE;
386 else
387 return XML_CONVERT_COMPLETED;
388 }
389
390 static enum XML_Convert_Result PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)391 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
392 unsigned short **toP, const unsigned short *toLim) {
393 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
394 unsigned short *to = *toP;
395 const char *from = *fromP;
396 while (from < fromLim && to < toLim) {
397 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
398 case BT_LEAD2:
399 if (fromLim - from < 2) {
400 res = XML_CONVERT_INPUT_INCOMPLETE;
401 goto after;
402 }
403 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
404 from += 2;
405 break;
406 case BT_LEAD3:
407 if (fromLim - from < 3) {
408 res = XML_CONVERT_INPUT_INCOMPLETE;
409 goto after;
410 }
411 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
412 | (from[2] & 0x3f));
413 from += 3;
414 break;
415 case BT_LEAD4: {
416 unsigned long n;
417 if (toLim - to < 2) {
418 res = XML_CONVERT_OUTPUT_EXHAUSTED;
419 goto after;
420 }
421 if (fromLim - from < 4) {
422 res = XML_CONVERT_INPUT_INCOMPLETE;
423 goto after;
424 }
425 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
426 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
427 n -= 0x10000;
428 to[0] = (unsigned short)((n >> 10) | 0xD800);
429 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
430 to += 2;
431 from += 4;
432 } break;
433 default:
434 *to++ = *from++;
435 break;
436 }
437 }
438 if (from < fromLim)
439 res = XML_CONVERT_OUTPUT_EXHAUSTED;
440 after:
441 *fromP = from;
442 *toP = to;
443 return res;
444 }
445
446 #ifdef XML_NS
447 static const struct normal_encoding utf8_encoding_ns
448 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
449 {
450 # include "asciitab.h"
451 # include "utf8tab.h"
452 },
453 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
454 #endif
455
456 static const struct normal_encoding utf8_encoding
457 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
458 {
459 #define BT_COLON BT_NMSTRT
460 #include "asciitab.h"
461 #undef BT_COLON
462 #include "utf8tab.h"
463 },
464 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
465
466 #ifdef XML_NS
467
468 static const struct normal_encoding internal_utf8_encoding_ns
469 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
470 {
471 # include "iasciitab.h"
472 # include "utf8tab.h"
473 },
474 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
475
476 #endif
477
478 static const struct normal_encoding internal_utf8_encoding
479 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
480 {
481 #define BT_COLON BT_NMSTRT
482 #include "iasciitab.h"
483 #undef BT_COLON
484 #include "utf8tab.h"
485 },
486 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
487
488 static enum XML_Convert_Result PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)489 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
490 char **toP, const char *toLim) {
491 UNUSED_P(enc);
492 for (;;) {
493 unsigned char c;
494 if (*fromP == fromLim)
495 return XML_CONVERT_COMPLETED;
496 c = (unsigned char)**fromP;
497 if (c & 0x80) {
498 if (toLim - *toP < 2)
499 return XML_CONVERT_OUTPUT_EXHAUSTED;
500 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
501 *(*toP)++ = (char)((c & 0x3f) | 0x80);
502 (*fromP)++;
503 } else {
504 if (*toP == toLim)
505 return XML_CONVERT_OUTPUT_EXHAUSTED;
506 *(*toP)++ = *(*fromP)++;
507 }
508 }
509 }
510
511 static enum XML_Convert_Result PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)512 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
513 unsigned short **toP, const unsigned short *toLim) {
514 UNUSED_P(enc);
515 while (*fromP < fromLim && *toP < toLim)
516 *(*toP)++ = (unsigned char)*(*fromP)++;
517
518 if ((*toP == toLim) && (*fromP < fromLim))
519 return XML_CONVERT_OUTPUT_EXHAUSTED;
520 else
521 return XML_CONVERT_COMPLETED;
522 }
523
524 #ifdef XML_NS
525
526 static const struct normal_encoding latin1_encoding_ns
527 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
528 {
529 # include "asciitab.h"
530 # include "latin1tab.h"
531 },
532 STANDARD_VTABLE(sb_) NULL_VTABLE};
533
534 #endif
535
536 static const struct normal_encoding latin1_encoding
537 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
538 {
539 #define BT_COLON BT_NMSTRT
540 #include "asciitab.h"
541 #undef BT_COLON
542 #include "latin1tab.h"
543 },
544 STANDARD_VTABLE(sb_) NULL_VTABLE};
545
546 static enum XML_Convert_Result PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)547 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
548 char **toP, const char *toLim) {
549 UNUSED_P(enc);
550 while (*fromP < fromLim && *toP < toLim)
551 *(*toP)++ = *(*fromP)++;
552
553 if ((*toP == toLim) && (*fromP < fromLim))
554 return XML_CONVERT_OUTPUT_EXHAUSTED;
555 else
556 return XML_CONVERT_COMPLETED;
557 }
558
559 #ifdef XML_NS
560
561 static const struct normal_encoding ascii_encoding_ns
562 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
563 {
564 # include "asciitab.h"
565 /* BT_NONXML == 0 */
566 },
567 STANDARD_VTABLE(sb_) NULL_VTABLE};
568
569 #endif
570
571 static const struct normal_encoding ascii_encoding
572 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
573 {
574 #define BT_COLON BT_NMSTRT
575 #include "asciitab.h"
576 #undef BT_COLON
577 /* BT_NONXML == 0 */
578 },
579 STANDARD_VTABLE(sb_) NULL_VTABLE};
580
581 static int PTRFASTCALL
unicode_byte_type(char hi,char lo)582 unicode_byte_type(char hi, char lo) {
583 switch ((unsigned char)hi) {
584 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
585 case 0xD8:
586 case 0xD9:
587 case 0xDA:
588 case 0xDB:
589 return BT_LEAD4;
590 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
591 case 0xDC:
592 case 0xDD:
593 case 0xDE:
594 case 0xDF:
595 return BT_TRAIL;
596 case 0xFF:
597 switch ((unsigned char)lo) {
598 case 0xFF: /* noncharacter-FFFF */
599 case 0xFE: /* noncharacter-FFFE */
600 return BT_NONXML;
601 }
602 break;
603 }
604 return BT_NONASCII;
605 }
606
607 #define DEFINE_UTF16_TO_UTF8(E) \
608 static enum XML_Convert_Result PTRCALL E##toUtf8( \
609 const ENCODING *enc, const char **fromP, const char *fromLim, \
610 char **toP, const char *toLim) { \
611 const char *from = *fromP; \
612 UNUSED_P(enc); \
613 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
614 for (; from < fromLim; from += 2) { \
615 int plane; \
616 unsigned char lo2; \
617 unsigned char lo = GET_LO(from); \
618 unsigned char hi = GET_HI(from); \
619 switch (hi) { \
620 case 0: \
621 if (lo < 0x80) { \
622 if (*toP == toLim) { \
623 *fromP = from; \
624 return XML_CONVERT_OUTPUT_EXHAUSTED; \
625 } \
626 *(*toP)++ = lo; \
627 break; \
628 } \
629 /* fall through */ \
630 case 0x1: \
631 case 0x2: \
632 case 0x3: \
633 case 0x4: \
634 case 0x5: \
635 case 0x6: \
636 case 0x7: \
637 if (toLim - *toP < 2) { \
638 *fromP = from; \
639 return XML_CONVERT_OUTPUT_EXHAUSTED; \
640 } \
641 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
642 *(*toP)++ = ((lo & 0x3f) | 0x80); \
643 break; \
644 default: \
645 if (toLim - *toP < 3) { \
646 *fromP = from; \
647 return XML_CONVERT_OUTPUT_EXHAUSTED; \
648 } \
649 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
650 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
651 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
652 *(*toP)++ = ((lo & 0x3f) | 0x80); \
653 break; \
654 case 0xD8: \
655 case 0xD9: \
656 case 0xDA: \
657 case 0xDB: \
658 if (toLim - *toP < 4) { \
659 *fromP = from; \
660 return XML_CONVERT_OUTPUT_EXHAUSTED; \
661 } \
662 if (fromLim - from < 4) { \
663 *fromP = from; \
664 return XML_CONVERT_INPUT_INCOMPLETE; \
665 } \
666 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
667 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
668 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
669 from += 2; \
670 lo2 = GET_LO(from); \
671 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
672 | (lo2 >> 6) | 0x80); \
673 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
674 break; \
675 } \
676 } \
677 *fromP = from; \
678 if (from < fromLim) \
679 return XML_CONVERT_INPUT_INCOMPLETE; \
680 else \
681 return XML_CONVERT_COMPLETED; \
682 }
683
684 #define DEFINE_UTF16_TO_UTF16(E) \
685 static enum XML_Convert_Result PTRCALL E##toUtf16( \
686 const ENCODING *enc, const char **fromP, const char *fromLim, \
687 unsigned short **toP, const unsigned short *toLim) { \
688 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
689 UNUSED_P(enc); \
690 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
691 /* Avoid copying first half only of surrogate */ \
692 if (fromLim - *fromP > ((toLim - *toP) << 1) \
693 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
694 fromLim -= 2; \
695 res = XML_CONVERT_INPUT_INCOMPLETE; \
696 } \
697 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
698 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
699 if ((*toP == toLim) && (*fromP < fromLim)) \
700 return XML_CONVERT_OUTPUT_EXHAUSTED; \
701 else \
702 return res; \
703 }
704
705 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
706 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
707 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
708
709 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)710 DEFINE_UTF16_TO_UTF16(little2_)
711
712 #undef SET2
713 #undef GET_LO
714 #undef GET_HI
715
716 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
717 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
718 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
719
720 DEFINE_UTF16_TO_UTF8(big2_)
721 DEFINE_UTF16_TO_UTF16(big2_)
722
723 #undef SET2
724 #undef GET_LO
725 #undef GET_HI
726
727 #define LITTLE2_BYTE_TYPE(enc, p) \
728 ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
729 : unicode_byte_type((p)[1], (p)[0]))
730 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
731 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
732 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
733 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
734 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
735 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
736
737 #ifdef XML_MIN_SIZE
738
739 static int PTRFASTCALL
740 little2_byteType(const ENCODING *enc, const char *p) {
741 return LITTLE2_BYTE_TYPE(enc, p);
742 }
743
744 static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)745 little2_byteToAscii(const ENCODING *enc, const char *p) {
746 UNUSED_P(enc);
747 return LITTLE2_BYTE_TO_ASCII(p);
748 }
749
750 static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)751 little2_charMatches(const ENCODING *enc, const char *p, int c) {
752 UNUSED_P(enc);
753 return LITTLE2_CHAR_MATCHES(p, c);
754 }
755
756 static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)757 little2_isNameMin(const ENCODING *enc, const char *p) {
758 UNUSED_P(enc);
759 return LITTLE2_IS_NAME_CHAR_MINBPC(p);
760 }
761
762 static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)763 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
764 UNUSED_P(enc);
765 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
766 }
767
768 # undef VTABLE
769 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
770
771 #else /* not XML_MIN_SIZE */
772
773 # undef PREFIX
774 # define PREFIX(ident) little2_##ident
775 # define MINBPC(enc) 2
776 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
777 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
778 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
779 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
780 # define IS_NAME_CHAR(enc, p, n) 0
781 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
782 # define IS_NMSTRT_CHAR(enc, p, n) (0)
783 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
784
785 # define XML_TOK_IMPL_C
786 # include "xmltok_impl.c"
787 # undef XML_TOK_IMPL_C
788
789 # undef MINBPC
790 # undef BYTE_TYPE
791 # undef BYTE_TO_ASCII
792 # undef CHAR_MATCHES
793 # undef IS_NAME_CHAR
794 # undef IS_NAME_CHAR_MINBPC
795 # undef IS_NMSTRT_CHAR
796 # undef IS_NMSTRT_CHAR_MINBPC
797 # undef IS_INVALID_CHAR
798
799 #endif /* not XML_MIN_SIZE */
800
801 #ifdef XML_NS
802
803 static const struct normal_encoding little2_encoding_ns
804 = {{VTABLE, 2, 0,
805 # if BYTEORDER == 1234
806 1
807 # else
808 0
809 # endif
810 },
811 {
812 # include "asciitab.h"
813 # include "latin1tab.h"
814 },
815 STANDARD_VTABLE(little2_) NULL_VTABLE};
816
817 #endif
818
819 static const struct normal_encoding little2_encoding
820 = {{VTABLE, 2, 0,
821 #if BYTEORDER == 1234
822 1
823 #else
824 0
825 #endif
826 },
827 {
828 #define BT_COLON BT_NMSTRT
829 #include "asciitab.h"
830 #undef BT_COLON
831 #include "latin1tab.h"
832 },
833 STANDARD_VTABLE(little2_) NULL_VTABLE};
834
835 #if BYTEORDER != 4321
836
837 # ifdef XML_NS
838
839 static const struct normal_encoding internal_little2_encoding_ns
840 = {{VTABLE, 2, 0, 1},
841 {
842 # include "iasciitab.h"
843 # include "latin1tab.h"
844 },
845 STANDARD_VTABLE(little2_) NULL_VTABLE};
846
847 # endif
848
849 static const struct normal_encoding internal_little2_encoding
850 = {{VTABLE, 2, 0, 1},
851 {
852 # define BT_COLON BT_NMSTRT
853 # include "iasciitab.h"
854 # undef BT_COLON
855 # include "latin1tab.h"
856 },
857 STANDARD_VTABLE(little2_) NULL_VTABLE};
858
859 #endif
860
861 #define BIG2_BYTE_TYPE(enc, p) \
862 ((p)[0] == 0 \
863 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
864 : unicode_byte_type((p)[0], (p)[1]))
865 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
866 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
867 #define BIG2_IS_NAME_CHAR_MINBPC(p) \
868 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
869 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
870 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
871
872 #ifdef XML_MIN_SIZE
873
874 static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)875 big2_byteType(const ENCODING *enc, const char *p) {
876 return BIG2_BYTE_TYPE(enc, p);
877 }
878
879 static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)880 big2_byteToAscii(const ENCODING *enc, const char *p) {
881 UNUSED_P(enc);
882 return BIG2_BYTE_TO_ASCII(p);
883 }
884
885 static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)886 big2_charMatches(const ENCODING *enc, const char *p, int c) {
887 UNUSED_P(enc);
888 return BIG2_CHAR_MATCHES(p, c);
889 }
890
891 static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)892 big2_isNameMin(const ENCODING *enc, const char *p) {
893 UNUSED_P(enc);
894 return BIG2_IS_NAME_CHAR_MINBPC(p);
895 }
896
897 static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)898 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
899 UNUSED_P(enc);
900 return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
901 }
902
903 # undef VTABLE
904 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
905
906 #else /* not XML_MIN_SIZE */
907
908 # undef PREFIX
909 # define PREFIX(ident) big2_##ident
910 # define MINBPC(enc) 2
911 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
912 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
913 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
914 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
915 # define IS_NAME_CHAR(enc, p, n) 0
916 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
917 # define IS_NMSTRT_CHAR(enc, p, n) (0)
918 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
919
920 # define XML_TOK_IMPL_C
921 # include "xmltok_impl.c"
922 # undef XML_TOK_IMPL_C
923
924 # undef MINBPC
925 # undef BYTE_TYPE
926 # undef BYTE_TO_ASCII
927 # undef CHAR_MATCHES
928 # undef IS_NAME_CHAR
929 # undef IS_NAME_CHAR_MINBPC
930 # undef IS_NMSTRT_CHAR
931 # undef IS_NMSTRT_CHAR_MINBPC
932 # undef IS_INVALID_CHAR
933
934 #endif /* not XML_MIN_SIZE */
935
936 #ifdef XML_NS
937
938 static const struct normal_encoding big2_encoding_ns
939 = {{VTABLE, 2, 0,
940 # if BYTEORDER == 4321
941 1
942 # else
943 0
944 # endif
945 },
946 {
947 # include "asciitab.h"
948 # include "latin1tab.h"
949 },
950 STANDARD_VTABLE(big2_) NULL_VTABLE};
951
952 #endif
953
954 static const struct normal_encoding big2_encoding
955 = {{VTABLE, 2, 0,
956 #if BYTEORDER == 4321
957 1
958 #else
959 0
960 #endif
961 },
962 {
963 #define BT_COLON BT_NMSTRT
964 #include "asciitab.h"
965 #undef BT_COLON
966 #include "latin1tab.h"
967 },
968 STANDARD_VTABLE(big2_) NULL_VTABLE};
969
970 #if BYTEORDER != 1234
971
972 # ifdef XML_NS
973
974 static const struct normal_encoding internal_big2_encoding_ns
975 = {{VTABLE, 2, 0, 1},
976 {
977 # include "iasciitab.h"
978 # include "latin1tab.h"
979 },
980 STANDARD_VTABLE(big2_) NULL_VTABLE};
981
982 # endif
983
984 static const struct normal_encoding internal_big2_encoding
985 = {{VTABLE, 2, 0, 1},
986 {
987 # define BT_COLON BT_NMSTRT
988 # include "iasciitab.h"
989 # undef BT_COLON
990 # include "latin1tab.h"
991 },
992 STANDARD_VTABLE(big2_) NULL_VTABLE};
993
994 #endif
995
996 #undef PREFIX
997
998 static int FASTCALL
streqci(const char * s1,const char * s2)999 streqci(const char *s1, const char *s2) {
1000 for (;;) {
1001 char c1 = *s1++;
1002 char c2 = *s2++;
1003 if (ASCII_a <= c1 && c1 <= ASCII_z)
1004 c1 += ASCII_A - ASCII_a;
1005 if (ASCII_a <= c2 && c2 <= ASCII_z)
1006 /* The following line will never get executed. streqci() is
1007 * only called from two places, both of which guarantee to put
1008 * upper-case strings into s2.
1009 */
1010 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1011 if (c1 != c2)
1012 return 0;
1013 if (! c1)
1014 break;
1015 }
1016 return 1;
1017 }
1018
1019 static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)1020 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1021 POSITION *pos) {
1022 UNUSED_P(enc);
1023 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1024 }
1025
1026 static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)1027 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1028 char buf[1];
1029 char *p = buf;
1030 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1031 if (p == buf)
1032 return -1;
1033 else
1034 return buf[0];
1035 }
1036
1037 static int FASTCALL
isSpace(int c)1038 isSpace(int c) {
1039 switch (c) {
1040 case 0x20:
1041 case 0xD:
1042 case 0xA:
1043 case 0x9:
1044 return 1;
1045 }
1046 return 0;
1047 }
1048
1049 /* Return 1 if there's just optional white space or there's an S
1050 followed by name=val.
1051 */
1052 static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)1053 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1054 const char **namePtr, const char **nameEndPtr,
1055 const char **valPtr, const char **nextTokPtr) {
1056 int c;
1057 char open;
1058 if (ptr == end) {
1059 *namePtr = NULL;
1060 return 1;
1061 }
1062 if (! isSpace(toAscii(enc, ptr, end))) {
1063 *nextTokPtr = ptr;
1064 return 0;
1065 }
1066 do {
1067 ptr += enc->minBytesPerChar;
1068 } while (isSpace(toAscii(enc, ptr, end)));
1069 if (ptr == end) {
1070 *namePtr = NULL;
1071 return 1;
1072 }
1073 *namePtr = ptr;
1074 for (;;) {
1075 c = toAscii(enc, ptr, end);
1076 if (c == -1) {
1077 *nextTokPtr = ptr;
1078 return 0;
1079 }
1080 if (c == ASCII_EQUALS) {
1081 *nameEndPtr = ptr;
1082 break;
1083 }
1084 if (isSpace(c)) {
1085 *nameEndPtr = ptr;
1086 do {
1087 ptr += enc->minBytesPerChar;
1088 } while (isSpace(c = toAscii(enc, ptr, end)));
1089 if (c != ASCII_EQUALS) {
1090 *nextTokPtr = ptr;
1091 return 0;
1092 }
1093 break;
1094 }
1095 ptr += enc->minBytesPerChar;
1096 }
1097 if (ptr == *namePtr) {
1098 *nextTokPtr = ptr;
1099 return 0;
1100 }
1101 ptr += enc->minBytesPerChar;
1102 c = toAscii(enc, ptr, end);
1103 while (isSpace(c)) {
1104 ptr += enc->minBytesPerChar;
1105 c = toAscii(enc, ptr, end);
1106 }
1107 if (c != ASCII_QUOT && c != ASCII_APOS) {
1108 *nextTokPtr = ptr;
1109 return 0;
1110 }
1111 open = (char)c;
1112 ptr += enc->minBytesPerChar;
1113 *valPtr = ptr;
1114 for (;; ptr += enc->minBytesPerChar) {
1115 c = toAscii(enc, ptr, end);
1116 if (c == open)
1117 break;
1118 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1119 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1120 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1121 *nextTokPtr = ptr;
1122 return 0;
1123 }
1124 }
1125 *nextTokPtr = ptr + enc->minBytesPerChar;
1126 return 1;
1127 }
1128
1129 static const char KW_version[]
1130 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1131
1132 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1133 ASCII_i, ASCII_n, ASCII_g, '\0'};
1134
1135 static const char KW_standalone[]
1136 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1137 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1138
1139 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1140
1141 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1142
1143 static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1144 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1145 const char *),
1146 int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1147 const char *end, const char **badPtr, const char **versionPtr,
1148 const char **versionEndPtr, const char **encodingName,
1149 const ENCODING **encoding, int *standalone) {
1150 const char *val = NULL;
1151 const char *name = NULL;
1152 const char *nameEnd = NULL;
1153 ptr += 5 * enc->minBytesPerChar;
1154 end -= 2 * enc->minBytesPerChar;
1155 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1156 || ! name) {
1157 *badPtr = ptr;
1158 return 0;
1159 }
1160 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1161 if (! isGeneralTextEntity) {
1162 *badPtr = name;
1163 return 0;
1164 }
1165 } else {
1166 if (versionPtr)
1167 *versionPtr = val;
1168 if (versionEndPtr)
1169 *versionEndPtr = ptr;
1170 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1171 *badPtr = ptr;
1172 return 0;
1173 }
1174 if (! name) {
1175 if (isGeneralTextEntity) {
1176 /* a TextDecl must have an EncodingDecl */
1177 *badPtr = ptr;
1178 return 0;
1179 }
1180 return 1;
1181 }
1182 }
1183 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1184 int c = toAscii(enc, val, end);
1185 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1186 *badPtr = val;
1187 return 0;
1188 }
1189 if (encodingName)
1190 *encodingName = val;
1191 if (encoding)
1192 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1193 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1194 *badPtr = ptr;
1195 return 0;
1196 }
1197 if (! name)
1198 return 1;
1199 }
1200 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1201 || isGeneralTextEntity) {
1202 *badPtr = name;
1203 return 0;
1204 }
1205 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1206 if (standalone)
1207 *standalone = 1;
1208 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1209 if (standalone)
1210 *standalone = 0;
1211 } else {
1212 *badPtr = val;
1213 return 0;
1214 }
1215 while (isSpace(toAscii(enc, ptr, end)))
1216 ptr += enc->minBytesPerChar;
1217 if (ptr != end) {
1218 *badPtr = ptr;
1219 return 0;
1220 }
1221 return 1;
1222 }
1223
1224 static int FASTCALL
checkCharRefNumber(int result)1225 checkCharRefNumber(int result) {
1226 switch (result >> 8) {
1227 case 0xD8:
1228 case 0xD9:
1229 case 0xDA:
1230 case 0xDB:
1231 case 0xDC:
1232 case 0xDD:
1233 case 0xDE:
1234 case 0xDF:
1235 return -1;
1236 case 0:
1237 if (latin1_encoding.type[result] == BT_NONXML)
1238 return -1;
1239 break;
1240 case 0xFF:
1241 if (result == 0xFFFE || result == 0xFFFF)
1242 return -1;
1243 break;
1244 }
1245 return result;
1246 }
1247
1248 int FASTCALL
XmlUtf8Encode(int c,char * buf)1249 XmlUtf8Encode(int c, char *buf) {
1250 enum {
1251 /* minN is minimum legal resulting value for N byte sequence */
1252 min2 = 0x80,
1253 min3 = 0x800,
1254 min4 = 0x10000
1255 };
1256
1257 if (c < 0)
1258 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1259 if (c < min2) {
1260 buf[0] = (char)(c | UTF8_cval1);
1261 return 1;
1262 }
1263 if (c < min3) {
1264 buf[0] = (char)((c >> 6) | UTF8_cval2);
1265 buf[1] = (char)((c & 0x3f) | 0x80);
1266 return 2;
1267 }
1268 if (c < min4) {
1269 buf[0] = (char)((c >> 12) | UTF8_cval3);
1270 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1271 buf[2] = (char)((c & 0x3f) | 0x80);
1272 return 3;
1273 }
1274 if (c < 0x110000) {
1275 buf[0] = (char)((c >> 18) | UTF8_cval4);
1276 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1277 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1278 buf[3] = (char)((c & 0x3f) | 0x80);
1279 return 4;
1280 }
1281 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1282 }
1283
1284 int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)1285 XmlUtf16Encode(int charNum, unsigned short *buf) {
1286 if (charNum < 0)
1287 return 0;
1288 if (charNum < 0x10000) {
1289 buf[0] = (unsigned short)charNum;
1290 return 1;
1291 }
1292 if (charNum < 0x110000) {
1293 charNum -= 0x10000;
1294 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1295 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1296 return 2;
1297 }
1298 return 0;
1299 }
1300
1301 struct unknown_encoding {
1302 struct normal_encoding normal;
1303 CONVERTER convert;
1304 void *userData;
1305 unsigned short utf16[256];
1306 char utf8[256][4];
1307 };
1308
1309 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1310
1311 int
XmlSizeOfUnknownEncoding(void)1312 XmlSizeOfUnknownEncoding(void) {
1313 return sizeof(struct unknown_encoding);
1314 }
1315
1316 static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)1317 unknown_isName(const ENCODING *enc, const char *p) {
1318 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1319 int c = uenc->convert(uenc->userData, p);
1320 if (c & ~0xFFFF)
1321 return 0;
1322 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1323 }
1324
1325 static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)1326 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1327 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1328 int c = uenc->convert(uenc->userData, p);
1329 if (c & ~0xFFFF)
1330 return 0;
1331 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1332 }
1333
1334 static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)1335 unknown_isInvalid(const ENCODING *enc, const char *p) {
1336 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1337 int c = uenc->convert(uenc->userData, p);
1338 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1339 }
1340
1341 static enum XML_Convert_Result PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1342 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1343 char **toP, const char *toLim) {
1344 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1345 char buf[XML_UTF8_ENCODE_MAX];
1346 for (;;) {
1347 const char *utf8;
1348 int n;
1349 if (*fromP == fromLim)
1350 return XML_CONVERT_COMPLETED;
1351 utf8 = uenc->utf8[(unsigned char)**fromP];
1352 n = *utf8++;
1353 if (n == 0) {
1354 int c = uenc->convert(uenc->userData, *fromP);
1355 n = XmlUtf8Encode(c, buf);
1356 if (n > toLim - *toP)
1357 return XML_CONVERT_OUTPUT_EXHAUSTED;
1358 utf8 = buf;
1359 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1360 - (BT_LEAD2 - 2));
1361 } else {
1362 if (n > toLim - *toP)
1363 return XML_CONVERT_OUTPUT_EXHAUSTED;
1364 (*fromP)++;
1365 }
1366 memcpy(*toP, utf8, n);
1367 *toP += n;
1368 }
1369 }
1370
1371 static enum XML_Convert_Result PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1372 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1373 unsigned short **toP, const unsigned short *toLim) {
1374 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1375 while (*fromP < fromLim && *toP < toLim) {
1376 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1377 if (c == 0) {
1378 c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1379 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1380 - (BT_LEAD2 - 2));
1381 } else
1382 (*fromP)++;
1383 *(*toP)++ = c;
1384 }
1385
1386 if ((*toP == toLim) && (*fromP < fromLim))
1387 return XML_CONVERT_OUTPUT_EXHAUSTED;
1388 else
1389 return XML_CONVERT_COMPLETED;
1390 }
1391
1392 ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,CONVERTER convert,void * userData)1393 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1394 void *userData) {
1395 int i;
1396 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1397 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1398 for (i = 0; i < 128; i++)
1399 if (latin1_encoding.type[i] != BT_OTHER
1400 && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1401 return 0;
1402 for (i = 0; i < 256; i++) {
1403 int c = table[i];
1404 if (c == -1) {
1405 e->normal.type[i] = BT_MALFORM;
1406 /* This shouldn't really get used. */
1407 e->utf16[i] = 0xFFFF;
1408 e->utf8[i][0] = 1;
1409 e->utf8[i][1] = 0;
1410 } else if (c < 0) {
1411 if (c < -4)
1412 return 0;
1413 /* Multi-byte sequences need a converter function */
1414 if (! convert)
1415 return 0;
1416 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1417 e->utf8[i][0] = 0;
1418 e->utf16[i] = 0;
1419 } else if (c < 0x80) {
1420 if (latin1_encoding.type[c] != BT_OTHER
1421 && latin1_encoding.type[c] != BT_NONXML && c != i)
1422 return 0;
1423 e->normal.type[i] = latin1_encoding.type[c];
1424 e->utf8[i][0] = 1;
1425 e->utf8[i][1] = (char)c;
1426 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1427 } else if (checkCharRefNumber(c) < 0) {
1428 e->normal.type[i] = BT_NONXML;
1429 /* This shouldn't really get used. */
1430 e->utf16[i] = 0xFFFF;
1431 e->utf8[i][0] = 1;
1432 e->utf8[i][1] = 0;
1433 } else {
1434 if (c > 0xFFFF)
1435 return 0;
1436 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1437 e->normal.type[i] = BT_NMSTRT;
1438 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1439 e->normal.type[i] = BT_NAME;
1440 else
1441 e->normal.type[i] = BT_OTHER;
1442 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1443 e->utf16[i] = (unsigned short)c;
1444 }
1445 }
1446 e->userData = userData;
1447 e->convert = convert;
1448 if (convert) {
1449 e->normal.isName2 = unknown_isName;
1450 e->normal.isName3 = unknown_isName;
1451 e->normal.isName4 = unknown_isName;
1452 e->normal.isNmstrt2 = unknown_isNmstrt;
1453 e->normal.isNmstrt3 = unknown_isNmstrt;
1454 e->normal.isNmstrt4 = unknown_isNmstrt;
1455 e->normal.isInvalid2 = unknown_isInvalid;
1456 e->normal.isInvalid3 = unknown_isInvalid;
1457 e->normal.isInvalid4 = unknown_isInvalid;
1458 }
1459 e->normal.enc.utf8Convert = unknown_toUtf8;
1460 e->normal.enc.utf16Convert = unknown_toUtf16;
1461 return &(e->normal.enc);
1462 }
1463
1464 /* If this enumeration is changed, getEncodingIndex and encodings
1465 must also be changed. */
1466 enum {
1467 UNKNOWN_ENC = -1,
1468 ISO_8859_1_ENC = 0,
1469 US_ASCII_ENC,
1470 UTF_8_ENC,
1471 UTF_16_ENC,
1472 UTF_16BE_ENC,
1473 UTF_16LE_ENC,
1474 /* must match encodingNames up to here */
1475 NO_ENC
1476 };
1477
1478 static const char KW_ISO_8859_1[]
1479 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
1480 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'};
1481 static const char KW_US_ASCII[]
1482 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1483 ASCII_C, ASCII_I, ASCII_I, '\0'};
1484 static const char KW_UTF_8[]
1485 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1486 static const char KW_UTF_16[]
1487 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1488 static const char KW_UTF_16BE[]
1489 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1490 ASCII_6, ASCII_B, ASCII_E, '\0'};
1491 static const char KW_UTF_16LE[]
1492 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1493 ASCII_6, ASCII_L, ASCII_E, '\0'};
1494
1495 static int FASTCALL
getEncodingIndex(const char * name)1496 getEncodingIndex(const char *name) {
1497 static const char *const encodingNames[] = {
1498 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1499 };
1500 int i;
1501 if (name == NULL)
1502 return NO_ENC;
1503 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1504 if (streqci(name, encodingNames[i]))
1505 return i;
1506 return UNKNOWN_ENC;
1507 }
1508
1509 /* For binary compatibility, we store the index of the encoding
1510 specified at initialization in the isUtf16 member.
1511 */
1512
1513 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1514 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1515
1516 /* This is what detects the encoding. encodingTable maps from
1517 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1518 the external (protocol) specified encoding; state is
1519 XML_CONTENT_STATE if we're parsing an external text entity, and
1520 XML_PROLOG_STATE otherwise.
1521 */
1522
1523 static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1524 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1525 int state, const char *ptr, const char *end, const char **nextTokPtr) {
1526 const ENCODING **encPtr;
1527
1528 if (ptr >= end)
1529 return XML_TOK_NONE;
1530 encPtr = enc->encPtr;
1531 if (ptr + 1 == end) {
1532 /* only a single byte available for auto-detection */
1533 #ifndef XML_DTD /* FIXME */
1534 /* a well-formed document entity must have more than one byte */
1535 if (state != XML_CONTENT_STATE)
1536 return XML_TOK_PARTIAL;
1537 #endif
1538 /* so we're parsing an external text entity... */
1539 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1540 switch (INIT_ENC_INDEX(enc)) {
1541 case UTF_16_ENC:
1542 case UTF_16LE_ENC:
1543 case UTF_16BE_ENC:
1544 return XML_TOK_PARTIAL;
1545 }
1546 switch ((unsigned char)*ptr) {
1547 case 0xFE:
1548 case 0xFF:
1549 case 0xEF: /* possibly first byte of UTF-8 BOM */
1550 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1551 break;
1552 /* fall through */
1553 case 0x00:
1554 case 0x3C:
1555 return XML_TOK_PARTIAL;
1556 }
1557 } else {
1558 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1559 case 0xFEFF:
1560 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1561 break;
1562 *nextTokPtr = ptr + 2;
1563 *encPtr = encodingTable[UTF_16BE_ENC];
1564 return XML_TOK_BOM;
1565 /* 00 3C is handled in the default case */
1566 case 0x3C00:
1567 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1568 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1569 && state == XML_CONTENT_STATE)
1570 break;
1571 *encPtr = encodingTable[UTF_16LE_ENC];
1572 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1573 case 0xFFFE:
1574 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1575 break;
1576 *nextTokPtr = ptr + 2;
1577 *encPtr = encodingTable[UTF_16LE_ENC];
1578 return XML_TOK_BOM;
1579 case 0xEFBB:
1580 /* Maybe a UTF-8 BOM (EF BB BF) */
1581 /* If there's an explicitly specified (external) encoding
1582 of ISO-8859-1 or some flavour of UTF-16
1583 and this is an external text entity,
1584 don't look for the BOM,
1585 because it might be a legal data.
1586 */
1587 if (state == XML_CONTENT_STATE) {
1588 int e = INIT_ENC_INDEX(enc);
1589 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1590 || e == UTF_16_ENC)
1591 break;
1592 }
1593 if (ptr + 2 == end)
1594 return XML_TOK_PARTIAL;
1595 if ((unsigned char)ptr[2] == 0xBF) {
1596 *nextTokPtr = ptr + 3;
1597 *encPtr = encodingTable[UTF_8_ENC];
1598 return XML_TOK_BOM;
1599 }
1600 break;
1601 default:
1602 if (ptr[0] == '\0') {
1603 /* 0 isn't a legal data character. Furthermore a document
1604 entity can only start with ASCII characters. So the only
1605 way this can fail to be big-endian UTF-16 if it it's an
1606 external parsed general entity that's labelled as
1607 UTF-16LE.
1608 */
1609 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1610 break;
1611 *encPtr = encodingTable[UTF_16BE_ENC];
1612 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1613 } else if (ptr[1] == '\0') {
1614 /* We could recover here in the case:
1615 - parsing an external entity
1616 - second byte is 0
1617 - no externally specified encoding
1618 - no encoding declaration
1619 by assuming UTF-16LE. But we don't, because this would mean when
1620 presented just with a single byte, we couldn't reliably determine
1621 whether we needed further bytes.
1622 */
1623 if (state == XML_CONTENT_STATE)
1624 break;
1625 *encPtr = encodingTable[UTF_16LE_ENC];
1626 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1627 }
1628 break;
1629 }
1630 }
1631 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1632 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1633 }
1634
1635 #define NS(x) x
1636 #define ns(x) x
1637 #define XML_TOK_NS_C
1638 #include "xmltok_ns.c"
1639 #undef XML_TOK_NS_C
1640 #undef NS
1641 #undef ns
1642
1643 #ifdef XML_NS
1644
1645 # define NS(x) x##NS
1646 # define ns(x) x##_ns
1647
1648 # define XML_TOK_NS_C
1649 # include "xmltok_ns.c"
1650 # undef XML_TOK_NS_C
1651
1652 # undef NS
1653 # undef ns
1654
1655 ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,CONVERTER convert,void * userData)1656 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1657 void *userData) {
1658 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1659 if (enc)
1660 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1661 return enc;
1662 }
1663
1664 #endif /* XML_NS */
1665