1 /*
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
11 Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12 Copyright (c) 2002 Greg Stein <gstein@users.sourceforge.net>
13 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14 Copyright (c) 2005-2009 Steven Solie <ssolie@users.sourceforge.net>
15 Copyright (c) 2016-2021 Sebastian Pipping <sebastian@pipping.org>
16 Copyright (c) 2016 Pascal Cuoq <cuoq@trust-in-soft.com>
17 Copyright (c) 2016 Don Lewis <truckman@apache.org>
18 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
19 Copyright (c) 2017 Alexander Bluhm <alexander.bluhm@gmx.net>
20 Copyright (c) 2017 Benbuck Nason <bnason@netflix.com>
21 Copyright (c) 2017 José Gutiérrez de la Concha <jose@zeroc.com>
22 Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
23 Licensed under the MIT license:
24
25 Permission is hereby granted, free of charge, to any person obtaining
26 a copy of this software and associated documentation files (the
27 "Software"), to deal in the Software without restriction, including
28 without limitation the rights to use, copy, modify, merge, publish,
29 distribute, sublicense, and/or sell copies of the Software, and to permit
30 persons to whom the Software is furnished to do so, subject to the
31 following conditions:
32
33 The above copyright notice and this permission notice shall be included
34 in all copies or substantial portions of the Software.
35
36 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
37 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
38 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
39 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
40 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
41 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
42 USE OR OTHER DEALINGS IN THE SOFTWARE.
43 */
44
45 #ifdef _WIN32
46 # include "winconfig.h"
47 #endif
48
49 #include <expat_config.h>
50
51 #include <stddef.h>
52 #include <string.h> /* memcpy */
53 #include <stdbool.h>
54
55 #include "expat_external.h"
56 #include "internal.h"
57 #include "xmltok.h"
58 #include "nametab.h"
59
60 #ifdef XML_DTD
61 # define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
62 #else
63 # define IGNORE_SECTION_TOK_VTABLE /* as nothing */
64 #endif
65
66 #define VTABLE1 \
67 {PREFIX(prologTok), PREFIX(contentTok), \
68 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE}, \
69 {PREFIX(attributeValueTok), PREFIX(entityValueTok)}, \
70 PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS), \
71 PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName), \
72 PREFIX(updatePosition), PREFIX(isPublicId)
73
74 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
75
76 #define UCS2_GET_NAMING(pages, hi, lo) \
77 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
78
79 /* A 2 byte UTF-8 representation splits the characters 11 bits between
80 the bottom 5 and 6 bits of the bytes. We need 8 bits to index into
81 pages, 3 bits to add to that index and 5 bits to generate the mask.
82 */
83 #define UTF8_GET_NAMING2(pages, byte) \
84 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
85 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)] \
86 & (1u << (((byte)[1]) & 0x1F)))
87
88 /* A 3 byte UTF-8 representation splits the characters 16 bits between
89 the bottom 4, 6 and 6 bits of the bytes. We need 8 bits to index
90 into pages, 3 bits to add to that index and 5 bits to generate the
91 mask.
92 */
93 #define UTF8_GET_NAMING3(pages, byte) \
94 (namingBitmap \
95 [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)] \
96 << 3) \
97 + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
98 & (1u << (((byte)[2]) & 0x1F)))
99
100 #define UTF8_GET_NAMING(pages, p, n) \
101 ((n) == 2 \
102 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
103 : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
104
105 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
106 of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
107 with the additional restriction of not allowing the Unicode
108 code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
109 Implementation details:
110 (A & 0x80) == 0 means A < 0x80
111 and
112 (A & 0xC0) == 0xC0 means A > 0xBF
113 */
114
115 #define UTF8_INVALID2(p) \
116 ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
117
118 #define UTF8_INVALID3(p) \
119 (((p)[2] & 0x80) == 0 \
120 || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD \
121 : ((p)[2] & 0xC0) == 0xC0) \
122 || ((*p) == 0xE0 \
123 ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
124 : ((p)[1] & 0x80) == 0 \
125 || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
126
127 #define UTF8_INVALID4(p) \
128 (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0 \
129 || ((p)[2] & 0xC0) == 0xC0 \
130 || ((*p) == 0xF0 \
131 ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
132 : ((p)[1] & 0x80) == 0 \
133 || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
134
135 static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)136 isNever(const ENCODING *enc, const char *p) {
137 UNUSED_P(enc);
138 UNUSED_P(p);
139 return 0;
140 }
141
142 static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)143 utf8_isName2(const ENCODING *enc, const char *p) {
144 UNUSED_P(enc);
145 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
146 }
147
148 static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)149 utf8_isName3(const ENCODING *enc, const char *p) {
150 UNUSED_P(enc);
151 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
152 }
153
154 #define utf8_isName4 isNever
155
156 static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)157 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
158 UNUSED_P(enc);
159 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
160 }
161
162 static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)163 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
164 UNUSED_P(enc);
165 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
166 }
167
168 #define utf8_isNmstrt4 isNever
169
170 static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)171 utf8_isInvalid2(const ENCODING *enc, const char *p) {
172 UNUSED_P(enc);
173 return UTF8_INVALID2((const unsigned char *)p);
174 }
175
176 static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)177 utf8_isInvalid3(const ENCODING *enc, const char *p) {
178 UNUSED_P(enc);
179 return UTF8_INVALID3((const unsigned char *)p);
180 }
181
182 static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)183 utf8_isInvalid4(const ENCODING *enc, const char *p) {
184 UNUSED_P(enc);
185 return UTF8_INVALID4((const unsigned char *)p);
186 }
187
188 struct normal_encoding {
189 ENCODING enc;
190 unsigned char type[256];
191 #ifdef XML_MIN_SIZE
192 int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
193 int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
194 int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
195 int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
196 int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
197 #endif /* XML_MIN_SIZE */
198 int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
199 int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
200 int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
201 int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
202 int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
203 int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
204 int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
205 int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
206 int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
207 };
208
209 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
210
211 #ifdef XML_MIN_SIZE
212
213 # define STANDARD_VTABLE(E) \
214 E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
215
216 #else
217
218 # define STANDARD_VTABLE(E) /* as nothing */
219
220 #endif
221
222 #define NORMAL_VTABLE(E) \
223 E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3, \
224 E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
225
226 #define NULL_VTABLE \
227 /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL, \
228 /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL, \
229 /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
230
231 static int FASTCALL checkCharRefNumber(int);
232
233 #include "xmltok_impl.h"
234 #include "ascii.h"
235
236 #ifdef XML_MIN_SIZE
237 # define sb_isNameMin isNever
238 # define sb_isNmstrtMin isNever
239 #endif
240
241 #ifdef XML_MIN_SIZE
242 # define MINBPC(enc) ((enc)->minBytesPerChar)
243 #else
244 /* minimum bytes per character */
245 # define MINBPC(enc) 1
246 #endif
247
248 #define SB_BYTE_TYPE(enc, p) \
249 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
250
251 #ifdef XML_MIN_SIZE
252 static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)253 sb_byteType(const ENCODING *enc, const char *p) {
254 return SB_BYTE_TYPE(enc, p);
255 }
256 # define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
257 #else
258 # define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
259 #endif
260
261 #ifdef XML_MIN_SIZE
262 # define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
263 static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)264 sb_byteToAscii(const ENCODING *enc, const char *p) {
265 UNUSED_P(enc);
266 return *p;
267 }
268 #else
269 # define BYTE_TO_ASCII(enc, p) (*(p))
270 #endif
271
272 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
273 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
274 #ifdef XML_MIN_SIZE
275 # define IS_INVALID_CHAR(enc, p, n) \
276 (AS_NORMAL_ENCODING(enc)->isInvalid##n \
277 && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
278 #else
279 # define IS_INVALID_CHAR(enc, p, n) \
280 (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
281 #endif
282
283 #ifdef XML_MIN_SIZE
284 # define IS_NAME_CHAR_MINBPC(enc, p) \
285 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
286 # define IS_NMSTRT_CHAR_MINBPC(enc, p) \
287 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
288 #else
289 # define IS_NAME_CHAR_MINBPC(enc, p) (0)
290 # define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
291 #endif
292
293 #ifdef XML_MIN_SIZE
294 # define CHAR_MATCHES(enc, p, c) \
295 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
296 static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)297 sb_charMatches(const ENCODING *enc, const char *p, int c) {
298 UNUSED_P(enc);
299 return *p == c;
300 }
301 #else
302 /* c is an ASCII character */
303 # define CHAR_MATCHES(enc, p, c) (*(p) == c)
304 #endif
305
306 #define PREFIX(ident) normal_##ident
307 #define XML_TOK_IMPL_C
308 #include "xmltok_impl.c"
309 #undef XML_TOK_IMPL_C
310
311 #undef MINBPC
312 #undef BYTE_TYPE
313 #undef BYTE_TO_ASCII
314 #undef CHAR_MATCHES
315 #undef IS_NAME_CHAR
316 #undef IS_NAME_CHAR_MINBPC
317 #undef IS_NMSTRT_CHAR
318 #undef IS_NMSTRT_CHAR_MINBPC
319 #undef IS_INVALID_CHAR
320
321 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
322 UTF8_cval1 = 0x00,
323 UTF8_cval2 = 0xc0,
324 UTF8_cval3 = 0xe0,
325 UTF8_cval4 = 0xf0
326 };
327
328 void
_INTERNAL_trim_to_complete_utf8_characters(const char * from,const char ** fromLimRef)329 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
330 const char **fromLimRef) {
331 const char *fromLim = *fromLimRef;
332 size_t walked = 0;
333 for (; fromLim > from; fromLim--, walked++) {
334 const unsigned char prev = (unsigned char)fromLim[-1];
335 if ((prev & 0xf8u)
336 == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
337 if (walked + 1 >= 4) {
338 fromLim += 4 - 1;
339 break;
340 } else {
341 walked = 0;
342 }
343 } else if ((prev & 0xf0u)
344 == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
345 if (walked + 1 >= 3) {
346 fromLim += 3 - 1;
347 break;
348 } else {
349 walked = 0;
350 }
351 } else if ((prev & 0xe0u)
352 == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
353 if (walked + 1 >= 2) {
354 fromLim += 2 - 1;
355 break;
356 } else {
357 walked = 0;
358 }
359 } else if ((prev & 0x80u)
360 == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
361 break;
362 }
363 }
364 *fromLimRef = fromLim;
365 }
366
367 static enum XML_Convert_Result PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)368 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
369 char **toP, const char *toLim) {
370 bool input_incomplete = false;
371 bool output_exhausted = false;
372
373 /* Avoid copying partial characters (due to limited space). */
374 const ptrdiff_t bytesAvailable = fromLim - *fromP;
375 const ptrdiff_t bytesStorable = toLim - *toP;
376 UNUSED_P(enc);
377 if (bytesAvailable > bytesStorable) {
378 fromLim = *fromP + bytesStorable;
379 output_exhausted = true;
380 }
381
382 /* Avoid copying partial characters (from incomplete input). */
383 {
384 const char *const fromLimBefore = fromLim;
385 _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
386 if (fromLim < fromLimBefore) {
387 input_incomplete = true;
388 }
389 }
390
391 {
392 const ptrdiff_t bytesToCopy = fromLim - *fromP;
393 memcpy(*toP, *fromP, bytesToCopy);
394 *fromP += bytesToCopy;
395 *toP += bytesToCopy;
396 }
397
398 if (output_exhausted) /* needs to go first */
399 return XML_CONVERT_OUTPUT_EXHAUSTED;
400 else if (input_incomplete)
401 return XML_CONVERT_INPUT_INCOMPLETE;
402 else
403 return XML_CONVERT_COMPLETED;
404 }
405
406 static enum XML_Convert_Result PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)407 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
408 unsigned short **toP, const unsigned short *toLim) {
409 enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
410 unsigned short *to = *toP;
411 const char *from = *fromP;
412 while (from < fromLim && to < toLim) {
413 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
414 case BT_LEAD2:
415 if (fromLim - from < 2) {
416 res = XML_CONVERT_INPUT_INCOMPLETE;
417 goto after;
418 }
419 *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
420 from += 2;
421 break;
422 case BT_LEAD3:
423 if (fromLim - from < 3) {
424 res = XML_CONVERT_INPUT_INCOMPLETE;
425 goto after;
426 }
427 *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
428 | (from[2] & 0x3f));
429 from += 3;
430 break;
431 case BT_LEAD4: {
432 unsigned long n;
433 if (toLim - to < 2) {
434 res = XML_CONVERT_OUTPUT_EXHAUSTED;
435 goto after;
436 }
437 if (fromLim - from < 4) {
438 res = XML_CONVERT_INPUT_INCOMPLETE;
439 goto after;
440 }
441 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
442 | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
443 n -= 0x10000;
444 to[0] = (unsigned short)((n >> 10) | 0xD800);
445 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
446 to += 2;
447 from += 4;
448 } break;
449 default:
450 *to++ = *from++;
451 break;
452 }
453 }
454 if (from < fromLim)
455 res = XML_CONVERT_OUTPUT_EXHAUSTED;
456 after:
457 *fromP = from;
458 *toP = to;
459 return res;
460 }
461
462 #ifdef XML_NS
463 static const struct normal_encoding utf8_encoding_ns
464 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
465 {
466 # include "asciitab.h"
467 # include "utf8tab.h"
468 },
469 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
470 #endif
471
472 static const struct normal_encoding utf8_encoding
473 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
474 {
475 #define BT_COLON BT_NMSTRT
476 #include "asciitab.h"
477 #undef BT_COLON
478 #include "utf8tab.h"
479 },
480 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
481
482 #ifdef XML_NS
483
484 static const struct normal_encoding internal_utf8_encoding_ns
485 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
486 {
487 # include "iasciitab.h"
488 # include "utf8tab.h"
489 },
490 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
491
492 #endif
493
494 static const struct normal_encoding internal_utf8_encoding
495 = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
496 {
497 #define BT_COLON BT_NMSTRT
498 #include "iasciitab.h"
499 #undef BT_COLON
500 #include "utf8tab.h"
501 },
502 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
503
504 static enum XML_Convert_Result PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)505 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
506 char **toP, const char *toLim) {
507 UNUSED_P(enc);
508 for (;;) {
509 unsigned char c;
510 if (*fromP == fromLim)
511 return XML_CONVERT_COMPLETED;
512 c = (unsigned char)**fromP;
513 if (c & 0x80) {
514 if (toLim - *toP < 2)
515 return XML_CONVERT_OUTPUT_EXHAUSTED;
516 *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
517 *(*toP)++ = (char)((c & 0x3f) | 0x80);
518 (*fromP)++;
519 } else {
520 if (*toP == toLim)
521 return XML_CONVERT_OUTPUT_EXHAUSTED;
522 *(*toP)++ = *(*fromP)++;
523 }
524 }
525 }
526
527 static enum XML_Convert_Result PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)528 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
529 unsigned short **toP, const unsigned short *toLim) {
530 UNUSED_P(enc);
531 while (*fromP < fromLim && *toP < toLim)
532 *(*toP)++ = (unsigned char)*(*fromP)++;
533
534 if ((*toP == toLim) && (*fromP < fromLim))
535 return XML_CONVERT_OUTPUT_EXHAUSTED;
536 else
537 return XML_CONVERT_COMPLETED;
538 }
539
540 #ifdef XML_NS
541
542 static const struct normal_encoding latin1_encoding_ns
543 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
544 {
545 # include "asciitab.h"
546 # include "latin1tab.h"
547 },
548 STANDARD_VTABLE(sb_) NULL_VTABLE};
549
550 #endif
551
552 static const struct normal_encoding latin1_encoding
553 = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
554 {
555 #define BT_COLON BT_NMSTRT
556 #include "asciitab.h"
557 #undef BT_COLON
558 #include "latin1tab.h"
559 },
560 STANDARD_VTABLE(sb_) NULL_VTABLE};
561
562 static enum XML_Convert_Result PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)563 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
564 char **toP, const char *toLim) {
565 UNUSED_P(enc);
566 while (*fromP < fromLim && *toP < toLim)
567 *(*toP)++ = *(*fromP)++;
568
569 if ((*toP == toLim) && (*fromP < fromLim))
570 return XML_CONVERT_OUTPUT_EXHAUSTED;
571 else
572 return XML_CONVERT_COMPLETED;
573 }
574
575 #ifdef XML_NS
576
577 static const struct normal_encoding ascii_encoding_ns
578 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
579 {
580 # include "asciitab.h"
581 /* BT_NONXML == 0 */
582 },
583 STANDARD_VTABLE(sb_) NULL_VTABLE};
584
585 #endif
586
587 static const struct normal_encoding ascii_encoding
588 = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
589 {
590 #define BT_COLON BT_NMSTRT
591 #include "asciitab.h"
592 #undef BT_COLON
593 /* BT_NONXML == 0 */
594 },
595 STANDARD_VTABLE(sb_) NULL_VTABLE};
596
597 static int PTRFASTCALL
unicode_byte_type(char hi,char lo)598 unicode_byte_type(char hi, char lo) {
599 switch ((unsigned char)hi) {
600 /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
601 case 0xD8:
602 case 0xD9:
603 case 0xDA:
604 case 0xDB:
605 return BT_LEAD4;
606 /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
607 case 0xDC:
608 case 0xDD:
609 case 0xDE:
610 case 0xDF:
611 return BT_TRAIL;
612 case 0xFF:
613 switch ((unsigned char)lo) {
614 case 0xFF: /* noncharacter-FFFF */
615 case 0xFE: /* noncharacter-FFFE */
616 return BT_NONXML;
617 }
618 break;
619 }
620 return BT_NONASCII;
621 }
622
623 #define DEFINE_UTF16_TO_UTF8(E) \
624 static enum XML_Convert_Result PTRCALL E##toUtf8( \
625 const ENCODING *enc, const char **fromP, const char *fromLim, \
626 char **toP, const char *toLim) { \
627 const char *from = *fromP; \
628 UNUSED_P(enc); \
629 fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */ \
630 for (; from < fromLim; from += 2) { \
631 int plane; \
632 unsigned char lo2; \
633 unsigned char lo = GET_LO(from); \
634 unsigned char hi = GET_HI(from); \
635 switch (hi) { \
636 case 0: \
637 if (lo < 0x80) { \
638 if (*toP == toLim) { \
639 *fromP = from; \
640 return XML_CONVERT_OUTPUT_EXHAUSTED; \
641 } \
642 *(*toP)++ = lo; \
643 break; \
644 } \
645 /* fall through */ \
646 case 0x1: \
647 case 0x2: \
648 case 0x3: \
649 case 0x4: \
650 case 0x5: \
651 case 0x6: \
652 case 0x7: \
653 if (toLim - *toP < 2) { \
654 *fromP = from; \
655 return XML_CONVERT_OUTPUT_EXHAUSTED; \
656 } \
657 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
658 *(*toP)++ = ((lo & 0x3f) | 0x80); \
659 break; \
660 default: \
661 if (toLim - *toP < 3) { \
662 *fromP = from; \
663 return XML_CONVERT_OUTPUT_EXHAUSTED; \
664 } \
665 /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
666 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
667 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
668 *(*toP)++ = ((lo & 0x3f) | 0x80); \
669 break; \
670 case 0xD8: \
671 case 0xD9: \
672 case 0xDA: \
673 case 0xDB: \
674 if (toLim - *toP < 4) { \
675 *fromP = from; \
676 return XML_CONVERT_OUTPUT_EXHAUSTED; \
677 } \
678 if (fromLim - from < 4) { \
679 *fromP = from; \
680 return XML_CONVERT_INPUT_INCOMPLETE; \
681 } \
682 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
683 *(*toP)++ = (char)((plane >> 2) | UTF8_cval4); \
684 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
685 from += 2; \
686 lo2 = GET_LO(from); \
687 *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2) \
688 | (lo2 >> 6) | 0x80); \
689 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
690 break; \
691 } \
692 } \
693 *fromP = from; \
694 if (from < fromLim) \
695 return XML_CONVERT_INPUT_INCOMPLETE; \
696 else \
697 return XML_CONVERT_COMPLETED; \
698 }
699
700 #define DEFINE_UTF16_TO_UTF16(E) \
701 static enum XML_Convert_Result PTRCALL E##toUtf16( \
702 const ENCODING *enc, const char **fromP, const char *fromLim, \
703 unsigned short **toP, const unsigned short *toLim) { \
704 enum XML_Convert_Result res = XML_CONVERT_COMPLETED; \
705 UNUSED_P(enc); \
706 fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */ \
707 /* Avoid copying first half only of surrogate */ \
708 if (fromLim - *fromP > ((toLim - *toP) << 1) \
709 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) { \
710 fromLim -= 2; \
711 res = XML_CONVERT_INPUT_INCOMPLETE; \
712 } \
713 for (; *fromP < fromLim && *toP < toLim; *fromP += 2) \
714 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
715 if ((*toP == toLim) && (*fromP < fromLim)) \
716 return XML_CONVERT_OUTPUT_EXHAUSTED; \
717 else \
718 return res; \
719 }
720
721 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
722 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
723 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
724
725 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)726 DEFINE_UTF16_TO_UTF16(little2_)
727
728 #undef SET2
729 #undef GET_LO
730 #undef GET_HI
731
732 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
733 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
734 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
735
736 DEFINE_UTF16_TO_UTF8(big2_)
737 DEFINE_UTF16_TO_UTF16(big2_)
738
739 #undef SET2
740 #undef GET_LO
741 #undef GET_HI
742
743 #define LITTLE2_BYTE_TYPE(enc, p) \
744 ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
745 : unicode_byte_type((p)[1], (p)[0]))
746 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
747 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
748 #define LITTLE2_IS_NAME_CHAR_MINBPC(p) \
749 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
750 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p) \
751 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
752
753 #ifdef XML_MIN_SIZE
754
755 static int PTRFASTCALL
756 little2_byteType(const ENCODING *enc, const char *p) {
757 return LITTLE2_BYTE_TYPE(enc, p);
758 }
759
760 static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)761 little2_byteToAscii(const ENCODING *enc, const char *p) {
762 UNUSED_P(enc);
763 return LITTLE2_BYTE_TO_ASCII(p);
764 }
765
766 static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)767 little2_charMatches(const ENCODING *enc, const char *p, int c) {
768 UNUSED_P(enc);
769 return LITTLE2_CHAR_MATCHES(p, c);
770 }
771
772 static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)773 little2_isNameMin(const ENCODING *enc, const char *p) {
774 UNUSED_P(enc);
775 return LITTLE2_IS_NAME_CHAR_MINBPC(p);
776 }
777
778 static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)779 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
780 UNUSED_P(enc);
781 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
782 }
783
784 # undef VTABLE
785 # define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
786
787 #else /* not XML_MIN_SIZE */
788
789 # undef PREFIX
790 # define PREFIX(ident) little2_##ident
791 # define MINBPC(enc) 2
792 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
793 # define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
794 # define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
795 # define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
796 # define IS_NAME_CHAR(enc, p, n) 0
797 # define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
798 # define IS_NMSTRT_CHAR(enc, p, n) (0)
799 # define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
800
801 # define XML_TOK_IMPL_C
802 # include "xmltok_impl.c"
803 # undef XML_TOK_IMPL_C
804
805 # undef MINBPC
806 # undef BYTE_TYPE
807 # undef BYTE_TO_ASCII
808 # undef CHAR_MATCHES
809 # undef IS_NAME_CHAR
810 # undef IS_NAME_CHAR_MINBPC
811 # undef IS_NMSTRT_CHAR
812 # undef IS_NMSTRT_CHAR_MINBPC
813 # undef IS_INVALID_CHAR
814
815 #endif /* not XML_MIN_SIZE */
816
817 #ifdef XML_NS
818
819 static const struct normal_encoding little2_encoding_ns
820 = {{VTABLE, 2, 0,
821 # if BYTEORDER == 1234
822 1
823 # else
824 0
825 # endif
826 },
827 {
828 # include "asciitab.h"
829 # include "latin1tab.h"
830 },
831 STANDARD_VTABLE(little2_) NULL_VTABLE};
832
833 #endif
834
835 static const struct normal_encoding little2_encoding
836 = {{VTABLE, 2, 0,
837 #if BYTEORDER == 1234
838 1
839 #else
840 0
841 #endif
842 },
843 {
844 #define BT_COLON BT_NMSTRT
845 #include "asciitab.h"
846 #undef BT_COLON
847 #include "latin1tab.h"
848 },
849 STANDARD_VTABLE(little2_) NULL_VTABLE};
850
851 #if BYTEORDER != 4321
852
853 # ifdef XML_NS
854
855 static const struct normal_encoding internal_little2_encoding_ns
856 = {{VTABLE, 2, 0, 1},
857 {
858 # include "iasciitab.h"
859 # include "latin1tab.h"
860 },
861 STANDARD_VTABLE(little2_) NULL_VTABLE};
862
863 # endif
864
865 static const struct normal_encoding internal_little2_encoding
866 = {{VTABLE, 2, 0, 1},
867 {
868 # define BT_COLON BT_NMSTRT
869 # include "iasciitab.h"
870 # undef BT_COLON
871 # include "latin1tab.h"
872 },
873 STANDARD_VTABLE(little2_) NULL_VTABLE};
874
875 #endif
876
877 #define BIG2_BYTE_TYPE(enc, p) \
878 ((p)[0] == 0 \
879 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
880 : unicode_byte_type((p)[0], (p)[1]))
881 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
882 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
883 #define BIG2_IS_NAME_CHAR_MINBPC(p) \
884 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
885 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p) \
886 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
887
888 #ifdef XML_MIN_SIZE
889
890 static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)891 big2_byteType(const ENCODING *enc, const char *p) {
892 return BIG2_BYTE_TYPE(enc, p);
893 }
894
895 static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)896 big2_byteToAscii(const ENCODING *enc, const char *p) {
897 UNUSED_P(enc);
898 return BIG2_BYTE_TO_ASCII(p);
899 }
900
901 static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)902 big2_charMatches(const ENCODING *enc, const char *p, int c) {
903 UNUSED_P(enc);
904 return BIG2_CHAR_MATCHES(p, c);
905 }
906
907 static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)908 big2_isNameMin(const ENCODING *enc, const char *p) {
909 UNUSED_P(enc);
910 return BIG2_IS_NAME_CHAR_MINBPC(p);
911 }
912
913 static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)914 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
915 UNUSED_P(enc);
916 return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
917 }
918
919 # undef VTABLE
920 # define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
921
922 #else /* not XML_MIN_SIZE */
923
924 # undef PREFIX
925 # define PREFIX(ident) big2_##ident
926 # define MINBPC(enc) 2
927 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
928 # define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
929 # define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
930 # define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
931 # define IS_NAME_CHAR(enc, p, n) 0
932 # define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
933 # define IS_NMSTRT_CHAR(enc, p, n) (0)
934 # define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
935
936 # define XML_TOK_IMPL_C
937 # include "xmltok_impl.c"
938 # undef XML_TOK_IMPL_C
939
940 # undef MINBPC
941 # undef BYTE_TYPE
942 # undef BYTE_TO_ASCII
943 # undef CHAR_MATCHES
944 # undef IS_NAME_CHAR
945 # undef IS_NAME_CHAR_MINBPC
946 # undef IS_NMSTRT_CHAR
947 # undef IS_NMSTRT_CHAR_MINBPC
948 # undef IS_INVALID_CHAR
949
950 #endif /* not XML_MIN_SIZE */
951
952 #ifdef XML_NS
953
954 static const struct normal_encoding big2_encoding_ns
955 = {{VTABLE, 2, 0,
956 # if BYTEORDER == 4321
957 1
958 # else
959 0
960 # endif
961 },
962 {
963 # include "asciitab.h"
964 # include "latin1tab.h"
965 },
966 STANDARD_VTABLE(big2_) NULL_VTABLE};
967
968 #endif
969
970 static const struct normal_encoding big2_encoding
971 = {{VTABLE, 2, 0,
972 #if BYTEORDER == 4321
973 1
974 #else
975 0
976 #endif
977 },
978 {
979 #define BT_COLON BT_NMSTRT
980 #include "asciitab.h"
981 #undef BT_COLON
982 #include "latin1tab.h"
983 },
984 STANDARD_VTABLE(big2_) NULL_VTABLE};
985
986 #if BYTEORDER != 1234
987
988 # ifdef XML_NS
989
990 static const struct normal_encoding internal_big2_encoding_ns
991 = {{VTABLE, 2, 0, 1},
992 {
993 # include "iasciitab.h"
994 # include "latin1tab.h"
995 },
996 STANDARD_VTABLE(big2_) NULL_VTABLE};
997
998 # endif
999
1000 static const struct normal_encoding internal_big2_encoding
1001 = {{VTABLE, 2, 0, 1},
1002 {
1003 # define BT_COLON BT_NMSTRT
1004 # include "iasciitab.h"
1005 # undef BT_COLON
1006 # include "latin1tab.h"
1007 },
1008 STANDARD_VTABLE(big2_) NULL_VTABLE};
1009
1010 #endif
1011
1012 #undef PREFIX
1013
1014 static int FASTCALL
streqci(const char * s1,const char * s2)1015 streqci(const char *s1, const char *s2) {
1016 for (;;) {
1017 char c1 = *s1++;
1018 char c2 = *s2++;
1019 if (ASCII_a <= c1 && c1 <= ASCII_z)
1020 c1 += ASCII_A - ASCII_a;
1021 if (ASCII_a <= c2 && c2 <= ASCII_z)
1022 /* The following line will never get executed. streqci() is
1023 * only called from two places, both of which guarantee to put
1024 * upper-case strings into s2.
1025 */
1026 c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1027 if (c1 != c2)
1028 return 0;
1029 if (! c1)
1030 break;
1031 }
1032 return 1;
1033 }
1034
1035 static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)1036 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1037 POSITION *pos) {
1038 UNUSED_P(enc);
1039 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1040 }
1041
1042 static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)1043 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1044 char buf[1];
1045 char *p = buf;
1046 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1047 if (p == buf)
1048 return -1;
1049 else
1050 return buf[0];
1051 }
1052
1053 static int FASTCALL
isSpace(int c)1054 isSpace(int c) {
1055 switch (c) {
1056 case 0x20:
1057 case 0xD:
1058 case 0xA:
1059 case 0x9:
1060 return 1;
1061 }
1062 return 0;
1063 }
1064
1065 /* Return 1 if there's just optional white space or there's an S
1066 followed by name=val.
1067 */
1068 static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)1069 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1070 const char **namePtr, const char **nameEndPtr,
1071 const char **valPtr, const char **nextTokPtr) {
1072 int c;
1073 char open;
1074 if (ptr == end) {
1075 *namePtr = NULL;
1076 return 1;
1077 }
1078 if (! isSpace(toAscii(enc, ptr, end))) {
1079 *nextTokPtr = ptr;
1080 return 0;
1081 }
1082 do {
1083 ptr += enc->minBytesPerChar;
1084 } while (isSpace(toAscii(enc, ptr, end)));
1085 if (ptr == end) {
1086 *namePtr = NULL;
1087 return 1;
1088 }
1089 *namePtr = ptr;
1090 for (;;) {
1091 c = toAscii(enc, ptr, end);
1092 if (c == -1) {
1093 *nextTokPtr = ptr;
1094 return 0;
1095 }
1096 if (c == ASCII_EQUALS) {
1097 *nameEndPtr = ptr;
1098 break;
1099 }
1100 if (isSpace(c)) {
1101 *nameEndPtr = ptr;
1102 do {
1103 ptr += enc->minBytesPerChar;
1104 } while (isSpace(c = toAscii(enc, ptr, end)));
1105 if (c != ASCII_EQUALS) {
1106 *nextTokPtr = ptr;
1107 return 0;
1108 }
1109 break;
1110 }
1111 ptr += enc->minBytesPerChar;
1112 }
1113 if (ptr == *namePtr) {
1114 *nextTokPtr = ptr;
1115 return 0;
1116 }
1117 ptr += enc->minBytesPerChar;
1118 c = toAscii(enc, ptr, end);
1119 while (isSpace(c)) {
1120 ptr += enc->minBytesPerChar;
1121 c = toAscii(enc, ptr, end);
1122 }
1123 if (c != ASCII_QUOT && c != ASCII_APOS) {
1124 *nextTokPtr = ptr;
1125 return 0;
1126 }
1127 open = (char)c;
1128 ptr += enc->minBytesPerChar;
1129 *valPtr = ptr;
1130 for (;; ptr += enc->minBytesPerChar) {
1131 c = toAscii(enc, ptr, end);
1132 if (c == open)
1133 break;
1134 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1135 && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1136 && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1137 *nextTokPtr = ptr;
1138 return 0;
1139 }
1140 }
1141 *nextTokPtr = ptr + enc->minBytesPerChar;
1142 return 1;
1143 }
1144
1145 static const char KW_version[]
1146 = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1147
1148 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1149 ASCII_i, ASCII_n, ASCII_g, '\0'};
1150
1151 static const char KW_standalone[]
1152 = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1153 ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1154
1155 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1156
1157 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1158
1159 static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1160 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1161 const char *),
1162 int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1163 const char *end, const char **badPtr, const char **versionPtr,
1164 const char **versionEndPtr, const char **encodingName,
1165 const ENCODING **encoding, int *standalone) {
1166 const char *val = NULL;
1167 const char *name = NULL;
1168 const char *nameEnd = NULL;
1169 ptr += 5 * enc->minBytesPerChar;
1170 end -= 2 * enc->minBytesPerChar;
1171 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1172 || ! name) {
1173 *badPtr = ptr;
1174 return 0;
1175 }
1176 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1177 if (! isGeneralTextEntity) {
1178 *badPtr = name;
1179 return 0;
1180 }
1181 } else {
1182 if (versionPtr)
1183 *versionPtr = val;
1184 if (versionEndPtr)
1185 *versionEndPtr = ptr;
1186 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1187 *badPtr = ptr;
1188 return 0;
1189 }
1190 if (! name) {
1191 if (isGeneralTextEntity) {
1192 /* a TextDecl must have an EncodingDecl */
1193 *badPtr = ptr;
1194 return 0;
1195 }
1196 return 1;
1197 }
1198 }
1199 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1200 int c = toAscii(enc, val, end);
1201 if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1202 *badPtr = val;
1203 return 0;
1204 }
1205 if (encodingName)
1206 *encodingName = val;
1207 if (encoding)
1208 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1209 if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1210 *badPtr = ptr;
1211 return 0;
1212 }
1213 if (! name)
1214 return 1;
1215 }
1216 if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1217 || isGeneralTextEntity) {
1218 *badPtr = name;
1219 return 0;
1220 }
1221 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1222 if (standalone)
1223 *standalone = 1;
1224 } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1225 if (standalone)
1226 *standalone = 0;
1227 } else {
1228 *badPtr = val;
1229 return 0;
1230 }
1231 while (isSpace(toAscii(enc, ptr, end)))
1232 ptr += enc->minBytesPerChar;
1233 if (ptr != end) {
1234 *badPtr = ptr;
1235 return 0;
1236 }
1237 return 1;
1238 }
1239
1240 static int FASTCALL
checkCharRefNumber(int result)1241 checkCharRefNumber(int result) {
1242 switch (result >> 8) {
1243 case 0xD8:
1244 case 0xD9:
1245 case 0xDA:
1246 case 0xDB:
1247 case 0xDC:
1248 case 0xDD:
1249 case 0xDE:
1250 case 0xDF:
1251 return -1;
1252 case 0:
1253 if (latin1_encoding.type[result] == BT_NONXML)
1254 return -1;
1255 break;
1256 case 0xFF:
1257 if (result == 0xFFFE || result == 0xFFFF)
1258 return -1;
1259 break;
1260 }
1261 return result;
1262 }
1263
1264 int FASTCALL
XmlUtf8Encode(int c,char * buf)1265 XmlUtf8Encode(int c, char *buf) {
1266 enum {
1267 /* minN is minimum legal resulting value for N byte sequence */
1268 min2 = 0x80,
1269 min3 = 0x800,
1270 min4 = 0x10000
1271 };
1272
1273 if (c < 0)
1274 return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1275 if (c < min2) {
1276 buf[0] = (char)(c | UTF8_cval1);
1277 return 1;
1278 }
1279 if (c < min3) {
1280 buf[0] = (char)((c >> 6) | UTF8_cval2);
1281 buf[1] = (char)((c & 0x3f) | 0x80);
1282 return 2;
1283 }
1284 if (c < min4) {
1285 buf[0] = (char)((c >> 12) | UTF8_cval3);
1286 buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1287 buf[2] = (char)((c & 0x3f) | 0x80);
1288 return 3;
1289 }
1290 if (c < 0x110000) {
1291 buf[0] = (char)((c >> 18) | UTF8_cval4);
1292 buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1293 buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1294 buf[3] = (char)((c & 0x3f) | 0x80);
1295 return 4;
1296 }
1297 return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1298 }
1299
1300 int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)1301 XmlUtf16Encode(int charNum, unsigned short *buf) {
1302 if (charNum < 0)
1303 return 0;
1304 if (charNum < 0x10000) {
1305 buf[0] = (unsigned short)charNum;
1306 return 1;
1307 }
1308 if (charNum < 0x110000) {
1309 charNum -= 0x10000;
1310 buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1311 buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1312 return 2;
1313 }
1314 return 0;
1315 }
1316
1317 struct unknown_encoding {
1318 struct normal_encoding normal;
1319 CONVERTER convert;
1320 void *userData;
1321 unsigned short utf16[256];
1322 char utf8[256][4];
1323 };
1324
1325 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1326
1327 int
XmlSizeOfUnknownEncoding(void)1328 XmlSizeOfUnknownEncoding(void) {
1329 return sizeof(struct unknown_encoding);
1330 }
1331
1332 static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)1333 unknown_isName(const ENCODING *enc, const char *p) {
1334 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1335 int c = uenc->convert(uenc->userData, p);
1336 if (c & ~0xFFFF)
1337 return 0;
1338 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1339 }
1340
1341 static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)1342 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1343 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1344 int c = uenc->convert(uenc->userData, p);
1345 if (c & ~0xFFFF)
1346 return 0;
1347 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1348 }
1349
1350 static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)1351 unknown_isInvalid(const ENCODING *enc, const char *p) {
1352 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1353 int c = uenc->convert(uenc->userData, p);
1354 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1355 }
1356
1357 static enum XML_Convert_Result PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1358 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1359 char **toP, const char *toLim) {
1360 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1361 char buf[XML_UTF8_ENCODE_MAX];
1362 for (;;) {
1363 const char *utf8;
1364 int n;
1365 if (*fromP == fromLim)
1366 return XML_CONVERT_COMPLETED;
1367 utf8 = uenc->utf8[(unsigned char)**fromP];
1368 n = *utf8++;
1369 if (n == 0) {
1370 int c = uenc->convert(uenc->userData, *fromP);
1371 n = XmlUtf8Encode(c, buf);
1372 if (n > toLim - *toP)
1373 return XML_CONVERT_OUTPUT_EXHAUSTED;
1374 utf8 = buf;
1375 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1376 - (BT_LEAD2 - 2));
1377 } else {
1378 if (n > toLim - *toP)
1379 return XML_CONVERT_OUTPUT_EXHAUSTED;
1380 (*fromP)++;
1381 }
1382 memcpy(*toP, utf8, n);
1383 *toP += n;
1384 }
1385 }
1386
1387 static enum XML_Convert_Result PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1388 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1389 unsigned short **toP, const unsigned short *toLim) {
1390 const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1391 while (*fromP < fromLim && *toP < toLim) {
1392 unsigned short c = uenc->utf16[(unsigned char)**fromP];
1393 if (c == 0) {
1394 c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1395 *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1396 - (BT_LEAD2 - 2));
1397 } else
1398 (*fromP)++;
1399 *(*toP)++ = c;
1400 }
1401
1402 if ((*toP == toLim) && (*fromP < fromLim))
1403 return XML_CONVERT_OUTPUT_EXHAUSTED;
1404 else
1405 return XML_CONVERT_COMPLETED;
1406 }
1407
1408 ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,CONVERTER convert,void * userData)1409 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1410 void *userData) {
1411 int i;
1412 struct unknown_encoding *e = (struct unknown_encoding *)mem;
1413 memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1414 for (i = 0; i < 128; i++)
1415 if (latin1_encoding.type[i] != BT_OTHER
1416 && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1417 return 0;
1418 for (i = 0; i < 256; i++) {
1419 int c = table[i];
1420 if (c == -1) {
1421 e->normal.type[i] = BT_MALFORM;
1422 /* This shouldn't really get used. */
1423 e->utf16[i] = 0xFFFF;
1424 e->utf8[i][0] = 1;
1425 e->utf8[i][1] = 0;
1426 } else if (c < 0) {
1427 if (c < -4)
1428 return 0;
1429 /* Multi-byte sequences need a converter function */
1430 if (! convert)
1431 return 0;
1432 e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1433 e->utf8[i][0] = 0;
1434 e->utf16[i] = 0;
1435 } else if (c < 0x80) {
1436 if (latin1_encoding.type[c] != BT_OTHER
1437 && latin1_encoding.type[c] != BT_NONXML && c != i)
1438 return 0;
1439 e->normal.type[i] = latin1_encoding.type[c];
1440 e->utf8[i][0] = 1;
1441 e->utf8[i][1] = (char)c;
1442 e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1443 } else if (checkCharRefNumber(c) < 0) {
1444 e->normal.type[i] = BT_NONXML;
1445 /* This shouldn't really get used. */
1446 e->utf16[i] = 0xFFFF;
1447 e->utf8[i][0] = 1;
1448 e->utf8[i][1] = 0;
1449 } else {
1450 if (c > 0xFFFF)
1451 return 0;
1452 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1453 e->normal.type[i] = BT_NMSTRT;
1454 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1455 e->normal.type[i] = BT_NAME;
1456 else
1457 e->normal.type[i] = BT_OTHER;
1458 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1459 e->utf16[i] = (unsigned short)c;
1460 }
1461 }
1462 e->userData = userData;
1463 e->convert = convert;
1464 if (convert) {
1465 e->normal.isName2 = unknown_isName;
1466 e->normal.isName3 = unknown_isName;
1467 e->normal.isName4 = unknown_isName;
1468 e->normal.isNmstrt2 = unknown_isNmstrt;
1469 e->normal.isNmstrt3 = unknown_isNmstrt;
1470 e->normal.isNmstrt4 = unknown_isNmstrt;
1471 e->normal.isInvalid2 = unknown_isInvalid;
1472 e->normal.isInvalid3 = unknown_isInvalid;
1473 e->normal.isInvalid4 = unknown_isInvalid;
1474 }
1475 e->normal.enc.utf8Convert = unknown_toUtf8;
1476 e->normal.enc.utf16Convert = unknown_toUtf16;
1477 return &(e->normal.enc);
1478 }
1479
1480 /* If this enumeration is changed, getEncodingIndex and encodings
1481 must also be changed. */
1482 enum {
1483 UNKNOWN_ENC = -1,
1484 ISO_8859_1_ENC = 0,
1485 US_ASCII_ENC,
1486 UTF_8_ENC,
1487 UTF_16_ENC,
1488 UTF_16BE_ENC,
1489 UTF_16LE_ENC,
1490 /* must match encodingNames up to here */
1491 NO_ENC
1492 };
1493
1494 static const char KW_ISO_8859_1[]
1495 = {ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8,
1496 ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'};
1497 static const char KW_US_ASCII[]
1498 = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1499 ASCII_C, ASCII_I, ASCII_I, '\0'};
1500 static const char KW_UTF_8[]
1501 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1502 static const char KW_UTF_16[]
1503 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1504 static const char KW_UTF_16BE[]
1505 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1506 ASCII_6, ASCII_B, ASCII_E, '\0'};
1507 static const char KW_UTF_16LE[]
1508 = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1509 ASCII_6, ASCII_L, ASCII_E, '\0'};
1510
1511 static int FASTCALL
getEncodingIndex(const char * name)1512 getEncodingIndex(const char *name) {
1513 static const char *const encodingNames[] = {
1514 KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1515 };
1516 int i;
1517 if (name == NULL)
1518 return NO_ENC;
1519 for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1520 if (streqci(name, encodingNames[i]))
1521 return i;
1522 return UNKNOWN_ENC;
1523 }
1524
1525 /* For binary compatibility, we store the index of the encoding
1526 specified at initialization in the isUtf16 member.
1527 */
1528
1529 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1530 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1531
1532 /* This is what detects the encoding. encodingTable maps from
1533 encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1534 the external (protocol) specified encoding; state is
1535 XML_CONTENT_STATE if we're parsing an external text entity, and
1536 XML_PROLOG_STATE otherwise.
1537 */
1538
1539 static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1540 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1541 int state, const char *ptr, const char *end, const char **nextTokPtr) {
1542 const ENCODING **encPtr;
1543
1544 if (ptr >= end)
1545 return XML_TOK_NONE;
1546 encPtr = enc->encPtr;
1547 if (ptr + 1 == end) {
1548 /* only a single byte available for auto-detection */
1549 #ifndef XML_DTD /* FIXME */
1550 /* a well-formed document entity must have more than one byte */
1551 if (state != XML_CONTENT_STATE)
1552 return XML_TOK_PARTIAL;
1553 #endif
1554 /* so we're parsing an external text entity... */
1555 /* if UTF-16 was externally specified, then we need at least 2 bytes */
1556 switch (INIT_ENC_INDEX(enc)) {
1557 case UTF_16_ENC:
1558 case UTF_16LE_ENC:
1559 case UTF_16BE_ENC:
1560 return XML_TOK_PARTIAL;
1561 }
1562 switch ((unsigned char)*ptr) {
1563 case 0xFE:
1564 case 0xFF:
1565 case 0xEF: /* possibly first byte of UTF-8 BOM */
1566 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1567 break;
1568 /* fall through */
1569 case 0x00:
1570 case 0x3C:
1571 return XML_TOK_PARTIAL;
1572 }
1573 } else {
1574 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1575 case 0xFEFF:
1576 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1577 break;
1578 *nextTokPtr = ptr + 2;
1579 *encPtr = encodingTable[UTF_16BE_ENC];
1580 return XML_TOK_BOM;
1581 /* 00 3C is handled in the default case */
1582 case 0x3C00:
1583 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1584 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1585 && state == XML_CONTENT_STATE)
1586 break;
1587 *encPtr = encodingTable[UTF_16LE_ENC];
1588 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1589 case 0xFFFE:
1590 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1591 break;
1592 *nextTokPtr = ptr + 2;
1593 *encPtr = encodingTable[UTF_16LE_ENC];
1594 return XML_TOK_BOM;
1595 case 0xEFBB:
1596 /* Maybe a UTF-8 BOM (EF BB BF) */
1597 /* If there's an explicitly specified (external) encoding
1598 of ISO-8859-1 or some flavour of UTF-16
1599 and this is an external text entity,
1600 don't look for the BOM,
1601 because it might be a legal data.
1602 */
1603 if (state == XML_CONTENT_STATE) {
1604 int e = INIT_ENC_INDEX(enc);
1605 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1606 || e == UTF_16_ENC)
1607 break;
1608 }
1609 if (ptr + 2 == end)
1610 return XML_TOK_PARTIAL;
1611 if ((unsigned char)ptr[2] == 0xBF) {
1612 *nextTokPtr = ptr + 3;
1613 *encPtr = encodingTable[UTF_8_ENC];
1614 return XML_TOK_BOM;
1615 }
1616 break;
1617 default:
1618 if (ptr[0] == '\0') {
1619 /* 0 isn't a legal data character. Furthermore a document
1620 entity can only start with ASCII characters. So the only
1621 way this can fail to be big-endian UTF-16 if it it's an
1622 external parsed general entity that's labelled as
1623 UTF-16LE.
1624 */
1625 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1626 break;
1627 *encPtr = encodingTable[UTF_16BE_ENC];
1628 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1629 } else if (ptr[1] == '\0') {
1630 /* We could recover here in the case:
1631 - parsing an external entity
1632 - second byte is 0
1633 - no externally specified encoding
1634 - no encoding declaration
1635 by assuming UTF-16LE. But we don't, because this would mean when
1636 presented just with a single byte, we couldn't reliably determine
1637 whether we needed further bytes.
1638 */
1639 if (state == XML_CONTENT_STATE)
1640 break;
1641 *encPtr = encodingTable[UTF_16LE_ENC];
1642 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1643 }
1644 break;
1645 }
1646 }
1647 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1648 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1649 }
1650
1651 #define NS(x) x
1652 #define ns(x) x
1653 #define XML_TOK_NS_C
1654 #include "xmltok_ns.c"
1655 #undef XML_TOK_NS_C
1656 #undef NS
1657 #undef ns
1658
1659 #ifdef XML_NS
1660
1661 # define NS(x) x##NS
1662 # define ns(x) x##_ns
1663
1664 # define XML_TOK_NS_C
1665 # include "xmltok_ns.c"
1666 # undef XML_TOK_NS_C
1667
1668 # undef NS
1669 # undef ns
1670
1671 ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,CONVERTER convert,void * userData)1672 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1673 void *userData) {
1674 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1675 if (enc)
1676 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1677 return enc;
1678 }
1679
1680 #endif /* XML_NS */
1681