• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2                             __  __            _
3                          ___\ \/ /_ __   __ _| |_
4                         / _ \\  /| '_ \ / _` | __|
5                        |  __//  \| |_) | (_| | |_
6                         \___/_/\_\ .__/ \__,_|\__|
7                                  |_| XML parser
8 
9    Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10    Copyright (c) 2000      Clark Cooper <coopercc@users.sourceforge.net>
11    Copyright (c) 2001-2003 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12    Copyright (c) 2002      Greg Stein <gstein@users.sourceforge.net>
13    Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
14    Copyright (c) 2005-2009 Steven Solie <ssolie@users.sourceforge.net>
15    Copyright (c) 2016-2021 Sebastian Pipping <sebastian@pipping.org>
16    Copyright (c) 2016      Pascal Cuoq <cuoq@trust-in-soft.com>
17    Copyright (c) 2016      Don Lewis <truckman@apache.org>
18    Copyright (c) 2017      Rhodri James <rhodri@wildebeest.org.uk>
19    Copyright (c) 2017      Alexander Bluhm <alexander.bluhm@gmx.net>
20    Copyright (c) 2017      Benbuck Nason <bnason@netflix.com>
21    Copyright (c) 2017      José Gutiérrez de la Concha <jose@zeroc.com>
22    Copyright (c) 2019      David Loffredo <loffredo@steptools.com>
23    Licensed under the MIT license:
24 
25    Permission is  hereby granted,  free of charge,  to any  person obtaining
26    a  copy  of  this  software   and  associated  documentation  files  (the
27    "Software"),  to  deal in  the  Software  without restriction,  including
28    without  limitation the  rights  to use,  copy,  modify, merge,  publish,
29    distribute, sublicense, and/or sell copies of the Software, and to permit
30    persons  to whom  the Software  is  furnished to  do so,  subject to  the
31    following conditions:
32 
33    The above copyright  notice and this permission notice  shall be included
34    in all copies or substantial portions of the Software.
35 
36    THE  SOFTWARE  IS  PROVIDED  "AS  IS",  WITHOUT  WARRANTY  OF  ANY  KIND,
37    EXPRESS  OR IMPLIED,  INCLUDING  BUT  NOT LIMITED  TO  THE WARRANTIES  OF
38    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
39    NO EVENT SHALL THE AUTHORS OR  COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
40    DAMAGES OR  OTHER LIABILITY, WHETHER  IN AN  ACTION OF CONTRACT,  TORT OR
41    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
42    USE OR OTHER DEALINGS IN THE SOFTWARE.
43 */
44 
45 #ifdef _WIN32
46 #  include "winconfig.h"
47 #endif
48 
49 #include <expat_config.h>
50 
51 #include <stddef.h>
52 #include <string.h> /* memcpy */
53 #include <stdbool.h>
54 
55 #include "expat_external.h"
56 #include "internal.h"
57 #include "xmltok.h"
58 #include "nametab.h"
59 
60 #ifdef XML_DTD
61 #  define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
62 #else
63 #  define IGNORE_SECTION_TOK_VTABLE /* as nothing */
64 #endif
65 
66 #define VTABLE1                                                                \
67   {PREFIX(prologTok), PREFIX(contentTok),                                      \
68    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE},                         \
69       {PREFIX(attributeValueTok), PREFIX(entityValueTok)},                     \
70       PREFIX(nameMatchesAscii), PREFIX(nameLength), PREFIX(skipS),             \
71       PREFIX(getAtts), PREFIX(charRefNumber), PREFIX(predefinedEntityName),    \
72       PREFIX(updatePosition), PREFIX(isPublicId)
73 
74 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
75 
76 #define UCS2_GET_NAMING(pages, hi, lo)                                         \
77   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1u << ((lo)&0x1F)))
78 
79 /* A 2 byte UTF-8 representation splits the characters 11 bits between
80    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
81    pages, 3 bits to add to that index and 5 bits to generate the mask.
82 */
83 #define UTF8_GET_NAMING2(pages, byte)                                          \
84   (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3)                         \
85                 + ((((byte)[0]) & 3) << 1) + ((((byte)[1]) >> 5) & 1)]         \
86    & (1u << (((byte)[1]) & 0x1F)))
87 
88 /* A 3 byte UTF-8 representation splits the characters 16 bits between
89    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
90    into pages, 3 bits to add to that index and 5 bits to generate the
91    mask.
92 */
93 #define UTF8_GET_NAMING3(pages, byte)                                          \
94   (namingBitmap                                                                \
95        [((pages)[((((byte)[0]) & 0xF) << 4) + ((((byte)[1]) >> 2) & 0xF)]      \
96          << 3)                                                                 \
97         + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)]                 \
98    & (1u << (((byte)[2]) & 0x1F)))
99 
100 #define UTF8_GET_NAMING(pages, p, n)                                           \
101   ((n) == 2                                                                    \
102        ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p))                   \
103        : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
104 
105 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
106    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
107    with the additional restriction of not allowing the Unicode
108    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
109    Implementation details:
110      (A & 0x80) == 0     means A < 0x80
111    and
112      (A & 0xC0) == 0xC0  means A > 0xBF
113 */
114 
115 #define UTF8_INVALID2(p)                                                       \
116   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
117 
118 #define UTF8_INVALID3(p)                                                       \
119   (((p)[2] & 0x80) == 0                                                        \
120    || ((*p) == 0xEF && (p)[1] == 0xBF ? (p)[2] > 0xBD                          \
121                                       : ((p)[2] & 0xC0) == 0xC0)               \
122    || ((*p) == 0xE0                                                            \
123            ? (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0                          \
124            : ((p)[1] & 0x80) == 0                                              \
125                  || ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
126 
127 #define UTF8_INVALID4(p)                                                       \
128   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 || ((p)[2] & 0x80) == 0     \
129    || ((p)[2] & 0xC0) == 0xC0                                                  \
130    || ((*p) == 0xF0                                                            \
131            ? (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0                          \
132            : ((p)[1] & 0x80) == 0                                              \
133                  || ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
134 
135 static int PTRFASTCALL
isNever(const ENCODING * enc,const char * p)136 isNever(const ENCODING *enc, const char *p) {
137   UNUSED_P(enc);
138   UNUSED_P(p);
139   return 0;
140 }
141 
142 static int PTRFASTCALL
utf8_isName2(const ENCODING * enc,const char * p)143 utf8_isName2(const ENCODING *enc, const char *p) {
144   UNUSED_P(enc);
145   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
146 }
147 
148 static int PTRFASTCALL
utf8_isName3(const ENCODING * enc,const char * p)149 utf8_isName3(const ENCODING *enc, const char *p) {
150   UNUSED_P(enc);
151   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
152 }
153 
154 #define utf8_isName4 isNever
155 
156 static int PTRFASTCALL
utf8_isNmstrt2(const ENCODING * enc,const char * p)157 utf8_isNmstrt2(const ENCODING *enc, const char *p) {
158   UNUSED_P(enc);
159   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
160 }
161 
162 static int PTRFASTCALL
utf8_isNmstrt3(const ENCODING * enc,const char * p)163 utf8_isNmstrt3(const ENCODING *enc, const char *p) {
164   UNUSED_P(enc);
165   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
166 }
167 
168 #define utf8_isNmstrt4 isNever
169 
170 static int PTRFASTCALL
utf8_isInvalid2(const ENCODING * enc,const char * p)171 utf8_isInvalid2(const ENCODING *enc, const char *p) {
172   UNUSED_P(enc);
173   return UTF8_INVALID2((const unsigned char *)p);
174 }
175 
176 static int PTRFASTCALL
utf8_isInvalid3(const ENCODING * enc,const char * p)177 utf8_isInvalid3(const ENCODING *enc, const char *p) {
178   UNUSED_P(enc);
179   return UTF8_INVALID3((const unsigned char *)p);
180 }
181 
182 static int PTRFASTCALL
utf8_isInvalid4(const ENCODING * enc,const char * p)183 utf8_isInvalid4(const ENCODING *enc, const char *p) {
184   UNUSED_P(enc);
185   return UTF8_INVALID4((const unsigned char *)p);
186 }
187 
188 struct normal_encoding {
189   ENCODING enc;
190   unsigned char type[256];
191 #ifdef XML_MIN_SIZE
192   int(PTRFASTCALL *byteType)(const ENCODING *, const char *);
193   int(PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
194   int(PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
195   int(PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
196   int(PTRCALL *charMatches)(const ENCODING *, const char *, int);
197 #endif /* XML_MIN_SIZE */
198   int(PTRFASTCALL *isName2)(const ENCODING *, const char *);
199   int(PTRFASTCALL *isName3)(const ENCODING *, const char *);
200   int(PTRFASTCALL *isName4)(const ENCODING *, const char *);
201   int(PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
202   int(PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
203   int(PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
204   int(PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
205   int(PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
206   int(PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
207 };
208 
209 #define AS_NORMAL_ENCODING(enc) ((const struct normal_encoding *)(enc))
210 
211 #ifdef XML_MIN_SIZE
212 
213 #  define STANDARD_VTABLE(E)                                                   \
214     E##byteType, E##isNameMin, E##isNmstrtMin, E##byteToAscii, E##charMatches,
215 
216 #else
217 
218 #  define STANDARD_VTABLE(E) /* as nothing */
219 
220 #endif
221 
222 #define NORMAL_VTABLE(E)                                                       \
223   E##isName2, E##isName3, E##isName4, E##isNmstrt2, E##isNmstrt3,              \
224       E##isNmstrt4, E##isInvalid2, E##isInvalid3, E##isInvalid4
225 
226 #define NULL_VTABLE                                                            \
227   /* isName2 */ NULL, /* isName3 */ NULL, /* isName4 */ NULL,                  \
228       /* isNmstrt2 */ NULL, /* isNmstrt3 */ NULL, /* isNmstrt4 */ NULL,        \
229       /* isInvalid2 */ NULL, /* isInvalid3 */ NULL, /* isInvalid4 */ NULL
230 
231 static int FASTCALL checkCharRefNumber(int);
232 
233 #include "xmltok_impl.h"
234 #include "ascii.h"
235 
236 #ifdef XML_MIN_SIZE
237 #  define sb_isNameMin isNever
238 #  define sb_isNmstrtMin isNever
239 #endif
240 
241 #ifdef XML_MIN_SIZE
242 #  define MINBPC(enc) ((enc)->minBytesPerChar)
243 #else
244 /* minimum bytes per character */
245 #  define MINBPC(enc) 1
246 #endif
247 
248 #define SB_BYTE_TYPE(enc, p)                                                   \
249   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
250 
251 #ifdef XML_MIN_SIZE
252 static int PTRFASTCALL
sb_byteType(const ENCODING * enc,const char * p)253 sb_byteType(const ENCODING *enc, const char *p) {
254   return SB_BYTE_TYPE(enc, p);
255 }
256 #  define BYTE_TYPE(enc, p) (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
257 #else
258 #  define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
259 #endif
260 
261 #ifdef XML_MIN_SIZE
262 #  define BYTE_TO_ASCII(enc, p) (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
263 static int PTRFASTCALL
sb_byteToAscii(const ENCODING * enc,const char * p)264 sb_byteToAscii(const ENCODING *enc, const char *p) {
265   UNUSED_P(enc);
266   return *p;
267 }
268 #else
269 #  define BYTE_TO_ASCII(enc, p) (*(p))
270 #endif
271 
272 #define IS_NAME_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isName##n(enc, p))
273 #define IS_NMSTRT_CHAR(enc, p, n) (AS_NORMAL_ENCODING(enc)->isNmstrt##n(enc, p))
274 #ifdef XML_MIN_SIZE
275 #  define IS_INVALID_CHAR(enc, p, n)                                           \
276     (AS_NORMAL_ENCODING(enc)->isInvalid##n                                     \
277      && AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
278 #else
279 #  define IS_INVALID_CHAR(enc, p, n)                                           \
280     (AS_NORMAL_ENCODING(enc)->isInvalid##n(enc, p))
281 #endif
282 
283 #ifdef XML_MIN_SIZE
284 #  define IS_NAME_CHAR_MINBPC(enc, p)                                          \
285     (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
286 #  define IS_NMSTRT_CHAR_MINBPC(enc, p)                                        \
287     (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
288 #else
289 #  define IS_NAME_CHAR_MINBPC(enc, p) (0)
290 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
291 #endif
292 
293 #ifdef XML_MIN_SIZE
294 #  define CHAR_MATCHES(enc, p, c)                                              \
295     (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
296 static int PTRCALL
sb_charMatches(const ENCODING * enc,const char * p,int c)297 sb_charMatches(const ENCODING *enc, const char *p, int c) {
298   UNUSED_P(enc);
299   return *p == c;
300 }
301 #else
302 /* c is an ASCII character */
303 #  define CHAR_MATCHES(enc, p, c) (*(p) == c)
304 #endif
305 
306 #define PREFIX(ident) normal_##ident
307 #define XML_TOK_IMPL_C
308 #include "xmltok_impl.c"
309 #undef XML_TOK_IMPL_C
310 
311 #undef MINBPC
312 #undef BYTE_TYPE
313 #undef BYTE_TO_ASCII
314 #undef CHAR_MATCHES
315 #undef IS_NAME_CHAR
316 #undef IS_NAME_CHAR_MINBPC
317 #undef IS_NMSTRT_CHAR
318 #undef IS_NMSTRT_CHAR_MINBPC
319 #undef IS_INVALID_CHAR
320 
321 enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
322        UTF8_cval1 = 0x00,
323        UTF8_cval2 = 0xc0,
324        UTF8_cval3 = 0xe0,
325        UTF8_cval4 = 0xf0
326 };
327 
328 void
_INTERNAL_trim_to_complete_utf8_characters(const char * from,const char ** fromLimRef)329 _INTERNAL_trim_to_complete_utf8_characters(const char *from,
330                                            const char **fromLimRef) {
331   const char *fromLim = *fromLimRef;
332   size_t walked = 0;
333   for (; fromLim > from; fromLim--, walked++) {
334     const unsigned char prev = (unsigned char)fromLim[-1];
335     if ((prev & 0xf8u)
336         == 0xf0u) { /* 4-byte character, lead by 0b11110xxx byte */
337       if (walked + 1 >= 4) {
338         fromLim += 4 - 1;
339         break;
340       } else {
341         walked = 0;
342       }
343     } else if ((prev & 0xf0u)
344                == 0xe0u) { /* 3-byte character, lead by 0b1110xxxx byte */
345       if (walked + 1 >= 3) {
346         fromLim += 3 - 1;
347         break;
348       } else {
349         walked = 0;
350       }
351     } else if ((prev & 0xe0u)
352                == 0xc0u) { /* 2-byte character, lead by 0b110xxxxx byte */
353       if (walked + 1 >= 2) {
354         fromLim += 2 - 1;
355         break;
356       } else {
357         walked = 0;
358       }
359     } else if ((prev & 0x80u)
360                == 0x00u) { /* 1-byte character, matching 0b0xxxxxxx */
361       break;
362     }
363   }
364   *fromLimRef = fromLim;
365 }
366 
367 static enum XML_Convert_Result PTRCALL
utf8_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)368 utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
369             char **toP, const char *toLim) {
370   bool input_incomplete = false;
371   bool output_exhausted = false;
372 
373   /* Avoid copying partial characters (due to limited space). */
374   const ptrdiff_t bytesAvailable = fromLim - *fromP;
375   const ptrdiff_t bytesStorable = toLim - *toP;
376   UNUSED_P(enc);
377   if (bytesAvailable > bytesStorable) {
378     fromLim = *fromP + bytesStorable;
379     output_exhausted = true;
380   }
381 
382   /* Avoid copying partial characters (from incomplete input). */
383   {
384     const char *const fromLimBefore = fromLim;
385     _INTERNAL_trim_to_complete_utf8_characters(*fromP, &fromLim);
386     if (fromLim < fromLimBefore) {
387       input_incomplete = true;
388     }
389   }
390 
391   {
392     const ptrdiff_t bytesToCopy = fromLim - *fromP;
393     memcpy(*toP, *fromP, bytesToCopy);
394     *fromP += bytesToCopy;
395     *toP += bytesToCopy;
396   }
397 
398   if (output_exhausted) /* needs to go first */
399     return XML_CONVERT_OUTPUT_EXHAUSTED;
400   else if (input_incomplete)
401     return XML_CONVERT_INPUT_INCOMPLETE;
402   else
403     return XML_CONVERT_COMPLETED;
404 }
405 
406 static enum XML_Convert_Result PTRCALL
utf8_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)407 utf8_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
408              unsigned short **toP, const unsigned short *toLim) {
409   enum XML_Convert_Result res = XML_CONVERT_COMPLETED;
410   unsigned short *to = *toP;
411   const char *from = *fromP;
412   while (from < fromLim && to < toLim) {
413     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
414     case BT_LEAD2:
415       if (fromLim - from < 2) {
416         res = XML_CONVERT_INPUT_INCOMPLETE;
417         goto after;
418       }
419       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
420       from += 2;
421       break;
422     case BT_LEAD3:
423       if (fromLim - from < 3) {
424         res = XML_CONVERT_INPUT_INCOMPLETE;
425         goto after;
426       }
427       *to++ = (unsigned short)(((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6)
428                                | (from[2] & 0x3f));
429       from += 3;
430       break;
431     case BT_LEAD4: {
432       unsigned long n;
433       if (toLim - to < 2) {
434         res = XML_CONVERT_OUTPUT_EXHAUSTED;
435         goto after;
436       }
437       if (fromLim - from < 4) {
438         res = XML_CONVERT_INPUT_INCOMPLETE;
439         goto after;
440       }
441       n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
442           | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
443       n -= 0x10000;
444       to[0] = (unsigned short)((n >> 10) | 0xD800);
445       to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
446       to += 2;
447       from += 4;
448     } break;
449     default:
450       *to++ = *from++;
451       break;
452     }
453   }
454   if (from < fromLim)
455     res = XML_CONVERT_OUTPUT_EXHAUSTED;
456 after:
457   *fromP = from;
458   *toP = to;
459   return res;
460 }
461 
462 #ifdef XML_NS
463 static const struct normal_encoding utf8_encoding_ns
464     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
465        {
466 #  include "asciitab.h"
467 #  include "utf8tab.h"
468        },
469        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
470 #endif
471 
472 static const struct normal_encoding utf8_encoding
473     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
474        {
475 #define BT_COLON BT_NMSTRT
476 #include "asciitab.h"
477 #undef BT_COLON
478 #include "utf8tab.h"
479        },
480        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
481 
482 #ifdef XML_NS
483 
484 static const struct normal_encoding internal_utf8_encoding_ns
485     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
486        {
487 #  include "iasciitab.h"
488 #  include "utf8tab.h"
489        },
490        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
491 
492 #endif
493 
494 static const struct normal_encoding internal_utf8_encoding
495     = {{VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0},
496        {
497 #define BT_COLON BT_NMSTRT
498 #include "iasciitab.h"
499 #undef BT_COLON
500 #include "utf8tab.h"
501        },
502        STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)};
503 
504 static enum XML_Convert_Result PTRCALL
latin1_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)505 latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
506               char **toP, const char *toLim) {
507   UNUSED_P(enc);
508   for (;;) {
509     unsigned char c;
510     if (*fromP == fromLim)
511       return XML_CONVERT_COMPLETED;
512     c = (unsigned char)**fromP;
513     if (c & 0x80) {
514       if (toLim - *toP < 2)
515         return XML_CONVERT_OUTPUT_EXHAUSTED;
516       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
517       *(*toP)++ = (char)((c & 0x3f) | 0x80);
518       (*fromP)++;
519     } else {
520       if (*toP == toLim)
521         return XML_CONVERT_OUTPUT_EXHAUSTED;
522       *(*toP)++ = *(*fromP)++;
523     }
524   }
525 }
526 
527 static enum XML_Convert_Result PTRCALL
latin1_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)528 latin1_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
529                unsigned short **toP, const unsigned short *toLim) {
530   UNUSED_P(enc);
531   while (*fromP < fromLim && *toP < toLim)
532     *(*toP)++ = (unsigned char)*(*fromP)++;
533 
534   if ((*toP == toLim) && (*fromP < fromLim))
535     return XML_CONVERT_OUTPUT_EXHAUSTED;
536   else
537     return XML_CONVERT_COMPLETED;
538 }
539 
540 #ifdef XML_NS
541 
542 static const struct normal_encoding latin1_encoding_ns
543     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
544        {
545 #  include "asciitab.h"
546 #  include "latin1tab.h"
547        },
548        STANDARD_VTABLE(sb_) NULL_VTABLE};
549 
550 #endif
551 
552 static const struct normal_encoding latin1_encoding
553     = {{VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0},
554        {
555 #define BT_COLON BT_NMSTRT
556 #include "asciitab.h"
557 #undef BT_COLON
558 #include "latin1tab.h"
559        },
560        STANDARD_VTABLE(sb_) NULL_VTABLE};
561 
562 static enum XML_Convert_Result PTRCALL
ascii_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)563 ascii_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
564              char **toP, const char *toLim) {
565   UNUSED_P(enc);
566   while (*fromP < fromLim && *toP < toLim)
567     *(*toP)++ = *(*fromP)++;
568 
569   if ((*toP == toLim) && (*fromP < fromLim))
570     return XML_CONVERT_OUTPUT_EXHAUSTED;
571   else
572     return XML_CONVERT_COMPLETED;
573 }
574 
575 #ifdef XML_NS
576 
577 static const struct normal_encoding ascii_encoding_ns
578     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
579        {
580 #  include "asciitab.h"
581            /* BT_NONXML == 0 */
582        },
583        STANDARD_VTABLE(sb_) NULL_VTABLE};
584 
585 #endif
586 
587 static const struct normal_encoding ascii_encoding
588     = {{VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0},
589        {
590 #define BT_COLON BT_NMSTRT
591 #include "asciitab.h"
592 #undef BT_COLON
593            /* BT_NONXML == 0 */
594        },
595        STANDARD_VTABLE(sb_) NULL_VTABLE};
596 
597 static int PTRFASTCALL
unicode_byte_type(char hi,char lo)598 unicode_byte_type(char hi, char lo) {
599   switch ((unsigned char)hi) {
600   /* 0xD800-0xDBFF first 16-bit code unit or high surrogate (W1) */
601   case 0xD8:
602   case 0xD9:
603   case 0xDA:
604   case 0xDB:
605     return BT_LEAD4;
606   /* 0xDC00-0xDFFF second 16-bit code unit or low surrogate (W2) */
607   case 0xDC:
608   case 0xDD:
609   case 0xDE:
610   case 0xDF:
611     return BT_TRAIL;
612   case 0xFF:
613     switch ((unsigned char)lo) {
614     case 0xFF: /* noncharacter-FFFF */
615     case 0xFE: /* noncharacter-FFFE */
616       return BT_NONXML;
617     }
618     break;
619   }
620   return BT_NONASCII;
621 }
622 
623 #define DEFINE_UTF16_TO_UTF8(E)                                                \
624   static enum XML_Convert_Result PTRCALL E##toUtf8(                            \
625       const ENCODING *enc, const char **fromP, const char *fromLim,            \
626       char **toP, const char *toLim) {                                         \
627     const char *from = *fromP;                                                 \
628     UNUSED_P(enc);                                                             \
629     fromLim = from + (((fromLim - from) >> 1) << 1); /* shrink to even */      \
630     for (; from < fromLim; from += 2) {                                        \
631       int plane;                                                               \
632       unsigned char lo2;                                                       \
633       unsigned char lo = GET_LO(from);                                         \
634       unsigned char hi = GET_HI(from);                                         \
635       switch (hi) {                                                            \
636       case 0:                                                                  \
637         if (lo < 0x80) {                                                       \
638           if (*toP == toLim) {                                                 \
639             *fromP = from;                                                     \
640             return XML_CONVERT_OUTPUT_EXHAUSTED;                               \
641           }                                                                    \
642           *(*toP)++ = lo;                                                      \
643           break;                                                               \
644         }                                                                      \
645         /* fall through */                                                     \
646       case 0x1:                                                                \
647       case 0x2:                                                                \
648       case 0x3:                                                                \
649       case 0x4:                                                                \
650       case 0x5:                                                                \
651       case 0x6:                                                                \
652       case 0x7:                                                                \
653         if (toLim - *toP < 2) {                                                \
654           *fromP = from;                                                       \
655           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
656         }                                                                      \
657         *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2);                      \
658         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
659         break;                                                                 \
660       default:                                                                 \
661         if (toLim - *toP < 3) {                                                \
662           *fromP = from;                                                       \
663           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
664         }                                                                      \
665         /* 16 bits divided 4, 6, 6 amongst 3 bytes */                          \
666         *(*toP)++ = ((hi >> 4) | UTF8_cval3);                                  \
667         *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80);                    \
668         *(*toP)++ = ((lo & 0x3f) | 0x80);                                      \
669         break;                                                                 \
670       case 0xD8:                                                               \
671       case 0xD9:                                                               \
672       case 0xDA:                                                               \
673       case 0xDB:                                                               \
674         if (toLim - *toP < 4) {                                                \
675           *fromP = from;                                                       \
676           return XML_CONVERT_OUTPUT_EXHAUSTED;                                 \
677         }                                                                      \
678         if (fromLim - from < 4) {                                              \
679           *fromP = from;                                                       \
680           return XML_CONVERT_INPUT_INCOMPLETE;                                 \
681         }                                                                      \
682         plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1;                   \
683         *(*toP)++ = (char)((plane >> 2) | UTF8_cval4);                         \
684         *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80);         \
685         from += 2;                                                             \
686         lo2 = GET_LO(from);                                                    \
687         *(*toP)++ = (((lo & 0x3) << 4) | ((GET_HI(from) & 0x3) << 2)           \
688                      | (lo2 >> 6) | 0x80);                                     \
689         *(*toP)++ = ((lo2 & 0x3f) | 0x80);                                     \
690         break;                                                                 \
691       }                                                                        \
692     }                                                                          \
693     *fromP = from;                                                             \
694     if (from < fromLim)                                                        \
695       return XML_CONVERT_INPUT_INCOMPLETE;                                     \
696     else                                                                       \
697       return XML_CONVERT_COMPLETED;                                            \
698   }
699 
700 #define DEFINE_UTF16_TO_UTF16(E)                                               \
701   static enum XML_Convert_Result PTRCALL E##toUtf16(                           \
702       const ENCODING *enc, const char **fromP, const char *fromLim,            \
703       unsigned short **toP, const unsigned short *toLim) {                     \
704     enum XML_Convert_Result res = XML_CONVERT_COMPLETED;                       \
705     UNUSED_P(enc);                                                             \
706     fromLim = *fromP + (((fromLim - *fromP) >> 1) << 1); /* shrink to even */  \
707     /* Avoid copying first half only of surrogate */                           \
708     if (fromLim - *fromP > ((toLim - *toP) << 1)                               \
709         && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) {                             \
710       fromLim -= 2;                                                            \
711       res = XML_CONVERT_INPUT_INCOMPLETE;                                      \
712     }                                                                          \
713     for (; *fromP < fromLim && *toP < toLim; *fromP += 2)                      \
714       *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP);                      \
715     if ((*toP == toLim) && (*fromP < fromLim))                                 \
716       return XML_CONVERT_OUTPUT_EXHAUSTED;                                     \
717     else                                                                       \
718       return res;                                                              \
719   }
720 
721 #define SET2(ptr, ch) (((ptr)[0] = ((ch)&0xff)), ((ptr)[1] = ((ch) >> 8)))
722 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
723 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
724 
725 DEFINE_UTF16_TO_UTF8(little2_)
DEFINE_UTF16_TO_UTF16(little2_)726 DEFINE_UTF16_TO_UTF16(little2_)
727 
728 #undef SET2
729 #undef GET_LO
730 #undef GET_HI
731 
732 #define SET2(ptr, ch) (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch)&0xFF)))
733 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
734 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
735 
736 DEFINE_UTF16_TO_UTF8(big2_)
737 DEFINE_UTF16_TO_UTF16(big2_)
738 
739 #undef SET2
740 #undef GET_LO
741 #undef GET_HI
742 
743 #define LITTLE2_BYTE_TYPE(enc, p)                                              \
744   ((p)[1] == 0 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]  \
745                : unicode_byte_type((p)[1], (p)[0]))
746 #define LITTLE2_BYTE_TO_ASCII(p) ((p)[1] == 0 ? (p)[0] : -1)
747 #define LITTLE2_CHAR_MATCHES(p, c) ((p)[1] == 0 && (p)[0] == c)
748 #define LITTLE2_IS_NAME_CHAR_MINBPC(p)                                         \
749   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
750 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)                                       \
751   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
752 
753 #ifdef XML_MIN_SIZE
754 
755 static int PTRFASTCALL
756 little2_byteType(const ENCODING *enc, const char *p) {
757   return LITTLE2_BYTE_TYPE(enc, p);
758 }
759 
760 static int PTRFASTCALL
little2_byteToAscii(const ENCODING * enc,const char * p)761 little2_byteToAscii(const ENCODING *enc, const char *p) {
762   UNUSED_P(enc);
763   return LITTLE2_BYTE_TO_ASCII(p);
764 }
765 
766 static int PTRCALL
little2_charMatches(const ENCODING * enc,const char * p,int c)767 little2_charMatches(const ENCODING *enc, const char *p, int c) {
768   UNUSED_P(enc);
769   return LITTLE2_CHAR_MATCHES(p, c);
770 }
771 
772 static int PTRFASTCALL
little2_isNameMin(const ENCODING * enc,const char * p)773 little2_isNameMin(const ENCODING *enc, const char *p) {
774   UNUSED_P(enc);
775   return LITTLE2_IS_NAME_CHAR_MINBPC(p);
776 }
777 
778 static int PTRFASTCALL
little2_isNmstrtMin(const ENCODING * enc,const char * p)779 little2_isNmstrtMin(const ENCODING *enc, const char *p) {
780   UNUSED_P(enc);
781   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(p);
782 }
783 
784 #  undef VTABLE
785 #  define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
786 
787 #else /* not XML_MIN_SIZE */
788 
789 #  undef PREFIX
790 #  define PREFIX(ident) little2_##ident
791 #  define MINBPC(enc) 2
792 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
793 #  define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
794 #  define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(p)
795 #  define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(p, c)
796 #  define IS_NAME_CHAR(enc, p, n) 0
797 #  define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(p)
798 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
799 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(p)
800 
801 #  define XML_TOK_IMPL_C
802 #  include "xmltok_impl.c"
803 #  undef XML_TOK_IMPL_C
804 
805 #  undef MINBPC
806 #  undef BYTE_TYPE
807 #  undef BYTE_TO_ASCII
808 #  undef CHAR_MATCHES
809 #  undef IS_NAME_CHAR
810 #  undef IS_NAME_CHAR_MINBPC
811 #  undef IS_NMSTRT_CHAR
812 #  undef IS_NMSTRT_CHAR_MINBPC
813 #  undef IS_INVALID_CHAR
814 
815 #endif /* not XML_MIN_SIZE */
816 
817 #ifdef XML_NS
818 
819 static const struct normal_encoding little2_encoding_ns
820     = {{VTABLE, 2, 0,
821 #  if BYTEORDER == 1234
822         1
823 #  else
824         0
825 #  endif
826        },
827        {
828 #  include "asciitab.h"
829 #  include "latin1tab.h"
830        },
831        STANDARD_VTABLE(little2_) NULL_VTABLE};
832 
833 #endif
834 
835 static const struct normal_encoding little2_encoding
836     = {{VTABLE, 2, 0,
837 #if BYTEORDER == 1234
838         1
839 #else
840         0
841 #endif
842        },
843        {
844 #define BT_COLON BT_NMSTRT
845 #include "asciitab.h"
846 #undef BT_COLON
847 #include "latin1tab.h"
848        },
849        STANDARD_VTABLE(little2_) NULL_VTABLE};
850 
851 #if BYTEORDER != 4321
852 
853 #  ifdef XML_NS
854 
855 static const struct normal_encoding internal_little2_encoding_ns
856     = {{VTABLE, 2, 0, 1},
857        {
858 #    include "iasciitab.h"
859 #    include "latin1tab.h"
860        },
861        STANDARD_VTABLE(little2_) NULL_VTABLE};
862 
863 #  endif
864 
865 static const struct normal_encoding internal_little2_encoding
866     = {{VTABLE, 2, 0, 1},
867        {
868 #  define BT_COLON BT_NMSTRT
869 #  include "iasciitab.h"
870 #  undef BT_COLON
871 #  include "latin1tab.h"
872        },
873        STANDARD_VTABLE(little2_) NULL_VTABLE};
874 
875 #endif
876 
877 #define BIG2_BYTE_TYPE(enc, p)                                                 \
878   ((p)[0] == 0                                                                 \
879        ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]]        \
880        : unicode_byte_type((p)[0], (p)[1]))
881 #define BIG2_BYTE_TO_ASCII(p) ((p)[0] == 0 ? (p)[1] : -1)
882 #define BIG2_CHAR_MATCHES(p, c) ((p)[0] == 0 && (p)[1] == c)
883 #define BIG2_IS_NAME_CHAR_MINBPC(p)                                            \
884   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
885 #define BIG2_IS_NMSTRT_CHAR_MINBPC(p)                                          \
886   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
887 
888 #ifdef XML_MIN_SIZE
889 
890 static int PTRFASTCALL
big2_byteType(const ENCODING * enc,const char * p)891 big2_byteType(const ENCODING *enc, const char *p) {
892   return BIG2_BYTE_TYPE(enc, p);
893 }
894 
895 static int PTRFASTCALL
big2_byteToAscii(const ENCODING * enc,const char * p)896 big2_byteToAscii(const ENCODING *enc, const char *p) {
897   UNUSED_P(enc);
898   return BIG2_BYTE_TO_ASCII(p);
899 }
900 
901 static int PTRCALL
big2_charMatches(const ENCODING * enc,const char * p,int c)902 big2_charMatches(const ENCODING *enc, const char *p, int c) {
903   UNUSED_P(enc);
904   return BIG2_CHAR_MATCHES(p, c);
905 }
906 
907 static int PTRFASTCALL
big2_isNameMin(const ENCODING * enc,const char * p)908 big2_isNameMin(const ENCODING *enc, const char *p) {
909   UNUSED_P(enc);
910   return BIG2_IS_NAME_CHAR_MINBPC(p);
911 }
912 
913 static int PTRFASTCALL
big2_isNmstrtMin(const ENCODING * enc,const char * p)914 big2_isNmstrtMin(const ENCODING *enc, const char *p) {
915   UNUSED_P(enc);
916   return BIG2_IS_NMSTRT_CHAR_MINBPC(p);
917 }
918 
919 #  undef VTABLE
920 #  define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
921 
922 #else /* not XML_MIN_SIZE */
923 
924 #  undef PREFIX
925 #  define PREFIX(ident) big2_##ident
926 #  define MINBPC(enc) 2
927 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
928 #  define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
929 #  define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(p)
930 #  define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(p, c)
931 #  define IS_NAME_CHAR(enc, p, n) 0
932 #  define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(p)
933 #  define IS_NMSTRT_CHAR(enc, p, n) (0)
934 #  define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(p)
935 
936 #  define XML_TOK_IMPL_C
937 #  include "xmltok_impl.c"
938 #  undef XML_TOK_IMPL_C
939 
940 #  undef MINBPC
941 #  undef BYTE_TYPE
942 #  undef BYTE_TO_ASCII
943 #  undef CHAR_MATCHES
944 #  undef IS_NAME_CHAR
945 #  undef IS_NAME_CHAR_MINBPC
946 #  undef IS_NMSTRT_CHAR
947 #  undef IS_NMSTRT_CHAR_MINBPC
948 #  undef IS_INVALID_CHAR
949 
950 #endif /* not XML_MIN_SIZE */
951 
952 #ifdef XML_NS
953 
954 static const struct normal_encoding big2_encoding_ns
955     = {{VTABLE, 2, 0,
956 #  if BYTEORDER == 4321
957         1
958 #  else
959         0
960 #  endif
961        },
962        {
963 #  include "asciitab.h"
964 #  include "latin1tab.h"
965        },
966        STANDARD_VTABLE(big2_) NULL_VTABLE};
967 
968 #endif
969 
970 static const struct normal_encoding big2_encoding
971     = {{VTABLE, 2, 0,
972 #if BYTEORDER == 4321
973         1
974 #else
975         0
976 #endif
977        },
978        {
979 #define BT_COLON BT_NMSTRT
980 #include "asciitab.h"
981 #undef BT_COLON
982 #include "latin1tab.h"
983        },
984        STANDARD_VTABLE(big2_) NULL_VTABLE};
985 
986 #if BYTEORDER != 1234
987 
988 #  ifdef XML_NS
989 
990 static const struct normal_encoding internal_big2_encoding_ns
991     = {{VTABLE, 2, 0, 1},
992        {
993 #    include "iasciitab.h"
994 #    include "latin1tab.h"
995        },
996        STANDARD_VTABLE(big2_) NULL_VTABLE};
997 
998 #  endif
999 
1000 static const struct normal_encoding internal_big2_encoding
1001     = {{VTABLE, 2, 0, 1},
1002        {
1003 #  define BT_COLON BT_NMSTRT
1004 #  include "iasciitab.h"
1005 #  undef BT_COLON
1006 #  include "latin1tab.h"
1007        },
1008        STANDARD_VTABLE(big2_) NULL_VTABLE};
1009 
1010 #endif
1011 
1012 #undef PREFIX
1013 
1014 static int FASTCALL
streqci(const char * s1,const char * s2)1015 streqci(const char *s1, const char *s2) {
1016   for (;;) {
1017     char c1 = *s1++;
1018     char c2 = *s2++;
1019     if (ASCII_a <= c1 && c1 <= ASCII_z)
1020       c1 += ASCII_A - ASCII_a;
1021     if (ASCII_a <= c2 && c2 <= ASCII_z)
1022       /* The following line will never get executed.  streqci() is
1023        * only called from two places, both of which guarantee to put
1024        * upper-case strings into s2.
1025        */
1026       c2 += ASCII_A - ASCII_a; /* LCOV_EXCL_LINE */
1027     if (c1 != c2)
1028       return 0;
1029     if (! c1)
1030       break;
1031   }
1032   return 1;
1033 }
1034 
1035 static void PTRCALL
initUpdatePosition(const ENCODING * enc,const char * ptr,const char * end,POSITION * pos)1036 initUpdatePosition(const ENCODING *enc, const char *ptr, const char *end,
1037                    POSITION *pos) {
1038   UNUSED_P(enc);
1039   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
1040 }
1041 
1042 static int
toAscii(const ENCODING * enc,const char * ptr,const char * end)1043 toAscii(const ENCODING *enc, const char *ptr, const char *end) {
1044   char buf[1];
1045   char *p = buf;
1046   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
1047   if (p == buf)
1048     return -1;
1049   else
1050     return buf[0];
1051 }
1052 
1053 static int FASTCALL
isSpace(int c)1054 isSpace(int c) {
1055   switch (c) {
1056   case 0x20:
1057   case 0xD:
1058   case 0xA:
1059   case 0x9:
1060     return 1;
1061   }
1062   return 0;
1063 }
1064 
1065 /* Return 1 if there's just optional white space or there's an S
1066    followed by name=val.
1067 */
1068 static int
parsePseudoAttribute(const ENCODING * enc,const char * ptr,const char * end,const char ** namePtr,const char ** nameEndPtr,const char ** valPtr,const char ** nextTokPtr)1069 parsePseudoAttribute(const ENCODING *enc, const char *ptr, const char *end,
1070                      const char **namePtr, const char **nameEndPtr,
1071                      const char **valPtr, const char **nextTokPtr) {
1072   int c;
1073   char open;
1074   if (ptr == end) {
1075     *namePtr = NULL;
1076     return 1;
1077   }
1078   if (! isSpace(toAscii(enc, ptr, end))) {
1079     *nextTokPtr = ptr;
1080     return 0;
1081   }
1082   do {
1083     ptr += enc->minBytesPerChar;
1084   } while (isSpace(toAscii(enc, ptr, end)));
1085   if (ptr == end) {
1086     *namePtr = NULL;
1087     return 1;
1088   }
1089   *namePtr = ptr;
1090   for (;;) {
1091     c = toAscii(enc, ptr, end);
1092     if (c == -1) {
1093       *nextTokPtr = ptr;
1094       return 0;
1095     }
1096     if (c == ASCII_EQUALS) {
1097       *nameEndPtr = ptr;
1098       break;
1099     }
1100     if (isSpace(c)) {
1101       *nameEndPtr = ptr;
1102       do {
1103         ptr += enc->minBytesPerChar;
1104       } while (isSpace(c = toAscii(enc, ptr, end)));
1105       if (c != ASCII_EQUALS) {
1106         *nextTokPtr = ptr;
1107         return 0;
1108       }
1109       break;
1110     }
1111     ptr += enc->minBytesPerChar;
1112   }
1113   if (ptr == *namePtr) {
1114     *nextTokPtr = ptr;
1115     return 0;
1116   }
1117   ptr += enc->minBytesPerChar;
1118   c = toAscii(enc, ptr, end);
1119   while (isSpace(c)) {
1120     ptr += enc->minBytesPerChar;
1121     c = toAscii(enc, ptr, end);
1122   }
1123   if (c != ASCII_QUOT && c != ASCII_APOS) {
1124     *nextTokPtr = ptr;
1125     return 0;
1126   }
1127   open = (char)c;
1128   ptr += enc->minBytesPerChar;
1129   *valPtr = ptr;
1130   for (;; ptr += enc->minBytesPerChar) {
1131     c = toAscii(enc, ptr, end);
1132     if (c == open)
1133       break;
1134     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)
1135         && ! (ASCII_0 <= c && c <= ASCII_9) && c != ASCII_PERIOD
1136         && c != ASCII_MINUS && c != ASCII_UNDERSCORE) {
1137       *nextTokPtr = ptr;
1138       return 0;
1139     }
1140   }
1141   *nextTokPtr = ptr + enc->minBytesPerChar;
1142   return 1;
1143 }
1144 
1145 static const char KW_version[]
1146     = {ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'};
1147 
1148 static const char KW_encoding[] = {ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d,
1149                                    ASCII_i, ASCII_n, ASCII_g, '\0'};
1150 
1151 static const char KW_standalone[]
1152     = {ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a,
1153        ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'};
1154 
1155 static const char KW_yes[] = {ASCII_y, ASCII_e, ASCII_s, '\0'};
1156 
1157 static const char KW_no[] = {ASCII_n, ASCII_o, '\0'};
1158 
1159 static int
doParseXmlDecl(const ENCODING * (* encodingFinder)(const ENCODING *,const char *,const char *),int isGeneralTextEntity,const ENCODING * enc,const char * ptr,const char * end,const char ** badPtr,const char ** versionPtr,const char ** versionEndPtr,const char ** encodingName,const ENCODING ** encoding,int * standalone)1160 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *, const char *,
1161                                                  const char *),
1162                int isGeneralTextEntity, const ENCODING *enc, const char *ptr,
1163                const char *end, const char **badPtr, const char **versionPtr,
1164                const char **versionEndPtr, const char **encodingName,
1165                const ENCODING **encoding, int *standalone) {
1166   const char *val = NULL;
1167   const char *name = NULL;
1168   const char *nameEnd = NULL;
1169   ptr += 5 * enc->minBytesPerChar;
1170   end -= 2 * enc->minBytesPerChar;
1171   if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1172       || ! name) {
1173     *badPtr = ptr;
1174     return 0;
1175   }
1176   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1177     if (! isGeneralTextEntity) {
1178       *badPtr = name;
1179       return 0;
1180     }
1181   } else {
1182     if (versionPtr)
1183       *versionPtr = val;
1184     if (versionEndPtr)
1185       *versionEndPtr = ptr;
1186     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1187       *badPtr = ptr;
1188       return 0;
1189     }
1190     if (! name) {
1191       if (isGeneralTextEntity) {
1192         /* a TextDecl must have an EncodingDecl */
1193         *badPtr = ptr;
1194         return 0;
1195       }
1196       return 1;
1197     }
1198   }
1199   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1200     int c = toAscii(enc, val, end);
1201     if (! (ASCII_a <= c && c <= ASCII_z) && ! (ASCII_A <= c && c <= ASCII_Z)) {
1202       *badPtr = val;
1203       return 0;
1204     }
1205     if (encodingName)
1206       *encodingName = val;
1207     if (encoding)
1208       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1209     if (! parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1210       *badPtr = ptr;
1211       return 0;
1212     }
1213     if (! name)
1214       return 1;
1215   }
1216   if (! XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1217       || isGeneralTextEntity) {
1218     *badPtr = name;
1219     return 0;
1220   }
1221   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1222     if (standalone)
1223       *standalone = 1;
1224   } else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1225     if (standalone)
1226       *standalone = 0;
1227   } else {
1228     *badPtr = val;
1229     return 0;
1230   }
1231   while (isSpace(toAscii(enc, ptr, end)))
1232     ptr += enc->minBytesPerChar;
1233   if (ptr != end) {
1234     *badPtr = ptr;
1235     return 0;
1236   }
1237   return 1;
1238 }
1239 
1240 static int FASTCALL
checkCharRefNumber(int result)1241 checkCharRefNumber(int result) {
1242   switch (result >> 8) {
1243   case 0xD8:
1244   case 0xD9:
1245   case 0xDA:
1246   case 0xDB:
1247   case 0xDC:
1248   case 0xDD:
1249   case 0xDE:
1250   case 0xDF:
1251     return -1;
1252   case 0:
1253     if (latin1_encoding.type[result] == BT_NONXML)
1254       return -1;
1255     break;
1256   case 0xFF:
1257     if (result == 0xFFFE || result == 0xFFFF)
1258       return -1;
1259     break;
1260   }
1261   return result;
1262 }
1263 
1264 int FASTCALL
XmlUtf8Encode(int c,char * buf)1265 XmlUtf8Encode(int c, char *buf) {
1266   enum {
1267     /* minN is minimum legal resulting value for N byte sequence */
1268     min2 = 0x80,
1269     min3 = 0x800,
1270     min4 = 0x10000
1271   };
1272 
1273   if (c < 0)
1274     return 0; /* LCOV_EXCL_LINE: this case is always eliminated beforehand */
1275   if (c < min2) {
1276     buf[0] = (char)(c | UTF8_cval1);
1277     return 1;
1278   }
1279   if (c < min3) {
1280     buf[0] = (char)((c >> 6) | UTF8_cval2);
1281     buf[1] = (char)((c & 0x3f) | 0x80);
1282     return 2;
1283   }
1284   if (c < min4) {
1285     buf[0] = (char)((c >> 12) | UTF8_cval3);
1286     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1287     buf[2] = (char)((c & 0x3f) | 0x80);
1288     return 3;
1289   }
1290   if (c < 0x110000) {
1291     buf[0] = (char)((c >> 18) | UTF8_cval4);
1292     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1293     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1294     buf[3] = (char)((c & 0x3f) | 0x80);
1295     return 4;
1296   }
1297   return 0; /* LCOV_EXCL_LINE: this case too is eliminated before calling */
1298 }
1299 
1300 int FASTCALL
XmlUtf16Encode(int charNum,unsigned short * buf)1301 XmlUtf16Encode(int charNum, unsigned short *buf) {
1302   if (charNum < 0)
1303     return 0;
1304   if (charNum < 0x10000) {
1305     buf[0] = (unsigned short)charNum;
1306     return 1;
1307   }
1308   if (charNum < 0x110000) {
1309     charNum -= 0x10000;
1310     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1311     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1312     return 2;
1313   }
1314   return 0;
1315 }
1316 
1317 struct unknown_encoding {
1318   struct normal_encoding normal;
1319   CONVERTER convert;
1320   void *userData;
1321   unsigned short utf16[256];
1322   char utf8[256][4];
1323 };
1324 
1325 #define AS_UNKNOWN_ENCODING(enc) ((const struct unknown_encoding *)(enc))
1326 
1327 int
XmlSizeOfUnknownEncoding(void)1328 XmlSizeOfUnknownEncoding(void) {
1329   return sizeof(struct unknown_encoding);
1330 }
1331 
1332 static int PTRFASTCALL
unknown_isName(const ENCODING * enc,const char * p)1333 unknown_isName(const ENCODING *enc, const char *p) {
1334   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1335   int c = uenc->convert(uenc->userData, p);
1336   if (c & ~0xFFFF)
1337     return 0;
1338   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1339 }
1340 
1341 static int PTRFASTCALL
unknown_isNmstrt(const ENCODING * enc,const char * p)1342 unknown_isNmstrt(const ENCODING *enc, const char *p) {
1343   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1344   int c = uenc->convert(uenc->userData, p);
1345   if (c & ~0xFFFF)
1346     return 0;
1347   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1348 }
1349 
1350 static int PTRFASTCALL
unknown_isInvalid(const ENCODING * enc,const char * p)1351 unknown_isInvalid(const ENCODING *enc, const char *p) {
1352   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1353   int c = uenc->convert(uenc->userData, p);
1354   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1355 }
1356 
1357 static enum XML_Convert_Result PTRCALL
unknown_toUtf8(const ENCODING * enc,const char ** fromP,const char * fromLim,char ** toP,const char * toLim)1358 unknown_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim,
1359                char **toP, const char *toLim) {
1360   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1361   char buf[XML_UTF8_ENCODE_MAX];
1362   for (;;) {
1363     const char *utf8;
1364     int n;
1365     if (*fromP == fromLim)
1366       return XML_CONVERT_COMPLETED;
1367     utf8 = uenc->utf8[(unsigned char)**fromP];
1368     n = *utf8++;
1369     if (n == 0) {
1370       int c = uenc->convert(uenc->userData, *fromP);
1371       n = XmlUtf8Encode(c, buf);
1372       if (n > toLim - *toP)
1373         return XML_CONVERT_OUTPUT_EXHAUSTED;
1374       utf8 = buf;
1375       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1376                  - (BT_LEAD2 - 2));
1377     } else {
1378       if (n > toLim - *toP)
1379         return XML_CONVERT_OUTPUT_EXHAUSTED;
1380       (*fromP)++;
1381     }
1382     memcpy(*toP, utf8, n);
1383     *toP += n;
1384   }
1385 }
1386 
1387 static enum XML_Convert_Result PTRCALL
unknown_toUtf16(const ENCODING * enc,const char ** fromP,const char * fromLim,unsigned short ** toP,const unsigned short * toLim)1388 unknown_toUtf16(const ENCODING *enc, const char **fromP, const char *fromLim,
1389                 unsigned short **toP, const unsigned short *toLim) {
1390   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1391   while (*fromP < fromLim && *toP < toLim) {
1392     unsigned short c = uenc->utf16[(unsigned char)**fromP];
1393     if (c == 0) {
1394       c = (unsigned short)uenc->convert(uenc->userData, *fromP);
1395       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1396                  - (BT_LEAD2 - 2));
1397     } else
1398       (*fromP)++;
1399     *(*toP)++ = c;
1400   }
1401 
1402   if ((*toP == toLim) && (*fromP < fromLim))
1403     return XML_CONVERT_OUTPUT_EXHAUSTED;
1404   else
1405     return XML_CONVERT_COMPLETED;
1406 }
1407 
1408 ENCODING *
XmlInitUnknownEncoding(void * mem,int * table,CONVERTER convert,void * userData)1409 XmlInitUnknownEncoding(void *mem, int *table, CONVERTER convert,
1410                        void *userData) {
1411   int i;
1412   struct unknown_encoding *e = (struct unknown_encoding *)mem;
1413   memcpy(mem, &latin1_encoding, sizeof(struct normal_encoding));
1414   for (i = 0; i < 128; i++)
1415     if (latin1_encoding.type[i] != BT_OTHER
1416         && latin1_encoding.type[i] != BT_NONXML && table[i] != i)
1417       return 0;
1418   for (i = 0; i < 256; i++) {
1419     int c = table[i];
1420     if (c == -1) {
1421       e->normal.type[i] = BT_MALFORM;
1422       /* This shouldn't really get used. */
1423       e->utf16[i] = 0xFFFF;
1424       e->utf8[i][0] = 1;
1425       e->utf8[i][1] = 0;
1426     } else if (c < 0) {
1427       if (c < -4)
1428         return 0;
1429       /* Multi-byte sequences need a converter function */
1430       if (! convert)
1431         return 0;
1432       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1433       e->utf8[i][0] = 0;
1434       e->utf16[i] = 0;
1435     } else if (c < 0x80) {
1436       if (latin1_encoding.type[c] != BT_OTHER
1437           && latin1_encoding.type[c] != BT_NONXML && c != i)
1438         return 0;
1439       e->normal.type[i] = latin1_encoding.type[c];
1440       e->utf8[i][0] = 1;
1441       e->utf8[i][1] = (char)c;
1442       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1443     } else if (checkCharRefNumber(c) < 0) {
1444       e->normal.type[i] = BT_NONXML;
1445       /* This shouldn't really get used. */
1446       e->utf16[i] = 0xFFFF;
1447       e->utf8[i][0] = 1;
1448       e->utf8[i][1] = 0;
1449     } else {
1450       if (c > 0xFFFF)
1451         return 0;
1452       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1453         e->normal.type[i] = BT_NMSTRT;
1454       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1455         e->normal.type[i] = BT_NAME;
1456       else
1457         e->normal.type[i] = BT_OTHER;
1458       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1459       e->utf16[i] = (unsigned short)c;
1460     }
1461   }
1462   e->userData = userData;
1463   e->convert = convert;
1464   if (convert) {
1465     e->normal.isName2 = unknown_isName;
1466     e->normal.isName3 = unknown_isName;
1467     e->normal.isName4 = unknown_isName;
1468     e->normal.isNmstrt2 = unknown_isNmstrt;
1469     e->normal.isNmstrt3 = unknown_isNmstrt;
1470     e->normal.isNmstrt4 = unknown_isNmstrt;
1471     e->normal.isInvalid2 = unknown_isInvalid;
1472     e->normal.isInvalid3 = unknown_isInvalid;
1473     e->normal.isInvalid4 = unknown_isInvalid;
1474   }
1475   e->normal.enc.utf8Convert = unknown_toUtf8;
1476   e->normal.enc.utf16Convert = unknown_toUtf16;
1477   return &(e->normal.enc);
1478 }
1479 
1480 /* If this enumeration is changed, getEncodingIndex and encodings
1481 must also be changed. */
1482 enum {
1483   UNKNOWN_ENC = -1,
1484   ISO_8859_1_ENC = 0,
1485   US_ASCII_ENC,
1486   UTF_8_ENC,
1487   UTF_16_ENC,
1488   UTF_16BE_ENC,
1489   UTF_16LE_ENC,
1490   /* must match encodingNames up to here */
1491   NO_ENC
1492 };
1493 
1494 static const char KW_ISO_8859_1[]
1495     = {ASCII_I, ASCII_S, ASCII_O,     ASCII_MINUS, ASCII_8, ASCII_8,
1496        ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1,     '\0'};
1497 static const char KW_US_ASCII[]
1498     = {ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S,
1499        ASCII_C, ASCII_I, ASCII_I,     '\0'};
1500 static const char KW_UTF_8[]
1501     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'};
1502 static const char KW_UTF_16[]
1503     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'};
1504 static const char KW_UTF_16BE[]
1505     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1506        ASCII_6, ASCII_B, ASCII_E, '\0'};
1507 static const char KW_UTF_16LE[]
1508     = {ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1,
1509        ASCII_6, ASCII_L, ASCII_E, '\0'};
1510 
1511 static int FASTCALL
getEncodingIndex(const char * name)1512 getEncodingIndex(const char *name) {
1513   static const char *const encodingNames[] = {
1514       KW_ISO_8859_1, KW_US_ASCII, KW_UTF_8, KW_UTF_16, KW_UTF_16BE, KW_UTF_16LE,
1515   };
1516   int i;
1517   if (name == NULL)
1518     return NO_ENC;
1519   for (i = 0; i < (int)(sizeof(encodingNames) / sizeof(encodingNames[0])); i++)
1520     if (streqci(name, encodingNames[i]))
1521       return i;
1522   return UNKNOWN_ENC;
1523 }
1524 
1525 /* For binary compatibility, we store the index of the encoding
1526    specified at initialization in the isUtf16 member.
1527 */
1528 
1529 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1530 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1531 
1532 /* This is what detects the encoding.  encodingTable maps from
1533    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1534    the external (protocol) specified encoding; state is
1535    XML_CONTENT_STATE if we're parsing an external text entity, and
1536    XML_PROLOG_STATE otherwise.
1537 */
1538 
1539 static int
initScan(const ENCODING * const * encodingTable,const INIT_ENCODING * enc,int state,const char * ptr,const char * end,const char ** nextTokPtr)1540 initScan(const ENCODING *const *encodingTable, const INIT_ENCODING *enc,
1541          int state, const char *ptr, const char *end, const char **nextTokPtr) {
1542   const ENCODING **encPtr;
1543 
1544   if (ptr >= end)
1545     return XML_TOK_NONE;
1546   encPtr = enc->encPtr;
1547   if (ptr + 1 == end) {
1548     /* only a single byte available for auto-detection */
1549 #ifndef XML_DTD /* FIXME */
1550     /* a well-formed document entity must have more than one byte */
1551     if (state != XML_CONTENT_STATE)
1552       return XML_TOK_PARTIAL;
1553 #endif
1554     /* so we're parsing an external text entity... */
1555     /* if UTF-16 was externally specified, then we need at least 2 bytes */
1556     switch (INIT_ENC_INDEX(enc)) {
1557     case UTF_16_ENC:
1558     case UTF_16LE_ENC:
1559     case UTF_16BE_ENC:
1560       return XML_TOK_PARTIAL;
1561     }
1562     switch ((unsigned char)*ptr) {
1563     case 0xFE:
1564     case 0xFF:
1565     case 0xEF: /* possibly first byte of UTF-8 BOM */
1566       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1567         break;
1568       /* fall through */
1569     case 0x00:
1570     case 0x3C:
1571       return XML_TOK_PARTIAL;
1572     }
1573   } else {
1574     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1575     case 0xFEFF:
1576       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1577         break;
1578       *nextTokPtr = ptr + 2;
1579       *encPtr = encodingTable[UTF_16BE_ENC];
1580       return XML_TOK_BOM;
1581     /* 00 3C is handled in the default case */
1582     case 0x3C00:
1583       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1584            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1585           && state == XML_CONTENT_STATE)
1586         break;
1587       *encPtr = encodingTable[UTF_16LE_ENC];
1588       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1589     case 0xFFFE:
1590       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC && state == XML_CONTENT_STATE)
1591         break;
1592       *nextTokPtr = ptr + 2;
1593       *encPtr = encodingTable[UTF_16LE_ENC];
1594       return XML_TOK_BOM;
1595     case 0xEFBB:
1596       /* Maybe a UTF-8 BOM (EF BB BF) */
1597       /* If there's an explicitly specified (external) encoding
1598          of ISO-8859-1 or some flavour of UTF-16
1599          and this is an external text entity,
1600          don't look for the BOM,
1601          because it might be a legal data.
1602       */
1603       if (state == XML_CONTENT_STATE) {
1604         int e = INIT_ENC_INDEX(enc);
1605         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC
1606             || e == UTF_16_ENC)
1607           break;
1608       }
1609       if (ptr + 2 == end)
1610         return XML_TOK_PARTIAL;
1611       if ((unsigned char)ptr[2] == 0xBF) {
1612         *nextTokPtr = ptr + 3;
1613         *encPtr = encodingTable[UTF_8_ENC];
1614         return XML_TOK_BOM;
1615       }
1616       break;
1617     default:
1618       if (ptr[0] == '\0') {
1619         /* 0 isn't a legal data character. Furthermore a document
1620            entity can only start with ASCII characters.  So the only
1621            way this can fail to be big-endian UTF-16 if it it's an
1622            external parsed general entity that's labelled as
1623            UTF-16LE.
1624         */
1625         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1626           break;
1627         *encPtr = encodingTable[UTF_16BE_ENC];
1628         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1629       } else if (ptr[1] == '\0') {
1630         /* We could recover here in the case:
1631             - parsing an external entity
1632             - second byte is 0
1633             - no externally specified encoding
1634             - no encoding declaration
1635            by assuming UTF-16LE.  But we don't, because this would mean when
1636            presented just with a single byte, we couldn't reliably determine
1637            whether we needed further bytes.
1638         */
1639         if (state == XML_CONTENT_STATE)
1640           break;
1641         *encPtr = encodingTable[UTF_16LE_ENC];
1642         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1643       }
1644       break;
1645     }
1646   }
1647   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1648   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1649 }
1650 
1651 #define NS(x) x
1652 #define ns(x) x
1653 #define XML_TOK_NS_C
1654 #include "xmltok_ns.c"
1655 #undef XML_TOK_NS_C
1656 #undef NS
1657 #undef ns
1658 
1659 #ifdef XML_NS
1660 
1661 #  define NS(x) x##NS
1662 #  define ns(x) x##_ns
1663 
1664 #  define XML_TOK_NS_C
1665 #  include "xmltok_ns.c"
1666 #  undef XML_TOK_NS_C
1667 
1668 #  undef NS
1669 #  undef ns
1670 
1671 ENCODING *
XmlInitUnknownEncodingNS(void * mem,int * table,CONVERTER convert,void * userData)1672 XmlInitUnknownEncodingNS(void *mem, int *table, CONVERTER convert,
1673                          void *userData) {
1674   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1675   if (enc)
1676     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1677   return enc;
1678 }
1679 
1680 #endif /* XML_NS */
1681