1 /* This file is included!
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
19 following conditions:
20
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32
33 #ifdef XML_TOK_IMPL_C
34
35 #ifndef IS_INVALID_CHAR
36 #define IS_INVALID_CHAR(enc, ptr, n) (0)
37 #endif
38
39 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
40 case BT_LEAD ## n: \
41 if (end - ptr < n) \
42 return XML_TOK_PARTIAL_CHAR; \
43 if (IS_INVALID_CHAR(enc, ptr, n)) { \
44 *(nextTokPtr) = (ptr); \
45 return XML_TOK_INVALID; \
46 } \
47 ptr += n; \
48 break;
49
50 #define INVALID_CASES(ptr, nextTokPtr) \
51 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
52 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
53 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
54 case BT_NONXML: \
55 case BT_MALFORM: \
56 case BT_TRAIL: \
57 *(nextTokPtr) = (ptr); \
58 return XML_TOK_INVALID;
59
60 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
61 case BT_LEAD ## n: \
62 if (end - ptr < n) \
63 return XML_TOK_PARTIAL_CHAR; \
64 if (!IS_NAME_CHAR(enc, ptr, n)) { \
65 *nextTokPtr = ptr; \
66 return XML_TOK_INVALID; \
67 } \
68 ptr += n; \
69 break;
70
71 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
72 case BT_NONASCII: \
73 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
74 *nextTokPtr = ptr; \
75 return XML_TOK_INVALID; \
76 } \
77 /* fall through */ \
78 case BT_NMSTRT: \
79 case BT_HEX: \
80 case BT_DIGIT: \
81 case BT_NAME: \
82 case BT_MINUS: \
83 ptr += MINBPC(enc); \
84 break; \
85 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
86 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
87 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
88
89 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
90 case BT_LEAD ## n: \
91 if (end - ptr < n) \
92 return XML_TOK_PARTIAL_CHAR; \
93 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
94 *nextTokPtr = ptr; \
95 return XML_TOK_INVALID; \
96 } \
97 ptr += n; \
98 break;
99
100 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
101 case BT_NONASCII: \
102 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
103 *nextTokPtr = ptr; \
104 return XML_TOK_INVALID; \
105 } \
106 /* fall through */ \
107 case BT_NMSTRT: \
108 case BT_HEX: \
109 ptr += MINBPC(enc); \
110 break; \
111 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
112 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
113 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
114
115 #ifndef PREFIX
116 #define PREFIX(ident) ident
117 #endif
118
119
120 #define HAS_CHARS(enc, ptr, end, count) \
121 (end - ptr >= count * MINBPC(enc))
122
123 #define HAS_CHAR(enc, ptr, end) \
124 HAS_CHARS(enc, ptr, end, 1)
125
126 #define REQUIRE_CHARS(enc, ptr, end, count) \
127 { \
128 if (! HAS_CHARS(enc, ptr, end, count)) { \
129 return XML_TOK_PARTIAL; \
130 } \
131 }
132
133 #define REQUIRE_CHAR(enc, ptr, end) \
134 REQUIRE_CHARS(enc, ptr, end, 1)
135
136
137 /* ptr points to character following "<!-" */
138
139 static int PTRCALL
PREFIX(scanComment)140 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
141 const char *end, const char **nextTokPtr)
142 {
143 if (HAS_CHAR(enc, ptr, end)) {
144 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
145 *nextTokPtr = ptr;
146 return XML_TOK_INVALID;
147 }
148 ptr += MINBPC(enc);
149 while (HAS_CHAR(enc, ptr, end)) {
150 switch (BYTE_TYPE(enc, ptr)) {
151 INVALID_CASES(ptr, nextTokPtr)
152 case BT_MINUS:
153 ptr += MINBPC(enc);
154 REQUIRE_CHAR(enc, ptr, end);
155 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
156 ptr += MINBPC(enc);
157 REQUIRE_CHAR(enc, ptr, end);
158 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
159 *nextTokPtr = ptr;
160 return XML_TOK_INVALID;
161 }
162 *nextTokPtr = ptr + MINBPC(enc);
163 return XML_TOK_COMMENT;
164 }
165 break;
166 default:
167 ptr += MINBPC(enc);
168 break;
169 }
170 }
171 }
172 return XML_TOK_PARTIAL;
173 }
174
175 /* ptr points to character following "<!" */
176
177 static int PTRCALL
PREFIX(scanDecl)178 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
179 const char *end, const char **nextTokPtr)
180 {
181 REQUIRE_CHAR(enc, ptr, end);
182 switch (BYTE_TYPE(enc, ptr)) {
183 case BT_MINUS:
184 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
185 case BT_LSQB:
186 *nextTokPtr = ptr + MINBPC(enc);
187 return XML_TOK_COND_SECT_OPEN;
188 case BT_NMSTRT:
189 case BT_HEX:
190 ptr += MINBPC(enc);
191 break;
192 default:
193 *nextTokPtr = ptr;
194 return XML_TOK_INVALID;
195 }
196 while (HAS_CHAR(enc, ptr, end)) {
197 switch (BYTE_TYPE(enc, ptr)) {
198 case BT_PERCNT:
199 REQUIRE_CHARS(enc, ptr, end, 2);
200 /* don't allow <!ENTITY% foo "whatever"> */
201 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
202 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
203 *nextTokPtr = ptr;
204 return XML_TOK_INVALID;
205 }
206 /* fall through */
207 case BT_S: case BT_CR: case BT_LF:
208 *nextTokPtr = ptr;
209 return XML_TOK_DECL_OPEN;
210 case BT_NMSTRT:
211 case BT_HEX:
212 ptr += MINBPC(enc);
213 break;
214 default:
215 *nextTokPtr = ptr;
216 return XML_TOK_INVALID;
217 }
218 }
219 return XML_TOK_PARTIAL;
220 }
221
222 static int PTRCALL
PREFIX(checkPiTarget)223 PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
224 const char *end, int *tokPtr)
225 {
226 int upper = 0;
227 *tokPtr = XML_TOK_PI;
228 if (end - ptr != MINBPC(enc)*3)
229 return 1;
230 switch (BYTE_TO_ASCII(enc, ptr)) {
231 case ASCII_x:
232 break;
233 case ASCII_X:
234 upper = 1;
235 break;
236 default:
237 return 1;
238 }
239 ptr += MINBPC(enc);
240 switch (BYTE_TO_ASCII(enc, ptr)) {
241 case ASCII_m:
242 break;
243 case ASCII_M:
244 upper = 1;
245 break;
246 default:
247 return 1;
248 }
249 ptr += MINBPC(enc);
250 switch (BYTE_TO_ASCII(enc, ptr)) {
251 case ASCII_l:
252 break;
253 case ASCII_L:
254 upper = 1;
255 break;
256 default:
257 return 1;
258 }
259 if (upper)
260 return 0;
261 *tokPtr = XML_TOK_XML_DECL;
262 return 1;
263 }
264
265 /* ptr points to character following "<?" */
266
267 static int PTRCALL
PREFIX(scanPi)268 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
269 const char *end, const char **nextTokPtr)
270 {
271 int tok;
272 const char *target = ptr;
273 REQUIRE_CHAR(enc, ptr, end);
274 switch (BYTE_TYPE(enc, ptr)) {
275 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
276 default:
277 *nextTokPtr = ptr;
278 return XML_TOK_INVALID;
279 }
280 while (HAS_CHAR(enc, ptr, end)) {
281 switch (BYTE_TYPE(enc, ptr)) {
282 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
283 case BT_S: case BT_CR: case BT_LF:
284 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
285 *nextTokPtr = ptr;
286 return XML_TOK_INVALID;
287 }
288 ptr += MINBPC(enc);
289 while (HAS_CHAR(enc, ptr, end)) {
290 switch (BYTE_TYPE(enc, ptr)) {
291 INVALID_CASES(ptr, nextTokPtr)
292 case BT_QUEST:
293 ptr += MINBPC(enc);
294 REQUIRE_CHAR(enc, ptr, end);
295 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
296 *nextTokPtr = ptr + MINBPC(enc);
297 return tok;
298 }
299 break;
300 default:
301 ptr += MINBPC(enc);
302 break;
303 }
304 }
305 return XML_TOK_PARTIAL;
306 case BT_QUEST:
307 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
308 *nextTokPtr = ptr;
309 return XML_TOK_INVALID;
310 }
311 ptr += MINBPC(enc);
312 REQUIRE_CHAR(enc, ptr, end);
313 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
314 *nextTokPtr = ptr + MINBPC(enc);
315 return tok;
316 }
317 /* fall through */
318 default:
319 *nextTokPtr = ptr;
320 return XML_TOK_INVALID;
321 }
322 }
323 return XML_TOK_PARTIAL;
324 }
325
326 static int PTRCALL
PREFIX(scanCdataSection)327 PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
328 const char *end, const char **nextTokPtr)
329 {
330 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
331 ASCII_T, ASCII_A, ASCII_LSQB };
332 int i;
333 /* CDATA[ */
334 REQUIRE_CHARS(enc, ptr, end, 6);
335 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
336 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
337 *nextTokPtr = ptr;
338 return XML_TOK_INVALID;
339 }
340 }
341 *nextTokPtr = ptr;
342 return XML_TOK_CDATA_SECT_OPEN;
343 }
344
345 static int PTRCALL
PREFIX(cdataSectionTok)346 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
347 const char *end, const char **nextTokPtr)
348 {
349 if (ptr >= end)
350 return XML_TOK_NONE;
351 if (MINBPC(enc) > 1) {
352 size_t n = end - ptr;
353 if (n & (MINBPC(enc) - 1)) {
354 n &= ~(MINBPC(enc) - 1);
355 if (n == 0)
356 return XML_TOK_PARTIAL;
357 end = ptr + n;
358 }
359 }
360 switch (BYTE_TYPE(enc, ptr)) {
361 case BT_RSQB:
362 ptr += MINBPC(enc);
363 REQUIRE_CHAR(enc, ptr, end);
364 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
365 break;
366 ptr += MINBPC(enc);
367 REQUIRE_CHAR(enc, ptr, end);
368 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
369 ptr -= MINBPC(enc);
370 break;
371 }
372 *nextTokPtr = ptr + MINBPC(enc);
373 return XML_TOK_CDATA_SECT_CLOSE;
374 case BT_CR:
375 ptr += MINBPC(enc);
376 REQUIRE_CHAR(enc, ptr, end);
377 if (BYTE_TYPE(enc, ptr) == BT_LF)
378 ptr += MINBPC(enc);
379 *nextTokPtr = ptr;
380 return XML_TOK_DATA_NEWLINE;
381 case BT_LF:
382 *nextTokPtr = ptr + MINBPC(enc);
383 return XML_TOK_DATA_NEWLINE;
384 INVALID_CASES(ptr, nextTokPtr)
385 default:
386 ptr += MINBPC(enc);
387 break;
388 }
389 while (HAS_CHAR(enc, ptr, end)) {
390 switch (BYTE_TYPE(enc, ptr)) {
391 #define LEAD_CASE(n) \
392 case BT_LEAD ## n: \
393 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
394 *nextTokPtr = ptr; \
395 return XML_TOK_DATA_CHARS; \
396 } \
397 ptr += n; \
398 break;
399 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
400 #undef LEAD_CASE
401 case BT_NONXML:
402 case BT_MALFORM:
403 case BT_TRAIL:
404 case BT_CR:
405 case BT_LF:
406 case BT_RSQB:
407 *nextTokPtr = ptr;
408 return XML_TOK_DATA_CHARS;
409 default:
410 ptr += MINBPC(enc);
411 break;
412 }
413 }
414 *nextTokPtr = ptr;
415 return XML_TOK_DATA_CHARS;
416 }
417
418 /* ptr points to character following "</" */
419
420 static int PTRCALL
PREFIX(scanEndTag)421 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
422 const char *end, const char **nextTokPtr)
423 {
424 REQUIRE_CHAR(enc, ptr, end);
425 switch (BYTE_TYPE(enc, ptr)) {
426 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
427 default:
428 *nextTokPtr = ptr;
429 return XML_TOK_INVALID;
430 }
431 while (HAS_CHAR(enc, ptr, end)) {
432 switch (BYTE_TYPE(enc, ptr)) {
433 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
434 case BT_S: case BT_CR: case BT_LF:
435 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
436 switch (BYTE_TYPE(enc, ptr)) {
437 case BT_S: case BT_CR: case BT_LF:
438 break;
439 case BT_GT:
440 *nextTokPtr = ptr + MINBPC(enc);
441 return XML_TOK_END_TAG;
442 default:
443 *nextTokPtr = ptr;
444 return XML_TOK_INVALID;
445 }
446 }
447 return XML_TOK_PARTIAL;
448 #ifdef XML_NS
449 case BT_COLON:
450 /* no need to check qname syntax here,
451 since end-tag must match exactly */
452 ptr += MINBPC(enc);
453 break;
454 #endif
455 case BT_GT:
456 *nextTokPtr = ptr + MINBPC(enc);
457 return XML_TOK_END_TAG;
458 default:
459 *nextTokPtr = ptr;
460 return XML_TOK_INVALID;
461 }
462 }
463 return XML_TOK_PARTIAL;
464 }
465
466 /* ptr points to character following "&#X" */
467
468 static int PTRCALL
PREFIX(scanHexCharRef)469 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
470 const char *end, const char **nextTokPtr)
471 {
472 if (HAS_CHAR(enc, ptr, end)) {
473 switch (BYTE_TYPE(enc, ptr)) {
474 case BT_DIGIT:
475 case BT_HEX:
476 break;
477 default:
478 *nextTokPtr = ptr;
479 return XML_TOK_INVALID;
480 }
481 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
482 switch (BYTE_TYPE(enc, ptr)) {
483 case BT_DIGIT:
484 case BT_HEX:
485 break;
486 case BT_SEMI:
487 *nextTokPtr = ptr + MINBPC(enc);
488 return XML_TOK_CHAR_REF;
489 default:
490 *nextTokPtr = ptr;
491 return XML_TOK_INVALID;
492 }
493 }
494 }
495 return XML_TOK_PARTIAL;
496 }
497
498 /* ptr points to character following "&#" */
499
500 static int PTRCALL
PREFIX(scanCharRef)501 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
502 const char *end, const char **nextTokPtr)
503 {
504 if (HAS_CHAR(enc, ptr, end)) {
505 if (CHAR_MATCHES(enc, ptr, ASCII_x))
506 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
507 switch (BYTE_TYPE(enc, ptr)) {
508 case BT_DIGIT:
509 break;
510 default:
511 *nextTokPtr = ptr;
512 return XML_TOK_INVALID;
513 }
514 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
515 switch (BYTE_TYPE(enc, ptr)) {
516 case BT_DIGIT:
517 break;
518 case BT_SEMI:
519 *nextTokPtr = ptr + MINBPC(enc);
520 return XML_TOK_CHAR_REF;
521 default:
522 *nextTokPtr = ptr;
523 return XML_TOK_INVALID;
524 }
525 }
526 }
527 return XML_TOK_PARTIAL;
528 }
529
530 /* ptr points to character following "&" */
531
532 static int PTRCALL
PREFIX(scanRef)533 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
534 const char **nextTokPtr)
535 {
536 REQUIRE_CHAR(enc, ptr, end);
537 switch (BYTE_TYPE(enc, ptr)) {
538 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
539 case BT_NUM:
540 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
541 default:
542 *nextTokPtr = ptr;
543 return XML_TOK_INVALID;
544 }
545 while (HAS_CHAR(enc, ptr, end)) {
546 switch (BYTE_TYPE(enc, ptr)) {
547 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
548 case BT_SEMI:
549 *nextTokPtr = ptr + MINBPC(enc);
550 return XML_TOK_ENTITY_REF;
551 default:
552 *nextTokPtr = ptr;
553 return XML_TOK_INVALID;
554 }
555 }
556 return XML_TOK_PARTIAL;
557 }
558
559 /* ptr points to character following first character of attribute name */
560
561 static int PTRCALL
PREFIX(scanAtts)562 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
563 const char **nextTokPtr)
564 {
565 #ifdef XML_NS
566 int hadColon = 0;
567 #endif
568 while (HAS_CHAR(enc, ptr, end)) {
569 switch (BYTE_TYPE(enc, ptr)) {
570 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
571 #ifdef XML_NS
572 case BT_COLON:
573 if (hadColon) {
574 *nextTokPtr = ptr;
575 return XML_TOK_INVALID;
576 }
577 hadColon = 1;
578 ptr += MINBPC(enc);
579 REQUIRE_CHAR(enc, ptr, end);
580 switch (BYTE_TYPE(enc, ptr)) {
581 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
582 default:
583 *nextTokPtr = ptr;
584 return XML_TOK_INVALID;
585 }
586 break;
587 #endif
588 case BT_S: case BT_CR: case BT_LF:
589 for (;;) {
590 int t;
591
592 ptr += MINBPC(enc);
593 REQUIRE_CHAR(enc, ptr, end);
594 t = BYTE_TYPE(enc, ptr);
595 if (t == BT_EQUALS)
596 break;
597 switch (t) {
598 case BT_S:
599 case BT_LF:
600 case BT_CR:
601 break;
602 default:
603 *nextTokPtr = ptr;
604 return XML_TOK_INVALID;
605 }
606 }
607 /* fall through */
608 case BT_EQUALS:
609 {
610 int open;
611 #ifdef XML_NS
612 hadColon = 0;
613 #endif
614 for (;;) {
615 ptr += MINBPC(enc);
616 REQUIRE_CHAR(enc, ptr, end);
617 open = BYTE_TYPE(enc, ptr);
618 if (open == BT_QUOT || open == BT_APOS)
619 break;
620 switch (open) {
621 case BT_S:
622 case BT_LF:
623 case BT_CR:
624 break;
625 default:
626 *nextTokPtr = ptr;
627 return XML_TOK_INVALID;
628 }
629 }
630 ptr += MINBPC(enc);
631 /* in attribute value */
632 for (;;) {
633 int t;
634 REQUIRE_CHAR(enc, ptr, end);
635 t = BYTE_TYPE(enc, ptr);
636 if (t == open)
637 break;
638 switch (t) {
639 INVALID_CASES(ptr, nextTokPtr)
640 case BT_AMP:
641 {
642 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
643 if (tok <= 0) {
644 if (tok == XML_TOK_INVALID)
645 *nextTokPtr = ptr;
646 return tok;
647 }
648 break;
649 }
650 case BT_LT:
651 *nextTokPtr = ptr;
652 return XML_TOK_INVALID;
653 default:
654 ptr += MINBPC(enc);
655 break;
656 }
657 }
658 ptr += MINBPC(enc);
659 REQUIRE_CHAR(enc, ptr, end);
660 switch (BYTE_TYPE(enc, ptr)) {
661 case BT_S:
662 case BT_CR:
663 case BT_LF:
664 break;
665 case BT_SOL:
666 goto sol;
667 case BT_GT:
668 goto gt;
669 default:
670 *nextTokPtr = ptr;
671 return XML_TOK_INVALID;
672 }
673 /* ptr points to closing quote */
674 for (;;) {
675 ptr += MINBPC(enc);
676 REQUIRE_CHAR(enc, ptr, end);
677 switch (BYTE_TYPE(enc, ptr)) {
678 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
679 case BT_S: case BT_CR: case BT_LF:
680 continue;
681 case BT_GT:
682 gt:
683 *nextTokPtr = ptr + MINBPC(enc);
684 return XML_TOK_START_TAG_WITH_ATTS;
685 case BT_SOL:
686 sol:
687 ptr += MINBPC(enc);
688 REQUIRE_CHAR(enc, ptr, end);
689 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
690 *nextTokPtr = ptr;
691 return XML_TOK_INVALID;
692 }
693 *nextTokPtr = ptr + MINBPC(enc);
694 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
695 default:
696 *nextTokPtr = ptr;
697 return XML_TOK_INVALID;
698 }
699 break;
700 }
701 break;
702 }
703 default:
704 *nextTokPtr = ptr;
705 return XML_TOK_INVALID;
706 }
707 }
708 return XML_TOK_PARTIAL;
709 }
710
711 /* ptr points to character following "<" */
712
713 static int PTRCALL
PREFIX(scanLt)714 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
715 const char **nextTokPtr)
716 {
717 #ifdef XML_NS
718 int hadColon;
719 #endif
720 REQUIRE_CHAR(enc, ptr, end);
721 switch (BYTE_TYPE(enc, ptr)) {
722 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
723 case BT_EXCL:
724 ptr += MINBPC(enc);
725 REQUIRE_CHAR(enc, ptr, end);
726 switch (BYTE_TYPE(enc, ptr)) {
727 case BT_MINUS:
728 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
729 case BT_LSQB:
730 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
731 end, nextTokPtr);
732 }
733 *nextTokPtr = ptr;
734 return XML_TOK_INVALID;
735 case BT_QUEST:
736 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
737 case BT_SOL:
738 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
739 default:
740 *nextTokPtr = ptr;
741 return XML_TOK_INVALID;
742 }
743 #ifdef XML_NS
744 hadColon = 0;
745 #endif
746 /* we have a start-tag */
747 while (HAS_CHAR(enc, ptr, end)) {
748 switch (BYTE_TYPE(enc, ptr)) {
749 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
750 #ifdef XML_NS
751 case BT_COLON:
752 if (hadColon) {
753 *nextTokPtr = ptr;
754 return XML_TOK_INVALID;
755 }
756 hadColon = 1;
757 ptr += MINBPC(enc);
758 REQUIRE_CHAR(enc, ptr, end);
759 switch (BYTE_TYPE(enc, ptr)) {
760 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
761 default:
762 *nextTokPtr = ptr;
763 return XML_TOK_INVALID;
764 }
765 break;
766 #endif
767 case BT_S: case BT_CR: case BT_LF:
768 {
769 ptr += MINBPC(enc);
770 while (HAS_CHAR(enc, ptr, end)) {
771 switch (BYTE_TYPE(enc, ptr)) {
772 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
773 case BT_GT:
774 goto gt;
775 case BT_SOL:
776 goto sol;
777 case BT_S: case BT_CR: case BT_LF:
778 ptr += MINBPC(enc);
779 continue;
780 default:
781 *nextTokPtr = ptr;
782 return XML_TOK_INVALID;
783 }
784 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
785 }
786 return XML_TOK_PARTIAL;
787 }
788 case BT_GT:
789 gt:
790 *nextTokPtr = ptr + MINBPC(enc);
791 return XML_TOK_START_TAG_NO_ATTS;
792 case BT_SOL:
793 sol:
794 ptr += MINBPC(enc);
795 REQUIRE_CHAR(enc, ptr, end);
796 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
797 *nextTokPtr = ptr;
798 return XML_TOK_INVALID;
799 }
800 *nextTokPtr = ptr + MINBPC(enc);
801 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
802 default:
803 *nextTokPtr = ptr;
804 return XML_TOK_INVALID;
805 }
806 }
807 return XML_TOK_PARTIAL;
808 }
809
810 static int PTRCALL
PREFIX(contentTok)811 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
812 const char **nextTokPtr)
813 {
814 if (ptr >= end)
815 return XML_TOK_NONE;
816 if (MINBPC(enc) > 1) {
817 size_t n = end - ptr;
818 if (n & (MINBPC(enc) - 1)) {
819 n &= ~(MINBPC(enc) - 1);
820 if (n == 0)
821 return XML_TOK_PARTIAL;
822 end = ptr + n;
823 }
824 }
825 switch (BYTE_TYPE(enc, ptr)) {
826 case BT_LT:
827 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
828 case BT_AMP:
829 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
830 case BT_CR:
831 ptr += MINBPC(enc);
832 if (! HAS_CHAR(enc, ptr, end))
833 return XML_TOK_TRAILING_CR;
834 if (BYTE_TYPE(enc, ptr) == BT_LF)
835 ptr += MINBPC(enc);
836 *nextTokPtr = ptr;
837 return XML_TOK_DATA_NEWLINE;
838 case BT_LF:
839 *nextTokPtr = ptr + MINBPC(enc);
840 return XML_TOK_DATA_NEWLINE;
841 case BT_RSQB:
842 ptr += MINBPC(enc);
843 if (! HAS_CHAR(enc, ptr, end))
844 return XML_TOK_TRAILING_RSQB;
845 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
846 break;
847 ptr += MINBPC(enc);
848 if (! HAS_CHAR(enc, ptr, end))
849 return XML_TOK_TRAILING_RSQB;
850 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
851 ptr -= MINBPC(enc);
852 break;
853 }
854 *nextTokPtr = ptr;
855 return XML_TOK_INVALID;
856 INVALID_CASES(ptr, nextTokPtr)
857 default:
858 ptr += MINBPC(enc);
859 break;
860 }
861 while (HAS_CHAR(enc, ptr, end)) {
862 switch (BYTE_TYPE(enc, ptr)) {
863 #define LEAD_CASE(n) \
864 case BT_LEAD ## n: \
865 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
866 *nextTokPtr = ptr; \
867 return XML_TOK_DATA_CHARS; \
868 } \
869 ptr += n; \
870 break;
871 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
872 #undef LEAD_CASE
873 case BT_RSQB:
874 if (HAS_CHARS(enc, ptr, end, 2)) {
875 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
876 ptr += MINBPC(enc);
877 break;
878 }
879 if (HAS_CHARS(enc, ptr, end, 3)) {
880 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
881 ptr += MINBPC(enc);
882 break;
883 }
884 *nextTokPtr = ptr + 2*MINBPC(enc);
885 return XML_TOK_INVALID;
886 }
887 }
888 /* fall through */
889 case BT_AMP:
890 case BT_LT:
891 case BT_NONXML:
892 case BT_MALFORM:
893 case BT_TRAIL:
894 case BT_CR:
895 case BT_LF:
896 *nextTokPtr = ptr;
897 return XML_TOK_DATA_CHARS;
898 default:
899 ptr += MINBPC(enc);
900 break;
901 }
902 }
903 *nextTokPtr = ptr;
904 return XML_TOK_DATA_CHARS;
905 }
906
907 /* ptr points to character following "%" */
908
909 static int PTRCALL
PREFIX(scanPercent)910 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
911 const char **nextTokPtr)
912 {
913 REQUIRE_CHAR(enc, ptr, end);
914 switch (BYTE_TYPE(enc, ptr)) {
915 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
916 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
917 *nextTokPtr = ptr;
918 return XML_TOK_PERCENT;
919 default:
920 *nextTokPtr = ptr;
921 return XML_TOK_INVALID;
922 }
923 while (HAS_CHAR(enc, ptr, end)) {
924 switch (BYTE_TYPE(enc, ptr)) {
925 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
926 case BT_SEMI:
927 *nextTokPtr = ptr + MINBPC(enc);
928 return XML_TOK_PARAM_ENTITY_REF;
929 default:
930 *nextTokPtr = ptr;
931 return XML_TOK_INVALID;
932 }
933 }
934 return XML_TOK_PARTIAL;
935 }
936
937 static int PTRCALL
PREFIX(scanPoundName)938 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
939 const char **nextTokPtr)
940 {
941 REQUIRE_CHAR(enc, ptr, end);
942 switch (BYTE_TYPE(enc, ptr)) {
943 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
944 default:
945 *nextTokPtr = ptr;
946 return XML_TOK_INVALID;
947 }
948 while (HAS_CHAR(enc, ptr, end)) {
949 switch (BYTE_TYPE(enc, ptr)) {
950 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
951 case BT_CR: case BT_LF: case BT_S:
952 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
953 *nextTokPtr = ptr;
954 return XML_TOK_POUND_NAME;
955 default:
956 *nextTokPtr = ptr;
957 return XML_TOK_INVALID;
958 }
959 }
960 return -XML_TOK_POUND_NAME;
961 }
962
963 static int PTRCALL
PREFIX(scanLit)964 PREFIX(scanLit)(int open, const ENCODING *enc,
965 const char *ptr, const char *end,
966 const char **nextTokPtr)
967 {
968 while (HAS_CHAR(enc, ptr, end)) {
969 int t = BYTE_TYPE(enc, ptr);
970 switch (t) {
971 INVALID_CASES(ptr, nextTokPtr)
972 case BT_QUOT:
973 case BT_APOS:
974 ptr += MINBPC(enc);
975 if (t != open)
976 break;
977 if (! HAS_CHAR(enc, ptr, end))
978 return -XML_TOK_LITERAL;
979 *nextTokPtr = ptr;
980 switch (BYTE_TYPE(enc, ptr)) {
981 case BT_S: case BT_CR: case BT_LF:
982 case BT_GT: case BT_PERCNT: case BT_LSQB:
983 return XML_TOK_LITERAL;
984 default:
985 return XML_TOK_INVALID;
986 }
987 default:
988 ptr += MINBPC(enc);
989 break;
990 }
991 }
992 return XML_TOK_PARTIAL;
993 }
994
995 static int PTRCALL
PREFIX(prologTok)996 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
997 const char **nextTokPtr)
998 {
999 int tok;
1000 if (ptr >= end)
1001 return XML_TOK_NONE;
1002 if (MINBPC(enc) > 1) {
1003 size_t n = end - ptr;
1004 if (n & (MINBPC(enc) - 1)) {
1005 n &= ~(MINBPC(enc) - 1);
1006 if (n == 0)
1007 return XML_TOK_PARTIAL;
1008 end = ptr + n;
1009 }
1010 }
1011 switch (BYTE_TYPE(enc, ptr)) {
1012 case BT_QUOT:
1013 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1014 case BT_APOS:
1015 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1016 case BT_LT:
1017 {
1018 ptr += MINBPC(enc);
1019 REQUIRE_CHAR(enc, ptr, end);
1020 switch (BYTE_TYPE(enc, ptr)) {
1021 case BT_EXCL:
1022 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1023 case BT_QUEST:
1024 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1025 case BT_NMSTRT:
1026 case BT_HEX:
1027 case BT_NONASCII:
1028 case BT_LEAD2:
1029 case BT_LEAD3:
1030 case BT_LEAD4:
1031 *nextTokPtr = ptr - MINBPC(enc);
1032 return XML_TOK_INSTANCE_START;
1033 }
1034 *nextTokPtr = ptr;
1035 return XML_TOK_INVALID;
1036 }
1037 case BT_CR:
1038 if (ptr + MINBPC(enc) == end) {
1039 *nextTokPtr = end;
1040 /* indicate that this might be part of a CR/LF pair */
1041 return -XML_TOK_PROLOG_S;
1042 }
1043 /* fall through */
1044 case BT_S: case BT_LF:
1045 for (;;) {
1046 ptr += MINBPC(enc);
1047 if (! HAS_CHAR(enc, ptr, end))
1048 break;
1049 switch (BYTE_TYPE(enc, ptr)) {
1050 case BT_S: case BT_LF:
1051 break;
1052 case BT_CR:
1053 /* don't split CR/LF pair */
1054 if (ptr + MINBPC(enc) != end)
1055 break;
1056 /* fall through */
1057 default:
1058 *nextTokPtr = ptr;
1059 return XML_TOK_PROLOG_S;
1060 }
1061 }
1062 *nextTokPtr = ptr;
1063 return XML_TOK_PROLOG_S;
1064 case BT_PERCNT:
1065 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1066 case BT_COMMA:
1067 *nextTokPtr = ptr + MINBPC(enc);
1068 return XML_TOK_COMMA;
1069 case BT_LSQB:
1070 *nextTokPtr = ptr + MINBPC(enc);
1071 return XML_TOK_OPEN_BRACKET;
1072 case BT_RSQB:
1073 ptr += MINBPC(enc);
1074 if (! HAS_CHAR(enc, ptr, end))
1075 return -XML_TOK_CLOSE_BRACKET;
1076 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1077 REQUIRE_CHARS(enc, ptr, end, 2);
1078 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1079 *nextTokPtr = ptr + 2*MINBPC(enc);
1080 return XML_TOK_COND_SECT_CLOSE;
1081 }
1082 }
1083 *nextTokPtr = ptr;
1084 return XML_TOK_CLOSE_BRACKET;
1085 case BT_LPAR:
1086 *nextTokPtr = ptr + MINBPC(enc);
1087 return XML_TOK_OPEN_PAREN;
1088 case BT_RPAR:
1089 ptr += MINBPC(enc);
1090 if (! HAS_CHAR(enc, ptr, end))
1091 return -XML_TOK_CLOSE_PAREN;
1092 switch (BYTE_TYPE(enc, ptr)) {
1093 case BT_AST:
1094 *nextTokPtr = ptr + MINBPC(enc);
1095 return XML_TOK_CLOSE_PAREN_ASTERISK;
1096 case BT_QUEST:
1097 *nextTokPtr = ptr + MINBPC(enc);
1098 return XML_TOK_CLOSE_PAREN_QUESTION;
1099 case BT_PLUS:
1100 *nextTokPtr = ptr + MINBPC(enc);
1101 return XML_TOK_CLOSE_PAREN_PLUS;
1102 case BT_CR: case BT_LF: case BT_S:
1103 case BT_GT: case BT_COMMA: case BT_VERBAR:
1104 case BT_RPAR:
1105 *nextTokPtr = ptr;
1106 return XML_TOK_CLOSE_PAREN;
1107 }
1108 *nextTokPtr = ptr;
1109 return XML_TOK_INVALID;
1110 case BT_VERBAR:
1111 *nextTokPtr = ptr + MINBPC(enc);
1112 return XML_TOK_OR;
1113 case BT_GT:
1114 *nextTokPtr = ptr + MINBPC(enc);
1115 return XML_TOK_DECL_CLOSE;
1116 case BT_NUM:
1117 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1118 #define LEAD_CASE(n) \
1119 case BT_LEAD ## n: \
1120 if (end - ptr < n) \
1121 return XML_TOK_PARTIAL_CHAR; \
1122 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1123 ptr += n; \
1124 tok = XML_TOK_NAME; \
1125 break; \
1126 } \
1127 if (IS_NAME_CHAR(enc, ptr, n)) { \
1128 ptr += n; \
1129 tok = XML_TOK_NMTOKEN; \
1130 break; \
1131 } \
1132 *nextTokPtr = ptr; \
1133 return XML_TOK_INVALID;
1134 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1135 #undef LEAD_CASE
1136 case BT_NMSTRT:
1137 case BT_HEX:
1138 tok = XML_TOK_NAME;
1139 ptr += MINBPC(enc);
1140 break;
1141 case BT_DIGIT:
1142 case BT_NAME:
1143 case BT_MINUS:
1144 #ifdef XML_NS
1145 case BT_COLON:
1146 #endif
1147 tok = XML_TOK_NMTOKEN;
1148 ptr += MINBPC(enc);
1149 break;
1150 case BT_NONASCII:
1151 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1152 ptr += MINBPC(enc);
1153 tok = XML_TOK_NAME;
1154 break;
1155 }
1156 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1157 ptr += MINBPC(enc);
1158 tok = XML_TOK_NMTOKEN;
1159 break;
1160 }
1161 /* fall through */
1162 default:
1163 *nextTokPtr = ptr;
1164 return XML_TOK_INVALID;
1165 }
1166 while (HAS_CHAR(enc, ptr, end)) {
1167 switch (BYTE_TYPE(enc, ptr)) {
1168 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1169 case BT_GT: case BT_RPAR: case BT_COMMA:
1170 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1171 case BT_S: case BT_CR: case BT_LF:
1172 *nextTokPtr = ptr;
1173 return tok;
1174 #ifdef XML_NS
1175 case BT_COLON:
1176 ptr += MINBPC(enc);
1177 switch (tok) {
1178 case XML_TOK_NAME:
1179 REQUIRE_CHAR(enc, ptr, end);
1180 tok = XML_TOK_PREFIXED_NAME;
1181 switch (BYTE_TYPE(enc, ptr)) {
1182 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1183 default:
1184 tok = XML_TOK_NMTOKEN;
1185 break;
1186 }
1187 break;
1188 case XML_TOK_PREFIXED_NAME:
1189 tok = XML_TOK_NMTOKEN;
1190 break;
1191 }
1192 break;
1193 #endif
1194 case BT_PLUS:
1195 if (tok == XML_TOK_NMTOKEN) {
1196 *nextTokPtr = ptr;
1197 return XML_TOK_INVALID;
1198 }
1199 *nextTokPtr = ptr + MINBPC(enc);
1200 return XML_TOK_NAME_PLUS;
1201 case BT_AST:
1202 if (tok == XML_TOK_NMTOKEN) {
1203 *nextTokPtr = ptr;
1204 return XML_TOK_INVALID;
1205 }
1206 *nextTokPtr = ptr + MINBPC(enc);
1207 return XML_TOK_NAME_ASTERISK;
1208 case BT_QUEST:
1209 if (tok == XML_TOK_NMTOKEN) {
1210 *nextTokPtr = ptr;
1211 return XML_TOK_INVALID;
1212 }
1213 *nextTokPtr = ptr + MINBPC(enc);
1214 return XML_TOK_NAME_QUESTION;
1215 default:
1216 *nextTokPtr = ptr;
1217 return XML_TOK_INVALID;
1218 }
1219 }
1220 return -tok;
1221 }
1222
1223 static int PTRCALL
PREFIX(attributeValueTok)1224 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
1225 const char *end, const char **nextTokPtr)
1226 {
1227 const char *start;
1228 if (ptr >= end)
1229 return XML_TOK_NONE;
1230 else if (! HAS_CHAR(enc, ptr, end)) {
1231 /* This line cannot be executed. The incoming data has already
1232 * been tokenized once, so incomplete characters like this have
1233 * already been eliminated from the input. Retaining the paranoia
1234 * check is still valuable, however.
1235 */
1236 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1237 }
1238 start = ptr;
1239 while (HAS_CHAR(enc, ptr, end)) {
1240 switch (BYTE_TYPE(enc, ptr)) {
1241 #define LEAD_CASE(n) \
1242 case BT_LEAD ## n: ptr += n; break;
1243 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1244 #undef LEAD_CASE
1245 case BT_AMP:
1246 if (ptr == start)
1247 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1248 *nextTokPtr = ptr;
1249 return XML_TOK_DATA_CHARS;
1250 case BT_LT:
1251 /* this is for inside entity references */
1252 *nextTokPtr = ptr;
1253 return XML_TOK_INVALID;
1254 case BT_LF:
1255 if (ptr == start) {
1256 *nextTokPtr = ptr + MINBPC(enc);
1257 return XML_TOK_DATA_NEWLINE;
1258 }
1259 *nextTokPtr = ptr;
1260 return XML_TOK_DATA_CHARS;
1261 case BT_CR:
1262 if (ptr == start) {
1263 ptr += MINBPC(enc);
1264 if (! HAS_CHAR(enc, ptr, end))
1265 return XML_TOK_TRAILING_CR;
1266 if (BYTE_TYPE(enc, ptr) == BT_LF)
1267 ptr += MINBPC(enc);
1268 *nextTokPtr = ptr;
1269 return XML_TOK_DATA_NEWLINE;
1270 }
1271 *nextTokPtr = ptr;
1272 return XML_TOK_DATA_CHARS;
1273 case BT_S:
1274 if (ptr == start) {
1275 *nextTokPtr = ptr + MINBPC(enc);
1276 return XML_TOK_ATTRIBUTE_VALUE_S;
1277 }
1278 *nextTokPtr = ptr;
1279 return XML_TOK_DATA_CHARS;
1280 default:
1281 ptr += MINBPC(enc);
1282 break;
1283 }
1284 }
1285 *nextTokPtr = ptr;
1286 return XML_TOK_DATA_CHARS;
1287 }
1288
1289 static int PTRCALL
PREFIX(entityValueTok)1290 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
1291 const char *end, const char **nextTokPtr)
1292 {
1293 const char *start;
1294 if (ptr >= end)
1295 return XML_TOK_NONE;
1296 else if (! HAS_CHAR(enc, ptr, end)) {
1297 /* This line cannot be executed. The incoming data has already
1298 * been tokenized once, so incomplete characters like this have
1299 * already been eliminated from the input. Retaining the paranoia
1300 * check is still valuable, however.
1301 */
1302 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1303 }
1304 start = ptr;
1305 while (HAS_CHAR(enc, ptr, end)) {
1306 switch (BYTE_TYPE(enc, ptr)) {
1307 #define LEAD_CASE(n) \
1308 case BT_LEAD ## n: ptr += n; break;
1309 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1310 #undef LEAD_CASE
1311 case BT_AMP:
1312 if (ptr == start)
1313 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1314 *nextTokPtr = ptr;
1315 return XML_TOK_DATA_CHARS;
1316 case BT_PERCNT:
1317 if (ptr == start) {
1318 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
1319 end, nextTokPtr);
1320 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1321 }
1322 *nextTokPtr = ptr;
1323 return XML_TOK_DATA_CHARS;
1324 case BT_LF:
1325 if (ptr == start) {
1326 *nextTokPtr = ptr + MINBPC(enc);
1327 return XML_TOK_DATA_NEWLINE;
1328 }
1329 *nextTokPtr = ptr;
1330 return XML_TOK_DATA_CHARS;
1331 case BT_CR:
1332 if (ptr == start) {
1333 ptr += MINBPC(enc);
1334 if (! HAS_CHAR(enc, ptr, end))
1335 return XML_TOK_TRAILING_CR;
1336 if (BYTE_TYPE(enc, ptr) == BT_LF)
1337 ptr += MINBPC(enc);
1338 *nextTokPtr = ptr;
1339 return XML_TOK_DATA_NEWLINE;
1340 }
1341 *nextTokPtr = ptr;
1342 return XML_TOK_DATA_CHARS;
1343 default:
1344 ptr += MINBPC(enc);
1345 break;
1346 }
1347 }
1348 *nextTokPtr = ptr;
1349 return XML_TOK_DATA_CHARS;
1350 }
1351
1352 #ifdef XML_DTD
1353
1354 static int PTRCALL
PREFIX(ignoreSectionTok)1355 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
1356 const char *end, const char **nextTokPtr)
1357 {
1358 int level = 0;
1359 if (MINBPC(enc) > 1) {
1360 size_t n = end - ptr;
1361 if (n & (MINBPC(enc) - 1)) {
1362 n &= ~(MINBPC(enc) - 1);
1363 end = ptr + n;
1364 }
1365 }
1366 while (HAS_CHAR(enc, ptr, end)) {
1367 switch (BYTE_TYPE(enc, ptr)) {
1368 INVALID_CASES(ptr, nextTokPtr)
1369 case BT_LT:
1370 ptr += MINBPC(enc);
1371 REQUIRE_CHAR(enc, ptr, end);
1372 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1373 ptr += MINBPC(enc);
1374 REQUIRE_CHAR(enc, ptr, end);
1375 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1376 ++level;
1377 ptr += MINBPC(enc);
1378 }
1379 }
1380 break;
1381 case BT_RSQB:
1382 ptr += MINBPC(enc);
1383 REQUIRE_CHAR(enc, ptr, end);
1384 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1385 ptr += MINBPC(enc);
1386 REQUIRE_CHAR(enc, ptr, end);
1387 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1388 ptr += MINBPC(enc);
1389 if (level == 0) {
1390 *nextTokPtr = ptr;
1391 return XML_TOK_IGNORE_SECT;
1392 }
1393 --level;
1394 }
1395 }
1396 break;
1397 default:
1398 ptr += MINBPC(enc);
1399 break;
1400 }
1401 }
1402 return XML_TOK_PARTIAL;
1403 }
1404
1405 #endif /* XML_DTD */
1406
1407 static int PTRCALL
PREFIX(isPublicId)1408 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1409 const char **badPtr)
1410 {
1411 ptr += MINBPC(enc);
1412 end -= MINBPC(enc);
1413 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1414 switch (BYTE_TYPE(enc, ptr)) {
1415 case BT_DIGIT:
1416 case BT_HEX:
1417 case BT_MINUS:
1418 case BT_APOS:
1419 case BT_LPAR:
1420 case BT_RPAR:
1421 case BT_PLUS:
1422 case BT_COMMA:
1423 case BT_SOL:
1424 case BT_EQUALS:
1425 case BT_QUEST:
1426 case BT_CR:
1427 case BT_LF:
1428 case BT_SEMI:
1429 case BT_EXCL:
1430 case BT_AST:
1431 case BT_PERCNT:
1432 case BT_NUM:
1433 #ifdef XML_NS
1434 case BT_COLON:
1435 #endif
1436 break;
1437 case BT_S:
1438 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1439 *badPtr = ptr;
1440 return 0;
1441 }
1442 break;
1443 case BT_NAME:
1444 case BT_NMSTRT:
1445 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1446 break;
1447 /* fall through */
1448 default:
1449 switch (BYTE_TO_ASCII(enc, ptr)) {
1450 case 0x24: /* $ */
1451 case 0x40: /* @ */
1452 break;
1453 default:
1454 *badPtr = ptr;
1455 return 0;
1456 }
1457 break;
1458 }
1459 }
1460 return 1;
1461 }
1462
1463 /* This must only be called for a well-formed start-tag or empty
1464 element tag. Returns the number of attributes. Pointers to the
1465 first attsMax attributes are stored in atts.
1466 */
1467
1468 static int PTRCALL
PREFIX(getAtts)1469 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1470 int attsMax, ATTRIBUTE *atts)
1471 {
1472 enum { other, inName, inValue } state = inName;
1473 int nAtts = 0;
1474 int open = 0; /* defined when state == inValue;
1475 initialization just to shut up compilers */
1476
1477 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1478 switch (BYTE_TYPE(enc, ptr)) {
1479 #define START_NAME \
1480 if (state == other) { \
1481 if (nAtts < attsMax) { \
1482 atts[nAtts].name = ptr; \
1483 atts[nAtts].normalized = 1; \
1484 } \
1485 state = inName; \
1486 }
1487 #define LEAD_CASE(n) \
1488 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1489 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1490 #undef LEAD_CASE
1491 case BT_NONASCII:
1492 case BT_NMSTRT:
1493 case BT_HEX:
1494 START_NAME
1495 break;
1496 #undef START_NAME
1497 case BT_QUOT:
1498 if (state != inValue) {
1499 if (nAtts < attsMax)
1500 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1501 state = inValue;
1502 open = BT_QUOT;
1503 }
1504 else if (open == BT_QUOT) {
1505 state = other;
1506 if (nAtts < attsMax)
1507 atts[nAtts].valueEnd = ptr;
1508 nAtts++;
1509 }
1510 break;
1511 case BT_APOS:
1512 if (state != inValue) {
1513 if (nAtts < attsMax)
1514 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1515 state = inValue;
1516 open = BT_APOS;
1517 }
1518 else if (open == BT_APOS) {
1519 state = other;
1520 if (nAtts < attsMax)
1521 atts[nAtts].valueEnd = ptr;
1522 nAtts++;
1523 }
1524 break;
1525 case BT_AMP:
1526 if (nAtts < attsMax)
1527 atts[nAtts].normalized = 0;
1528 break;
1529 case BT_S:
1530 if (state == inName)
1531 state = other;
1532 else if (state == inValue
1533 && nAtts < attsMax
1534 && atts[nAtts].normalized
1535 && (ptr == atts[nAtts].valuePtr
1536 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1537 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1538 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1539 atts[nAtts].normalized = 0;
1540 break;
1541 case BT_CR: case BT_LF:
1542 /* This case ensures that the first attribute name is counted
1543 Apart from that we could just change state on the quote. */
1544 if (state == inName)
1545 state = other;
1546 else if (state == inValue && nAtts < attsMax)
1547 atts[nAtts].normalized = 0;
1548 break;
1549 case BT_GT:
1550 case BT_SOL:
1551 if (state != inValue)
1552 return nAtts;
1553 break;
1554 default:
1555 break;
1556 }
1557 }
1558 /* not reached */
1559 }
1560
1561 static int PTRFASTCALL
PREFIX(charRefNumber)1562 PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
1563 {
1564 int result = 0;
1565 /* skip &# */
1566 ptr += 2*MINBPC(enc);
1567 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1568 for (ptr += MINBPC(enc);
1569 !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1570 ptr += MINBPC(enc)) {
1571 int c = BYTE_TO_ASCII(enc, ptr);
1572 switch (c) {
1573 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1574 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1575 result <<= 4;
1576 result |= (c - ASCII_0);
1577 break;
1578 case ASCII_A: case ASCII_B: case ASCII_C:
1579 case ASCII_D: case ASCII_E: case ASCII_F:
1580 result <<= 4;
1581 result += 10 + (c - ASCII_A);
1582 break;
1583 case ASCII_a: case ASCII_b: case ASCII_c:
1584 case ASCII_d: case ASCII_e: case ASCII_f:
1585 result <<= 4;
1586 result += 10 + (c - ASCII_a);
1587 break;
1588 }
1589 if (result >= 0x110000)
1590 return -1;
1591 }
1592 }
1593 else {
1594 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1595 int c = BYTE_TO_ASCII(enc, ptr);
1596 result *= 10;
1597 result += (c - ASCII_0);
1598 if (result >= 0x110000)
1599 return -1;
1600 }
1601 }
1602 return checkCharRefNumber(result);
1603 }
1604
1605 static int PTRCALL
PREFIX(predefinedEntityName)1606 PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
1607 const char *end)
1608 {
1609 switch ((end - ptr)/MINBPC(enc)) {
1610 case 2:
1611 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1612 switch (BYTE_TO_ASCII(enc, ptr)) {
1613 case ASCII_l:
1614 return ASCII_LT;
1615 case ASCII_g:
1616 return ASCII_GT;
1617 }
1618 }
1619 break;
1620 case 3:
1621 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1622 ptr += MINBPC(enc);
1623 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1624 ptr += MINBPC(enc);
1625 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1626 return ASCII_AMP;
1627 }
1628 }
1629 break;
1630 case 4:
1631 switch (BYTE_TO_ASCII(enc, ptr)) {
1632 case ASCII_q:
1633 ptr += MINBPC(enc);
1634 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1635 ptr += MINBPC(enc);
1636 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1637 ptr += MINBPC(enc);
1638 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1639 return ASCII_QUOT;
1640 }
1641 }
1642 break;
1643 case ASCII_a:
1644 ptr += MINBPC(enc);
1645 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1646 ptr += MINBPC(enc);
1647 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1648 ptr += MINBPC(enc);
1649 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1650 return ASCII_APOS;
1651 }
1652 }
1653 break;
1654 }
1655 }
1656 return 0;
1657 }
1658
1659 static int PTRCALL
PREFIX(nameMatchesAscii)1660 PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
1661 const char *end1, const char *ptr2)
1662 {
1663 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1664 if (end1 - ptr1 < MINBPC(enc)) {
1665 /* This line cannot be executed. The incoming data has already
1666 * been tokenized once, so incomplete characters like this have
1667 * already been eliminated from the input. Retaining the
1668 * paranoia check is still valuable, however.
1669 */
1670 return 0; /* LCOV_EXCL_LINE */
1671 }
1672 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1673 return 0;
1674 }
1675 return ptr1 == end1;
1676 }
1677
1678 static int PTRFASTCALL
PREFIX(nameLength)1679 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1680 {
1681 const char *start = ptr;
1682 for (;;) {
1683 switch (BYTE_TYPE(enc, ptr)) {
1684 #define LEAD_CASE(n) \
1685 case BT_LEAD ## n: ptr += n; break;
1686 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1687 #undef LEAD_CASE
1688 case BT_NONASCII:
1689 case BT_NMSTRT:
1690 #ifdef XML_NS
1691 case BT_COLON:
1692 #endif
1693 case BT_HEX:
1694 case BT_DIGIT:
1695 case BT_NAME:
1696 case BT_MINUS:
1697 ptr += MINBPC(enc);
1698 break;
1699 default:
1700 return (int)(ptr - start);
1701 }
1702 }
1703 }
1704
1705 static const char * PTRFASTCALL
PREFIX(skipS)1706 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1707 {
1708 for (;;) {
1709 switch (BYTE_TYPE(enc, ptr)) {
1710 case BT_LF:
1711 case BT_CR:
1712 case BT_S:
1713 ptr += MINBPC(enc);
1714 break;
1715 default:
1716 return ptr;
1717 }
1718 }
1719 }
1720
1721 static void PTRCALL
PREFIX(updatePosition)1722 PREFIX(updatePosition)(const ENCODING *enc,
1723 const char *ptr,
1724 const char *end,
1725 POSITION *pos)
1726 {
1727 while (HAS_CHAR(enc, ptr, end)) {
1728 switch (BYTE_TYPE(enc, ptr)) {
1729 #define LEAD_CASE(n) \
1730 case BT_LEAD ## n: \
1731 ptr += n; \
1732 break;
1733 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1734 #undef LEAD_CASE
1735 case BT_LF:
1736 pos->columnNumber = (XML_Size)-1;
1737 pos->lineNumber++;
1738 ptr += MINBPC(enc);
1739 break;
1740 case BT_CR:
1741 pos->lineNumber++;
1742 ptr += MINBPC(enc);
1743 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1744 ptr += MINBPC(enc);
1745 pos->columnNumber = (XML_Size)-1;
1746 break;
1747 default:
1748 ptr += MINBPC(enc);
1749 break;
1750 }
1751 pos->columnNumber++;
1752 }
1753 }
1754
1755 #undef DO_LEAD_CASE
1756 #undef MULTIBYTE_CASES
1757 #undef INVALID_CASES
1758 #undef CHECK_NAME_CASE
1759 #undef CHECK_NAME_CASES
1760 #undef CHECK_NMSTRT_CASE
1761 #undef CHECK_NMSTRT_CASES
1762
1763 #endif /* XML_TOK_IMPL_C */
1764