1 /* This file is included!
2 __ __ _
3 ___\ \/ /_ __ __ _| |_
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
7 |_| XML parser
8
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000-2017 Expat development team
11 Licensed under the MIT license:
12
13 Permission is hereby granted, free of charge, to any person obtaining
14 a copy of this software and associated documentation files (the
15 "Software"), to deal in the Software without restriction, including
16 without limitation the rights to use, copy, modify, merge, publish,
17 distribute, sublicense, and/or sell copies of the Software, and to permit
18 persons to whom the Software is furnished to do so, subject to the
19 following conditions:
20
21 The above copyright notice and this permission notice shall be included
22 in all copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
27 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
28 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
29 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
30 USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32
33 #ifdef XML_TOK_IMPL_C
34
35 # ifndef IS_INVALID_CHAR
36 # define IS_INVALID_CHAR(enc, ptr, n) (0)
37 # endif
38
39 # define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
40 case BT_LEAD##n: \
41 if (end - ptr < n) \
42 return XML_TOK_PARTIAL_CHAR; \
43 if (IS_INVALID_CHAR(enc, ptr, n)) { \
44 *(nextTokPtr) = (ptr); \
45 return XML_TOK_INVALID; \
46 } \
47 ptr += n; \
48 break;
49
50 # define INVALID_CASES(ptr, nextTokPtr) \
51 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
52 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
53 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
54 case BT_NONXML: \
55 case BT_MALFORM: \
56 case BT_TRAIL: \
57 *(nextTokPtr) = (ptr); \
58 return XML_TOK_INVALID;
59
60 # define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
61 case BT_LEAD##n: \
62 if (end - ptr < n) \
63 return XML_TOK_PARTIAL_CHAR; \
64 if (! IS_NAME_CHAR(enc, ptr, n)) { \
65 *nextTokPtr = ptr; \
66 return XML_TOK_INVALID; \
67 } \
68 ptr += n; \
69 break;
70
71 # define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
72 case BT_NONASCII: \
73 if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \
74 *nextTokPtr = ptr; \
75 return XML_TOK_INVALID; \
76 } \
77 /* fall through */ \
78 case BT_NMSTRT: \
79 case BT_HEX: \
80 case BT_DIGIT: \
81 case BT_NAME: \
82 case BT_MINUS: \
83 ptr += MINBPC(enc); \
84 break; \
85 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
86 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
87 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
88
89 # define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
90 case BT_LEAD##n: \
91 if (end - ptr < n) \
92 return XML_TOK_PARTIAL_CHAR; \
93 if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \
94 *nextTokPtr = ptr; \
95 return XML_TOK_INVALID; \
96 } \
97 ptr += n; \
98 break;
99
100 # define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
101 case BT_NONASCII: \
102 if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
103 *nextTokPtr = ptr; \
104 return XML_TOK_INVALID; \
105 } \
106 /* fall through */ \
107 case BT_NMSTRT: \
108 case BT_HEX: \
109 ptr += MINBPC(enc); \
110 break; \
111 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
112 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
113 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
114
115 # ifndef PREFIX
116 # define PREFIX(ident) ident
117 # endif
118
119 # define HAS_CHARS(enc, ptr, end, count) (end - ptr >= count * MINBPC(enc))
120
121 # define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
122
123 # define REQUIRE_CHARS(enc, ptr, end, count) \
124 { \
125 if (! HAS_CHARS(enc, ptr, end, count)) { \
126 return XML_TOK_PARTIAL; \
127 } \
128 }
129
130 # define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
131
132 /* ptr points to character following "<!-" */
133
134 static int PTRCALL
PREFIX(scanComment)135 PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
136 const char **nextTokPtr) {
137 if (HAS_CHAR(enc, ptr, end)) {
138 if (! CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
139 *nextTokPtr = ptr;
140 return XML_TOK_INVALID;
141 }
142 ptr += MINBPC(enc);
143 while (HAS_CHAR(enc, ptr, end)) {
144 switch (BYTE_TYPE(enc, ptr)) {
145 INVALID_CASES(ptr, nextTokPtr)
146 case BT_MINUS:
147 ptr += MINBPC(enc);
148 REQUIRE_CHAR(enc, ptr, end);
149 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
150 ptr += MINBPC(enc);
151 REQUIRE_CHAR(enc, ptr, end);
152 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
153 *nextTokPtr = ptr;
154 return XML_TOK_INVALID;
155 }
156 *nextTokPtr = ptr + MINBPC(enc);
157 return XML_TOK_COMMENT;
158 }
159 break;
160 default:
161 ptr += MINBPC(enc);
162 break;
163 }
164 }
165 }
166 return XML_TOK_PARTIAL;
167 }
168
169 /* ptr points to character following "<!" */
170
171 static int PTRCALL
PREFIX(scanDecl)172 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
173 const char **nextTokPtr) {
174 REQUIRE_CHAR(enc, ptr, end);
175 switch (BYTE_TYPE(enc, ptr)) {
176 case BT_MINUS:
177 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
178 case BT_LSQB:
179 *nextTokPtr = ptr + MINBPC(enc);
180 return XML_TOK_COND_SECT_OPEN;
181 case BT_NMSTRT:
182 case BT_HEX:
183 ptr += MINBPC(enc);
184 break;
185 default:
186 *nextTokPtr = ptr;
187 return XML_TOK_INVALID;
188 }
189 while (HAS_CHAR(enc, ptr, end)) {
190 switch (BYTE_TYPE(enc, ptr)) {
191 case BT_PERCNT:
192 REQUIRE_CHARS(enc, ptr, end, 2);
193 /* don't allow <!ENTITY% foo "whatever"> */
194 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
195 case BT_S:
196 case BT_CR:
197 case BT_LF:
198 case BT_PERCNT:
199 *nextTokPtr = ptr;
200 return XML_TOK_INVALID;
201 }
202 /* fall through */
203 case BT_S:
204 case BT_CR:
205 case BT_LF:
206 *nextTokPtr = ptr;
207 return XML_TOK_DECL_OPEN;
208 case BT_NMSTRT:
209 case BT_HEX:
210 ptr += MINBPC(enc);
211 break;
212 default:
213 *nextTokPtr = ptr;
214 return XML_TOK_INVALID;
215 }
216 }
217 return XML_TOK_PARTIAL;
218 }
219
220 static int PTRCALL
PREFIX(checkPiTarget)221 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end,
222 int *tokPtr) {
223 int upper = 0;
224 UNUSED_P(enc);
225 *tokPtr = XML_TOK_PI;
226 if (end - ptr != MINBPC(enc) * 3)
227 return 1;
228 switch (BYTE_TO_ASCII(enc, ptr)) {
229 case ASCII_x:
230 break;
231 case ASCII_X:
232 upper = 1;
233 break;
234 default:
235 return 1;
236 }
237 ptr += MINBPC(enc);
238 switch (BYTE_TO_ASCII(enc, ptr)) {
239 case ASCII_m:
240 break;
241 case ASCII_M:
242 upper = 1;
243 break;
244 default:
245 return 1;
246 }
247 ptr += MINBPC(enc);
248 switch (BYTE_TO_ASCII(enc, ptr)) {
249 case ASCII_l:
250 break;
251 case ASCII_L:
252 upper = 1;
253 break;
254 default:
255 return 1;
256 }
257 if (upper)
258 return 0;
259 *tokPtr = XML_TOK_XML_DECL;
260 return 1;
261 }
262
263 /* ptr points to character following "<?" */
264
265 static int PTRCALL
PREFIX(scanPi)266 PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
267 const char **nextTokPtr) {
268 int tok;
269 const char *target = ptr;
270 REQUIRE_CHAR(enc, ptr, end);
271 switch (BYTE_TYPE(enc, ptr)) {
272 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
273 default:
274 *nextTokPtr = ptr;
275 return XML_TOK_INVALID;
276 }
277 while (HAS_CHAR(enc, ptr, end)) {
278 switch (BYTE_TYPE(enc, ptr)) {
279 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
280 case BT_S:
281 case BT_CR:
282 case BT_LF:
283 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
284 *nextTokPtr = ptr;
285 return XML_TOK_INVALID;
286 }
287 ptr += MINBPC(enc);
288 while (HAS_CHAR(enc, ptr, end)) {
289 switch (BYTE_TYPE(enc, ptr)) {
290 INVALID_CASES(ptr, nextTokPtr)
291 case BT_QUEST:
292 ptr += MINBPC(enc);
293 REQUIRE_CHAR(enc, ptr, end);
294 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
295 *nextTokPtr = ptr + MINBPC(enc);
296 return tok;
297 }
298 break;
299 default:
300 ptr += MINBPC(enc);
301 break;
302 }
303 }
304 return XML_TOK_PARTIAL;
305 case BT_QUEST:
306 if (! PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
307 *nextTokPtr = ptr;
308 return XML_TOK_INVALID;
309 }
310 ptr += MINBPC(enc);
311 REQUIRE_CHAR(enc, ptr, end);
312 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
313 *nextTokPtr = ptr + MINBPC(enc);
314 return tok;
315 }
316 /* fall through */
317 default:
318 *nextTokPtr = ptr;
319 return XML_TOK_INVALID;
320 }
321 }
322 return XML_TOK_PARTIAL;
323 }
324
325 static int PTRCALL
PREFIX(scanCdataSection)326 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
327 const char **nextTokPtr) {
328 static const char CDATA_LSQB[]
329 = {ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB};
330 int i;
331 UNUSED_P(enc);
332 /* CDATA[ */
333 REQUIRE_CHARS(enc, ptr, end, 6);
334 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
335 if (! CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
336 *nextTokPtr = ptr;
337 return XML_TOK_INVALID;
338 }
339 }
340 *nextTokPtr = ptr;
341 return XML_TOK_CDATA_SECT_OPEN;
342 }
343
344 static int PTRCALL
PREFIX(cdataSectionTok)345 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
346 const char **nextTokPtr) {
347 if (ptr >= end)
348 return XML_TOK_NONE;
349 if (MINBPC(enc) > 1) {
350 size_t n = end - ptr;
351 if (n & (MINBPC(enc) - 1)) {
352 n &= ~(MINBPC(enc) - 1);
353 if (n == 0)
354 return XML_TOK_PARTIAL;
355 end = ptr + n;
356 }
357 }
358 switch (BYTE_TYPE(enc, ptr)) {
359 case BT_RSQB:
360 ptr += MINBPC(enc);
361 REQUIRE_CHAR(enc, ptr, end);
362 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
363 break;
364 ptr += MINBPC(enc);
365 REQUIRE_CHAR(enc, ptr, end);
366 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
367 ptr -= MINBPC(enc);
368 break;
369 }
370 *nextTokPtr = ptr + MINBPC(enc);
371 return XML_TOK_CDATA_SECT_CLOSE;
372 case BT_CR:
373 ptr += MINBPC(enc);
374 REQUIRE_CHAR(enc, ptr, end);
375 if (BYTE_TYPE(enc, ptr) == BT_LF)
376 ptr += MINBPC(enc);
377 *nextTokPtr = ptr;
378 return XML_TOK_DATA_NEWLINE;
379 case BT_LF:
380 *nextTokPtr = ptr + MINBPC(enc);
381 return XML_TOK_DATA_NEWLINE;
382 INVALID_CASES(ptr, nextTokPtr)
383 default:
384 ptr += MINBPC(enc);
385 break;
386 }
387 while (HAS_CHAR(enc, ptr, end)) {
388 switch (BYTE_TYPE(enc, ptr)) {
389 # define LEAD_CASE(n) \
390 case BT_LEAD##n: \
391 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
392 *nextTokPtr = ptr; \
393 return XML_TOK_DATA_CHARS; \
394 } \
395 ptr += n; \
396 break;
397 LEAD_CASE(2)
398 LEAD_CASE(3)
399 LEAD_CASE(4)
400 # undef LEAD_CASE
401 case BT_NONXML:
402 case BT_MALFORM:
403 case BT_TRAIL:
404 case BT_CR:
405 case BT_LF:
406 case BT_RSQB:
407 *nextTokPtr = ptr;
408 return XML_TOK_DATA_CHARS;
409 default:
410 ptr += MINBPC(enc);
411 break;
412 }
413 }
414 *nextTokPtr = ptr;
415 return XML_TOK_DATA_CHARS;
416 }
417
418 /* ptr points to character following "</" */
419
420 static int PTRCALL
PREFIX(scanEndTag)421 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
422 const char **nextTokPtr) {
423 REQUIRE_CHAR(enc, ptr, end);
424 switch (BYTE_TYPE(enc, ptr)) {
425 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
426 default:
427 *nextTokPtr = ptr;
428 return XML_TOK_INVALID;
429 }
430 while (HAS_CHAR(enc, ptr, end)) {
431 switch (BYTE_TYPE(enc, ptr)) {
432 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
433 case BT_S:
434 case BT_CR:
435 case BT_LF:
436 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
437 switch (BYTE_TYPE(enc, ptr)) {
438 case BT_S:
439 case BT_CR:
440 case BT_LF:
441 break;
442 case BT_GT:
443 *nextTokPtr = ptr + MINBPC(enc);
444 return XML_TOK_END_TAG;
445 default:
446 *nextTokPtr = ptr;
447 return XML_TOK_INVALID;
448 }
449 }
450 return XML_TOK_PARTIAL;
451 # ifdef XML_NS
452 case BT_COLON:
453 /* no need to check qname syntax here,
454 since end-tag must match exactly */
455 ptr += MINBPC(enc);
456 break;
457 # endif
458 case BT_GT:
459 *nextTokPtr = ptr + MINBPC(enc);
460 return XML_TOK_END_TAG;
461 default:
462 *nextTokPtr = ptr;
463 return XML_TOK_INVALID;
464 }
465 }
466 return XML_TOK_PARTIAL;
467 }
468
469 /* ptr points to character following "&#X" */
470
471 static int PTRCALL
PREFIX(scanHexCharRef)472 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
473 const char **nextTokPtr) {
474 if (HAS_CHAR(enc, ptr, end)) {
475 switch (BYTE_TYPE(enc, ptr)) {
476 case BT_DIGIT:
477 case BT_HEX:
478 break;
479 default:
480 *nextTokPtr = ptr;
481 return XML_TOK_INVALID;
482 }
483 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
484 switch (BYTE_TYPE(enc, ptr)) {
485 case BT_DIGIT:
486 case BT_HEX:
487 break;
488 case BT_SEMI:
489 *nextTokPtr = ptr + MINBPC(enc);
490 return XML_TOK_CHAR_REF;
491 default:
492 *nextTokPtr = ptr;
493 return XML_TOK_INVALID;
494 }
495 }
496 }
497 return XML_TOK_PARTIAL;
498 }
499
500 /* ptr points to character following "&#" */
501
502 static int PTRCALL
PREFIX(scanCharRef)503 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
504 const char **nextTokPtr) {
505 if (HAS_CHAR(enc, ptr, end)) {
506 if (CHAR_MATCHES(enc, ptr, ASCII_x))
507 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
508 switch (BYTE_TYPE(enc, ptr)) {
509 case BT_DIGIT:
510 break;
511 default:
512 *nextTokPtr = ptr;
513 return XML_TOK_INVALID;
514 }
515 for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
516 switch (BYTE_TYPE(enc, ptr)) {
517 case BT_DIGIT:
518 break;
519 case BT_SEMI:
520 *nextTokPtr = ptr + MINBPC(enc);
521 return XML_TOK_CHAR_REF;
522 default:
523 *nextTokPtr = ptr;
524 return XML_TOK_INVALID;
525 }
526 }
527 }
528 return XML_TOK_PARTIAL;
529 }
530
531 /* ptr points to character following "&" */
532
533 static int PTRCALL
PREFIX(scanRef)534 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
535 const char **nextTokPtr) {
536 REQUIRE_CHAR(enc, ptr, end);
537 switch (BYTE_TYPE(enc, ptr)) {
538 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
539 case BT_NUM:
540 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
541 default:
542 *nextTokPtr = ptr;
543 return XML_TOK_INVALID;
544 }
545 while (HAS_CHAR(enc, ptr, end)) {
546 switch (BYTE_TYPE(enc, ptr)) {
547 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
548 case BT_SEMI:
549 *nextTokPtr = ptr + MINBPC(enc);
550 return XML_TOK_ENTITY_REF;
551 default:
552 *nextTokPtr = ptr;
553 return XML_TOK_INVALID;
554 }
555 }
556 return XML_TOK_PARTIAL;
557 }
558
559 /* ptr points to character following first character of attribute name */
560
561 static int PTRCALL
PREFIX(scanAtts)562 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
563 const char **nextTokPtr) {
564 # ifdef XML_NS
565 int hadColon = 0;
566 # endif
567 while (HAS_CHAR(enc, ptr, end)) {
568 switch (BYTE_TYPE(enc, ptr)) {
569 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
570 # ifdef XML_NS
571 case BT_COLON:
572 if (hadColon) {
573 *nextTokPtr = ptr;
574 return XML_TOK_INVALID;
575 }
576 hadColon = 1;
577 ptr += MINBPC(enc);
578 REQUIRE_CHAR(enc, ptr, end);
579 switch (BYTE_TYPE(enc, ptr)) {
580 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
581 default:
582 *nextTokPtr = ptr;
583 return XML_TOK_INVALID;
584 }
585 break;
586 # endif
587 case BT_S:
588 case BT_CR:
589 case BT_LF:
590 for (;;) {
591 int t;
592
593 ptr += MINBPC(enc);
594 REQUIRE_CHAR(enc, ptr, end);
595 t = BYTE_TYPE(enc, ptr);
596 if (t == BT_EQUALS)
597 break;
598 switch (t) {
599 case BT_S:
600 case BT_LF:
601 case BT_CR:
602 break;
603 default:
604 *nextTokPtr = ptr;
605 return XML_TOK_INVALID;
606 }
607 }
608 /* fall through */
609 case BT_EQUALS: {
610 int open;
611 # ifdef XML_NS
612 hadColon = 0;
613 # endif
614 for (;;) {
615 ptr += MINBPC(enc);
616 REQUIRE_CHAR(enc, ptr, end);
617 open = BYTE_TYPE(enc, ptr);
618 if (open == BT_QUOT || open == BT_APOS)
619 break;
620 switch (open) {
621 case BT_S:
622 case BT_LF:
623 case BT_CR:
624 break;
625 default:
626 *nextTokPtr = ptr;
627 return XML_TOK_INVALID;
628 }
629 }
630 ptr += MINBPC(enc);
631 /* in attribute value */
632 for (;;) {
633 int t;
634 REQUIRE_CHAR(enc, ptr, end);
635 t = BYTE_TYPE(enc, ptr);
636 if (t == open)
637 break;
638 switch (t) {
639 INVALID_CASES(ptr, nextTokPtr)
640 case BT_AMP: {
641 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
642 if (tok <= 0) {
643 if (tok == XML_TOK_INVALID)
644 *nextTokPtr = ptr;
645 return tok;
646 }
647 break;
648 }
649 case BT_LT:
650 *nextTokPtr = ptr;
651 return XML_TOK_INVALID;
652 default:
653 ptr += MINBPC(enc);
654 break;
655 }
656 }
657 ptr += MINBPC(enc);
658 REQUIRE_CHAR(enc, ptr, end);
659 switch (BYTE_TYPE(enc, ptr)) {
660 case BT_S:
661 case BT_CR:
662 case BT_LF:
663 break;
664 case BT_SOL:
665 goto sol;
666 case BT_GT:
667 goto gt;
668 default:
669 *nextTokPtr = ptr;
670 return XML_TOK_INVALID;
671 }
672 /* ptr points to closing quote */
673 for (;;) {
674 ptr += MINBPC(enc);
675 REQUIRE_CHAR(enc, ptr, end);
676 switch (BYTE_TYPE(enc, ptr)) {
677 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
678 case BT_S:
679 case BT_CR:
680 case BT_LF:
681 continue;
682 case BT_GT:
683 gt:
684 *nextTokPtr = ptr + MINBPC(enc);
685 return XML_TOK_START_TAG_WITH_ATTS;
686 case BT_SOL:
687 sol:
688 ptr += MINBPC(enc);
689 REQUIRE_CHAR(enc, ptr, end);
690 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
691 *nextTokPtr = ptr;
692 return XML_TOK_INVALID;
693 }
694 *nextTokPtr = ptr + MINBPC(enc);
695 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
696 default:
697 *nextTokPtr = ptr;
698 return XML_TOK_INVALID;
699 }
700 break;
701 }
702 break;
703 }
704 default:
705 *nextTokPtr = ptr;
706 return XML_TOK_INVALID;
707 }
708 }
709 return XML_TOK_PARTIAL;
710 }
711
712 /* ptr points to character following "<" */
713
714 static int PTRCALL
PREFIX(scanLt)715 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
716 const char **nextTokPtr) {
717 # ifdef XML_NS
718 int hadColon;
719 # endif
720 REQUIRE_CHAR(enc, ptr, end);
721 switch (BYTE_TYPE(enc, ptr)) {
722 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
723 case BT_EXCL:
724 ptr += MINBPC(enc);
725 REQUIRE_CHAR(enc, ptr, end);
726 switch (BYTE_TYPE(enc, ptr)) {
727 case BT_MINUS:
728 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
729 case BT_LSQB:
730 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
731 }
732 *nextTokPtr = ptr;
733 return XML_TOK_INVALID;
734 case BT_QUEST:
735 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
736 case BT_SOL:
737 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
738 default:
739 *nextTokPtr = ptr;
740 return XML_TOK_INVALID;
741 }
742 # ifdef XML_NS
743 hadColon = 0;
744 # endif
745 /* we have a start-tag */
746 while (HAS_CHAR(enc, ptr, end)) {
747 switch (BYTE_TYPE(enc, ptr)) {
748 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
749 # ifdef XML_NS
750 case BT_COLON:
751 if (hadColon) {
752 *nextTokPtr = ptr;
753 return XML_TOK_INVALID;
754 }
755 hadColon = 1;
756 ptr += MINBPC(enc);
757 REQUIRE_CHAR(enc, ptr, end);
758 switch (BYTE_TYPE(enc, ptr)) {
759 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
760 default:
761 *nextTokPtr = ptr;
762 return XML_TOK_INVALID;
763 }
764 break;
765 # endif
766 case BT_S:
767 case BT_CR:
768 case BT_LF: {
769 ptr += MINBPC(enc);
770 while (HAS_CHAR(enc, ptr, end)) {
771 switch (BYTE_TYPE(enc, ptr)) {
772 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
773 case BT_GT:
774 goto gt;
775 case BT_SOL:
776 goto sol;
777 case BT_S:
778 case BT_CR:
779 case BT_LF:
780 ptr += MINBPC(enc);
781 continue;
782 default:
783 *nextTokPtr = ptr;
784 return XML_TOK_INVALID;
785 }
786 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
787 }
788 return XML_TOK_PARTIAL;
789 }
790 case BT_GT:
791 gt:
792 *nextTokPtr = ptr + MINBPC(enc);
793 return XML_TOK_START_TAG_NO_ATTS;
794 case BT_SOL:
795 sol:
796 ptr += MINBPC(enc);
797 REQUIRE_CHAR(enc, ptr, end);
798 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
799 *nextTokPtr = ptr;
800 return XML_TOK_INVALID;
801 }
802 *nextTokPtr = ptr + MINBPC(enc);
803 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
804 default:
805 *nextTokPtr = ptr;
806 return XML_TOK_INVALID;
807 }
808 }
809 return XML_TOK_PARTIAL;
810 }
811
812 static int PTRCALL
PREFIX(contentTok)813 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
814 const char **nextTokPtr) {
815 if (ptr >= end)
816 return XML_TOK_NONE;
817 if (MINBPC(enc) > 1) {
818 size_t n = end - ptr;
819 if (n & (MINBPC(enc) - 1)) {
820 n &= ~(MINBPC(enc) - 1);
821 if (n == 0)
822 return XML_TOK_PARTIAL;
823 end = ptr + n;
824 }
825 }
826 switch (BYTE_TYPE(enc, ptr)) {
827 case BT_LT:
828 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
829 case BT_AMP:
830 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
831 case BT_CR:
832 ptr += MINBPC(enc);
833 if (! HAS_CHAR(enc, ptr, end))
834 return XML_TOK_TRAILING_CR;
835 if (BYTE_TYPE(enc, ptr) == BT_LF)
836 ptr += MINBPC(enc);
837 *nextTokPtr = ptr;
838 return XML_TOK_DATA_NEWLINE;
839 case BT_LF:
840 *nextTokPtr = ptr + MINBPC(enc);
841 return XML_TOK_DATA_NEWLINE;
842 case BT_RSQB:
843 ptr += MINBPC(enc);
844 if (! HAS_CHAR(enc, ptr, end))
845 return XML_TOK_TRAILING_RSQB;
846 if (! CHAR_MATCHES(enc, ptr, ASCII_RSQB))
847 break;
848 ptr += MINBPC(enc);
849 if (! HAS_CHAR(enc, ptr, end))
850 return XML_TOK_TRAILING_RSQB;
851 if (! CHAR_MATCHES(enc, ptr, ASCII_GT)) {
852 ptr -= MINBPC(enc);
853 break;
854 }
855 *nextTokPtr = ptr;
856 return XML_TOK_INVALID;
857 INVALID_CASES(ptr, nextTokPtr)
858 default:
859 ptr += MINBPC(enc);
860 break;
861 }
862 while (HAS_CHAR(enc, ptr, end)) {
863 switch (BYTE_TYPE(enc, ptr)) {
864 # define LEAD_CASE(n) \
865 case BT_LEAD##n: \
866 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
867 *nextTokPtr = ptr; \
868 return XML_TOK_DATA_CHARS; \
869 } \
870 ptr += n; \
871 break;
872 LEAD_CASE(2)
873 LEAD_CASE(3)
874 LEAD_CASE(4)
875 # undef LEAD_CASE
876 case BT_RSQB:
877 if (HAS_CHARS(enc, ptr, end, 2)) {
878 if (! CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
879 ptr += MINBPC(enc);
880 break;
881 }
882 if (HAS_CHARS(enc, ptr, end, 3)) {
883 if (! CHAR_MATCHES(enc, ptr + 2 * MINBPC(enc), ASCII_GT)) {
884 ptr += MINBPC(enc);
885 break;
886 }
887 *nextTokPtr = ptr + 2 * MINBPC(enc);
888 return XML_TOK_INVALID;
889 }
890 }
891 /* fall through */
892 case BT_AMP:
893 case BT_LT:
894 case BT_NONXML:
895 case BT_MALFORM:
896 case BT_TRAIL:
897 case BT_CR:
898 case BT_LF:
899 *nextTokPtr = ptr;
900 return XML_TOK_DATA_CHARS;
901 default:
902 ptr += MINBPC(enc);
903 break;
904 }
905 }
906 *nextTokPtr = ptr;
907 return XML_TOK_DATA_CHARS;
908 }
909
910 /* ptr points to character following "%" */
911
912 static int PTRCALL
PREFIX(scanPercent)913 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
914 const char **nextTokPtr) {
915 REQUIRE_CHAR(enc, ptr, end);
916 switch (BYTE_TYPE(enc, ptr)) {
917 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
918 case BT_S:
919 case BT_LF:
920 case BT_CR:
921 case BT_PERCNT:
922 *nextTokPtr = ptr;
923 return XML_TOK_PERCENT;
924 default:
925 *nextTokPtr = ptr;
926 return XML_TOK_INVALID;
927 }
928 while (HAS_CHAR(enc, ptr, end)) {
929 switch (BYTE_TYPE(enc, ptr)) {
930 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
931 case BT_SEMI:
932 *nextTokPtr = ptr + MINBPC(enc);
933 return XML_TOK_PARAM_ENTITY_REF;
934 default:
935 *nextTokPtr = ptr;
936 return XML_TOK_INVALID;
937 }
938 }
939 return XML_TOK_PARTIAL;
940 }
941
942 static int PTRCALL
PREFIX(scanPoundName)943 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
944 const char **nextTokPtr) {
945 REQUIRE_CHAR(enc, ptr, end);
946 switch (BYTE_TYPE(enc, ptr)) {
947 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
948 default:
949 *nextTokPtr = ptr;
950 return XML_TOK_INVALID;
951 }
952 while (HAS_CHAR(enc, ptr, end)) {
953 switch (BYTE_TYPE(enc, ptr)) {
954 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
955 case BT_CR:
956 case BT_LF:
957 case BT_S:
958 case BT_RPAR:
959 case BT_GT:
960 case BT_PERCNT:
961 case BT_VERBAR:
962 *nextTokPtr = ptr;
963 return XML_TOK_POUND_NAME;
964 default:
965 *nextTokPtr = ptr;
966 return XML_TOK_INVALID;
967 }
968 }
969 return -XML_TOK_POUND_NAME;
970 }
971
972 static int PTRCALL
PREFIX(scanLit)973 PREFIX(scanLit)(int open, const ENCODING *enc, const char *ptr, const char *end,
974 const char **nextTokPtr) {
975 while (HAS_CHAR(enc, ptr, end)) {
976 int t = BYTE_TYPE(enc, ptr);
977 switch (t) {
978 INVALID_CASES(ptr, nextTokPtr)
979 case BT_QUOT:
980 case BT_APOS:
981 ptr += MINBPC(enc);
982 if (t != open)
983 break;
984 if (! HAS_CHAR(enc, ptr, end))
985 return -XML_TOK_LITERAL;
986 *nextTokPtr = ptr;
987 switch (BYTE_TYPE(enc, ptr)) {
988 case BT_S:
989 case BT_CR:
990 case BT_LF:
991 case BT_GT:
992 case BT_PERCNT:
993 case BT_LSQB:
994 return XML_TOK_LITERAL;
995 default:
996 return XML_TOK_INVALID;
997 }
998 default:
999 ptr += MINBPC(enc);
1000 break;
1001 }
1002 }
1003 return XML_TOK_PARTIAL;
1004 }
1005
1006 static int PTRCALL
PREFIX(prologTok)1007 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
1008 const char **nextTokPtr) {
1009 int tok;
1010 if (ptr >= end)
1011 return XML_TOK_NONE;
1012 if (MINBPC(enc) > 1) {
1013 size_t n = end - ptr;
1014 if (n & (MINBPC(enc) - 1)) {
1015 n &= ~(MINBPC(enc) - 1);
1016 if (n == 0)
1017 return XML_TOK_PARTIAL;
1018 end = ptr + n;
1019 }
1020 }
1021 switch (BYTE_TYPE(enc, ptr)) {
1022 case BT_QUOT:
1023 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1024 case BT_APOS:
1025 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1026 case BT_LT: {
1027 ptr += MINBPC(enc);
1028 REQUIRE_CHAR(enc, ptr, end);
1029 switch (BYTE_TYPE(enc, ptr)) {
1030 case BT_EXCL:
1031 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1032 case BT_QUEST:
1033 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1034 case BT_NMSTRT:
1035 case BT_HEX:
1036 case BT_NONASCII:
1037 case BT_LEAD2:
1038 case BT_LEAD3:
1039 case BT_LEAD4:
1040 *nextTokPtr = ptr - MINBPC(enc);
1041 return XML_TOK_INSTANCE_START;
1042 }
1043 *nextTokPtr = ptr;
1044 return XML_TOK_INVALID;
1045 }
1046 case BT_CR:
1047 if (ptr + MINBPC(enc) == end) {
1048 *nextTokPtr = end;
1049 /* indicate that this might be part of a CR/LF pair */
1050 return -XML_TOK_PROLOG_S;
1051 }
1052 /* fall through */
1053 case BT_S:
1054 case BT_LF:
1055 for (;;) {
1056 ptr += MINBPC(enc);
1057 if (! HAS_CHAR(enc, ptr, end))
1058 break;
1059 switch (BYTE_TYPE(enc, ptr)) {
1060 case BT_S:
1061 case BT_LF:
1062 break;
1063 case BT_CR:
1064 /* don't split CR/LF pair */
1065 if (ptr + MINBPC(enc) != end)
1066 break;
1067 /* fall through */
1068 default:
1069 *nextTokPtr = ptr;
1070 return XML_TOK_PROLOG_S;
1071 }
1072 }
1073 *nextTokPtr = ptr;
1074 return XML_TOK_PROLOG_S;
1075 case BT_PERCNT:
1076 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1077 case BT_COMMA:
1078 *nextTokPtr = ptr + MINBPC(enc);
1079 return XML_TOK_COMMA;
1080 case BT_LSQB:
1081 *nextTokPtr = ptr + MINBPC(enc);
1082 return XML_TOK_OPEN_BRACKET;
1083 case BT_RSQB:
1084 ptr += MINBPC(enc);
1085 if (! HAS_CHAR(enc, ptr, end))
1086 return -XML_TOK_CLOSE_BRACKET;
1087 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1088 REQUIRE_CHARS(enc, ptr, end, 2);
1089 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1090 *nextTokPtr = ptr + 2 * MINBPC(enc);
1091 return XML_TOK_COND_SECT_CLOSE;
1092 }
1093 }
1094 *nextTokPtr = ptr;
1095 return XML_TOK_CLOSE_BRACKET;
1096 case BT_LPAR:
1097 *nextTokPtr = ptr + MINBPC(enc);
1098 return XML_TOK_OPEN_PAREN;
1099 case BT_RPAR:
1100 ptr += MINBPC(enc);
1101 if (! HAS_CHAR(enc, ptr, end))
1102 return -XML_TOK_CLOSE_PAREN;
1103 switch (BYTE_TYPE(enc, ptr)) {
1104 case BT_AST:
1105 *nextTokPtr = ptr + MINBPC(enc);
1106 return XML_TOK_CLOSE_PAREN_ASTERISK;
1107 case BT_QUEST:
1108 *nextTokPtr = ptr + MINBPC(enc);
1109 return XML_TOK_CLOSE_PAREN_QUESTION;
1110 case BT_PLUS:
1111 *nextTokPtr = ptr + MINBPC(enc);
1112 return XML_TOK_CLOSE_PAREN_PLUS;
1113 case BT_CR:
1114 case BT_LF:
1115 case BT_S:
1116 case BT_GT:
1117 case BT_COMMA:
1118 case BT_VERBAR:
1119 case BT_RPAR:
1120 *nextTokPtr = ptr;
1121 return XML_TOK_CLOSE_PAREN;
1122 }
1123 *nextTokPtr = ptr;
1124 return XML_TOK_INVALID;
1125 case BT_VERBAR:
1126 *nextTokPtr = ptr + MINBPC(enc);
1127 return XML_TOK_OR;
1128 case BT_GT:
1129 *nextTokPtr = ptr + MINBPC(enc);
1130 return XML_TOK_DECL_CLOSE;
1131 case BT_NUM:
1132 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1133 # define LEAD_CASE(n) \
1134 case BT_LEAD##n: \
1135 if (end - ptr < n) \
1136 return XML_TOK_PARTIAL_CHAR; \
1137 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1138 ptr += n; \
1139 tok = XML_TOK_NAME; \
1140 break; \
1141 } \
1142 if (IS_NAME_CHAR(enc, ptr, n)) { \
1143 ptr += n; \
1144 tok = XML_TOK_NMTOKEN; \
1145 break; \
1146 } \
1147 *nextTokPtr = ptr; \
1148 return XML_TOK_INVALID;
1149 LEAD_CASE(2)
1150 LEAD_CASE(3)
1151 LEAD_CASE(4)
1152 # undef LEAD_CASE
1153 case BT_NMSTRT:
1154 case BT_HEX:
1155 tok = XML_TOK_NAME;
1156 ptr += MINBPC(enc);
1157 break;
1158 case BT_DIGIT:
1159 case BT_NAME:
1160 case BT_MINUS:
1161 # ifdef XML_NS
1162 case BT_COLON:
1163 # endif
1164 tok = XML_TOK_NMTOKEN;
1165 ptr += MINBPC(enc);
1166 break;
1167 case BT_NONASCII:
1168 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1169 ptr += MINBPC(enc);
1170 tok = XML_TOK_NAME;
1171 break;
1172 }
1173 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1174 ptr += MINBPC(enc);
1175 tok = XML_TOK_NMTOKEN;
1176 break;
1177 }
1178 /* fall through */
1179 default:
1180 *nextTokPtr = ptr;
1181 return XML_TOK_INVALID;
1182 }
1183 while (HAS_CHAR(enc, ptr, end)) {
1184 switch (BYTE_TYPE(enc, ptr)) {
1185 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1186 case BT_GT:
1187 case BT_RPAR:
1188 case BT_COMMA:
1189 case BT_VERBAR:
1190 case BT_LSQB:
1191 case BT_PERCNT:
1192 case BT_S:
1193 case BT_CR:
1194 case BT_LF:
1195 *nextTokPtr = ptr;
1196 return tok;
1197 # ifdef XML_NS
1198 case BT_COLON:
1199 ptr += MINBPC(enc);
1200 switch (tok) {
1201 case XML_TOK_NAME:
1202 REQUIRE_CHAR(enc, ptr, end);
1203 tok = XML_TOK_PREFIXED_NAME;
1204 switch (BYTE_TYPE(enc, ptr)) {
1205 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1206 default:
1207 tok = XML_TOK_NMTOKEN;
1208 break;
1209 }
1210 break;
1211 case XML_TOK_PREFIXED_NAME:
1212 tok = XML_TOK_NMTOKEN;
1213 break;
1214 }
1215 break;
1216 # endif
1217 case BT_PLUS:
1218 if (tok == XML_TOK_NMTOKEN) {
1219 *nextTokPtr = ptr;
1220 return XML_TOK_INVALID;
1221 }
1222 *nextTokPtr = ptr + MINBPC(enc);
1223 return XML_TOK_NAME_PLUS;
1224 case BT_AST:
1225 if (tok == XML_TOK_NMTOKEN) {
1226 *nextTokPtr = ptr;
1227 return XML_TOK_INVALID;
1228 }
1229 *nextTokPtr = ptr + MINBPC(enc);
1230 return XML_TOK_NAME_ASTERISK;
1231 case BT_QUEST:
1232 if (tok == XML_TOK_NMTOKEN) {
1233 *nextTokPtr = ptr;
1234 return XML_TOK_INVALID;
1235 }
1236 *nextTokPtr = ptr + MINBPC(enc);
1237 return XML_TOK_NAME_QUESTION;
1238 default:
1239 *nextTokPtr = ptr;
1240 return XML_TOK_INVALID;
1241 }
1242 }
1243 return -tok;
1244 }
1245
1246 static int PTRCALL
PREFIX(attributeValueTok)1247 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1248 const char **nextTokPtr) {
1249 const char *start;
1250 if (ptr >= end)
1251 return XML_TOK_NONE;
1252 else if (! HAS_CHAR(enc, ptr, end)) {
1253 /* This line cannot be executed. The incoming data has already
1254 * been tokenized once, so incomplete characters like this have
1255 * already been eliminated from the input. Retaining the paranoia
1256 * check is still valuable, however.
1257 */
1258 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1259 }
1260 start = ptr;
1261 while (HAS_CHAR(enc, ptr, end)) {
1262 switch (BYTE_TYPE(enc, ptr)) {
1263 # define LEAD_CASE(n) \
1264 case BT_LEAD##n: \
1265 ptr += n; \
1266 break;
1267 LEAD_CASE(2)
1268 LEAD_CASE(3)
1269 LEAD_CASE(4)
1270 # undef LEAD_CASE
1271 case BT_AMP:
1272 if (ptr == start)
1273 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1274 *nextTokPtr = ptr;
1275 return XML_TOK_DATA_CHARS;
1276 case BT_LT:
1277 /* this is for inside entity references */
1278 *nextTokPtr = ptr;
1279 return XML_TOK_INVALID;
1280 case BT_LF:
1281 if (ptr == start) {
1282 *nextTokPtr = ptr + MINBPC(enc);
1283 return XML_TOK_DATA_NEWLINE;
1284 }
1285 *nextTokPtr = ptr;
1286 return XML_TOK_DATA_CHARS;
1287 case BT_CR:
1288 if (ptr == start) {
1289 ptr += MINBPC(enc);
1290 if (! HAS_CHAR(enc, ptr, end))
1291 return XML_TOK_TRAILING_CR;
1292 if (BYTE_TYPE(enc, ptr) == BT_LF)
1293 ptr += MINBPC(enc);
1294 *nextTokPtr = ptr;
1295 return XML_TOK_DATA_NEWLINE;
1296 }
1297 *nextTokPtr = ptr;
1298 return XML_TOK_DATA_CHARS;
1299 case BT_S:
1300 if (ptr == start) {
1301 *nextTokPtr = ptr + MINBPC(enc);
1302 return XML_TOK_ATTRIBUTE_VALUE_S;
1303 }
1304 *nextTokPtr = ptr;
1305 return XML_TOK_DATA_CHARS;
1306 default:
1307 ptr += MINBPC(enc);
1308 break;
1309 }
1310 }
1311 *nextTokPtr = ptr;
1312 return XML_TOK_DATA_CHARS;
1313 }
1314
1315 static int PTRCALL
PREFIX(entityValueTok)1316 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1317 const char **nextTokPtr) {
1318 const char *start;
1319 if (ptr >= end)
1320 return XML_TOK_NONE;
1321 else if (! HAS_CHAR(enc, ptr, end)) {
1322 /* This line cannot be executed. The incoming data has already
1323 * been tokenized once, so incomplete characters like this have
1324 * already been eliminated from the input. Retaining the paranoia
1325 * check is still valuable, however.
1326 */
1327 return XML_TOK_PARTIAL; /* LCOV_EXCL_LINE */
1328 }
1329 start = ptr;
1330 while (HAS_CHAR(enc, ptr, end)) {
1331 switch (BYTE_TYPE(enc, ptr)) {
1332 # define LEAD_CASE(n) \
1333 case BT_LEAD##n: \
1334 ptr += n; \
1335 break;
1336 LEAD_CASE(2)
1337 LEAD_CASE(3)
1338 LEAD_CASE(4)
1339 # undef LEAD_CASE
1340 case BT_AMP:
1341 if (ptr == start)
1342 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1343 *nextTokPtr = ptr;
1344 return XML_TOK_DATA_CHARS;
1345 case BT_PERCNT:
1346 if (ptr == start) {
1347 int tok = PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1348 return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
1349 }
1350 *nextTokPtr = ptr;
1351 return XML_TOK_DATA_CHARS;
1352 case BT_LF:
1353 if (ptr == start) {
1354 *nextTokPtr = ptr + MINBPC(enc);
1355 return XML_TOK_DATA_NEWLINE;
1356 }
1357 *nextTokPtr = ptr;
1358 return XML_TOK_DATA_CHARS;
1359 case BT_CR:
1360 if (ptr == start) {
1361 ptr += MINBPC(enc);
1362 if (! HAS_CHAR(enc, ptr, end))
1363 return XML_TOK_TRAILING_CR;
1364 if (BYTE_TYPE(enc, ptr) == BT_LF)
1365 ptr += MINBPC(enc);
1366 *nextTokPtr = ptr;
1367 return XML_TOK_DATA_NEWLINE;
1368 }
1369 *nextTokPtr = ptr;
1370 return XML_TOK_DATA_CHARS;
1371 default:
1372 ptr += MINBPC(enc);
1373 break;
1374 }
1375 }
1376 *nextTokPtr = ptr;
1377 return XML_TOK_DATA_CHARS;
1378 }
1379
1380 # ifdef XML_DTD
1381
1382 static int PTRCALL
PREFIX(ignoreSectionTok)1383 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1384 const char **nextTokPtr) {
1385 int level = 0;
1386 if (MINBPC(enc) > 1) {
1387 size_t n = end - ptr;
1388 if (n & (MINBPC(enc) - 1)) {
1389 n &= ~(MINBPC(enc) - 1);
1390 end = ptr + n;
1391 }
1392 }
1393 while (HAS_CHAR(enc, ptr, end)) {
1394 switch (BYTE_TYPE(enc, ptr)) {
1395 INVALID_CASES(ptr, nextTokPtr)
1396 case BT_LT:
1397 ptr += MINBPC(enc);
1398 REQUIRE_CHAR(enc, ptr, end);
1399 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1400 ptr += MINBPC(enc);
1401 REQUIRE_CHAR(enc, ptr, end);
1402 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1403 ++level;
1404 ptr += MINBPC(enc);
1405 }
1406 }
1407 break;
1408 case BT_RSQB:
1409 ptr += MINBPC(enc);
1410 REQUIRE_CHAR(enc, ptr, end);
1411 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1412 ptr += MINBPC(enc);
1413 REQUIRE_CHAR(enc, ptr, end);
1414 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1415 ptr += MINBPC(enc);
1416 if (level == 0) {
1417 *nextTokPtr = ptr;
1418 return XML_TOK_IGNORE_SECT;
1419 }
1420 --level;
1421 }
1422 }
1423 break;
1424 default:
1425 ptr += MINBPC(enc);
1426 break;
1427 }
1428 }
1429 return XML_TOK_PARTIAL;
1430 }
1431
1432 # endif /* XML_DTD */
1433
1434 static int PTRCALL
PREFIX(isPublicId)1435 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1436 const char **badPtr) {
1437 ptr += MINBPC(enc);
1438 end -= MINBPC(enc);
1439 for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
1440 switch (BYTE_TYPE(enc, ptr)) {
1441 case BT_DIGIT:
1442 case BT_HEX:
1443 case BT_MINUS:
1444 case BT_APOS:
1445 case BT_LPAR:
1446 case BT_RPAR:
1447 case BT_PLUS:
1448 case BT_COMMA:
1449 case BT_SOL:
1450 case BT_EQUALS:
1451 case BT_QUEST:
1452 case BT_CR:
1453 case BT_LF:
1454 case BT_SEMI:
1455 case BT_EXCL:
1456 case BT_AST:
1457 case BT_PERCNT:
1458 case BT_NUM:
1459 # ifdef XML_NS
1460 case BT_COLON:
1461 # endif
1462 break;
1463 case BT_S:
1464 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1465 *badPtr = ptr;
1466 return 0;
1467 }
1468 break;
1469 case BT_NAME:
1470 case BT_NMSTRT:
1471 if (! (BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1472 break;
1473 /* fall through */
1474 default:
1475 switch (BYTE_TO_ASCII(enc, ptr)) {
1476 case 0x24: /* $ */
1477 case 0x40: /* @ */
1478 break;
1479 default:
1480 *badPtr = ptr;
1481 return 0;
1482 }
1483 break;
1484 }
1485 }
1486 return 1;
1487 }
1488
1489 /* This must only be called for a well-formed start-tag or empty
1490 element tag. Returns the number of attributes. Pointers to the
1491 first attsMax attributes are stored in atts.
1492 */
1493
1494 static int PTRCALL
PREFIX(getAtts)1495 PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
1496 ATTRIBUTE *atts) {
1497 enum { other, inName, inValue } state = inName;
1498 int nAtts = 0;
1499 int open = 0; /* defined when state == inValue;
1500 initialization just to shut up compilers */
1501
1502 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1503 switch (BYTE_TYPE(enc, ptr)) {
1504 # define START_NAME \
1505 if (state == other) { \
1506 if (nAtts < attsMax) { \
1507 atts[nAtts].name = ptr; \
1508 atts[nAtts].normalized = 1; \
1509 } \
1510 state = inName; \
1511 }
1512 # define LEAD_CASE(n) \
1513 case BT_LEAD##n: \
1514 START_NAME ptr += (n - MINBPC(enc)); \
1515 break;
1516 LEAD_CASE(2)
1517 LEAD_CASE(3)
1518 LEAD_CASE(4)
1519 # undef LEAD_CASE
1520 case BT_NONASCII:
1521 case BT_NMSTRT:
1522 case BT_HEX:
1523 START_NAME
1524 break;
1525 # undef START_NAME
1526 case BT_QUOT:
1527 if (state != inValue) {
1528 if (nAtts < attsMax)
1529 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1530 state = inValue;
1531 open = BT_QUOT;
1532 } else if (open == BT_QUOT) {
1533 state = other;
1534 if (nAtts < attsMax)
1535 atts[nAtts].valueEnd = ptr;
1536 nAtts++;
1537 }
1538 break;
1539 case BT_APOS:
1540 if (state != inValue) {
1541 if (nAtts < attsMax)
1542 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1543 state = inValue;
1544 open = BT_APOS;
1545 } else if (open == BT_APOS) {
1546 state = other;
1547 if (nAtts < attsMax)
1548 atts[nAtts].valueEnd = ptr;
1549 nAtts++;
1550 }
1551 break;
1552 case BT_AMP:
1553 if (nAtts < attsMax)
1554 atts[nAtts].normalized = 0;
1555 break;
1556 case BT_S:
1557 if (state == inName)
1558 state = other;
1559 else if (state == inValue && nAtts < attsMax && atts[nAtts].normalized
1560 && (ptr == atts[nAtts].valuePtr
1561 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1562 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1563 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1564 atts[nAtts].normalized = 0;
1565 break;
1566 case BT_CR:
1567 case BT_LF:
1568 /* This case ensures that the first attribute name is counted
1569 Apart from that we could just change state on the quote. */
1570 if (state == inName)
1571 state = other;
1572 else if (state == inValue && nAtts < attsMax)
1573 atts[nAtts].normalized = 0;
1574 break;
1575 case BT_GT:
1576 case BT_SOL:
1577 if (state != inValue)
1578 return nAtts;
1579 break;
1580 default:
1581 break;
1582 }
1583 }
1584 /* not reached */
1585 }
1586
1587 static int PTRFASTCALL
PREFIX(charRefNumber)1588 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) {
1589 int result = 0;
1590 /* skip &# */
1591 UNUSED_P(enc);
1592 ptr += 2 * MINBPC(enc);
1593 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1594 for (ptr += MINBPC(enc); ! CHAR_MATCHES(enc, ptr, ASCII_SEMI);
1595 ptr += MINBPC(enc)) {
1596 int c = BYTE_TO_ASCII(enc, ptr);
1597 switch (c) {
1598 case ASCII_0:
1599 case ASCII_1:
1600 case ASCII_2:
1601 case ASCII_3:
1602 case ASCII_4:
1603 case ASCII_5:
1604 case ASCII_6:
1605 case ASCII_7:
1606 case ASCII_8:
1607 case ASCII_9:
1608 result <<= 4;
1609 result |= (c - ASCII_0);
1610 break;
1611 case ASCII_A:
1612 case ASCII_B:
1613 case ASCII_C:
1614 case ASCII_D:
1615 case ASCII_E:
1616 case ASCII_F:
1617 result <<= 4;
1618 result += 10 + (c - ASCII_A);
1619 break;
1620 case ASCII_a:
1621 case ASCII_b:
1622 case ASCII_c:
1623 case ASCII_d:
1624 case ASCII_e:
1625 case ASCII_f:
1626 result <<= 4;
1627 result += 10 + (c - ASCII_a);
1628 break;
1629 }
1630 if (result >= 0x110000)
1631 return -1;
1632 }
1633 } else {
1634 for (; ! CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1635 int c = BYTE_TO_ASCII(enc, ptr);
1636 result *= 10;
1637 result += (c - ASCII_0);
1638 if (result >= 0x110000)
1639 return -1;
1640 }
1641 }
1642 return checkCharRefNumber(result);
1643 }
1644
1645 static int PTRCALL
PREFIX(predefinedEntityName)1646 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
1647 const char *end) {
1648 UNUSED_P(enc);
1649 switch ((end - ptr) / MINBPC(enc)) {
1650 case 2:
1651 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1652 switch (BYTE_TO_ASCII(enc, ptr)) {
1653 case ASCII_l:
1654 return ASCII_LT;
1655 case ASCII_g:
1656 return ASCII_GT;
1657 }
1658 }
1659 break;
1660 case 3:
1661 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1662 ptr += MINBPC(enc);
1663 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1664 ptr += MINBPC(enc);
1665 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1666 return ASCII_AMP;
1667 }
1668 }
1669 break;
1670 case 4:
1671 switch (BYTE_TO_ASCII(enc, ptr)) {
1672 case ASCII_q:
1673 ptr += MINBPC(enc);
1674 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1675 ptr += MINBPC(enc);
1676 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1677 ptr += MINBPC(enc);
1678 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1679 return ASCII_QUOT;
1680 }
1681 }
1682 break;
1683 case ASCII_a:
1684 ptr += MINBPC(enc);
1685 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1686 ptr += MINBPC(enc);
1687 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1688 ptr += MINBPC(enc);
1689 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1690 return ASCII_APOS;
1691 }
1692 }
1693 break;
1694 }
1695 }
1696 return 0;
1697 }
1698
1699 static int PTRCALL
PREFIX(nameMatchesAscii)1700 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1701 const char *end1, const char *ptr2) {
1702 UNUSED_P(enc);
1703 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1704 if (end1 - ptr1 < MINBPC(enc)) {
1705 /* This line cannot be executed. The incoming data has already
1706 * been tokenized once, so incomplete characters like this have
1707 * already been eliminated from the input. Retaining the
1708 * paranoia check is still valuable, however.
1709 */
1710 return 0; /* LCOV_EXCL_LINE */
1711 }
1712 if (! CHAR_MATCHES(enc, ptr1, *ptr2))
1713 return 0;
1714 }
1715 return ptr1 == end1;
1716 }
1717
1718 static int PTRFASTCALL
PREFIX(nameLength)1719 PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
1720 const char *start = ptr;
1721 for (;;) {
1722 switch (BYTE_TYPE(enc, ptr)) {
1723 # define LEAD_CASE(n) \
1724 case BT_LEAD##n: \
1725 ptr += n; \
1726 break;
1727 LEAD_CASE(2)
1728 LEAD_CASE(3)
1729 LEAD_CASE(4)
1730 # undef LEAD_CASE
1731 case BT_NONASCII:
1732 case BT_NMSTRT:
1733 # ifdef XML_NS
1734 case BT_COLON:
1735 # endif
1736 case BT_HEX:
1737 case BT_DIGIT:
1738 case BT_NAME:
1739 case BT_MINUS:
1740 ptr += MINBPC(enc);
1741 break;
1742 default:
1743 return (int)(ptr - start);
1744 }
1745 }
1746 }
1747
1748 static const char *PTRFASTCALL
PREFIX(skipS)1749 PREFIX(skipS)(const ENCODING *enc, const char *ptr) {
1750 for (;;) {
1751 switch (BYTE_TYPE(enc, ptr)) {
1752 case BT_LF:
1753 case BT_CR:
1754 case BT_S:
1755 ptr += MINBPC(enc);
1756 break;
1757 default:
1758 return ptr;
1759 }
1760 }
1761 }
1762
1763 static void PTRCALL
PREFIX(updatePosition)1764 PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
1765 POSITION *pos) {
1766 while (HAS_CHAR(enc, ptr, end)) {
1767 switch (BYTE_TYPE(enc, ptr)) {
1768 # define LEAD_CASE(n) \
1769 case BT_LEAD##n: \
1770 ptr += n; \
1771 pos->columnNumber++; \
1772 break;
1773 LEAD_CASE(2)
1774 LEAD_CASE(3)
1775 LEAD_CASE(4)
1776 # undef LEAD_CASE
1777 case BT_LF:
1778 pos->columnNumber = 0;
1779 pos->lineNumber++;
1780 ptr += MINBPC(enc);
1781 break;
1782 case BT_CR:
1783 pos->lineNumber++;
1784 ptr += MINBPC(enc);
1785 if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
1786 ptr += MINBPC(enc);
1787 pos->columnNumber = 0;
1788 break;
1789 default:
1790 ptr += MINBPC(enc);
1791 pos->columnNumber++;
1792 break;
1793 }
1794 }
1795 }
1796
1797 # undef DO_LEAD_CASE
1798 # undef MULTIBYTE_CASES
1799 # undef INVALID_CASES
1800 # undef CHECK_NAME_CASE
1801 # undef CHECK_NAME_CASES
1802 # undef CHECK_NMSTRT_CASE
1803 # undef CHECK_NMSTRT_CASES
1804
1805 #endif /* XML_TOK_IMPL_C */
1806