• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLparser.c : an HTML parser
3  *
4  * References:
5  *   HTML Living Standard
6  *     https://html.spec.whatwg.org/multipage/parsing.html
7  *
8  * Tokenization now conforms to HTML5. Tree construction still follows
9  * a custom, non-standard implementation. See:
10  *
11  *     https://gitlab.gnome.org/GNOME/libxml2/-/issues/211
12  *
13  * See Copyright for the status of this software.
14  *
15  * daniel@veillard.com
16  */
17 
18 #define IN_LIBXML
19 #include "libxml.h"
20 #ifdef LIBXML_HTML_ENABLED
21 
22 #include <string.h>
23 #include <ctype.h>
24 #include <stdlib.h>
25 
26 #include <libxml/HTMLparser.h>
27 #include <libxml/xmlmemory.h>
28 #include <libxml/tree.h>
29 #include <libxml/parser.h>
30 #include <libxml/parserInternals.h>
31 #include <libxml/xmlerror.h>
32 #include <libxml/HTMLtree.h>
33 #include <libxml/entities.h>
34 #include <libxml/encoding.h>
35 #include <libxml/xmlIO.h>
36 #include <libxml/uri.h>
37 
38 #include "private/buf.h"
39 #include "private/dict.h"
40 #include "private/enc.h"
41 #include "private/error.h"
42 #include "private/html.h"
43 #include "private/io.h"
44 #include "private/memory.h"
45 #include "private/parser.h"
46 #include "private/tree.h"
47 
48 #define HTML_MAX_NAMELEN 1000
49 #define HTML_MAX_ATTRS 100000000 /* 100 million */
50 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
51 #define HTML_PARSER_BUFFER_SIZE 100
52 
53 #define IS_WS_HTML(c) \
54     (((c) == 0x20) || \
55      (((c) >= 0x09) && ((c) <= 0x0D) && ((c) != 0x0B)))
56 
57 #define IS_HEX_DIGIT(c) \
58     ((IS_ASCII_DIGIT(c)) || \
59      ((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f')))
60 
61 #define IS_UPPER(c) \
62     (((c) >= 'A') && ((c) <= 'Z'))
63 
64 #define IS_ALNUM(c) \
65     (IS_ASCII_LETTER(c) || IS_ASCII_DIGIT(c))
66 
67 typedef const unsigned htmlAsciiMask[2];
68 
69 static htmlAsciiMask MASK_DQ = {
70     0,
71     1u << ('"' - 32),
72 };
73 static htmlAsciiMask MASK_SQ = {
74     0,
75     1u << ('\'' - 32),
76 };
77 static htmlAsciiMask MASK_GT = {
78     0,
79     1u << ('>' - 32),
80 };
81 static htmlAsciiMask MASK_DASH = {
82     0,
83     1u << ('-' - 32),
84 };
85 static htmlAsciiMask MASK_WS_GT = {
86     1u << 0x09 | 1u << 0x0A | 1u << 0x0C | 1u << 0x0D,
87     1u << (' ' - 32) | 1u << ('>' - 32),
88 };
89 static htmlAsciiMask MASK_DQ_GT = {
90     0,
91     1u << ('"' - 32) | 1u << ('>' - 32),
92 };
93 static htmlAsciiMask MASK_SQ_GT = {
94     0,
95     1u << ('\'' - 32) | 1u << ('>' - 32),
96 };
97 
98 static int htmlOmittedDefaultValue = 1;
99 
100 static int
101 htmlParseElementInternal(htmlParserCtxtPtr ctxt);
102 
103 /************************************************************************
104  *									*
105  *		Some factorized error routines				*
106  *									*
107  ************************************************************************/
108 
109 /**
110  * htmlErrMemory:
111  * @ctxt:  an HTML parser context
112  * @extra:  extra information
113  *
114  * Handle a redefinition of attribute error
115  */
116 static void
htmlErrMemory(xmlParserCtxtPtr ctxt)117 htmlErrMemory(xmlParserCtxtPtr ctxt)
118 {
119     xmlCtxtErrMemory(ctxt);
120 }
121 
122 /**
123  * htmlParseErr:
124  * @ctxt:  an HTML parser context
125  * @error:  the error number
126  * @msg:  the error message
127  * @str1:  string infor
128  * @str2:  string infor
129  *
130  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131  */
132 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)133 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134              const char *msg, const xmlChar *str1, const xmlChar *str2)
135 {
136     xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
137                str1, str2, NULL, 0, msg, str1, str2);
138 }
139 
140 /************************************************************************
141  *									*
142  *	Parser stacks related functions and macros		*
143  *									*
144  ************************************************************************/
145 
146 /**
147  * htmlnamePush:
148  * @ctxt:  an HTML parser context
149  * @value:  the element name
150  *
151  * Pushes a new element name on top of the name stack
152  *
153  * Returns -1 in case of error, the index in the stack otherwise
154  */
155 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)156 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
157 {
158     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
159         ctxt->html = 3;
160     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
161         ctxt->html = 10;
162     if (ctxt->nameNr >= ctxt->nameMax) {
163         const xmlChar **tmp;
164         int newSize;
165 
166         newSize = xmlGrowCapacity(ctxt->nameMax, sizeof(tmp[0]),
167                                   10, XML_MAX_ITEMS);
168         if (newSize < 0) {
169             htmlErrMemory(ctxt);
170             return (-1);
171         }
172         tmp = xmlRealloc(ctxt->nameTab, newSize * sizeof(tmp[0]));
173         if (tmp == NULL) {
174             htmlErrMemory(ctxt);
175             return(-1);
176         }
177         ctxt->nameTab = tmp;
178         ctxt->nameMax = newSize;
179     }
180     ctxt->nameTab[ctxt->nameNr] = value;
181     ctxt->name = value;
182     return (ctxt->nameNr++);
183 }
184 /**
185  * htmlnamePop:
186  * @ctxt: an HTML parser context
187  *
188  * Pops the top element name from the name stack
189  *
190  * Returns the name just removed
191  */
192 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)193 htmlnamePop(htmlParserCtxtPtr ctxt)
194 {
195     const xmlChar *ret;
196 
197     if (ctxt->nameNr <= 0)
198         return (NULL);
199     ctxt->nameNr--;
200     if (ctxt->nameNr < 0)
201         return (NULL);
202     if (ctxt->nameNr > 0)
203         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
204     else
205         ctxt->name = NULL;
206     ret = ctxt->nameTab[ctxt->nameNr];
207     ctxt->nameTab[ctxt->nameNr] = NULL;
208     return (ret);
209 }
210 
211 /**
212  * htmlNodeInfoPush:
213  * @ctxt:  an HTML parser context
214  * @value:  the node info
215  *
216  * Pushes a new element name on top of the node info stack
217  *
218  * Returns 0 in case of error, the index in the stack otherwise
219  */
220 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)221 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
222 {
223     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
224         xmlParserNodeInfo *tmp;
225         int newSize;
226 
227         newSize = xmlGrowCapacity(ctxt->nodeInfoMax, sizeof(tmp[0]),
228                                   5, XML_MAX_ITEMS);
229         if (newSize < 0) {
230             htmlErrMemory(ctxt);
231             return (0);
232         }
233         tmp = xmlRealloc(ctxt->nodeInfoTab, newSize * sizeof(tmp[0]));
234         if (tmp == NULL) {
235             htmlErrMemory(ctxt);
236             return (0);
237         }
238         ctxt->nodeInfoTab = tmp;
239         ctxt->nodeInfoMax = newSize;
240     }
241     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
242     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
243     return (ctxt->nodeInfoNr++);
244 }
245 
246 /**
247  * htmlNodeInfoPop:
248  * @ctxt:  an HTML parser context
249  *
250  * Pops the top element name from the node info stack
251  *
252  * Returns 0 in case of error, the pointer to NodeInfo otherwise
253  */
254 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)255 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
256 {
257     if (ctxt->nodeInfoNr <= 0)
258         return (NULL);
259     ctxt->nodeInfoNr--;
260     if (ctxt->nodeInfoNr < 0)
261         return (NULL);
262     if (ctxt->nodeInfoNr > 0)
263         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
264     else
265         ctxt->nodeInfo = NULL;
266     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
267 }
268 
269 /*
270  * Macros for accessing the content. Those should be used only by the parser,
271  * and not exported.
272  *
273  * Dirty macros, i.e. one need to make assumption on the context to use them
274  *
275  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
276  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
277  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
278  *           in UNICODE mode. This should be used internally by the parser
279  *           only to compare to ASCII values otherwise it would break when
280  *           running with UTF-8 encoding.
281  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
282  *           to compare on ASCII based substring.
283  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
284  *           it should be used only to compare on ASCII based substring.
285  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
286  *           strings without newlines within the parser.
287  *
288  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
289  *
290  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
291  */
292 
293 #define UPPER (toupper(*ctxt->input->cur))
294 
295 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
296 
297 #define NXT(val) ctxt->input->cur[(val)]
298 
299 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
300 
301 #define CUR_PTR ctxt->input->cur
302 #define BASE_PTR ctxt->input->base
303 
304 #define SHRINK \
305     if ((!PARSER_PROGRESSIVE(ctxt)) && \
306         (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
307 	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
308 	xmlParserShrink(ctxt);
309 
310 #define GROW \
311     if ((!PARSER_PROGRESSIVE(ctxt)) && \
312         (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
313 	xmlParserGrow(ctxt);
314 
315 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
316 
317 /* Imported from XML */
318 
319 #define CUR (*ctxt->input->cur)
320 
321 /**
322  * htmlFindEncoding:
323  * @the HTML parser context
324  *
325  * Ty to find and encoding in the current data available in the input
326  * buffer this is needed to try to switch to the proper encoding when
327  * one face a character error.
328  * That's an heuristic, since it's operating outside of parsing it could
329  * try to use a meta which had been commented out, that's the reason it
330  * should only be used in case of error, not as a default.
331  *
332  * Returns an encoding string or NULL if not found, the string need to
333  *   be freed
334  */
335 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)336 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
337     const xmlChar *start, *cur, *end;
338     xmlChar *ret;
339 
340     if ((ctxt == NULL) || (ctxt->input == NULL) ||
341         (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
342         return(NULL);
343     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
344         return(NULL);
345 
346     start = ctxt->input->cur;
347     end = ctxt->input->end;
348     /* we also expect the input buffer to be zero terminated */
349     if (*end != 0)
350         return(NULL);
351 
352     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
353     if (cur == NULL)
354         return(NULL);
355     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
356     if (cur == NULL)
357         return(NULL);
358     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
359     if (cur == NULL)
360         return(NULL);
361     cur += 8;
362     start = cur;
363     while ((IS_ALNUM(*cur)) ||
364            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
365            cur++;
366     if (cur == start)
367         return(NULL);
368     ret = xmlStrndup(start, cur - start);
369     if (ret == NULL)
370         htmlErrMemory(ctxt);
371     return(ret);
372 }
373 
374 static int
htmlMaskMatch(htmlAsciiMask mask,unsigned c)375 htmlMaskMatch(htmlAsciiMask mask, unsigned c) {
376     if (c >= 64)
377         return(0);
378     return((mask[c/32] >> (c & 31)) & 1);
379 }
380 
381 static int
htmlValidateUtf8(xmlParserCtxtPtr ctxt,const xmlChar * str,size_t len,int partial)382 htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len,
383                  int partial) {
384     unsigned c = str[0];
385     int size;
386 
387     if (c < 0xC2) {
388         goto invalid;
389     } else if (c < 0xE0) {
390         if (len < 2)
391             goto incomplete;
392         if ((str[1] & 0xC0) != 0x80)
393             goto invalid;
394         size = 2;
395     } else if (c < 0xF0) {
396         unsigned v;
397 
398         if (len < 3)
399             goto incomplete;
400 
401         v = str[1] << 8 | str[2]; /* hint to generate 16-bit load */
402         v |= c << 16;
403 
404         if (((v & 0x00C0C0) != 0x008080) ||
405             ((v & 0x0F2000) == 0x000000) ||
406             ((v & 0x0F2000) == 0x0D2000))
407             goto invalid;
408 
409         size = 3;
410     } else {
411         unsigned v;
412 
413         if (len < 4)
414             goto incomplete;
415 
416         v = c << 24 | str[1] << 16 | str[2] << 8 | str[3];
417 
418         if (((v & 0x00C0C0C0) != 0x00808080) ||
419             (v < 0xF0900000) || (v >= 0xF4900000))
420             goto invalid;
421 
422         size = 4;
423     }
424 
425     return(size);
426 
427 incomplete:
428     if (partial)
429         return(0);
430 
431 invalid:
432     /* Only report the first error */
433     if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
434         htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
435                      "Invalid bytes in character encoding", NULL, NULL);
436         ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
437     }
438 
439     return(-1);
440 }
441 
442 /**
443  * htmlSkipBlankChars:
444  * @ctxt:  the HTML parser context
445  *
446  * skip all blanks character found at that point in the input streams.
447  *
448  * Returns the number of space chars skipped
449  */
450 
451 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)452 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
453     const xmlChar *cur = ctxt->input->cur;
454     size_t avail = ctxt->input->end - cur;
455     int res = 0;
456     int line = ctxt->input->line;
457     int col = ctxt->input->col;
458 
459     while (!PARSER_STOPPED(ctxt)) {
460         if (avail == 0) {
461             ctxt->input->cur = cur;
462             GROW;
463             cur = ctxt->input->cur;
464             avail = ctxt->input->end - cur;
465 
466             if (avail == 0)
467                 break;
468         }
469 
470         if (*cur == '\n') {
471             line++;
472             col = 1;
473         } else if (IS_WS_HTML(*cur)) {
474             col++;
475         } else {
476             break;
477         }
478 
479         cur += 1;
480         avail -= 1;
481 
482 	if (res < INT_MAX)
483 	    res++;
484     }
485 
486     ctxt->input->cur = cur;
487     ctxt->input->line = line;
488     ctxt->input->col = col;
489 
490     if (res > 8)
491         GROW;
492 
493     return(res);
494 }
495 
496 
497 
498 /************************************************************************
499  *									*
500  *	The list of HTML elements and their properties		*
501  *									*
502  ************************************************************************/
503 
504 /*
505  *  Start Tag: 1 means the start tag can be omitted
506  *  End Tag:   1 means the end tag can be omitted
507  *             2 means it's forbidden (empty elements)
508  *             3 means the tag is stylistic and should be closed easily
509  *  Depr:      this element is deprecated
510  *  DTD:       1 means that this element is valid only in the Loose DTD
511  *             2 means that this element is valid only in the Frameset DTD
512  *
513  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
514  */
515 
516 #define DATA_RCDATA         1
517 #define DATA_RAWTEXT        2
518 #define DATA_PLAINTEXT      3
519 #define DATA_SCRIPT         4
520 #define DATA_SCRIPT_ESC1    5
521 #define DATA_SCRIPT_ESC2    6
522 
523 static const htmlElemDesc
524 html40ElementTable[] = {
525 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
526 	NULL, NULL, NULL, NULL, NULL,
527 	0
528 },
529 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
530 	NULL, NULL, NULL, NULL, NULL,
531 	0
532 },
533 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
534 	NULL, NULL, NULL, NULL, NULL,
535 	0
536 },
537 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
538 	NULL, NULL, NULL, NULL, NULL,
539 	0
540 },
541 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
542 	NULL, NULL, NULL, NULL, NULL,
543 	0
544 },
545 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
546 	NULL, NULL, NULL, NULL, NULL,
547 	0
548 },
549 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
550 	NULL, NULL, NULL, NULL, NULL,
551 	0
552 },
553 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
554 	NULL, NULL, NULL, NULL, NULL,
555 	0
556 },
557 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
558 	NULL, NULL, NULL, NULL, NULL,
559 	0
560 },
561 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
562 	NULL, NULL, NULL, NULL, NULL,
563 	0
564 },
565 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
566 	NULL, NULL, NULL, NULL, NULL,
567 	0
568 },
569 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
570 	NULL, NULL, NULL, NULL, NULL,
571 	0
572 },
573 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
574 	NULL, NULL, NULL, NULL, NULL,
575 	0
576 },
577 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
578 	NULL, NULL, NULL, NULL, NULL,
579 	0
580 },
581 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
582 	NULL, NULL, NULL, NULL, NULL,
583 	0
584 },
585 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
586 	NULL, NULL, NULL, NULL, NULL,
587 	0
588 },
589 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
590 	NULL, NULL, NULL, NULL, NULL,
591 	0
592 },
593 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
594 	NULL, NULL, NULL, NULL, NULL,
595 	0
596 },
597 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
598 	NULL, NULL, NULL, NULL, NULL,
599 	0
600 },
601 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
602 	NULL, NULL, NULL, NULL, NULL,
603 	0
604 },
605 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
606 	NULL, NULL, NULL, NULL, NULL,
607 	0
608 },
609 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
610 	NULL, NULL, NULL, NULL, NULL,
611 	0
612 },
613 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
614 	NULL, NULL, NULL, NULL, NULL,
615 	0
616 },
617 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
618 	NULL, NULL, NULL, NULL, NULL,
619 	0
620 },
621 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
622 	NULL, NULL, NULL, NULL, NULL,
623 	0
624 },
625 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
626 	NULL, NULL, NULL, NULL, NULL,
627 	0
628 },
629 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
630 	NULL, NULL, NULL, NULL, NULL,
631 	0
632 },
633 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
634 	NULL, NULL, NULL, NULL, NULL,
635 	0
636 },
637 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
638 	NULL, NULL, NULL, NULL, NULL,
639 	0
640 },
641 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
642 	NULL, NULL, NULL, NULL, NULL,
643 	0
644 },
645 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
646 	NULL, NULL, NULL, NULL, NULL,
647 	0
648 },
649 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
650 	NULL, NULL, NULL, NULL, NULL,
651 	0
652 },
653 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
654 	NULL, NULL, NULL, NULL, NULL,
655 	0
656 },
657 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
658 	NULL, NULL, NULL, NULL, NULL,
659 	0
660 },
661 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
662 	NULL, NULL, NULL, NULL, NULL,
663 	0
664 },
665 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
666 	NULL, NULL, NULL, NULL, NULL,
667 	0
668 },
669 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
670 	NULL, NULL, NULL, NULL, NULL,
671 	0
672 },
673 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
674 	NULL, NULL, NULL, NULL, NULL,
675 	0
676 },
677 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
678 	NULL, NULL, NULL, NULL, NULL,
679 	0
680 },
681 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
682 	NULL, NULL, NULL, NULL, NULL,
683 	0
684 },
685 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
686 	NULL, NULL, NULL, NULL, NULL,
687 	0
688 },
689 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
690 	NULL, NULL, NULL, NULL, NULL,
691 	0
692 },
693 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
694 	NULL, NULL, NULL, NULL, NULL,
695 	0
696 },
697 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
698 	NULL, NULL, NULL, NULL, NULL,
699 	0
700 },
701 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
702 	NULL, NULL, NULL, NULL, NULL,
703 	0
704 },
705 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
706 	NULL, NULL, NULL, NULL, NULL,
707 	DATA_RAWTEXT
708 },
709 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
710 	NULL, NULL, NULL, NULL, NULL,
711 	0
712 },
713 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
714 	NULL, NULL, NULL, NULL, NULL,
715 	0
716 },
717 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
718 	NULL, NULL, NULL, NULL, NULL,
719 	0
720 },
721 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
722 	NULL, NULL, NULL, NULL, NULL,
723 	0
724 },
725 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
726 	NULL, NULL, NULL, NULL, NULL,
727 	0
728 },
729 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
730 	NULL, NULL, NULL, NULL, NULL,
731 	0
732 },
733 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
734 	NULL, NULL, NULL, NULL, NULL,
735 	0
736 },
737 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
738 	NULL, NULL, NULL, NULL, NULL,
739 	0
740 },
741 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
742 	NULL, NULL, NULL, NULL, NULL,
743 	0
744 },
745 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
746 	NULL, NULL, NULL, NULL, NULL,
747 	0
748 },
749 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
750 	NULL, NULL, NULL, NULL, NULL,
751 	0
752 },
753 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
754 	NULL, NULL, NULL, NULL, NULL,
755 	0
756 },
757 { "noembed",	0, 0, 0, 0, 0, 0, 0, "",
758 	NULL, NULL, NULL, NULL, NULL,
759 	DATA_RAWTEXT
760 },
761 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
762 	NULL, NULL, NULL, NULL, NULL,
763 	DATA_RAWTEXT
764 },
765 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
766 	NULL, NULL, NULL, NULL, NULL,
767 	0
768 },
769 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
770 	NULL, NULL, NULL, NULL, NULL,
771 	0
772 },
773 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
774 	NULL, NULL, NULL, NULL, NULL,
775 	0
776 },
777 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
778 	NULL, NULL, NULL, NULL, NULL,
779 	0
780 },
781 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
782 	NULL, NULL, NULL, NULL, NULL,
783 	0
784 },
785 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
786 	NULL, NULL, NULL, NULL, NULL,
787 	0
788 },
789 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
790 	NULL, NULL, NULL, NULL, NULL,
791 	0
792 },
793 { "plaintext",	0, 0, 0, 0, 0, 0, 0, "",
794 	NULL, NULL, NULL, NULL, NULL,
795 	DATA_PLAINTEXT
796 },
797 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
798 	NULL, NULL, NULL, NULL, NULL,
799 	0
800 },
801 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
802 	NULL, NULL, NULL, NULL, NULL,
803 	0
804 },
805 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
806 	NULL, NULL, NULL, NULL, NULL,
807 	0
808 },
809 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
810 	NULL, NULL, NULL, NULL, NULL,
811 	0
812 },
813 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
814 	NULL, NULL, NULL, NULL, NULL,
815 	DATA_SCRIPT
816 },
817 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
818 	NULL, NULL, NULL, NULL, NULL,
819 	0
820 },
821 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
822 	NULL, NULL, NULL, NULL, NULL,
823 	0
824 },
825 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
826 	NULL, NULL, NULL, NULL, NULL,
827 	0
828 },
829 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
830 	NULL, NULL, NULL, NULL, NULL,
831 	0
832 },
833 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
834 	NULL, NULL, NULL, NULL, NULL,
835 	0
836 },
837 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
838 	NULL, NULL, NULL, NULL, NULL,
839 	DATA_RAWTEXT
840 },
841 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
842 	NULL, NULL, NULL, NULL, NULL,
843 	0
844 },
845 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
846 	NULL, NULL, NULL, NULL, NULL,
847 	0
848 },
849 { "table",	0, 0, 0, 0, 0, 0, 0, "",
850 	NULL, NULL, NULL, NULL, NULL,
851 	0
852 },
853 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
854 	NULL, NULL, NULL, NULL, NULL,
855 	0
856 },
857 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
858 	NULL, NULL, NULL, NULL, NULL,
859 	0
860 },
861 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
862 	NULL, NULL, NULL, NULL, NULL,
863 	DATA_RCDATA
864 },
865 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
866 	NULL, NULL, NULL, NULL, NULL,
867 	0
868 },
869 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
870 	NULL, NULL, NULL, NULL, NULL,
871 	0
872 },
873 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
874 	NULL, NULL, NULL, NULL, NULL,
875 	0
876 },
877 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
878 	NULL, NULL, NULL, NULL, NULL,
879 	DATA_RCDATA
880 },
881 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
882 	NULL, NULL, NULL, NULL, NULL,
883 	0
884 },
885 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
886 	NULL, NULL, NULL, NULL, NULL,
887 	0
888 },
889 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
890 	NULL, NULL, NULL, NULL, NULL,
891 	0
892 },
893 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
894 	NULL, NULL, NULL, NULL, NULL,
895 	0
896 },
897 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
898 	NULL, NULL, NULL, NULL, NULL,
899 	0
900 },
901 { "xmp",	0, 0, 0, 0, 0, 0, 1, "",
902 	NULL, NULL, NULL, NULL, NULL,
903 	DATA_RAWTEXT
904 }
905 };
906 
907 typedef struct {
908     const char *oldTag;
909     const char *newTag;
910 } htmlStartCloseEntry;
911 
912 /*
913  * start tags that imply the end of current element
914  */
915 static const htmlStartCloseEntry htmlStartClose[] = {
916     { "a", "a" },
917     { "a", "fieldset" },
918     { "a", "table" },
919     { "a", "td" },
920     { "a", "th" },
921     { "address", "dd" },
922     { "address", "dl" },
923     { "address", "dt" },
924     { "address", "form" },
925     { "address", "li" },
926     { "address", "ul" },
927     { "b", "center" },
928     { "b", "p" },
929     { "b", "td" },
930     { "b", "th" },
931     { "big", "p" },
932     { "caption", "col" },
933     { "caption", "colgroup" },
934     { "caption", "tbody" },
935     { "caption", "tfoot" },
936     { "caption", "thead" },
937     { "caption", "tr" },
938     { "col", "col" },
939     { "col", "colgroup" },
940     { "col", "tbody" },
941     { "col", "tfoot" },
942     { "col", "thead" },
943     { "col", "tr" },
944     { "colgroup", "colgroup" },
945     { "colgroup", "tbody" },
946     { "colgroup", "tfoot" },
947     { "colgroup", "thead" },
948     { "colgroup", "tr" },
949     { "dd", "dt" },
950     { "dir", "dd" },
951     { "dir", "dl" },
952     { "dir", "dt" },
953     { "dir", "form" },
954     { "dir", "ul" },
955     { "dl", "form" },
956     { "dl", "li" },
957     { "dt", "dd" },
958     { "dt", "dl" },
959     { "font", "center" },
960     { "font", "td" },
961     { "font", "th" },
962     { "form", "form" },
963     { "h1", "fieldset" },
964     { "h1", "form" },
965     { "h1", "li" },
966     { "h1", "p" },
967     { "h1", "table" },
968     { "h2", "fieldset" },
969     { "h2", "form" },
970     { "h2", "li" },
971     { "h2", "p" },
972     { "h2", "table" },
973     { "h3", "fieldset" },
974     { "h3", "form" },
975     { "h3", "li" },
976     { "h3", "p" },
977     { "h3", "table" },
978     { "h4", "fieldset" },
979     { "h4", "form" },
980     { "h4", "li" },
981     { "h4", "p" },
982     { "h4", "table" },
983     { "h5", "fieldset" },
984     { "h5", "form" },
985     { "h5", "li" },
986     { "h5", "p" },
987     { "h5", "table" },
988     { "h6", "fieldset" },
989     { "h6", "form" },
990     { "h6", "li" },
991     { "h6", "p" },
992     { "h6", "table" },
993     { "head", "a" },
994     { "head", "abbr" },
995     { "head", "acronym" },
996     { "head", "address" },
997     { "head", "b" },
998     { "head", "bdo" },
999     { "head", "big" },
1000     { "head", "blockquote" },
1001     { "head", "body" },
1002     { "head", "br" },
1003     { "head", "center" },
1004     { "head", "cite" },
1005     { "head", "code" },
1006     { "head", "dd" },
1007     { "head", "dfn" },
1008     { "head", "dir" },
1009     { "head", "div" },
1010     { "head", "dl" },
1011     { "head", "dt" },
1012     { "head", "em" },
1013     { "head", "fieldset" },
1014     { "head", "font" },
1015     { "head", "form" },
1016     { "head", "frameset" },
1017     { "head", "h1" },
1018     { "head", "h2" },
1019     { "head", "h3" },
1020     { "head", "h4" },
1021     { "head", "h5" },
1022     { "head", "h6" },
1023     { "head", "hr" },
1024     { "head", "i" },
1025     { "head", "iframe" },
1026     { "head", "img" },
1027     { "head", "kbd" },
1028     { "head", "li" },
1029     { "head", "listing" },
1030     { "head", "map" },
1031     { "head", "menu" },
1032     { "head", "ol" },
1033     { "head", "p" },
1034     { "head", "pre" },
1035     { "head", "q" },
1036     { "head", "s" },
1037     { "head", "samp" },
1038     { "head", "small" },
1039     { "head", "span" },
1040     { "head", "strike" },
1041     { "head", "strong" },
1042     { "head", "sub" },
1043     { "head", "sup" },
1044     { "head", "table" },
1045     { "head", "tt" },
1046     { "head", "u" },
1047     { "head", "ul" },
1048     { "head", "var" },
1049     { "head", "xmp" },
1050     { "hr", "form" },
1051     { "i", "center" },
1052     { "i", "p" },
1053     { "i", "td" },
1054     { "i", "th" },
1055     { "legend", "fieldset" },
1056     { "li", "li" },
1057     { "link", "body" },
1058     { "link", "frameset" },
1059     { "listing", "dd" },
1060     { "listing", "dl" },
1061     { "listing", "dt" },
1062     { "listing", "fieldset" },
1063     { "listing", "form" },
1064     { "listing", "li" },
1065     { "listing", "table" },
1066     { "listing", "ul" },
1067     { "menu", "dd" },
1068     { "menu", "dl" },
1069     { "menu", "dt" },
1070     { "menu", "form" },
1071     { "menu", "ul" },
1072     { "ol", "form" },
1073     { "option", "optgroup" },
1074     { "option", "option" },
1075     { "p", "address" },
1076     { "p", "blockquote" },
1077     { "p", "body" },
1078     { "p", "caption" },
1079     { "p", "center" },
1080     { "p", "col" },
1081     { "p", "colgroup" },
1082     { "p", "dd" },
1083     { "p", "dir" },
1084     { "p", "div" },
1085     { "p", "dl" },
1086     { "p", "dt" },
1087     { "p", "fieldset" },
1088     { "p", "form" },
1089     { "p", "frameset" },
1090     { "p", "h1" },
1091     { "p", "h2" },
1092     { "p", "h3" },
1093     { "p", "h4" },
1094     { "p", "h5" },
1095     { "p", "h6" },
1096     { "p", "head" },
1097     { "p", "hr" },
1098     { "p", "li" },
1099     { "p", "listing" },
1100     { "p", "menu" },
1101     { "p", "ol" },
1102     { "p", "p" },
1103     { "p", "pre" },
1104     { "p", "table" },
1105     { "p", "tbody" },
1106     { "p", "td" },
1107     { "p", "tfoot" },
1108     { "p", "th" },
1109     { "p", "title" },
1110     { "p", "tr" },
1111     { "p", "ul" },
1112     { "p", "xmp" },
1113     { "pre", "dd" },
1114     { "pre", "dl" },
1115     { "pre", "dt" },
1116     { "pre", "fieldset" },
1117     { "pre", "form" },
1118     { "pre", "li" },
1119     { "pre", "table" },
1120     { "pre", "ul" },
1121     { "s", "p" },
1122     { "script", "noscript" },
1123     { "small", "p" },
1124     { "span", "td" },
1125     { "span", "th" },
1126     { "strike", "p" },
1127     { "style", "body" },
1128     { "style", "frameset" },
1129     { "tbody", "tbody" },
1130     { "tbody", "tfoot" },
1131     { "td", "tbody" },
1132     { "td", "td" },
1133     { "td", "tfoot" },
1134     { "td", "th" },
1135     { "td", "tr" },
1136     { "tfoot", "tbody" },
1137     { "th", "tbody" },
1138     { "th", "td" },
1139     { "th", "tfoot" },
1140     { "th", "th" },
1141     { "th", "tr" },
1142     { "thead", "tbody" },
1143     { "thead", "tfoot" },
1144     { "title", "body" },
1145     { "title", "frameset" },
1146     { "tr", "tbody" },
1147     { "tr", "tfoot" },
1148     { "tr", "tr" },
1149     { "tt", "p" },
1150     { "u", "p" },
1151     { "u", "td" },
1152     { "u", "th" },
1153     { "ul", "address" },
1154     { "ul", "form" },
1155     { "ul", "menu" },
1156     { "ul", "pre" },
1157     { "xmp", "dd" },
1158     { "xmp", "dl" },
1159     { "xmp", "dt" },
1160     { "xmp", "fieldset" },
1161     { "xmp", "form" },
1162     { "xmp", "li" },
1163     { "xmp", "table" },
1164     { "xmp", "ul" }
1165 };
1166 
1167 /*
1168  * The list of HTML elements which are supposed not to have
1169  * CDATA content and where a p element will be implied
1170  *
1171  * TODO: extend that list by reading the HTML SGML DTD on
1172  *       implied paragraph
1173  */
1174 static const char *const htmlNoContentElements[] = {
1175     "html",
1176     "head",
1177     NULL
1178 };
1179 
1180 /*
1181  * The list of HTML attributes which are of content %Script;
1182  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1183  *       it assumes the name starts with 'on'
1184  */
1185 static const char *const htmlScriptAttributes[] = {
1186     "onclick",
1187     "ondblclick",
1188     "onmousedown",
1189     "onmouseup",
1190     "onmouseover",
1191     "onmousemove",
1192     "onmouseout",
1193     "onkeypress",
1194     "onkeydown",
1195     "onkeyup",
1196     "onload",
1197     "onunload",
1198     "onfocus",
1199     "onblur",
1200     "onsubmit",
1201     "onreset",
1202     "onchange",
1203     "onselect"
1204 };
1205 
1206 /*
1207  * This table is used by the htmlparser to know what to do with
1208  * broken html pages. By assigning different priorities to different
1209  * elements the parser can decide how to handle extra endtags.
1210  * Endtags are only allowed to close elements with lower or equal
1211  * priority.
1212  */
1213 
1214 typedef struct {
1215     const char *name;
1216     int priority;
1217 } elementPriority;
1218 
1219 static const elementPriority htmlEndPriority[] = {
1220     {"div",   150},
1221     {"td",    160},
1222     {"th",    160},
1223     {"tr",    170},
1224     {"thead", 180},
1225     {"tbody", 180},
1226     {"tfoot", 180},
1227     {"table", 190},
1228     {"head",  200},
1229     {"body",  200},
1230     {"html",  220},
1231     {NULL,    100} /* Default priority */
1232 };
1233 
1234 /************************************************************************
1235  *									*
1236  *	functions to handle HTML specific data			*
1237  *									*
1238  ************************************************************************/
1239 
1240 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)1241 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
1242     /*
1243      * Capture end position and add node
1244      */
1245     if ( ctxt->node != NULL && ctxt->record_info ) {
1246        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
1247                                 (CUR_PTR - ctxt->input->base);
1248        ctxt->nodeInfo->end_line = ctxt->input->line;
1249        ctxt->nodeInfo->node = ctxt->node;
1250        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
1251        htmlNodeInfoPop(ctxt);
1252     }
1253 }
1254 
1255 /**
1256  * htmlInitAutoClose:
1257  *
1258  * DEPRECATED: This is a no-op.
1259  */
1260 void
htmlInitAutoClose(void)1261 htmlInitAutoClose(void) {
1262 }
1263 
1264 static int
htmlCompareTags(const void * key,const void * member)1265 htmlCompareTags(const void *key, const void *member) {
1266     const xmlChar *tag = (const xmlChar *) key;
1267     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1268 
1269     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1270 }
1271 
1272 /**
1273  * htmlTagLookup:
1274  * @tag:  The tag name in lowercase
1275  *
1276  * Lookup the HTML tag in the ElementTable
1277  *
1278  * Returns the related htmlElemDescPtr or NULL if not found.
1279  */
1280 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1281 htmlTagLookup(const xmlChar *tag) {
1282     if (tag == NULL)
1283         return(NULL);
1284 
1285     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1286                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1287                 sizeof(htmlElemDesc), htmlCompareTags));
1288 }
1289 
1290 /**
1291  * htmlGetEndPriority:
1292  * @name: The name of the element to look up the priority for.
1293  *
1294  * Return value: The "endtag" priority.
1295  **/
1296 static int
htmlGetEndPriority(const xmlChar * name)1297 htmlGetEndPriority (const xmlChar *name) {
1298     int i = 0;
1299 
1300     while ((htmlEndPriority[i].name != NULL) &&
1301 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1302 	i++;
1303 
1304     return(htmlEndPriority[i].priority);
1305 }
1306 
1307 
1308 static int
htmlCompareStartClose(const void * vkey,const void * member)1309 htmlCompareStartClose(const void *vkey, const void *member) {
1310     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1311     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1312     int ret;
1313 
1314     ret = strcmp(key->oldTag, entry->oldTag);
1315     if (ret == 0)
1316         ret = strcmp(key->newTag, entry->newTag);
1317 
1318     return(ret);
1319 }
1320 
1321 /**
1322  * htmlCheckAutoClose:
1323  * @newtag:  The new tag name
1324  * @oldtag:  The old tag name
1325  *
1326  * Checks whether the new tag is one of the registered valid tags for
1327  * closing old.
1328  *
1329  * Returns 0 if no, 1 if yes.
1330  */
1331 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1332 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1333 {
1334     htmlStartCloseEntry key;
1335     void *res;
1336 
1337     key.oldTag = (const char *) oldtag;
1338     key.newTag = (const char *) newtag;
1339     res = bsearch(&key, htmlStartClose,
1340             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1341             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1342     return(res != NULL);
1343 }
1344 
1345 /**
1346  * htmlAutoCloseOnClose:
1347  * @ctxt:  an HTML parser context
1348  * @newtag:  The new tag name
1349  * @force:  force the tag closure
1350  *
1351  * The HTML DTD allows an ending tag to implicitly close other tags.
1352  */
1353 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1354 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1355 {
1356     const htmlElemDesc *info;
1357     int i, priority;
1358 
1359     if (ctxt->options & HTML_PARSE_HTML5)
1360         return;
1361 
1362     priority = htmlGetEndPriority(newtag);
1363 
1364     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1365 
1366         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1367             break;
1368         /*
1369          * A misplaced endtag can only close elements with lower
1370          * or equal priority, so if we find an element with higher
1371          * priority before we find an element with
1372          * matching name, we just ignore this endtag
1373          */
1374         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1375             return;
1376     }
1377     if (i < 0)
1378         return;
1379 
1380     while (!xmlStrEqual(newtag, ctxt->name)) {
1381         info = htmlTagLookup(ctxt->name);
1382         if ((info != NULL) && (info->endTag == 3)) {
1383             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1384 	                 "Opening and ending tag mismatch: %s and %s\n",
1385 			 newtag, ctxt->name);
1386         }
1387 	htmlParserFinishElementParsing(ctxt);
1388         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1389             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1390 	htmlnamePop(ctxt);
1391     }
1392 }
1393 
1394 /**
1395  * htmlAutoCloseOnEnd:
1396  * @ctxt:  an HTML parser context
1397  *
1398  * Close all remaining tags at the end of the stream
1399  */
1400 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1401 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1402 {
1403     int i;
1404 
1405     if (ctxt->options & HTML_PARSE_HTML5)
1406         return;
1407 
1408     if (ctxt->nameNr == 0)
1409         return;
1410     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1411 	htmlParserFinishElementParsing(ctxt);
1412         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1413             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1414 	htmlnamePop(ctxt);
1415     }
1416 }
1417 
1418 /**
1419  * htmlAutoClose:
1420  * @ctxt:  an HTML parser context
1421  * @newtag:  The new tag name or NULL
1422  *
1423  * The HTML DTD allows a tag to implicitly close other tags.
1424  * The list is kept in htmlStartClose array. This function is
1425  * called when a new tag has been detected and generates the
1426  * appropriates closes if possible/needed.
1427  * If newtag is NULL this mean we are at the end of the resource
1428  * and we should check
1429  */
1430 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1431 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1432 {
1433     if (ctxt->options & HTML_PARSE_HTML5)
1434         return;
1435 
1436     if (newtag == NULL)
1437         return;
1438 
1439     while ((ctxt->name != NULL) &&
1440            (htmlCheckAutoClose(newtag, ctxt->name))) {
1441 	htmlParserFinishElementParsing(ctxt);
1442         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1443             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1444 	htmlnamePop(ctxt);
1445     }
1446 }
1447 
1448 /**
1449  * htmlAutoCloseTag:
1450  * @doc:  the HTML document
1451  * @name:  The tag name
1452  * @elem:  the HTML element
1453  *
1454  * DEPRECATED: Internal function, don't use.
1455  *
1456  * The HTML DTD allows a tag to implicitly close other tags.
1457  * The list is kept in htmlStartClose array. This function checks
1458  * if the element or one of it's children would autoclose the
1459  * given tag.
1460  *
1461  * Returns 1 if autoclose, 0 otherwise
1462  */
1463 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1464 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1465     htmlNodePtr child;
1466 
1467     if (elem == NULL) return(1);
1468     if (xmlStrEqual(name, elem->name)) return(0);
1469     if (htmlCheckAutoClose(elem->name, name)) return(1);
1470     child = elem->children;
1471     while (child != NULL) {
1472         if (htmlAutoCloseTag(doc, name, child)) return(1);
1473 	child = child->next;
1474     }
1475     return(0);
1476 }
1477 
1478 /**
1479  * htmlIsAutoClosed:
1480  * @doc:  the HTML document
1481  * @elem:  the HTML element
1482  *
1483  * DEPRECATED: Internal function, don't use.
1484  *
1485  * The HTML DTD allows a tag to implicitly close other tags.
1486  * The list is kept in htmlStartClose array. This function checks
1487  * if a tag is autoclosed by one of it's child
1488  *
1489  * Returns 1 if autoclosed, 0 otherwise
1490  */
1491 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1492 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1493     htmlNodePtr child;
1494 
1495     if (elem == NULL) return(1);
1496     child = elem->children;
1497     while (child != NULL) {
1498 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1499 	child = child->next;
1500     }
1501     return(0);
1502 }
1503 
1504 /**
1505  * htmlCheckImplied:
1506  * @ctxt:  an HTML parser context
1507  * @newtag:  The new tag name
1508  *
1509  * The HTML DTD allows a tag to exists only implicitly
1510  * called when a new tag has been detected and generates the
1511  * appropriates implicit tags if missing
1512  */
1513 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1514 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1515     int i;
1516 
1517     if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
1518         return;
1519     if (!htmlOmittedDefaultValue)
1520 	return;
1521     if (xmlStrEqual(newtag, BAD_CAST"html"))
1522 	return;
1523     if (ctxt->nameNr <= 0) {
1524 	htmlnamePush(ctxt, BAD_CAST"html");
1525 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1526 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1527     }
1528     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1529         return;
1530     if ((ctxt->nameNr <= 1) &&
1531         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1532 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1533 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1534 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1535 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1536 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1537         if (ctxt->html >= 3) {
1538             /* we already saw or generated an <head> before */
1539             return;
1540         }
1541         /*
1542          * dropped OBJECT ... i you put it first BODY will be
1543          * assumed !
1544          */
1545         htmlnamePush(ctxt, BAD_CAST"head");
1546         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1547             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1548     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1549 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1550 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1551         if (ctxt->html >= 10) {
1552             /* we already saw or generated a <body> before */
1553             return;
1554         }
1555 	for (i = 0;i < ctxt->nameNr;i++) {
1556 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1557 		return;
1558 	    }
1559 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1560 		return;
1561 	    }
1562 	}
1563 
1564 	htmlnamePush(ctxt, BAD_CAST"body");
1565 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1567     }
1568 }
1569 
1570 /**
1571  * htmlCheckParagraph
1572  * @ctxt:  an HTML parser context
1573  *
1574  * Check whether a p element need to be implied before inserting
1575  * characters in the current element.
1576  *
1577  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1578  *         in case of error.
1579  */
1580 
1581 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1582 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1583     const xmlChar *tag;
1584     int i;
1585 
1586     if (ctxt == NULL)
1587 	return(-1);
1588     if (ctxt->options & HTML_PARSE_HTML5)
1589         return(0);
1590 
1591     tag = ctxt->name;
1592     if (tag == NULL) {
1593 	htmlAutoClose(ctxt, BAD_CAST"p");
1594 	htmlCheckImplied(ctxt, BAD_CAST"p");
1595 	htmlnamePush(ctxt, BAD_CAST"p");
1596 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1597 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1598 	return(1);
1599     }
1600     if (!htmlOmittedDefaultValue)
1601 	return(0);
1602     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1603 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1604 	    htmlAutoClose(ctxt, BAD_CAST"p");
1605 	    htmlCheckImplied(ctxt, BAD_CAST"p");
1606 	    htmlnamePush(ctxt, BAD_CAST"p");
1607 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1608 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1609 	    return(1);
1610 	}
1611     }
1612     return(0);
1613 }
1614 
1615 /**
1616  * htmlIsScriptAttribute:
1617  * @name:  an attribute name
1618  *
1619  * Check if an attribute is of content type Script
1620  *
1621  * Returns 1 is the attribute is a script 0 otherwise
1622  */
1623 int
htmlIsScriptAttribute(const xmlChar * name)1624 htmlIsScriptAttribute(const xmlChar *name) {
1625     unsigned int i;
1626 
1627     if (name == NULL)
1628       return(0);
1629     /*
1630      * all script attributes start with 'on'
1631      */
1632     if ((name[0] != 'o') || (name[1] != 'n'))
1633       return(0);
1634     for (i = 0;
1635 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1636 	 i++) {
1637 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1638 	    return(1);
1639     }
1640     return(0);
1641 }
1642 
1643 /************************************************************************
1644  *									*
1645  *	The list of HTML predefined entities			*
1646  *									*
1647  ************************************************************************/
1648 
1649 
1650 static const htmlEntityDesc  html40EntitiesTable[] = {
1651 /*
1652  * the 4 absolute ones, plus apostrophe.
1653  */
1654 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1655 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
1656 { 39,	"apos",	"single quote" },
1657 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
1658 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1659 
1660 /*
1661  * A bunch still in the 128-255 range
1662  * Replacing them depend really on the charset used.
1663  */
1664 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1665 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1666 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1667 { 163,	"pound","pound sign, U+00A3 ISOnum" },
1668 { 164,	"curren","currency sign, U+00A4 ISOnum" },
1669 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1670 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1671 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
1672 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1673 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1674 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1675 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1676 { 172,	"not",	"not sign, U+00AC ISOnum" },
1677 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1678 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1679 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1680 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1681 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1682 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1683 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1684 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1685 { 181,	"micro","micro sign, U+00B5 ISOnum" },
1686 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1687 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1688 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1689 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1690 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1691 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1692 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1693 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1694 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1695 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1696 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1697 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1698 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1699 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1700 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1701 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1702 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1703 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1704 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1705 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1706 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1707 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1708 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1709 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1710 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1711 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1712 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1713 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1714 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1715 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1716 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1717 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1718 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1719 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
1720 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1721 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1722 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1723 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1724 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1725 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1726 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1727 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1728 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1729 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1730 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1731 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1732 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1733 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1734 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1735 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1736 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1737 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1738 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1739 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1740 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1741 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1742 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1743 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1744 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1745 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1746 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1747 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1748 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1749 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1750 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1751 { 247,	"divide","division sign, U+00F7 ISOnum" },
1752 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1753 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1754 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1755 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1756 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1757 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1758 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1759 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1760 
1761 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1762 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1763 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1764 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1765 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1766 
1767 /*
1768  * Anything below should really be kept as entities references
1769  */
1770 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1771 
1772 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1773 { 732,	"tilde","small tilde, U+02DC ISOdia" },
1774 
1775 { 913,	"Alpha","greek capital letter alpha, U+0391" },
1776 { 914,	"Beta",	"greek capital letter beta, U+0392" },
1777 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1778 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1779 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1780 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1781 { 919,	"Eta",	"greek capital letter eta, U+0397" },
1782 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1783 { 921,	"Iota",	"greek capital letter iota, U+0399" },
1784 { 922,	"Kappa","greek capital letter kappa, U+039A" },
1785 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1786 { 924,	"Mu",	"greek capital letter mu, U+039C" },
1787 { 925,	"Nu",	"greek capital letter nu, U+039D" },
1788 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1789 { 927,	"Omicron","greek capital letter omicron, U+039F" },
1790 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1791 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
1792 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1793 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
1794 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1795 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1796 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
1797 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1798 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1799 
1800 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1801 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1802 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1803 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1804 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1805 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1806 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1807 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1808 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1809 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1810 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1811 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1812 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1813 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1814 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1815 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1816 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1817 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1818 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1819 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1820 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1821 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1822 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1823 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1824 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1825 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1826 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1827 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1828 
1829 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
1830 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
1831 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
1832 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1833 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1834 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1835 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1836 { 8211,	"ndash","en dash, U+2013 ISOpub" },
1837 { 8212,	"mdash","em dash, U+2014 ISOpub" },
1838 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1839 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1840 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1841 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1842 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1843 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1844 { 8224,	"dagger","dagger, U+2020 ISOpub" },
1845 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1846 
1847 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1848 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1849 
1850 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
1851 
1852 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1853 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1854 
1855 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1856 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1857 
1858 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1859 { 8260,	"frasl","fraction slash, U+2044 NEW" },
1860 
1861 { 8364,	"euro",	"euro sign, U+20AC NEW" },
1862 
1863 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1864 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1865 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1866 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1867 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1868 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1869 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1870 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1871 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1872 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1873 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1874 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1875 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1876 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1877 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1878 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1879 
1880 { 8704,	"forall","for all, U+2200 ISOtech" },
1881 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
1882 { 8707,	"exist","there exists, U+2203 ISOtech" },
1883 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1884 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1885 { 8712,	"isin",	"element of, U+2208 ISOtech" },
1886 { 8713,	"notin","not an element of, U+2209 ISOtech" },
1887 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
1888 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1889 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1890 { 8722,	"minus","minus sign, U+2212 ISOtech" },
1891 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1892 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1893 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
1894 { 8734,	"infin","infinity, U+221E ISOtech" },
1895 { 8736,	"ang",	"angle, U+2220 ISOamso" },
1896 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1897 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1898 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1899 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
1900 { 8747,	"int",	"integral, U+222B ISOtech" },
1901 { 8756,	"there4","therefore, U+2234 ISOtech" },
1902 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1903 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1904 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1905 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1906 { 8801,	"equiv","identical to, U+2261 ISOtech" },
1907 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1908 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1909 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
1910 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
1911 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1912 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1913 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1914 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1915 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1916 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1917 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1918 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1919 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1920 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1921 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
1922 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1923 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1924 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1925 
1926 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
1927 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1928 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1929 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1930 
1931 };
1932 
1933 /************************************************************************
1934  *									*
1935  *		Commodity functions to handle entities			*
1936  *									*
1937  ************************************************************************/
1938 
1939 /**
1940  * htmlEntityLookup:
1941  * @name: the entity name
1942  *
1943  * Lookup the given entity in EntitiesTable
1944  *
1945  * TODO: the linear scan is really ugly, an hash table is really needed.
1946  *
1947  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1948  */
1949 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)1950 htmlEntityLookup(const xmlChar *name) {
1951     unsigned int i;
1952 
1953     for (i = 0;i < (sizeof(html40EntitiesTable)/
1954                     sizeof(html40EntitiesTable[0]));i++) {
1955         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1956             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1957 	}
1958     }
1959     return(NULL);
1960 }
1961 
1962 static int
htmlCompareEntityDesc(const void * vkey,const void * vdesc)1963 htmlCompareEntityDesc(const void *vkey, const void *vdesc) {
1964     const unsigned *key = vkey;
1965     const htmlEntityDesc *desc = vdesc;
1966 
1967     return((int) *key - (int) desc->value);
1968 }
1969 
1970 /**
1971  * htmlEntityValueLookup:
1972  * @value: the entity's unicode value
1973  *
1974  * Lookup the given entity in EntitiesTable
1975  *
1976  * TODO: the linear scan is really ugly, an hash table is really needed.
1977  *
1978  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1979  */
1980 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)1981 htmlEntityValueLookup(unsigned int value) {
1982     const htmlEntityDesc *desc;
1983     size_t nmemb;
1984 
1985     nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]);
1986     desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc),
1987                    htmlCompareEntityDesc);
1988 
1989     return(desc);
1990 }
1991 
1992 /**
1993  * UTF8ToHtml:
1994  * @out:  a pointer to an array of bytes to store the result
1995  * @outlen:  the length of @out
1996  * @in:  a pointer to an array of UTF-8 chars
1997  * @inlen:  the length of @in
1998  *
1999  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2000  * plus HTML entities block of chars out.
2001  *
2002  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2003  * The value of @inlen after return is the number of octets consumed
2004  *     as the return value is positive, else unpredictable.
2005  * The value of @outlen after return is the number of octets consumed.
2006  */
2007 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2008 UTF8ToHtml(unsigned char* out, int *outlen,
2009            const unsigned char* in, int *inlen) {
2010     const unsigned char* instart = in;
2011     const unsigned char* inend;
2012     unsigned char* outstart = out;
2013     unsigned char* outend;
2014     int ret = XML_ENC_ERR_SPACE;
2015 
2016     if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
2017         return(XML_ENC_ERR_INTERNAL);
2018 
2019     if (in == NULL) {
2020         /*
2021 	 * initialization nothing to do
2022 	 */
2023 	*outlen = 0;
2024 	*inlen = 0;
2025 	return(XML_ENC_ERR_SUCCESS);
2026     }
2027 
2028     inend = in + *inlen;
2029     outend = out + *outlen;
2030     while (in < inend) {
2031         const htmlEntityDesc *ent;
2032         const char *cp;
2033         char nbuf[16];
2034         unsigned c, d;
2035         int seqlen, len, i;
2036 
2037 	d = *in;
2038 
2039 	if (d < 0x80) {
2040             if (out >= outend)
2041                 goto done;
2042             *out++ = d;
2043             in += 1;
2044             continue;
2045         }
2046 
2047         if (d < 0xE0)      { c = d & 0x1F; seqlen = 2; }
2048         else if (d < 0xF0) { c = d & 0x0F; seqlen = 3; }
2049         else               { c = d & 0x07; seqlen = 4; }
2050 
2051 	if (inend - in < seqlen)
2052 	    break;
2053 
2054 	for (i = 1; i < seqlen; i++) {
2055 	    d = in[i];
2056 	    c <<= 6;
2057 	    c |= d & 0x3F;
2058 	}
2059 
2060         /*
2061          * Try to lookup a predefined HTML entity for it
2062          */
2063         ent = htmlEntityValueLookup(c);
2064 
2065         if (ent == NULL) {
2066           snprintf(nbuf, sizeof(nbuf), "#%u", c);
2067           cp = nbuf;
2068         } else {
2069           cp = ent->name;
2070         }
2071 
2072         len = strlen(cp);
2073         if (outend - out < len + 2)
2074             goto done;
2075 
2076         *out++ = '&';
2077         memcpy(out, cp, len);
2078         out += len;
2079         *out++ = ';';
2080 
2081         in += seqlen;
2082     }
2083 
2084     ret = out - outstart;
2085 
2086 done:
2087     *outlen = out - outstart;
2088     *inlen = in - instart;
2089     return(ret);
2090 }
2091 
2092 /**
2093  * htmlEncodeEntities:
2094  * @out:  a pointer to an array of bytes to store the result
2095  * @outlen:  the length of @out
2096  * @in:  a pointer to an array of UTF-8 chars
2097  * @inlen:  the length of @in
2098  * @quoteChar: the quote character to escape (' or ") or zero.
2099  *
2100  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2101  * plus HTML entities block of chars out.
2102  *
2103  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2104  * The value of @inlen after return is the number of octets consumed
2105  *     as the return value is positive, else unpredictable.
2106  * The value of @outlen after return is the number of octets consumed.
2107  */
2108 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2109 htmlEncodeEntities(unsigned char* out, int *outlen,
2110 		   const unsigned char* in, int *inlen, int quoteChar) {
2111     const unsigned char* processed = in;
2112     const unsigned char* outend;
2113     const unsigned char* outstart = out;
2114     const unsigned char* instart = in;
2115     const unsigned char* inend;
2116     unsigned int c, d;
2117     int trailing;
2118 
2119     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2120         return(-1);
2121     outend = out + (*outlen);
2122     inend = in + (*inlen);
2123     while (in < inend) {
2124 	d = *in++;
2125 	if      (d < 0x80)  { c= d; trailing= 0; }
2126 	else if (d < 0xC0) {
2127 	    /* trailing byte in leading position */
2128 	    *outlen = out - outstart;
2129 	    *inlen = processed - instart;
2130 	    return(-2);
2131         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2132         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2133         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2134 	else {
2135 	    /* no chance for this in Ascii */
2136 	    *outlen = out - outstart;
2137 	    *inlen = processed - instart;
2138 	    return(-2);
2139 	}
2140 
2141 	if (inend - in < trailing)
2142 	    break;
2143 
2144 	while (trailing--) {
2145 	    if (((d= *in++) & 0xC0) != 0x80) {
2146 		*outlen = out - outstart;
2147 		*inlen = processed - instart;
2148 		return(-2);
2149 	    }
2150 	    c <<= 6;
2151 	    c |= d & 0x3F;
2152 	}
2153 
2154 	/* assertion: c is a single UTF-4 value */
2155 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2156 	    (c != '&') && (c != '<') && (c != '>')) {
2157 	    if (out >= outend)
2158 		break;
2159 	    *out++ = c;
2160 	} else {
2161 	    const htmlEntityDesc * ent;
2162 	    const char *cp;
2163 	    char nbuf[16];
2164 	    int len;
2165 
2166 	    /*
2167 	     * Try to lookup a predefined HTML entity for it
2168 	     */
2169 	    ent = htmlEntityValueLookup(c);
2170 	    if (ent == NULL) {
2171 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2172 		cp = nbuf;
2173 	    }
2174 	    else
2175 		cp = ent->name;
2176 	    len = strlen(cp);
2177 	    if (outend - out < len + 2)
2178 		break;
2179 	    *out++ = '&';
2180 	    memcpy(out, cp, len);
2181 	    out += len;
2182 	    *out++ = ';';
2183 	}
2184 	processed = in;
2185     }
2186     *outlen = out - outstart;
2187     *inlen = processed - instart;
2188     return(0);
2189 }
2190 
2191 /************************************************************************
2192  *									*
2193  *		Commodity functions, cleanup needed ?			*
2194  *									*
2195  ************************************************************************/
2196 /*
2197  * all tags allowing pc data from the html 4.01 loose dtd
2198  * NOTE: it might be more appropriate to integrate this information
2199  * into the html40ElementTable array but I don't want to risk any
2200  * binary incompatibility
2201  */
2202 static const char *allowPCData[] = {
2203     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2204     "blockquote", "body", "button", "caption", "center", "cite", "code",
2205     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2206     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2207     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2208     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2209 };
2210 
2211 /**
2212  * areBlanks:
2213  * @ctxt:  an HTML parser context
2214  * @str:  a xmlChar *
2215  * @len:  the size of @str
2216  *
2217  * Is this a sequence of blank chars that one can ignore ?
2218  *
2219  * Returns 1 if ignorable 0 if whitespace, -1 otherwise.
2220  */
2221 
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2222 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2223     unsigned int i;
2224     int j;
2225     xmlNodePtr lastChild;
2226     xmlDtdPtr dtd;
2227 
2228     for (j = 0;j < len;j++)
2229         if (!(IS_WS_HTML(str[j]))) return(-1);
2230 
2231     if (CUR == 0) return(1);
2232     if (CUR != '<') return(0);
2233     if (ctxt->name == NULL)
2234 	return(1);
2235     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2236 	return(1);
2237     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2238 	return(1);
2239 
2240     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2241     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2242         dtd = xmlGetIntSubset(ctxt->myDoc);
2243         if (dtd != NULL && dtd->ExternalID != NULL) {
2244             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2245                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2246                 return(1);
2247         }
2248     }
2249 
2250     if (ctxt->node == NULL) return(0);
2251     lastChild = xmlGetLastChild(ctxt->node);
2252     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2253 	lastChild = lastChild->prev;
2254     if (lastChild == NULL) {
2255         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2256             (ctxt->node->content != NULL)) return(0);
2257 	/* keep ws in constructs like ...<b> </b>...
2258 	   for all tags "b" allowing PCDATA */
2259 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2260 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2261 		return(0);
2262 	    }
2263 	}
2264     } else if (xmlNodeIsText(lastChild)) {
2265         return(0);
2266     } else {
2267 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2268 	   for all tags "p" allowing PCDATA */
2269 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2270 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2271 		return(0);
2272 	    }
2273 	}
2274     }
2275     return(1);
2276 }
2277 
2278 /**
2279  * htmlNewDocNoDtD:
2280  * @URI:  URI for the dtd, or NULL
2281  * @ExternalID:  the external ID of the DTD, or NULL
2282  *
2283  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2284  * are NULL
2285  *
2286  * Returns a new document, do not initialize the DTD if not provided
2287  */
2288 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2289 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2290     xmlDocPtr cur;
2291 
2292     /*
2293      * Allocate a new document and fill the fields.
2294      */
2295     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2296     if (cur == NULL)
2297 	return(NULL);
2298     memset(cur, 0, sizeof(xmlDoc));
2299 
2300     cur->type = XML_HTML_DOCUMENT_NODE;
2301     cur->version = NULL;
2302     cur->intSubset = NULL;
2303     cur->doc = cur;
2304     cur->name = NULL;
2305     cur->children = NULL;
2306     cur->extSubset = NULL;
2307     cur->oldNs = NULL;
2308     cur->encoding = NULL;
2309     cur->standalone = 1;
2310     cur->compression = 0;
2311     cur->ids = NULL;
2312     cur->refs = NULL;
2313     cur->_private = NULL;
2314     cur->charset = XML_CHAR_ENCODING_UTF8;
2315     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2316     if ((ExternalID != NULL) ||
2317 	(URI != NULL)) {
2318         xmlDtdPtr intSubset;
2319 
2320 	intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2321         if (intSubset == NULL) {
2322             xmlFree(cur);
2323             return(NULL);
2324         }
2325     }
2326     if ((xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2327 	xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2328     return(cur);
2329 }
2330 
2331 /**
2332  * htmlNewDoc:
2333  * @URI:  URI for the dtd, or NULL
2334  * @ExternalID:  the external ID of the DTD, or NULL
2335  *
2336  * Creates a new HTML document
2337  *
2338  * Returns a new document
2339  */
2340 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2341 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2342     if ((URI == NULL) && (ExternalID == NULL))
2343 	return(htmlNewDocNoDtD(
2344 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2345 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2346 
2347     return(htmlNewDocNoDtD(URI, ExternalID));
2348 }
2349 
2350 
2351 /************************************************************************
2352  *									*
2353  *			The parser itself				*
2354  *	Relates to http://www.w3.org/TR/html40				*
2355  *									*
2356  ************************************************************************/
2357 
2358 /************************************************************************
2359  *									*
2360  *			The parser itself				*
2361  *									*
2362  ************************************************************************/
2363 
2364 /**
2365  * htmlParseHTMLName:
2366  * @ctxt:  an HTML parser context
2367  *
2368  * parse an HTML tag or attribute name, note that we convert it to lowercase
2369  * since HTML names are not case-sensitive.
2370  *
2371  * Returns the Tag Name parsed or NULL
2372  */
2373 
2374 static xmlHashedString
htmlParseHTMLName(htmlParserCtxtPtr ctxt,int attr)2375 htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
2376     xmlHashedString ret;
2377     xmlChar buf[HTML_PARSER_BUFFER_SIZE];
2378     const xmlChar *in;
2379     size_t avail;
2380     int eof = PARSER_PROGRESSIVE(ctxt);
2381     int nbchar = 0;
2382     int stop = attr ? '=' : ' ';
2383 
2384     in = ctxt->input->cur;
2385     avail = ctxt->input->end - in;
2386 
2387     while (1) {
2388         int c, size;
2389 
2390         if ((!eof) && (avail < 32)) {
2391             size_t oldAvail = avail;
2392 
2393             ctxt->input->cur = in;
2394 
2395             SHRINK;
2396             xmlParserGrow(ctxt);
2397 
2398             in = ctxt->input->cur;
2399             avail = ctxt->input->end - in;
2400 
2401             if (oldAvail == avail)
2402                 eof = 1;
2403         }
2404 
2405         if (avail == 0)
2406             break;
2407 
2408         c = *in;
2409         size = 1;
2410 
2411         if ((nbchar != 0) &&
2412             ((c == '/') || (c == '>') || (c == stop) ||
2413              (IS_WS_HTML(c))))
2414             break;
2415 
2416         if (c == 0) {
2417             if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2418                 buf[nbchar++] = 0xEF;
2419                 buf[nbchar++] = 0xBF;
2420                 buf[nbchar++] = 0xBD;
2421             }
2422         } else if (c < 0x80) {
2423             if (nbchar < HTML_PARSER_BUFFER_SIZE) {
2424                 if (IS_UPPER(c))
2425                     c += 0x20;
2426                 buf[nbchar++] = c;
2427             }
2428         } else {
2429             size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
2430 
2431             if (size > 0) {
2432                 if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) {
2433                     memcpy(buf + nbchar, in, size);
2434                     nbchar += size;
2435                 }
2436             } else {
2437                 size = 1;
2438 
2439                 if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2440                     buf[nbchar++] = 0xEF;
2441                     buf[nbchar++] = 0xBF;
2442                     buf[nbchar++] = 0xBD;
2443                 }
2444             }
2445         }
2446 
2447         in += size;
2448         avail -= size;
2449     }
2450 
2451     ctxt->input->cur = in;
2452 
2453     SHRINK;
2454 
2455     ret = xmlDictLookupHashed(ctxt->dict, buf, nbchar);
2456     if (ret.name == NULL)
2457         htmlErrMemory(ctxt);
2458 
2459     return(ret);
2460 }
2461 
2462 static const short htmlC1Remap[32] = {
2463     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
2464     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
2465     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
2466     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
2467 };
2468 
2469 static const xmlChar *
htmlCodePointToUtf8(int c,xmlChar * out,int * osize)2470 htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
2471     int i = 0;
2472     int bits, hi;
2473 
2474     if ((c >= 0x80) && (c < 0xA0)) {
2475         c = htmlC1Remap[c - 0x80];
2476     } else if ((c <= 0) ||
2477                ((c >= 0xD800) && (c < 0xE000)) ||
2478                (c > 0x10FFFF)) {
2479         c = 0xFFFD;
2480     }
2481 
2482     if      (c <    0x80) { bits =  0; hi = 0x00; }
2483     else if (c <   0x800) { bits =  6; hi = 0xC0; }
2484     else if (c < 0x10000) { bits = 12; hi = 0xE0; }
2485     else                  { bits = 18; hi = 0xF0; }
2486 
2487     out[i++] = (c >> bits) | hi;
2488 
2489     while (bits > 0) {
2490         bits -= 6;
2491         out[i++] = ((c >> bits) & 0x3F) | 0x80;
2492     }
2493 
2494     *osize = i;
2495     return(out);
2496 }
2497 
2498 #include "html5ent.inc"
2499 
2500 #define ENT_F_SEMICOLON 0x80u
2501 #define ENT_F_SUBTABLE  0x40u
2502 #define ENT_F_ALL       0xC0u
2503 
2504 static const xmlChar *
htmlFindEntityPrefix(const xmlChar * string,size_t slen,int isAttr,int * nlen,int * rlen)2505 htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr,
2506                      int *nlen, int *rlen) {
2507     const xmlChar *match = NULL;
2508     unsigned left, right;
2509     int first = string[0];
2510     size_t matchLen = 0;
2511     size_t soff = 1;
2512 
2513     if (slen < 2)
2514         return(NULL);
2515     if (!IS_ASCII_LETTER(first))
2516         return(NULL);
2517 
2518     /*
2519      * Look up range by first character
2520      */
2521     first &= 63;
2522     left = htmlEntAlpha[first*3] | htmlEntAlpha[first*3+1] << 8;
2523     right = left + htmlEntAlpha[first*3+2];
2524 
2525     /*
2526      * Binary search
2527      */
2528     while (left < right) {
2529         const xmlChar *bytes;
2530         unsigned mid;
2531         size_t len;
2532         int cmp;
2533 
2534         mid = left + (right - left) / 2;
2535         bytes = htmlEntStrings + htmlEntValues[mid];
2536         len = bytes[0] & ~ENT_F_ALL;
2537 
2538         cmp = string[soff] - bytes[1];
2539 
2540         if (cmp == 0) {
2541             if (slen < len) {
2542                 cmp = strncmp((const char *) string + soff + 1,
2543                               (const char *) bytes + 2,
2544                               slen - 1);
2545                 /* Prefix can never match */
2546                 if (cmp == 0)
2547                     break;
2548             } else {
2549                 cmp = strncmp((const char *) string + soff + 1,
2550                               (const char *) bytes + 2,
2551                               len - 1);
2552             }
2553         }
2554 
2555         if (cmp < 0) {
2556             right = mid;
2557         } else if (cmp > 0) {
2558             left = mid + 1;
2559         } else {
2560             int term = soff + len < slen ? string[soff + len] : 0;
2561             int isAlnum, isTerm;
2562 
2563             isAlnum = IS_ALNUM(term);
2564             isTerm = ((term == ';') ||
2565                       ((bytes[0] & ENT_F_SEMICOLON) &&
2566                        ((!isAttr) ||
2567                         ((!isAlnum) && (term != '=')))));
2568 
2569             if (isTerm) {
2570                 match = bytes + len + 1;
2571                 matchLen = soff + len;
2572                 if (term == ';')
2573                     matchLen += 1;
2574             }
2575 
2576             if (bytes[0] & ENT_F_SUBTABLE) {
2577                 if (isTerm)
2578                     match += 2;
2579 
2580                 if ((isAlnum) && (soff + len < slen)) {
2581                     left = mid + bytes[len + 1];
2582                     right = left + bytes[len + 2];
2583                     soff += len;
2584                     continue;
2585                 }
2586             }
2587 
2588             break;
2589         }
2590     }
2591 
2592     if (match == NULL)
2593         return(NULL);
2594 
2595     *nlen = matchLen;
2596     *rlen = match[0];
2597     return(match + 1);
2598 }
2599 
2600 /**
2601  * htmlParseData:
2602  * @ctxt:  an HTML parser context
2603  * @mask:  mask of terminating characters
2604  * @comment:  true if parsing a comment
2605  * @refs:  true if references are allowed
2606  * @maxLength:  maximum output length
2607  *
2608  * Parse data until terminator is reached.
2609  *
2610  * Returns the parsed string or NULL in case of errors.
2611  */
2612 
2613 static xmlChar *
htmlParseData(htmlParserCtxtPtr ctxt,htmlAsciiMask mask,int comment,int refs,int maxLength)2614 htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
2615               int comment, int refs, int maxLength) {
2616     xmlParserInputPtr input = ctxt->input;
2617     xmlChar *ret = NULL;
2618     xmlChar *buffer;
2619     xmlChar utf8Char[4];
2620     size_t buffer_size;
2621     size_t used;
2622     int eof = PARSER_PROGRESSIVE(ctxt);
2623     int line, col;
2624     int termSkip = -1;
2625 
2626     used = 0;
2627     buffer_size = ctxt->spaceMax;
2628     buffer = (xmlChar *) ctxt->spaceTab;
2629     if (buffer == NULL) {
2630         buffer_size = 500;
2631         buffer = xmlMalloc(buffer_size + 1);
2632         if (buffer == NULL) {
2633             htmlErrMemory(ctxt);
2634             return(NULL);
2635         }
2636     }
2637 
2638     line = input->line;
2639     col = input->col;
2640 
2641     while (!PARSER_STOPPED(ctxt)) {
2642         const xmlChar *chunk, *in, *repl;
2643         size_t avail, chunkSize, extraSize;
2644         int replSize;
2645         int skip = 0;
2646         int ncr = 0;
2647         int ncrSize = 0;
2648         int cp = 0;
2649 
2650         chunk = input->cur;
2651         avail = input->end - chunk;
2652         in = chunk;
2653 
2654         repl = BAD_CAST "";
2655         replSize = 0;
2656 
2657         while (!PARSER_STOPPED(ctxt)) {
2658             size_t j;
2659             int cur, size;
2660 
2661             if ((!eof) && (avail <= 64)) {
2662                 size_t oldAvail = avail;
2663                 size_t off = in - chunk;
2664 
2665                 input->cur = in;
2666 
2667                 xmlParserGrow(ctxt);
2668 
2669                 in = input->cur;
2670                 chunk = in - off;
2671                 input->cur = chunk;
2672                 avail = input->end - in;
2673 
2674                 if (oldAvail == avail)
2675                     eof = 1;
2676             }
2677 
2678             if (avail == 0) {
2679                 termSkip = 0;
2680                 break;
2681             }
2682 
2683             cur = *in;
2684             size = 1;
2685             col += 1;
2686 
2687             if (htmlMaskMatch(mask, cur)) {
2688                 if (comment) {
2689                     if (avail < 2) {
2690                         termSkip = 1;
2691                     } else if (in[1] == '-') {
2692                         if  (avail < 3) {
2693                             termSkip = 2;
2694                         } else if (in[2] == '>') {
2695                             termSkip = 3;
2696                         } else if (in[2] == '!') {
2697                             if (avail < 4)
2698                                 termSkip = 3;
2699                             else if (in[3] == '>')
2700                                 termSkip = 4;
2701                         }
2702                     }
2703 
2704                     if (termSkip >= 0)
2705                         break;
2706                 } else {
2707                     termSkip = 0;
2708                     break;
2709                 }
2710             }
2711 
2712             if (ncr) {
2713                 int lc = cur | 0x20;
2714                 int digit;
2715 
2716                 if ((cur >= '0') && (cur <= '9')) {
2717                     digit = cur - '0';
2718                 } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
2719                     digit = (lc - 'a') + 10;
2720                 } else {
2721                     if (cur == ';') {
2722                         in += 1;
2723                         size += 1;
2724                         ncrSize += 1;
2725                     }
2726                     goto next_chunk;
2727                 }
2728 
2729                 cp = cp * ncr + digit;
2730                 if (cp >= 0x110000)
2731                     cp = 0x110000;
2732 
2733                 ncrSize += 1;
2734 
2735                 goto next_char;
2736             }
2737 
2738             switch (cur) {
2739             case '&':
2740                 if (!refs)
2741                     break;
2742 
2743                 j = 1;
2744 
2745                 if ((j < avail) && (in[j] == '#')) {
2746                     j += 1;
2747                     if (j < avail) {
2748                         if ((in[j] | 0x20) == 'x') {
2749                             j += 1;
2750                             if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
2751                                 ncr = 16;
2752                                 size = 3;
2753                                 ncrSize = 3;
2754                                 cp = 0;
2755                             }
2756                         } else if (IS_ASCII_DIGIT(in[j])) {
2757                             ncr = 10;
2758                             size = 2;
2759                             ncrSize = 2;
2760                             cp = 0;
2761                         }
2762                     }
2763                 } else {
2764                     repl = htmlFindEntityPrefix(in + j,
2765                                                 avail - j,
2766                                                 /* isAttr */ 1,
2767                                                 &skip, &replSize);
2768                     if (repl != NULL) {
2769                         skip += 1;
2770                         goto next_chunk;
2771                     }
2772 
2773                     skip = 0;
2774                 }
2775 
2776                 break;
2777 
2778             case '\0':
2779                 skip = 1;
2780                 repl = BAD_CAST "\xEF\xBF\xBD";
2781                 replSize = 3;
2782                 goto next_chunk;
2783 
2784             case '\n':
2785                 line += 1;
2786                 col = 1;
2787                 break;
2788 
2789             case '\r':
2790                 skip = 1;
2791                 if (in[1] != 0x0A) {
2792                     repl = BAD_CAST "\x0A";
2793                     replSize = 1;
2794                 }
2795                 goto next_chunk;
2796 
2797             default:
2798                 if (cur < 0x80)
2799                     break;
2800 
2801                 if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
2802                     xmlChar * guess;
2803 
2804 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2805                     guess = NULL;
2806 #else
2807                     guess = htmlFindEncoding(ctxt);
2808 #endif
2809                     if (guess == NULL) {
2810                         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
2811                     } else {
2812                         xmlSwitchEncodingName(ctxt, (const char *) guess);
2813                         xmlFree(guess);
2814                     }
2815                     input->flags |= XML_INPUT_HAS_ENCODING;
2816 
2817                     goto restart;
2818                 }
2819 
2820                 size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
2821 
2822                 if (size <= 0) {
2823                     skip = 1;
2824                     repl = BAD_CAST "\xEF\xBF\xBD";
2825                     replSize = 3;
2826                     goto next_chunk;
2827                 }
2828 
2829                 break;
2830             }
2831 
2832 next_char:
2833             in += size;
2834             avail -= size;
2835         }
2836 
2837 next_chunk:
2838         if (ncrSize > 0) {
2839             skip = ncrSize;
2840             in -= ncrSize;
2841 
2842             repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
2843         }
2844 
2845         chunkSize = in - chunk;
2846         extraSize = chunkSize + replSize;
2847 
2848         if (extraSize > maxLength - used) {
2849             htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
2850                          "value too long\n", NULL, NULL);
2851             goto error;
2852         }
2853 
2854         if (extraSize > buffer_size - used) {
2855             size_t newSize = (used + extraSize) * 2;
2856             xmlChar *tmp = xmlRealloc(buffer, newSize + 1);
2857 
2858             if (tmp == NULL) {
2859                 htmlErrMemory(ctxt);
2860                 goto error;
2861             }
2862             buffer = tmp;
2863             buffer_size = newSize;
2864         }
2865 
2866         if (chunkSize > 0) {
2867             input->cur += chunkSize;
2868             memcpy(buffer + used, chunk, chunkSize);
2869             used += chunkSize;
2870         }
2871 
2872         input->cur += skip;
2873         if (replSize > 0) {
2874             memcpy(buffer + used, repl, replSize);
2875             used += replSize;
2876         }
2877 
2878         SHRINK;
2879 
2880         if (termSkip >= 0)
2881             break;
2882 
2883 restart:
2884         ;
2885     }
2886 
2887     if (termSkip > 0) {
2888         input->cur += termSkip;
2889         col += termSkip;
2890     }
2891 
2892     input->line = line;
2893     input->col = col;
2894 
2895     ret = xmlMalloc(used + 1);
2896     if (ret == NULL) {
2897         htmlErrMemory(ctxt);
2898     } else {
2899         memcpy(ret, buffer, used);
2900         ret[used] = 0;
2901     }
2902 
2903 error:
2904     ctxt->spaceTab = (void *) buffer;
2905     ctxt->spaceMax = buffer_size;
2906 
2907     return(ret);
2908 }
2909 
2910 /**
2911  * htmlParseEntityRef:
2912  * @ctxt:  an HTML parser context
2913  * @str:  location to store the entity name
2914  *
2915  * DEPRECATED: Internal function, don't use.
2916  *
2917  * Returns NULL.
2918  */
2919 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED,const xmlChar ** str ATTRIBUTE_UNUSED)2920 htmlParseEntityRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED,
2921                    const xmlChar **str ATTRIBUTE_UNUSED) {
2922     return(NULL);
2923 }
2924 
2925 /**
2926  * htmlParseAttValue:
2927  * @ctxt:  an HTML parser context
2928  *
2929  * parse a value for an attribute
2930  * Note: the parser won't do substitution of entities here, this
2931  * will be handled later in xmlStringGetNodeList, unless it was
2932  * asked for ctxt->replaceEntities != 0
2933  *
2934  * Returns the AttValue parsed or NULL.
2935  */
2936 
2937 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2938 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2939     xmlChar *ret = NULL;
2940     int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
2941                     XML_MAX_HUGE_LENGTH :
2942                     XML_MAX_TEXT_LENGTH;
2943 
2944     if (CUR == '"') {
2945         SKIP(1);
2946 	ret = htmlParseData(ctxt, MASK_DQ, 0, 1, maxLength);
2947         if (CUR == '"')
2948             SKIP(1);
2949     } else if (CUR == '\'') {
2950         SKIP(1);
2951 	ret = htmlParseData(ctxt, MASK_SQ, 0, 1, maxLength);
2952         if (CUR == '\'')
2953             SKIP(1);
2954     } else {
2955 	ret = htmlParseData(ctxt, MASK_WS_GT, 0, 1, maxLength);
2956     }
2957     return(ret);
2958 }
2959 
2960 static void
htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt,const xmlChar * buf,int size,int mode)2961 htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
2962                         int size, int mode) {
2963     if ((ctxt->sax == NULL) || (ctxt->disableSAX))
2964         return;
2965 
2966     if ((mode == 0) || (mode == DATA_RCDATA) ||
2967         (ctxt->sax->cdataBlock == NULL)) {
2968         int blank = areBlanks(ctxt, buf, size);
2969 
2970         if ((mode == 0) && (blank > 0) && (!ctxt->keepBlanks)) {
2971             if (ctxt->sax->ignorableWhitespace != NULL)
2972                 ctxt->sax->ignorableWhitespace(ctxt->userData,
2973                                                buf, size);
2974         } else {
2975             if ((mode == 0) && (blank < 0))
2976                 htmlCheckParagraph(ctxt);
2977 
2978             if (ctxt->sax->characters != NULL)
2979                 ctxt->sax->characters(ctxt->userData, buf, size);
2980         }
2981     } else {
2982         /*
2983          * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2984          */
2985         ctxt->sax->cdataBlock(ctxt->userData, buf, size);
2986     }
2987 }
2988 
2989 /**
2990  * htmlParseCharData:
2991  * @ctxt:  an HTML parser context
2992  * @partial: true if the input buffer is incomplete
2993  *
2994  * Parse character data and references.
2995  *
2996  * Returns 1 if all data was parsed, 0 otherwise.
2997  */
2998 
2999 static int
htmlParseCharData(htmlParserCtxtPtr ctxt,int partial)3000 htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
3001     xmlParserInputPtr input = ctxt->input;
3002     xmlChar utf8Char[4];
3003     int complete = 0;
3004     int done = 0;
3005     int mode;
3006     int eof = PARSER_PROGRESSIVE(ctxt);
3007     int line, col;
3008 
3009     mode = ctxt->endCheckState;
3010 
3011     line = input->line;
3012     col = input->col;
3013 
3014     while (!PARSER_STOPPED(ctxt)) {
3015         const xmlChar *chunk, *in, *repl;
3016         size_t avail;
3017         int replSize;
3018         int skip = 0;
3019         int ncr = 0;
3020         int ncrSize = 0;
3021         int cp = 0;
3022 
3023         chunk = input->cur;
3024         avail = input->end - chunk;
3025         in = chunk;
3026 
3027         repl = BAD_CAST "";
3028         replSize = 0;
3029 
3030         while (!PARSER_STOPPED(ctxt)) {
3031             size_t j;
3032             int cur, size;
3033 
3034             if (avail <= 64) {
3035                 if (!eof) {
3036                     size_t oldAvail = avail;
3037                     size_t off = in - chunk;
3038 
3039                     input->cur = in;
3040 
3041                     xmlParserGrow(ctxt);
3042 
3043                     in = input->cur;
3044                     chunk = in - off;
3045                     input->cur = chunk;
3046                     avail = input->end - in;
3047 
3048                     if (oldAvail == avail)
3049                         eof = 1;
3050                 }
3051 
3052                 if (avail == 0) {
3053                     if ((partial) && (ncr)) {
3054                         in -= ncrSize;
3055                         ncrSize = 0;
3056                     }
3057 
3058                     done = 1;
3059                     break;
3060                 }
3061             }
3062 
3063             /* Accelerator */
3064             if (!ncr) {
3065                 while (avail > 0) {
3066                     static const unsigned mask[8] = {
3067                         0x00002401, 0x10002040,
3068                         0x00000000, 0x00000000,
3069                         0xFFFFFFFF, 0xFFFFFFFF,
3070                         0xFFFFFFFF, 0xFFFFFFFF
3071                     };
3072                     cur = *in;
3073                     if ((1u << (cur & 0x1F)) & mask[cur >> 5])
3074                         break;
3075                     col += 1;
3076                     in += 1;
3077                     avail -= 1;
3078                 }
3079 
3080                 if ((!eof) && (avail <= 64))
3081                     continue;
3082                 if (avail == 0)
3083                     continue;
3084             }
3085 
3086             cur = *in;
3087             size = 1;
3088             col += 1;
3089 
3090             if (ncr) {
3091                 int lc = cur | 0x20;
3092                 int digit;
3093 
3094                 if ((cur >= '0') && (cur <= '9')) {
3095                     digit = cur - '0';
3096                 } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
3097                     digit = (lc - 'a') + 10;
3098                 } else {
3099                     if (cur == ';') {
3100                         in += 1;
3101                         size += 1;
3102                         ncrSize += 1;
3103                     }
3104                     goto next_chunk;
3105                 }
3106 
3107                 cp = cp * ncr + digit;
3108                 if (cp >= 0x110000)
3109                     cp = 0x110000;
3110 
3111                 ncrSize += 1;
3112 
3113                 goto next_char;
3114             }
3115 
3116             switch (cur) {
3117             case '<':
3118                 if (mode == 0) {
3119                     done = 1;
3120                     complete = 1;
3121                     goto next_chunk;
3122                 }
3123                 if (mode == DATA_PLAINTEXT)
3124                     break;
3125 
3126                 j = 1;
3127                 if (j < avail) {
3128                     if ((mode == DATA_SCRIPT) && (in[j] == '!')) {
3129                         /* Check for comment start */
3130 
3131                         j += 1;
3132                         if ((j < avail) && (in[j] == '-')) {
3133                             j += 1;
3134                             if ((j < avail) && (in[j] == '-'))
3135                                 mode = DATA_SCRIPT_ESC1;
3136                         }
3137                     } else {
3138                         int i = 0;
3139                         int solidus = 0;
3140 
3141                         /* Check for tag */
3142 
3143                         if (in[j] == '/') {
3144                             j += 1;
3145                             solidus = 1;
3146                         }
3147 
3148                         if ((solidus) || (mode == DATA_SCRIPT_ESC1)) {
3149                             while ((j < avail) &&
3150                                    (ctxt->name[i] != 0) &&
3151                                    (ctxt->name[i] == (in[j] | 0x20))) {
3152                                 i += 1;
3153                                 j += 1;
3154                             }
3155 
3156                             if ((ctxt->name[i] == 0) && (j < avail)) {
3157                                 int c = in[j];
3158 
3159                                 if ((c == '>') || (c == '/') ||
3160                                     (IS_WS_HTML(c))) {
3161                                     if ((mode == DATA_SCRIPT_ESC1) &&
3162                                         (!solidus)) {
3163                                         mode = DATA_SCRIPT_ESC2;
3164                                     } else if (mode == DATA_SCRIPT_ESC2) {
3165                                         mode = DATA_SCRIPT_ESC1;
3166                                     } else {
3167                                         complete = 1;
3168                                         done = 1;
3169                                         goto next_chunk;
3170                                     }
3171                                 }
3172                             }
3173                         }
3174                     }
3175                 }
3176 
3177                 if ((partial) && (j >= avail)) {
3178                     done = 1;
3179                     goto next_chunk;
3180                 }
3181 
3182                 break;
3183 
3184             case '-':
3185                 if ((mode != DATA_SCRIPT_ESC1) && (mode != DATA_SCRIPT_ESC2))
3186                     break;
3187 
3188                 /* Check for comment end */
3189 
3190                 j = 1;
3191                 if ((j < avail) && (in[j] == '-')) {
3192                     j += 1;
3193                     if ((j < avail) && (in[j] == '>'))
3194                         mode = DATA_SCRIPT;
3195                 }
3196 
3197                 if ((partial) && (j >= avail)) {
3198                     done = 1;
3199                     goto next_chunk;
3200                 }
3201 
3202                 break;
3203 
3204             case '&':
3205                 if ((mode != 0) && (mode != DATA_RCDATA))
3206                     break;
3207 
3208                 j = 1;
3209 
3210                 if ((j < avail) && (in[j] == '#')) {
3211                     j += 1;
3212                     if (j < avail) {
3213                         if ((in[j] | 0x20) == 'x') {
3214                             j += 1;
3215                             if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
3216                                 ncr = 16;
3217                                 size = 3;
3218                                 ncrSize = 3;
3219                                 cp = 0;
3220                             }
3221                         } else if (IS_ASCII_DIGIT(in[j])) {
3222                             ncr = 10;
3223                             size = 2;
3224                             ncrSize = 2;
3225                             cp = 0;
3226                         }
3227                     }
3228                 } else {
3229                     if (partial) {
3230                         int terminated = 0;
3231                         size_t i;
3232 
3233                         /*
3234                          * &CounterClockwiseContourIntegral; has 33 bytes.
3235                          */
3236                         for (i = 1; i < avail; i++) {
3237                             if ((i >= 32) || !IS_ASCII_LETTER(in[i])) {
3238                                 terminated = 1;
3239                                 break;
3240                             }
3241                         }
3242 
3243                         if (!terminated) {
3244                             done = 1;
3245                             goto next_chunk;
3246                         }
3247                     }
3248 
3249                     repl = htmlFindEntityPrefix(in + j,
3250                                                 avail - j,
3251                                                 /* isAttr */ 0,
3252                                                 &skip, &replSize);
3253                     if (repl != NULL) {
3254                         skip += 1;
3255                         goto next_chunk;
3256                     }
3257 
3258                     skip = 0;
3259                 }
3260 
3261                 if ((partial) && (j >= avail)) {
3262                     done = 1;
3263                     goto next_chunk;
3264                 }
3265 
3266                 break;
3267 
3268             case '\0':
3269                 skip = 1;
3270                 repl = BAD_CAST "\xEF\xBF\xBD";
3271                 replSize = 3;
3272                 goto next_chunk;
3273 
3274             case '\n':
3275                 line += 1;
3276                 col = 1;
3277                 break;
3278 
3279             case '\r':
3280                 if (partial && avail < 2) {
3281                     done = 1;
3282                     goto next_chunk;
3283                 }
3284 
3285                 skip = 1;
3286                 if (in[1] != 0x0A) {
3287                     repl = BAD_CAST "\x0A";
3288                     replSize = 1;
3289                 }
3290                 goto next_chunk;
3291 
3292             default:
3293                 if (cur < 0x80)
3294                     break;
3295 
3296                 if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
3297                     xmlChar * guess;
3298 
3299                     if (in > chunk)
3300                         goto next_chunk;
3301 
3302 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3303                     guess = NULL;
3304 #else
3305                     guess = htmlFindEncoding(ctxt);
3306 #endif
3307                     if (guess == NULL) {
3308                         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
3309                     } else {
3310                         xmlSwitchEncodingName(ctxt, (const char *) guess);
3311                         xmlFree(guess);
3312                     }
3313                     input->flags |= XML_INPUT_HAS_ENCODING;
3314 
3315                     goto restart;
3316                 }
3317 
3318                 size = htmlValidateUtf8(ctxt, in, avail, partial);
3319 
3320                 if ((partial) && (size == 0)) {
3321                     done = 1;
3322                     goto next_chunk;
3323                 }
3324 
3325                 if (size <= 0) {
3326                     skip = 1;
3327                     repl = BAD_CAST "\xEF\xBF\xBD";
3328                     replSize = 3;
3329                     goto next_chunk;
3330                 }
3331 
3332                 break;
3333             }
3334 
3335 next_char:
3336             in += size;
3337             avail -= size;
3338         }
3339 
3340 next_chunk:
3341         if (ncrSize > 0) {
3342             skip = ncrSize;
3343             in -= ncrSize;
3344 
3345             repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
3346         }
3347 
3348         if (in > chunk) {
3349             input->cur += in - chunk;
3350             htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode);
3351         }
3352 
3353         input->cur += skip;
3354         if (replSize > 0)
3355             htmlCharDataSAXCallback(ctxt, repl, replSize, mode);
3356 
3357         SHRINK;
3358 
3359         if (done)
3360             break;
3361 
3362 restart:
3363         ;
3364     }
3365 
3366     input->line = line;
3367     input->col = col;
3368 
3369     if (complete)
3370         ctxt->endCheckState = 0;
3371     else
3372         ctxt->endCheckState = mode;
3373 
3374     return(complete);
3375 }
3376 
3377 /**
3378  * htmlParseComment:
3379  * @ctxt:  an HTML parser context
3380  * @bogus:  true if this is a bogus comment
3381  *
3382  * Parse an HTML comment
3383  */
3384 static void
htmlParseComment(htmlParserCtxtPtr ctxt,int bogus)3385 htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) {
3386     const xmlChar *comment = BAD_CAST "";
3387     xmlChar *buf = NULL;
3388     int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3389                     XML_MAX_HUGE_LENGTH :
3390                     XML_MAX_TEXT_LENGTH;
3391 
3392     if (bogus) {
3393         buf = htmlParseData(ctxt, MASK_GT, 0, 0, maxLength);
3394         if (CUR == '>')
3395             SKIP(1);
3396         comment = buf;
3397     } else {
3398         if (CUR == '>') {
3399             SKIP(1);
3400         } else if ((CUR == '-') && (NXT(1) == '>')) {
3401             SKIP(2);
3402         } else {
3403             buf = htmlParseData(ctxt, MASK_DASH, 1, 0, maxLength);
3404             comment = buf;
3405         }
3406     }
3407 
3408     if (comment == NULL)
3409         return;
3410 
3411     if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3412         (!ctxt->disableSAX))
3413         ctxt->sax->comment(ctxt->userData, comment);
3414 
3415     xmlFree(buf);
3416 }
3417 
3418 /**
3419  * htmlParseCharRef:
3420  * @ctxt:  an HTML parser context
3421  *
3422  * DEPRECATED: Internal function, don't use.
3423  *
3424  * Returns 0
3425  */
3426 int
htmlParseCharRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED)3427 htmlParseCharRef(htmlParserCtxtPtr ctxt ATTRIBUTE_UNUSED) {
3428     return(0);
3429 }
3430 
3431 
3432 /**
3433  * htmlParseDoctypeLiteral:
3434  * @ctxt:  an HTML parser context
3435  *
3436  * Parse a DOCTYPE SYTSTEM or PUBLIC literal.
3437  *
3438  * Returns the literal or NULL in case of error.
3439  */
3440 
3441 static xmlChar *
htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt)3442 htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) {
3443     xmlChar *ret;
3444     int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3445                     XML_MAX_TEXT_LENGTH :
3446                     XML_MAX_NAME_LENGTH;
3447 
3448     if (CUR == '"') {
3449         SKIP(1);
3450         ret = htmlParseData(ctxt, MASK_DQ_GT, 0, 0, maxLength);
3451         if (CUR == '"')
3452             SKIP(1);
3453     } else if (CUR == '\'') {
3454         SKIP(1);
3455         ret = htmlParseData(ctxt, MASK_SQ_GT, 0, 0, maxLength);
3456         if (CUR == '\'')
3457             SKIP(1);
3458     } else {
3459         return(NULL);
3460     }
3461 
3462     return(ret);
3463 }
3464 
3465 static void
htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt)3466 htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt) {
3467     const xmlChar *in;
3468     size_t avail;
3469     int eof = PARSER_PROGRESSIVE(ctxt);
3470     int line, col;
3471 
3472     line = ctxt->input->line;
3473     col = ctxt->input->col;
3474 
3475     in = ctxt->input->cur;
3476     avail = ctxt->input->end - in;
3477 
3478     while (!PARSER_STOPPED(ctxt)) {
3479         int cur;
3480 
3481         if ((!eof) && (avail <= 64)) {
3482             size_t oldAvail = avail;
3483 
3484             ctxt->input->cur = in;
3485 
3486             xmlParserGrow(ctxt);
3487 
3488             in = ctxt->input->cur;
3489             avail = ctxt->input->end - in;
3490 
3491             if (oldAvail == avail)
3492                 eof = 1;
3493         }
3494 
3495         if (avail == 0)
3496             break;
3497 
3498         col += 1;
3499 
3500         cur = *in;
3501         if (cur == '>') {
3502             in += 1;
3503             break;
3504         } else if (cur == 0x0A) {
3505             line += 1;
3506             col = 1;
3507         }
3508 
3509         in += 1;
3510         avail -= 1;
3511 
3512         SHRINK;
3513     }
3514 
3515     ctxt->input->cur = in;
3516     ctxt->input->line = line;
3517     ctxt->input->col = col;
3518 }
3519 
3520 /**
3521  * htmlParseDocTypeDecl:
3522  * @ctxt:  an HTML parser context
3523  *
3524  * Parse a DOCTYPE declaration.
3525  */
3526 
3527 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3528 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3529     xmlChar *name = NULL;
3530     xmlChar *publicId = NULL;
3531     xmlChar *URI = NULL;
3532     int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3533                     XML_MAX_TEXT_LENGTH :
3534                     XML_MAX_NAME_LENGTH;
3535 
3536     /*
3537      * We know that '<!DOCTYPE' has been detected.
3538      */
3539     SKIP(9);
3540 
3541     SKIP_BLANKS;
3542 
3543     if ((ctxt->input->cur < ctxt->input->end) && (CUR != '>')) {
3544         name = htmlParseData(ctxt, MASK_WS_GT, 0, 0, maxLength);
3545 
3546         if ((ctxt->options & HTML_PARSE_HTML5) && (name != NULL)) {
3547             xmlChar *cur;
3548 
3549             for (cur = name; *cur; cur++) {
3550                 if (IS_UPPER(*cur))
3551                     *cur += 0x20;
3552             }
3553         }
3554 
3555         SKIP_BLANKS;
3556     }
3557 
3558     /*
3559      * Check for SystemID and publicId
3560      */
3561     if ((UPPER == 'P') && (UPP(1) == 'U') &&
3562 	(UPP(2) == 'B') && (UPP(3) == 'L') &&
3563 	(UPP(4) == 'I') && (UPP(5) == 'C')) {
3564         SKIP(6);
3565         SKIP_BLANKS;
3566 	publicId = htmlParseDoctypeLiteral(ctxt);
3567 	if (publicId == NULL)
3568             goto bogus;
3569         SKIP_BLANKS;
3570 	URI = htmlParseDoctypeLiteral(ctxt);
3571     } else if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3572                (UPP(2) == 'S') && (UPP(3) == 'T') &&
3573 	       (UPP(4) == 'E') && (UPP(5) == 'M')) {
3574         SKIP(6);
3575         SKIP_BLANKS;
3576 	URI = htmlParseDoctypeLiteral(ctxt);
3577     }
3578 
3579 bogus:
3580     htmlSkipBogusDoctype(ctxt);
3581 
3582     /*
3583      * Create or update the document accordingly to the DOCTYPE
3584      */
3585     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3586 	(!ctxt->disableSAX))
3587 	ctxt->sax->internalSubset(ctxt->userData, name, publicId, URI);
3588 
3589     xmlFree(name);
3590     xmlFree(URI);
3591     xmlFree(publicId);
3592 }
3593 
3594 /**
3595  * htmlParseAttribute:
3596  * @ctxt:  an HTML parser context
3597  * @value:  a xmlChar ** used to store the value of the attribute
3598  *
3599  * parse an attribute
3600  *
3601  * [41] Attribute ::= Name Eq AttValue
3602  *
3603  * [25] Eq ::= S? '=' S?
3604  *
3605  * With namespace:
3606  *
3607  * [NS 11] Attribute ::= QName Eq AttValue
3608  *
3609  * Also the case QName == xmlns:??? is handled independently as a namespace
3610  * definition.
3611  *
3612  * Returns the attribute name, and the value in *value.
3613  */
3614 
3615 static xmlHashedString
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3616 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3617     xmlHashedString hname;
3618     xmlChar *val = NULL;
3619 
3620     *value = NULL;
3621     hname = htmlParseHTMLName(ctxt, 1);
3622     if (hname.name == NULL)
3623         return(hname);
3624 
3625     /*
3626      * read the value
3627      */
3628     SKIP_BLANKS;
3629     if (CUR == '=') {
3630         SKIP(1);
3631 	SKIP_BLANKS;
3632 	val = htmlParseAttValue(ctxt);
3633     }
3634 
3635     *value = val;
3636     return(hname);
3637 }
3638 
3639 /**
3640  * htmlCheckEncoding:
3641  * @ctxt:  an HTML parser context
3642  * @attvalue: the attribute value
3643  *
3644  * Checks an http-equiv attribute from a Meta tag to detect
3645  * the encoding
3646  * If a new encoding is detected the parser is switched to decode
3647  * it and pass UTF8
3648  */
3649 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3650 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3651     const xmlChar *encoding;
3652     xmlChar *copy;
3653 
3654     if (!attvalue)
3655 	return;
3656 
3657     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3658     if (encoding != NULL) {
3659 	encoding += 7;
3660     }
3661     /*
3662      * skip blank
3663      */
3664     if (encoding && IS_WS_HTML(*encoding))
3665 	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3666     if (encoding && *encoding == '=') {
3667 	encoding ++;
3668         copy = xmlStrdup(encoding);
3669         if (copy == NULL)
3670             htmlErrMemory(ctxt);
3671 	xmlSetDeclaredEncoding(ctxt, copy);
3672     }
3673 }
3674 
3675 /**
3676  * htmlCheckMeta:
3677  * @ctxt:  an HTML parser context
3678  * @atts:  the attributes values
3679  *
3680  * Checks an attributes from a Meta tag
3681  */
3682 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3683 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3684     int i;
3685     const xmlChar *att, *value;
3686     int http = 0;
3687     const xmlChar *content = NULL;
3688 
3689     if ((ctxt == NULL) || (atts == NULL))
3690 	return;
3691 
3692     i = 0;
3693     att = atts[i++];
3694     while (att != NULL) {
3695 	value = atts[i++];
3696         if (value != NULL) {
3697             if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3698                 (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3699                 http = 1;
3700             } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3701                 xmlChar *copy;
3702 
3703                 copy = xmlStrdup(value);
3704                 if (copy == NULL)
3705                     htmlErrMemory(ctxt);
3706                 xmlSetDeclaredEncoding(ctxt, copy);
3707             } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3708                 content = value;
3709             }
3710         }
3711 	att = atts[i++];
3712     }
3713     if ((http) && (content != NULL))
3714 	htmlCheckEncoding(ctxt, content);
3715 
3716 }
3717 
3718 /**
3719  * htmlAttrHashInsert:
3720  * @ctxt: parser context
3721  * @size: size of the hash table
3722  * @name: attribute name
3723  * @hashValue: hash value of name
3724  * @aindex: attribute index (this is a multiple of 5)
3725  *
3726  * Inserts a new attribute into the hash table.
3727  *
3728  * Returns INT_MAX if no existing attribute was found, the attribute
3729  * index if an attribute was found, -1 if a memory allocation failed.
3730  */
3731 static int
htmlAttrHashInsert(xmlParserCtxtPtr ctxt,unsigned size,const xmlChar * name,unsigned hashValue,int aindex)3732 htmlAttrHashInsert(xmlParserCtxtPtr ctxt, unsigned size, const xmlChar *name,
3733                    unsigned hashValue, int aindex) {
3734     xmlAttrHashBucket *table = ctxt->attrHash;
3735     xmlAttrHashBucket *bucket;
3736     unsigned hindex;
3737 
3738     hindex = hashValue & (size - 1);
3739     bucket = &table[hindex];
3740 
3741     while (bucket->index >= 0) {
3742         const xmlChar **atts = &ctxt->atts[bucket->index];
3743 
3744         if (name == atts[0])
3745             return(bucket->index);
3746 
3747         hindex++;
3748         bucket++;
3749         if (hindex >= size) {
3750             hindex = 0;
3751             bucket = table;
3752         }
3753     }
3754 
3755     bucket->index = aindex;
3756 
3757     return(INT_MAX);
3758 }
3759 
3760 /**
3761  * htmlParseStartTag:
3762  * @ctxt:  an HTML parser context
3763  *
3764  * parse a start of tag either for rule element or
3765  * EmptyElement. In both case we don't parse the tag closing chars.
3766  *
3767  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3768  *
3769  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3770  *
3771  * With namespace:
3772  *
3773  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3774  *
3775  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3776  *
3777  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3778  */
3779 
3780 static void
htmlParseStartTag(htmlParserCtxtPtr ctxt)3781 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3782     const xmlChar *name;
3783     const xmlChar *attname;
3784     xmlChar *attvalue;
3785     const xmlChar **atts;
3786     int nbatts = 0;
3787     int maxatts;
3788     int meta = 0;
3789     int i;
3790     int discardtag = 0;
3791 
3792     ctxt->endCheckState = 0;
3793 
3794     SKIP(1);
3795 
3796     atts = ctxt->atts;
3797     maxatts = ctxt->maxatts;
3798 
3799     GROW;
3800     name = htmlParseHTMLName(ctxt, 0).name;
3801     if (name == NULL)
3802         return;
3803     if (xmlStrEqual(name, BAD_CAST"meta"))
3804 	meta = 1;
3805 
3806     if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
3807         /*
3808          * Check for auto-closure of HTML elements.
3809          */
3810         htmlAutoClose(ctxt, name);
3811 
3812         /*
3813          * Check for implied HTML elements.
3814          */
3815         htmlCheckImplied(ctxt, name);
3816 
3817         /*
3818          * Avoid html at any level > 0, head at any level != 1
3819          * or any attempt to recurse body
3820          */
3821         if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3822             htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3823                          "htmlParseStartTag: misplaced <html> tag\n",
3824                          name, NULL);
3825             discardtag = 1;
3826             ctxt->depth++;
3827         }
3828         if ((ctxt->nameNr != 1) &&
3829             (xmlStrEqual(name, BAD_CAST"head"))) {
3830             htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3831                          "htmlParseStartTag: misplaced <head> tag\n",
3832                          name, NULL);
3833             discardtag = 1;
3834             ctxt->depth++;
3835         }
3836         if (xmlStrEqual(name, BAD_CAST"body")) {
3837             int indx;
3838             for (indx = 0;indx < ctxt->nameNr;indx++) {
3839                 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3840                     htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3841                                  "htmlParseStartTag: misplaced <body> tag\n",
3842                                  name, NULL);
3843                     discardtag = 1;
3844                     ctxt->depth++;
3845                 }
3846             }
3847         }
3848     }
3849 
3850     /*
3851      * Now parse the attributes, it ends up with the ending
3852      *
3853      * (S Attribute)* S?
3854      */
3855     SKIP_BLANKS;
3856     while ((ctxt->input->cur < ctxt->input->end) &&
3857            (CUR != '>') &&
3858 	   ((CUR != '/') || (NXT(1) != '>')) &&
3859            (PARSER_STOPPED(ctxt) == 0)) {
3860         xmlHashedString hattname;
3861 
3862         /*  unexpected-solidus-in-tag */
3863         if (CUR == '/') {
3864             SKIP(1);
3865             SKIP_BLANKS;
3866             continue;
3867         }
3868 	GROW;
3869 	hattname = htmlParseAttribute(ctxt, &attvalue);
3870         attname = hattname.name;
3871 
3872         if (attname != NULL) {
3873 	    /*
3874 	     * Add the pair to atts
3875 	     */
3876 	    if (nbatts + 4 > maxatts) {
3877 	        const xmlChar **tmp;
3878                 unsigned *utmp;
3879                 int newSize;
3880 
3881                 newSize = xmlGrowCapacity(maxatts,
3882                                           sizeof(tmp[0]) * 2 + sizeof(utmp[0]),
3883                                           11, HTML_MAX_ATTRS);
3884 		if (newSize < 0) {
3885 		    htmlErrMemory(ctxt);
3886 		    goto failed;
3887 		}
3888 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3889                 if (newSize < 2)
3890                     newSize = 2;
3891 #endif
3892 	        tmp = xmlRealloc(atts, newSize * sizeof(tmp[0]) * 2);
3893 		if (tmp == NULL) {
3894 		    htmlErrMemory(ctxt);
3895 		    goto failed;
3896 		}
3897                 atts = tmp;
3898 		ctxt->atts = tmp;
3899 
3900 	        utmp = xmlRealloc(ctxt->attallocs, newSize * sizeof(utmp[0]));
3901 		if (utmp == NULL) {
3902 		    htmlErrMemory(ctxt);
3903 		    goto failed;
3904 		}
3905                 ctxt->attallocs = utmp;
3906 
3907                 maxatts = newSize * 2;
3908 		ctxt->maxatts = maxatts;
3909 	    }
3910 
3911             ctxt->attallocs[nbatts/2] = hattname.hashValue;
3912 	    atts[nbatts++] = attname;
3913 	    atts[nbatts++] = attvalue;
3914 
3915             attvalue = NULL;
3916 	}
3917 
3918 failed:
3919         if (attvalue != NULL)
3920             xmlFree(attvalue);
3921 
3922 	SKIP_BLANKS;
3923     }
3924 
3925     if (ctxt->input->cur >= ctxt->input->end) {
3926         discardtag = 1;
3927         goto done;
3928     }
3929 
3930     /*
3931      * Verify that attribute names are unique.
3932      */
3933     if (nbatts > 2) {
3934         unsigned attrHashSize;
3935         int j, k;
3936 
3937         attrHashSize = 4;
3938         while (attrHashSize / 2 < (unsigned) nbatts / 2)
3939             attrHashSize *= 2;
3940 
3941         if (attrHashSize > ctxt->attrHashMax) {
3942             xmlAttrHashBucket *tmp;
3943 
3944             tmp = xmlRealloc(ctxt->attrHash, attrHashSize * sizeof(tmp[0]));
3945             if (tmp == NULL) {
3946                 htmlErrMemory(ctxt);
3947                 goto done;
3948             }
3949 
3950             ctxt->attrHash = tmp;
3951             ctxt->attrHashMax = attrHashSize;
3952         }
3953 
3954         memset(ctxt->attrHash, -1, attrHashSize * sizeof(ctxt->attrHash[0]));
3955 
3956         for (i = 0, j = 0, k = 0; i < nbatts; i += 2, k++) {
3957             unsigned hashValue;
3958             int res;
3959 
3960             attname = atts[i];
3961             hashValue = ctxt->attallocs[k] | 0x80000000;
3962 
3963             res = htmlAttrHashInsert(ctxt, attrHashSize, attname,
3964                                     hashValue, j);
3965             if (res < 0)
3966                 continue;
3967 
3968             if (res == INT_MAX) {
3969                 atts[j] = atts[i];
3970                 atts[j+1] = atts[i+1];
3971                 j += 2;
3972             } else {
3973                 xmlFree((xmlChar *) atts[i+1]);
3974             }
3975         }
3976 
3977         nbatts = j;
3978     }
3979 
3980     if (nbatts > 0) {
3981         atts[nbatts] = NULL;
3982         atts[nbatts + 1] = NULL;
3983 
3984     /*
3985      * Apple's new libiconv is so broken that you routinely run into
3986      * issues when fuzz testing (by accident with an uninstrumented
3987      * libiconv). Here's a harmless (?) example:
3988      *
3989      * printf '>'             | iconv -f shift_jis -t utf-8 | hexdump -C
3990      * printf '\xfc\x00\x00'  | iconv -f shift_jis -t utf-8 | hexdump -C
3991      * printf '>\xfc\x00\x00' | iconv -f shift_jis -t utf-8 | hexdump -C
3992      *
3993      * The last command fails to detect the illegal sequence.
3994      */
3995 #if !defined(__APPLE__) || \
3996     !defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
3997         /*
3998          * Handle specific association to the META tag
3999          */
4000         if (meta)
4001             htmlCheckMeta(ctxt, atts);
4002 #endif
4003     }
4004 
4005     /*
4006      * SAX: Start of Element !
4007      */
4008     if (!discardtag) {
4009         if (ctxt->options & HTML_PARSE_HTML5) {
4010             if (ctxt->nameNr > 0)
4011                 htmlnamePop(ctxt);
4012         }
4013 
4014 	htmlnamePush(ctxt, name);
4015 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4016 	    if (nbatts != 0)
4017 		ctxt->sax->startElement(ctxt->userData, name, atts);
4018 	    else
4019 		ctxt->sax->startElement(ctxt->userData, name, NULL);
4020 	}
4021     }
4022 
4023 done:
4024     if (atts != NULL) {
4025         for (i = 1;i < nbatts;i += 2) {
4026 	    if (atts[i] != NULL)
4027 		xmlFree((xmlChar *) atts[i]);
4028 	}
4029     }
4030 }
4031 
4032 /**
4033  * htmlParseEndTag:
4034  * @ctxt:  an HTML parser context
4035  *
4036  * parse an end of tag
4037  *
4038  * [42] ETag ::= '</' Name S? '>'
4039  *
4040  * With namespace
4041  *
4042  * [NS 9] ETag ::= '</' QName S? '>'
4043  *
4044  * Returns 1 if the current level should be closed.
4045  */
4046 
4047 static void
htmlParseEndTag(htmlParserCtxtPtr ctxt)4048 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4049 {
4050     const xmlChar *name;
4051     const xmlChar *oldname;
4052     int i;
4053 
4054     ctxt->endCheckState = 0;
4055 
4056     SKIP(2);
4057 
4058     if (ctxt->input->cur >= ctxt->input->end) {
4059         htmlCheckParagraph(ctxt);
4060         if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4061             (ctxt->sax->characters != NULL))
4062             ctxt->sax->characters(ctxt->userData,
4063                                   BAD_CAST "</", 2);
4064         return;
4065     }
4066 
4067     if (CUR == '>') {
4068         SKIP(1);
4069         return;
4070     }
4071 
4072     if (!IS_ASCII_LETTER(CUR)) {
4073         htmlParseComment(ctxt, /* bogus */ 1);
4074         return;
4075     }
4076 
4077     name = htmlParseHTMLName(ctxt, 0).name;
4078     if (name == NULL)
4079         return;
4080 
4081     /*
4082      * Parse and ignore attributes.
4083      */
4084     SKIP_BLANKS;
4085     while ((ctxt->input->cur < ctxt->input->end) &&
4086            (CUR != '>') &&
4087 	   ((CUR != '/') || (NXT(1) != '>')) &&
4088            (ctxt->instate != XML_PARSER_EOF)) {
4089         xmlChar *attvalue = NULL;
4090 
4091         /*  unexpected-solidus-in-tag */
4092         if (CUR == '/') {
4093             SKIP(1);
4094             SKIP_BLANKS;
4095             continue;
4096         }
4097 	GROW;
4098 	htmlParseAttribute(ctxt, &attvalue);
4099         if (attvalue != NULL)
4100             xmlFree(attvalue);
4101 
4102 	SKIP_BLANKS;
4103     }
4104 
4105     if (CUR == '>') {
4106         SKIP(1);
4107     } else if ((CUR == '/') && (NXT(1) == '>')) {
4108         SKIP(2);
4109     } else {
4110         return;
4111     }
4112 
4113     if (ctxt->options & HTML_PARSE_HTML5) {
4114         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4115             ctxt->sax->endElement(ctxt->userData, name);
4116         return;
4117     }
4118 
4119     /*
4120      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4121      * out now.
4122      */
4123     if ((ctxt->depth > 0) &&
4124         (xmlStrEqual(name, BAD_CAST "html") ||
4125          xmlStrEqual(name, BAD_CAST "body") ||
4126 	 xmlStrEqual(name, BAD_CAST "head"))) {
4127 	ctxt->depth--;
4128 	return;
4129     }
4130 
4131     /*
4132      * If the name read is not one of the element in the parsing stack
4133      * then return, it's just an error.
4134      */
4135     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4136         if (xmlStrEqual(name, ctxt->nameTab[i]))
4137             break;
4138     }
4139     if (i < 0) {
4140         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4141 	             "Unexpected end tag : %s\n", name, NULL);
4142         return;
4143     }
4144 
4145 
4146     /*
4147      * Check for auto-closure of HTML elements.
4148      */
4149 
4150     htmlAutoCloseOnClose(ctxt, name);
4151 
4152     /*
4153      * Well formedness constraints, opening and closing must match.
4154      * With the exception that the autoclose may have popped stuff out
4155      * of the stack.
4156      */
4157     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4158         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4159                      "Opening and ending tag mismatch: %s and %s\n",
4160                      name, ctxt->name);
4161     }
4162 
4163     /*
4164      * SAX: End of Tag
4165      */
4166     oldname = ctxt->name;
4167     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4168 	htmlParserFinishElementParsing(ctxt);
4169         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4170             ctxt->sax->endElement(ctxt->userData, name);
4171         htmlnamePop(ctxt);
4172     }
4173 }
4174 
4175 /**
4176  * htmlParseContent:
4177  * @ctxt:  an HTML parser context
4178  *
4179  * Parse a content: comment, sub-element, reference or text.
4180  * New version for non recursive htmlParseElementInternal
4181  */
4182 
4183 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4184 htmlParseContent(htmlParserCtxtPtr ctxt) {
4185     GROW;
4186 
4187     while ((PARSER_STOPPED(ctxt) == 0) &&
4188            (ctxt->input->cur < ctxt->input->end)) {
4189         int mode;
4190 
4191         mode = ctxt->endCheckState;
4192 
4193         if ((mode == 0) && (CUR == '<')) {
4194             if (NXT(1) == '/') {
4195 	        htmlParseEndTag(ctxt);
4196             } else if (NXT(1) == '!') {
4197                 /*
4198                  * Sometimes DOCTYPE arrives in the middle of the document
4199                  */
4200                 if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4201                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
4202                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4203                     (UPP(8) == 'E')) {
4204                     htmlParseDocTypeDecl(ctxt);
4205                 } else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4206                     SKIP(4);
4207                     htmlParseComment(ctxt, /* bogus */ 0);
4208                 } else {
4209                     SKIP(2);
4210                     htmlParseComment(ctxt, /* bogus */ 1);
4211                 }
4212             } else if (NXT(1) == '?') {
4213                 SKIP(1);
4214                 htmlParseComment(ctxt, /* bogus */ 1);
4215             } else if (IS_ASCII_LETTER(NXT(1))) {
4216                 htmlParseElementInternal(ctxt);
4217             } else {
4218                 htmlCheckParagraph(ctxt);
4219                 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4220                     (ctxt->sax->characters != NULL))
4221                     ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4222                 SKIP(1);
4223             }
4224         } else {
4225             htmlParseCharData(ctxt, /* partial */ 0);
4226         }
4227 
4228         SHRINK;
4229         GROW;
4230     }
4231 
4232     if (ctxt->input->cur >= ctxt->input->end)
4233         htmlAutoCloseOnEnd(ctxt);
4234 }
4235 
4236 /**
4237  * htmlParseElementInternal:
4238  * @ctxt:  an HTML parser context
4239  *
4240  * parse an HTML element, new version, non recursive
4241  *
4242  * [39] element ::= EmptyElemTag | STag content ETag
4243  *
4244  * [41] Attribute ::= Name Eq AttValue
4245  */
4246 
4247 static int
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4248 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4249     const xmlChar *name;
4250     const htmlElemDesc * info;
4251     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4252 
4253     if ((ctxt == NULL) || (ctxt->input == NULL))
4254 	return(0);
4255 
4256     /* Capture start position */
4257     if (ctxt->record_info) {
4258         node_info.begin_pos = ctxt->input->consumed +
4259                           (CUR_PTR - ctxt->input->base);
4260 	node_info.begin_line = ctxt->input->line;
4261     }
4262 
4263     htmlParseStartTag(ctxt);
4264     name = ctxt->name;
4265     if (name == NULL)
4266         return(0);
4267 
4268     if (ctxt->record_info)
4269         htmlNodeInfoPush(ctxt, &node_info);
4270 
4271     /*
4272      * Check for an Empty Element labeled the XML/SGML way
4273      */
4274     if ((CUR == '/') && (NXT(1) == '>')) {
4275         SKIP(2);
4276         htmlParserFinishElementParsing(ctxt);
4277         if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4278             if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4279                 ctxt->sax->endElement(ctxt->userData, name);
4280         }
4281 	htmlnamePop(ctxt);
4282 	return(0);
4283     }
4284 
4285     if (CUR != '>')
4286         return(0);
4287     SKIP(1);
4288 
4289     /*
4290      * Lookup the info for that element.
4291      */
4292     info = htmlTagLookup(name);
4293 
4294     /*
4295      * Check for an Empty Element from DTD definition
4296      */
4297     if ((info != NULL) && (info->empty)) {
4298         htmlParserFinishElementParsing(ctxt);
4299         if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4300             if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4301                 ctxt->sax->endElement(ctxt->userData, name);
4302         }
4303 	htmlnamePop(ctxt);
4304 	return(0);
4305     }
4306 
4307     if (info != NULL)
4308         ctxt->endCheckState = info->dataMode;
4309 
4310     return(1);
4311 }
4312 
4313 /**
4314  * htmlParseElement:
4315  * @ctxt:  an HTML parser context
4316  *
4317  * DEPRECATED: Internal function, don't use.
4318  *
4319  * parse an HTML element, this is highly recursive
4320  * this is kept for compatibility with previous code versions
4321  *
4322  * [39] element ::= EmptyElemTag | STag content ETag
4323  *
4324  * [41] Attribute ::= Name Eq AttValue
4325  */
4326 
4327 void
htmlParseElement(htmlParserCtxtPtr ctxt)4328 htmlParseElement(htmlParserCtxtPtr ctxt) {
4329     const xmlChar *oldptr;
4330     int depth;
4331 
4332     if ((ctxt == NULL) || (ctxt->input == NULL))
4333 	return;
4334 
4335     if (htmlParseElementInternal(ctxt) == 0)
4336         return;
4337 
4338     /*
4339      * Parse the content of the element:
4340      */
4341     depth = ctxt->nameNr;
4342     while (CUR != 0) {
4343 	oldptr = ctxt->input->cur;
4344 	htmlParseContent(ctxt);
4345 	if (oldptr==ctxt->input->cur) break;
4346 	if (ctxt->nameNr < depth) break;
4347     }
4348 
4349     if (CUR == 0) {
4350 	htmlAutoCloseOnEnd(ctxt);
4351     }
4352 }
4353 
4354 /**
4355  * htmlCtxtParseContentInternal:
4356  * @ctxt:  parser context
4357  * @input:  parser input
4358  *
4359  * Returns a node list.
4360  */
4361 xmlNodePtr
htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt,xmlParserInputPtr input)4362 htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt, xmlParserInputPtr input) {
4363     xmlNodePtr root;
4364     xmlNodePtr list = NULL;
4365     xmlChar *rootName = BAD_CAST "#root";
4366 
4367     root = xmlNewDocNode(ctxt->myDoc, NULL, rootName, NULL);
4368     if (root == NULL) {
4369         htmlErrMemory(ctxt);
4370         return(NULL);
4371     }
4372 
4373     if (xmlCtxtPushInput(ctxt, input) < 0) {
4374         xmlFreeNode(root);
4375         return(NULL);
4376     }
4377 
4378     htmlnamePush(ctxt, rootName);
4379     nodePush(ctxt, root);
4380 
4381     htmlParseContent(ctxt);
4382 
4383     /* TODO: Use xmlCtxtIsCatastrophicError */
4384     if (ctxt->errNo != XML_ERR_NO_MEMORY) {
4385         xmlNodePtr cur;
4386 
4387         /*
4388          * Unlink newly created node list.
4389          */
4390         list = root->children;
4391         root->children = NULL;
4392         root->last = NULL;
4393         for (cur = list; cur != NULL; cur = cur->next)
4394             cur->parent = NULL;
4395     }
4396 
4397     nodePop(ctxt);
4398     htmlnamePop(ctxt);
4399 
4400     xmlCtxtPopInput(ctxt);
4401 
4402     xmlFreeNode(root);
4403     return(list);
4404 }
4405 
4406 /**
4407  * htmlParseDocument:
4408  * @ctxt:  an HTML parser context
4409  *
4410  * Parse an HTML document and invoke the SAX handlers. This is useful
4411  * if you're only interested in custom SAX callbacks. If you want a
4412  * document tree, use htmlCtxtParseDocument.
4413  *
4414  * Returns 0, -1 in case of error.
4415  */
4416 
4417 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4418 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4419     xmlDtdPtr dtd;
4420 
4421     if ((ctxt == NULL) || (ctxt->input == NULL))
4422 	return(-1);
4423 
4424     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4425         ctxt->sax->setDocumentLocator(ctxt->userData,
4426                 (xmlSAXLocator *) &xmlDefaultSAXLocator);
4427     }
4428 
4429     xmlDetectEncoding(ctxt);
4430 
4431     /*
4432      * TODO: Implement HTML5 prescan algorithm
4433      */
4434 
4435     /*
4436      * This is wrong but matches long-standing behavior. In most
4437      * cases, a document starting with an XML declaration will
4438      * specify UTF-8. The HTML5 prescan algorithm handles
4439      * XML declarations in a better way.
4440      */
4441     if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4442         (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4443         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4444 
4445     /*
4446      * Wipe out everything which is before the first '<'
4447      */
4448     SKIP_BLANKS;
4449 
4450     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4451 	ctxt->sax->startDocument(ctxt->userData);
4452 
4453     /*
4454      * Parse possible comments and PIs before any content
4455      */
4456     while (CUR == '<') {
4457         if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
4458             SKIP(4);
4459             htmlParseComment(ctxt, /* bogus */ 0);
4460         } else if (NXT(1) == '?') {
4461             SKIP(1);
4462             htmlParseComment(ctxt, /* bogus */ 1);
4463         } else {
4464             break;
4465         }
4466 	SKIP_BLANKS;
4467     }
4468 
4469     /*
4470      * Then possibly doc type declaration(s) and more Misc
4471      * (doctypedecl Misc*)?
4472      */
4473     if ((CUR == '<') && (NXT(1) == '!') &&
4474 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4475 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4476 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4477 	(UPP(8) == 'E')) {
4478         ctxt->instate = XML_PARSER_MISC;
4479 	htmlParseDocTypeDecl(ctxt);
4480     }
4481     SKIP_BLANKS;
4482 
4483     /*
4484      * Parse possible comments and PIs before any content
4485      */
4486     ctxt->instate = XML_PARSER_PROLOG;
4487     while (CUR == '<') {
4488         if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
4489             SKIP(4);
4490             htmlParseComment(ctxt, /* bogus */ 0);
4491         } else if (NXT(1) == '?') {
4492             SKIP(1);
4493             htmlParseComment(ctxt, /* bogus */ 1);
4494         } else {
4495             break;
4496         }
4497 	SKIP_BLANKS;
4498     }
4499 
4500     /*
4501      * Time to start parsing the tree itself
4502      */
4503     ctxt->instate = XML_PARSER_CONTENT;
4504     htmlParseContent(ctxt);
4505 
4506     /*
4507      * autoclose
4508      */
4509     if (CUR == 0)
4510 	htmlAutoCloseOnEnd(ctxt);
4511 
4512 
4513     /*
4514      * SAX: end of the document processing.
4515      */
4516     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4517         ctxt->sax->endDocument(ctxt->userData);
4518 
4519     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4520 	dtd = xmlGetIntSubset(ctxt->myDoc);
4521 	if (dtd == NULL) {
4522 	    ctxt->myDoc->intSubset =
4523 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4524 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4525 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4526             if (ctxt->myDoc->intSubset == NULL)
4527                 htmlErrMemory(ctxt);
4528         }
4529     }
4530     if (! ctxt->wellFormed) return(-1);
4531     return(0);
4532 }
4533 
4534 
4535 /************************************************************************
4536  *									*
4537  *			Parser contexts handling			*
4538  *									*
4539  ************************************************************************/
4540 
4541 /**
4542  * htmlInitParserCtxt:
4543  * @ctxt:  an HTML parser context
4544  * @sax:  SAX handler
4545  * @userData:  user data
4546  *
4547  * Initialize a parser context
4548  *
4549  * Returns 0 in case of success and -1 in case of error
4550  */
4551 
4552 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt,const htmlSAXHandler * sax,void * userData)4553 htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4554                    void *userData)
4555 {
4556     if (ctxt == NULL) return(-1);
4557     memset(ctxt, 0, sizeof(htmlParserCtxt));
4558 
4559     ctxt->dict = xmlDictCreate();
4560     if (ctxt->dict == NULL)
4561 	return(-1);
4562 
4563     if (ctxt->sax == NULL)
4564         ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4565     if (ctxt->sax == NULL)
4566 	return(-1);
4567     if (sax == NULL) {
4568         memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4569         xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4570         ctxt->userData = ctxt;
4571     } else {
4572         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4573         ctxt->userData = userData ? userData : ctxt;
4574     }
4575 
4576     /* Allocate the Input stack */
4577     ctxt->inputTab = (htmlParserInputPtr *)
4578                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
4579     if (ctxt->inputTab == NULL)
4580 	return(-1);
4581     ctxt->inputNr = 0;
4582     ctxt->inputMax = 5;
4583     ctxt->input = NULL;
4584     ctxt->version = NULL;
4585     ctxt->encoding = NULL;
4586     ctxt->standalone = -1;
4587     ctxt->instate = XML_PARSER_START;
4588 
4589     /* Allocate the Node stack */
4590     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4591     if (ctxt->nodeTab == NULL)
4592 	return(-1);
4593     ctxt->nodeNr = 0;
4594     ctxt->nodeMax = 10;
4595     ctxt->node = NULL;
4596 
4597     /* Allocate the Name stack */
4598     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4599     if (ctxt->nameTab == NULL)
4600 	return(-1);
4601     ctxt->nameNr = 0;
4602     ctxt->nameMax = 10;
4603     ctxt->name = NULL;
4604 
4605     ctxt->nodeInfoTab = NULL;
4606     ctxt->nodeInfoNr  = 0;
4607     ctxt->nodeInfoMax = 0;
4608 
4609     ctxt->myDoc = NULL;
4610     ctxt->wellFormed = 1;
4611     ctxt->replaceEntities = 0;
4612     ctxt->linenumbers = xmlLineNumbersDefaultValue;
4613     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4614     ctxt->html = 1;
4615     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4616     ctxt->vctxt.userData = ctxt;
4617     ctxt->vctxt.error = xmlParserValidityError;
4618     ctxt->vctxt.warning = xmlParserValidityWarning;
4619     ctxt->record_info = 0;
4620     ctxt->validate = 0;
4621     ctxt->checkIndex = 0;
4622     ctxt->catalogs = NULL;
4623     xmlInitNodeInfoSeq(&ctxt->node_seq);
4624     return(0);
4625 }
4626 
4627 /**
4628  * htmlFreeParserCtxt:
4629  * @ctxt:  an HTML parser context
4630  *
4631  * Free all the memory used by a parser context. However the parsed
4632  * document in ctxt->myDoc is not freed.
4633  */
4634 
4635 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)4636 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4637 {
4638     xmlFreeParserCtxt(ctxt);
4639 }
4640 
4641 /**
4642  * htmlNewParserCtxt:
4643  *
4644  * Allocate and initialize a new HTML parser context.
4645  *
4646  * This can be used to parse HTML documents into DOM trees with
4647  * functions like xmlCtxtReadFile or xmlCtxtReadMemory.
4648  *
4649  * See htmlCtxtUseOptions for parser options.
4650  *
4651  * See xmlCtxtSetErrorHandler for advanced error handling.
4652  *
4653  * See htmlNewSAXParserCtxt for custom SAX parsers.
4654  *
4655  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4656  */
4657 
4658 htmlParserCtxtPtr
htmlNewParserCtxt(void)4659 htmlNewParserCtxt(void)
4660 {
4661     return(htmlNewSAXParserCtxt(NULL, NULL));
4662 }
4663 
4664 /**
4665  * htmlNewSAXParserCtxt:
4666  * @sax:  SAX handler
4667  * @userData:  user data
4668  *
4669  * Allocate and initialize a new HTML SAX parser context. If userData
4670  * is NULL, the parser context will be passed as user data.
4671  *
4672  * Available since 2.11.0. If you want support older versions,
4673  * it's best to invoke htmlNewParserCtxt and set ctxt->sax with
4674  * struct assignment.
4675  *
4676  * Also see htmlNewParserCtxt.
4677  *
4678  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4679  */
4680 
4681 htmlParserCtxtPtr
htmlNewSAXParserCtxt(const htmlSAXHandler * sax,void * userData)4682 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
4683 {
4684     xmlParserCtxtPtr ctxt;
4685 
4686     xmlInitParser();
4687 
4688     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4689     if (ctxt == NULL)
4690 	return(NULL);
4691     memset(ctxt, 0, sizeof(xmlParserCtxt));
4692     if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
4693         htmlFreeParserCtxt(ctxt);
4694 	return(NULL);
4695     }
4696     return(ctxt);
4697 }
4698 
4699 static htmlParserCtxtPtr
htmlCreateMemoryParserCtxtInternal(const char * url,const char * buffer,size_t size,const char * encoding)4700 htmlCreateMemoryParserCtxtInternal(const char *url,
4701                                    const char *buffer, size_t size,
4702                                    const char *encoding) {
4703     xmlParserCtxtPtr ctxt;
4704     xmlParserInputPtr input;
4705 
4706     if (buffer == NULL)
4707 	return(NULL);
4708 
4709     ctxt = htmlNewParserCtxt();
4710     if (ctxt == NULL)
4711 	return(NULL);
4712 
4713     input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding, 0);
4714     if (input == NULL) {
4715 	xmlFreeParserCtxt(ctxt);
4716         return(NULL);
4717     }
4718 
4719     if (xmlCtxtPushInput(ctxt, input) < 0) {
4720         xmlFreeInputStream(input);
4721         xmlFreeParserCtxt(ctxt);
4722         return(NULL);
4723     }
4724 
4725     return(ctxt);
4726 }
4727 
4728 /**
4729  * htmlCreateMemoryParserCtxt:
4730  * @buffer:  a pointer to a char array
4731  * @size:  the size of the array
4732  *
4733  * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory.
4734  *
4735  * Create a parser context for an HTML in-memory document. The input
4736  * buffer must not contain any terminating null bytes.
4737  *
4738  * Returns the new parser context or NULL
4739  */
4740 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)4741 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4742     if (size <= 0)
4743 	return(NULL);
4744 
4745     return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
4746 }
4747 
4748 /**
4749  * htmlCreateDocParserCtxt:
4750  * @str:  a pointer to an array of xmlChar
4751  * @encoding:  encoding (optional)
4752  *
4753  * Create a parser context for a null-terminated string.
4754  *
4755  * Returns the new parser context or NULL if a memory allocation failed.
4756  */
4757 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * str,const char * url,const char * encoding)4758 htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
4759                         const char *encoding) {
4760     xmlParserCtxtPtr ctxt;
4761     xmlParserInputPtr input;
4762 
4763     if (str == NULL)
4764 	return(NULL);
4765 
4766     ctxt = htmlNewParserCtxt();
4767     if (ctxt == NULL)
4768 	return(NULL);
4769 
4770     input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str,
4771                                       encoding, 0);
4772     if (input == NULL) {
4773 	xmlFreeParserCtxt(ctxt);
4774 	return(NULL);
4775     }
4776 
4777     if (xmlCtxtPushInput(ctxt, input) < 0) {
4778         xmlFreeInputStream(input);
4779         xmlFreeParserCtxt(ctxt);
4780         return(NULL);
4781     }
4782 
4783     return(ctxt);
4784 }
4785 
4786 #ifdef LIBXML_PUSH_ENABLED
4787 /************************************************************************
4788  *									*
4789  *	Progressive parsing interfaces				*
4790  *									*
4791  ************************************************************************/
4792 
4793 typedef enum {
4794     LSTATE_TAG_NAME = 0,
4795     LSTATE_BEFORE_ATTR_NAME,
4796     LSTATE_ATTR_NAME,
4797     LSTATE_AFTER_ATTR_NAME,
4798     LSTATE_BEFORE_ATTR_VALUE,
4799     LSTATE_ATTR_VALUE_DQUOTED,
4800     LSTATE_ATTR_VALUE_SQUOTED,
4801     LSTATE_ATTR_VALUE_UNQUOTED
4802 } xmlLookupStates;
4803 
4804 /**
4805  * htmlParseLookupGt:
4806  * @ctxt:  an HTML parser context
4807  *
4808  * Check whether there's enough data in the input buffer to finish parsing
4809  * a tag. This has to take quotes into account.
4810  */
4811 static int
htmlParseLookupGt(xmlParserCtxtPtr ctxt)4812 htmlParseLookupGt(xmlParserCtxtPtr ctxt) {
4813     const xmlChar *cur;
4814     const xmlChar *end = ctxt->input->end;
4815     int state = ctxt->endCheckState;
4816     size_t index;
4817 
4818     if (ctxt->checkIndex == 0)
4819         cur = ctxt->input->cur + 2; /* Skip '<a' or '</' */
4820     else
4821         cur = ctxt->input->cur + ctxt->checkIndex;
4822 
4823     while (cur < end) {
4824         int c = *cur++;
4825 
4826         if (state != LSTATE_ATTR_VALUE_SQUOTED &&
4827             state != LSTATE_ATTR_VALUE_DQUOTED) {
4828             if (c == '/' &&
4829                 state != LSTATE_BEFORE_ATTR_VALUE &&
4830                 state != LSTATE_ATTR_VALUE_UNQUOTED) {
4831                 state = LSTATE_BEFORE_ATTR_NAME;
4832                 continue;
4833             } else if (c == '>') {
4834                 ctxt->checkIndex = 0;
4835                 ctxt->endCheckState = 0;
4836                 return(0);
4837             }
4838         }
4839 
4840         switch (state) {
4841             case LSTATE_TAG_NAME:
4842                 if (IS_WS_HTML(c))
4843                     state = LSTATE_BEFORE_ATTR_NAME;
4844                 break;
4845 
4846             case LSTATE_BEFORE_ATTR_NAME:
4847                 if (!IS_WS_HTML(c))
4848                     state = LSTATE_ATTR_NAME;
4849                 break;
4850 
4851             case LSTATE_ATTR_NAME:
4852                 if (c == '=')
4853                     state = LSTATE_BEFORE_ATTR_VALUE;
4854                 else if (IS_WS_HTML(c))
4855                     state = LSTATE_AFTER_ATTR_NAME;
4856                 break;
4857 
4858             case LSTATE_AFTER_ATTR_NAME:
4859                 if (c == '=')
4860                     state = LSTATE_BEFORE_ATTR_VALUE;
4861                 else if (!IS_WS_HTML(c))
4862                     state = LSTATE_ATTR_NAME;
4863                 break;
4864 
4865             case LSTATE_BEFORE_ATTR_VALUE:
4866                 if (c == '"')
4867                     state = LSTATE_ATTR_VALUE_DQUOTED;
4868                 else if (c == '\'')
4869                     state = LSTATE_ATTR_VALUE_SQUOTED;
4870                 else if (!IS_WS_HTML(c))
4871                     state = LSTATE_ATTR_VALUE_UNQUOTED;
4872                 break;
4873 
4874             case LSTATE_ATTR_VALUE_DQUOTED:
4875                 if (c == '"')
4876                     state = LSTATE_BEFORE_ATTR_NAME;
4877                 break;
4878 
4879             case LSTATE_ATTR_VALUE_SQUOTED:
4880                 if (c == '\'')
4881                     state = LSTATE_BEFORE_ATTR_NAME;
4882                 break;
4883 
4884             case LSTATE_ATTR_VALUE_UNQUOTED:
4885                 if (IS_WS_HTML(c))
4886                     state = LSTATE_BEFORE_ATTR_NAME;
4887                 break;
4888         }
4889     }
4890 
4891     index = cur - ctxt->input->cur;
4892     if (index > LONG_MAX) {
4893         ctxt->checkIndex = 0;
4894         ctxt->endCheckState = 0;
4895         return(0);
4896     }
4897     ctxt->checkIndex = index;
4898     ctxt->endCheckState = state;
4899     return(-1);
4900 }
4901 
4902 /**
4903  * htmlParseLookupString:
4904  * @ctxt:  an XML parser context
4905  * @startDelta: delta to apply at the start
4906  * @str:  string
4907  * @strLen:  length of string
4908  *
4909  * Check whether the input buffer contains a string.
4910  */
4911 static int
htmlParseLookupString(xmlParserCtxtPtr ctxt,size_t startDelta,const char * str,size_t strLen,size_t extraLen)4912 htmlParseLookupString(xmlParserCtxtPtr ctxt, size_t startDelta,
4913                       const char *str, size_t strLen, size_t extraLen) {
4914     const xmlChar *end = ctxt->input->end;
4915     const xmlChar *cur, *term;
4916     size_t index, rescan;
4917     int ret;
4918 
4919     if (ctxt->checkIndex == 0) {
4920         cur = ctxt->input->cur + startDelta;
4921     } else {
4922         cur = ctxt->input->cur + ctxt->checkIndex;
4923     }
4924 
4925     term = BAD_CAST strstr((const char *) cur, str);
4926     if ((term != NULL) &&
4927         ((size_t) (ctxt->input->end - term) >= extraLen + 1)) {
4928         ctxt->checkIndex = 0;
4929 
4930         if (term - ctxt->input->cur > INT_MAX / 2)
4931             ret = INT_MAX / 2;
4932         else
4933             ret = term - ctxt->input->cur;
4934 
4935         return(ret);
4936     }
4937 
4938     /* Rescan (strLen + extraLen - 1) characters. */
4939     rescan = strLen + extraLen - 1;
4940     if ((size_t) (end - cur) <= rescan)
4941         end = cur;
4942     else
4943         end -= rescan;
4944     index = end - ctxt->input->cur;
4945     if (index > INT_MAX / 2) {
4946         ctxt->checkIndex = 0;
4947         ret = INT_MAX / 2;
4948     } else {
4949         ctxt->checkIndex = index;
4950         ret = -1;
4951     }
4952 
4953     return(ret);
4954 }
4955 
4956 /**
4957  * htmlParseLookupCommentEnd:
4958  * @ctxt: an HTML parser context
4959  *
4960  * Try to find a comment end tag in the input stream
4961  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
4962  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
4963  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4964  * to avoid rescanning sequences of bytes, it DOES change the state of the
4965  * parser, do not use liberally.
4966  *
4967  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
4968  */
4969 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)4970 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
4971 {
4972     int mark = 0;
4973     int offset;
4974 
4975     while (1) {
4976 	mark = htmlParseLookupString(ctxt, 2, "--", 2, 0);
4977 	if (mark < 0)
4978             break;
4979         /*
4980          * <!-->    is a complete comment, but
4981          * <!--!>   is not
4982          * <!---!>  is not
4983          * <!----!> is
4984          */
4985         if ((NXT(mark+2) == '>') ||
4986 	    ((mark >= 4) && (NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
4987             ctxt->checkIndex = 0;
4988 	    break;
4989 	}
4990         offset = (NXT(mark+2) == '!') ? 3 : 2;
4991         if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
4992 	    ctxt->checkIndex = mark;
4993             return(-1);
4994         }
4995 	ctxt->checkIndex = mark + 1;
4996     }
4997     return mark;
4998 }
4999 
5000 
5001 /**
5002  * htmlParseTryOrFinish:
5003  * @ctxt:  an HTML parser context
5004  * @terminate:  last chunk indicator
5005  *
5006  * Try to progress on parsing
5007  *
5008  * Returns zero if no parsing was possible
5009  */
5010 static void
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5011 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5012     while (PARSER_STOPPED(ctxt) == 0) {
5013         htmlParserInputPtr in;
5014         size_t avail;
5015 
5016 	in = ctxt->input;
5017 	if (in == NULL) break;
5018 	avail = in->end - in->cur;
5019 
5020         switch (ctxt->instate) {
5021             case XML_PARSER_EOF:
5022 	        /*
5023 		 * Document parsing is done !
5024 		 */
5025 	        return;
5026 
5027             case XML_PARSER_START:
5028                 /*
5029                  * Very first chars read from the document flow.
5030                  */
5031                 if ((!terminate) && (avail < 4))
5032                     return;
5033 
5034                 xmlDetectEncoding(ctxt);
5035 
5036                 /*
5037                  * TODO: Implement HTML5 prescan algorithm
5038                  */
5039 
5040                 /*
5041                  * This is wrong but matches long-standing behavior. In most
5042                  * cases, a document starting with an XML declaration will
5043                  * specify UTF-8. The HTML5 prescan algorithm handles
5044                  * XML declarations in a better way.
5045                  */
5046                 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
5047                     (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
5048                     xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
5049                 }
5050 
5051                 /* fall through */
5052 
5053             case XML_PARSER_XML_DECL:
5054                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
5055                     ctxt->sax->setDocumentLocator(ctxt->userData,
5056                             (xmlSAXLocator *) &xmlDefaultSAXLocator);
5057                 }
5058 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5059 	            (!ctxt->disableSAX))
5060 		    ctxt->sax->startDocument(ctxt->userData);
5061 
5062                 /* Allow callback to modify state for tests */
5063                 if ((ctxt->instate == XML_PARSER_START) ||
5064                     (ctxt->instate == XML_PARSER_XML_DECL))
5065                     ctxt->instate = XML_PARSER_MISC;
5066 		break;
5067 
5068             case XML_PARSER_START_TAG:
5069 		if ((!terminate) &&
5070 		    (htmlParseLookupGt(ctxt) < 0))
5071 		    return;
5072 
5073                 htmlParseElementInternal(ctxt);
5074 
5075 		ctxt->instate = XML_PARSER_CONTENT;
5076                 break;
5077 
5078             case XML_PARSER_MISC:
5079             case XML_PARSER_PROLOG:
5080             case XML_PARSER_CONTENT: {
5081                 int mode;
5082 
5083                 if ((ctxt->instate == XML_PARSER_MISC) ||
5084                     (ctxt->instate == XML_PARSER_PROLOG)) {
5085                     SKIP_BLANKS;
5086                     avail = in->end - in->cur;
5087                 }
5088 
5089 		if (avail < 1)
5090 		    return;
5091                 /*
5092                  * Note that endCheckState is also used by
5093                  * xmlParseLookupGt.
5094                  */
5095                 mode = ctxt->endCheckState;
5096 
5097                 if (mode != 0) {
5098                     if (htmlParseCharData(ctxt, !terminate) == 0)
5099                         return;
5100 		} else if (in->cur[0] == '<') {
5101                     int next;
5102 
5103                     if (avail < 2) {
5104                         if (!terminate)
5105                             return;
5106                         next = ' ';
5107                     } else {
5108                         next = in->cur[1];
5109                     }
5110 
5111                     if (next == '!') {
5112                         if ((!terminate) && (avail < 4))
5113                             return;
5114                         if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5115                             if ((!terminate) &&
5116                                 (htmlParseLookupCommentEnd(ctxt) < 0))
5117                                 return;
5118                             SKIP(4);
5119                             htmlParseComment(ctxt, /* bogus */ 0);
5120                             /* don't change state */
5121                             break;
5122                         }
5123 
5124                         if ((!terminate) && (avail < 9))
5125                             return;
5126                         if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5127                             (UPP(4) == 'C') && (UPP(5) == 'T') &&
5128                             (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5129                             (UPP(8) == 'E')) {
5130                             if ((!terminate) &&
5131                                 (htmlParseLookupString(ctxt, 9, ">", 1,
5132                                                        0) < 0))
5133                                 return;
5134                             htmlParseDocTypeDecl(ctxt);
5135                             if (ctxt->instate == XML_PARSER_MISC)
5136                                 ctxt->instate = XML_PARSER_PROLOG;
5137                             else
5138                                 ctxt->instate = XML_PARSER_CONTENT;
5139                         } else {
5140                             ctxt->instate = XML_PARSER_CONTENT;
5141                             if ((!terminate) &&
5142                                 (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5143                                 return;
5144                             SKIP(2);
5145                             htmlParseComment(ctxt, /* bogus */ 1);
5146                         }
5147                     } else if (next == '?') {
5148                         if ((!terminate) &&
5149                             (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5150                             return;
5151                         SKIP(1);
5152                         htmlParseComment(ctxt, /* bogus */ 1);
5153                         /* don't change state */
5154                     } else if (next == '/') {
5155                         ctxt->instate = XML_PARSER_END_TAG;
5156                         ctxt->checkIndex = 0;
5157                     } else if (IS_ASCII_LETTER(next)) {
5158                         ctxt->instate = XML_PARSER_START_TAG;
5159                         ctxt->checkIndex = 0;
5160                     } else {
5161                         ctxt->instate = XML_PARSER_CONTENT;
5162                         htmlCheckParagraph(ctxt);
5163                         if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5164                             (ctxt->sax->characters != NULL))
5165                             ctxt->sax->characters(ctxt->userData,
5166                                                   BAD_CAST "<", 1);
5167                         SKIP(1);
5168                     }
5169                 } else {
5170                     ctxt->instate = XML_PARSER_CONTENT;
5171                     /*
5172                      * We follow the logic of the XML push parser
5173                      */
5174 		    if (avail < HTML_PARSER_BIG_BUFFER_SIZE) {
5175                         if ((!terminate) &&
5176                             (htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0))
5177                             return;
5178                     }
5179                     ctxt->checkIndex = 0;
5180                     if (htmlParseCharData(ctxt, !terminate) == 0)
5181                         return;
5182 		}
5183 
5184 		break;
5185 	    }
5186 
5187             case XML_PARSER_END_TAG:
5188 		if ((!terminate) &&
5189 		    (htmlParseLookupGt(ctxt) < 0))
5190 		    return;
5191 		htmlParseEndTag(ctxt);
5192 		ctxt->instate = XML_PARSER_CONTENT;
5193 		ctxt->checkIndex = 0;
5194 	        break;
5195 
5196 	    default:
5197 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5198 			     "HPP: internal error\n", NULL, NULL);
5199 		ctxt->instate = XML_PARSER_EOF;
5200 		break;
5201 	}
5202     }
5203 }
5204 
5205 /**
5206  * htmlParseChunk:
5207  * @ctxt:  an HTML parser context
5208  * @chunk:  chunk of memory
5209  * @size:  size of chunk in bytes
5210  * @terminate:  last chunk indicator
5211  *
5212  * Parse a chunk of memory in push parser mode.
5213  *
5214  * Assumes that the parser context was initialized with
5215  * htmlCreatePushParserCtxt.
5216  *
5217  * The last chunk, which will often be empty, must be marked with
5218  * the @terminate flag. With the default SAX callbacks, the resulting
5219  * document will be available in ctxt->myDoc. This pointer will not
5220  * be freed by the library.
5221  *
5222  * If the document isn't well-formed, ctxt->myDoc is set to NULL.
5223  *
5224  * Returns an xmlParserErrors code (0 on success).
5225  */
5226 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)5227 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5228               int terminate) {
5229     if ((ctxt == NULL) || (ctxt->input == NULL))
5230 	return(XML_ERR_ARGUMENT);
5231     if (PARSER_STOPPED(ctxt) != 0)
5232         return(ctxt->errNo);
5233     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5234         (ctxt->input->buf != NULL))  {
5235 	size_t pos = ctxt->input->cur - ctxt->input->base;
5236 	int res;
5237 
5238 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5239         xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5240 	if (res < 0) {
5241             htmlParseErr(ctxt, ctxt->input->buf->error,
5242                          "xmlParserInputBufferPush failed", NULL, NULL);
5243             xmlHaltParser(ctxt);
5244 	    return (ctxt->errNo);
5245 	}
5246     }
5247 
5248     htmlParseTryOrFinish(ctxt, terminate);
5249 
5250     if ((terminate) && (ctxt->instate != XML_PARSER_EOF)) {
5251         htmlAutoCloseOnEnd(ctxt);
5252 
5253         if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5254             ctxt->sax->endDocument(ctxt->userData);
5255 
5256         if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) &&
5257             (ctxt->myDoc != NULL)) {
5258             xmlDtdPtr dtd;
5259             dtd = xmlGetIntSubset(ctxt->myDoc);
5260             if (dtd == NULL) {
5261                 ctxt->myDoc->intSubset =
5262                     xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5263                         BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5264                         BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5265                 if (ctxt->myDoc->intSubset == NULL)
5266                     htmlErrMemory(ctxt);
5267             }
5268         }
5269 
5270 	ctxt->instate = XML_PARSER_EOF;
5271     }
5272 
5273     return((xmlParserErrors) ctxt->errNo);
5274 }
5275 
5276 /************************************************************************
5277  *									*
5278  *			User entry points				*
5279  *									*
5280  ************************************************************************/
5281 
5282 /**
5283  * htmlCreatePushParserCtxt:
5284  * @sax:  a SAX handler (optional)
5285  * @user_data:  The user data returned on SAX callbacks (optional)
5286  * @chunk:  a pointer to an array of chars (optional)
5287  * @size:  number of chars in the array
5288  * @filename:  only used for error reporting (optional)
5289  * @enc:  encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5290  *
5291  * Create a parser context for using the HTML parser in push mode.
5292  *
5293  * Returns the new parser context or NULL if a memory allocation
5294  * failed.
5295  */
5296 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5297 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5298                          const char *chunk, int size, const char *filename,
5299 			 xmlCharEncoding enc) {
5300     htmlParserCtxtPtr ctxt;
5301     htmlParserInputPtr input;
5302     const char *encoding;
5303 
5304     ctxt = htmlNewSAXParserCtxt(sax, user_data);
5305     if (ctxt == NULL)
5306 	return(NULL);
5307 
5308     encoding = xmlGetCharEncodingName(enc);
5309     input = xmlNewPushInput(filename, chunk, size);
5310     if (input == NULL) {
5311 	htmlFreeParserCtxt(ctxt);
5312 	return(NULL);
5313     }
5314 
5315     if (xmlCtxtPushInput(ctxt, input) < 0) {
5316         xmlFreeInputStream(input);
5317         xmlFreeParserCtxt(ctxt);
5318         return(NULL);
5319     }
5320 
5321     if (encoding != NULL)
5322         xmlSwitchEncodingName(ctxt, encoding);
5323 
5324     return(ctxt);
5325 }
5326 #endif /* LIBXML_PUSH_ENABLED */
5327 
5328 /**
5329  * htmlSAXParseDoc:
5330  * @cur:  a pointer to an array of xmlChar
5331  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5332  * @sax:  the SAX handler block
5333  * @userData: if using SAX, this pointer will be provided on callbacks.
5334  *
5335  * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
5336  *
5337  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5338  * to handle parse events. If sax is NULL, fallback to the default DOM
5339  * behavior and return a tree.
5340  *
5341  * Returns the resulting document tree unless SAX is NULL or the document is
5342  *     not well formed.
5343  */
5344 
5345 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5346 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5347                 htmlSAXHandlerPtr sax, void *userData) {
5348     htmlDocPtr ret;
5349     htmlParserCtxtPtr ctxt;
5350 
5351     if (cur == NULL)
5352         return(NULL);
5353 
5354     ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5355     if (ctxt == NULL)
5356         return(NULL);
5357 
5358     if (sax != NULL) {
5359         *ctxt->sax = *sax;
5360         ctxt->userData = userData;
5361     }
5362 
5363     htmlParseDocument(ctxt);
5364     ret = ctxt->myDoc;
5365     htmlFreeParserCtxt(ctxt);
5366 
5367     return(ret);
5368 }
5369 
5370 /**
5371  * htmlParseDoc:
5372  * @cur:  a pointer to an array of xmlChar
5373  * @encoding:  the encoding (optional)
5374  *
5375  * DEPRECATED: Use htmlReadDoc.
5376  *
5377  * Parse an HTML in-memory document and build a tree.
5378  *
5379  * This function uses deprecated global parser options.
5380  *
5381  * Returns the resulting document tree
5382  */
5383 
5384 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)5385 htmlParseDoc(const xmlChar *cur, const char *encoding) {
5386     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5387 }
5388 
5389 
5390 /**
5391  * htmlCreateFileParserCtxt:
5392  * @filename:  the filename
5393  * @encoding:  optional encoding
5394  *
5395  * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile.
5396  *
5397  * Create a parser context to read from a file.
5398  *
5399  * A non-NULL encoding overrides encoding declarations in the document.
5400  *
5401  * Automatic support for ZLIB/Compress compressed document is provided
5402  * by default if found at compile-time.
5403  *
5404  * Returns the new parser context or NULL if a memory allocation failed.
5405  */
5406 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)5407 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5408 {
5409     htmlParserCtxtPtr ctxt;
5410     htmlParserInputPtr input;
5411 
5412     if (filename == NULL)
5413         return(NULL);
5414 
5415     ctxt = htmlNewParserCtxt();
5416     if (ctxt == NULL) {
5417 	return(NULL);
5418     }
5419 
5420     input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5421     if (input == NULL) {
5422 	xmlFreeParserCtxt(ctxt);
5423 	return(NULL);
5424     }
5425     if (xmlCtxtPushInput(ctxt, input) < 0) {
5426         xmlFreeInputStream(input);
5427         xmlFreeParserCtxt(ctxt);
5428         return(NULL);
5429     }
5430 
5431     return(ctxt);
5432 }
5433 
5434 /**
5435  * htmlSAXParseFile:
5436  * @filename:  the filename
5437  * @encoding:  encoding (optional)
5438  * @sax:  the SAX handler block
5439  * @userData: if using SAX, this pointer will be provided on callbacks.
5440  *
5441  * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
5442  *
5443  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5444  * compressed document is provided by default if found at compile-time.
5445  * It use the given SAX function block to handle the parsing callback.
5446  * If sax is NULL, fallback to the default DOM tree building routines.
5447  *
5448  * Returns the resulting document tree unless SAX is NULL or the document is
5449  *     not well formed.
5450  */
5451 
5452 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)5453 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5454                  void *userData) {
5455     htmlDocPtr ret;
5456     htmlParserCtxtPtr ctxt;
5457     htmlSAXHandlerPtr oldsax = NULL;
5458 
5459     ctxt = htmlCreateFileParserCtxt(filename, encoding);
5460     if (ctxt == NULL) return(NULL);
5461     if (sax != NULL) {
5462 	oldsax = ctxt->sax;
5463         ctxt->sax = sax;
5464         ctxt->userData = userData;
5465     }
5466 
5467     htmlParseDocument(ctxt);
5468 
5469     ret = ctxt->myDoc;
5470     if (sax != NULL) {
5471         ctxt->sax = oldsax;
5472         ctxt->userData = NULL;
5473     }
5474     htmlFreeParserCtxt(ctxt);
5475 
5476     return(ret);
5477 }
5478 
5479 /**
5480  * htmlParseFile:
5481  * @filename:  the filename
5482  * @encoding:  encoding (optional)
5483  *
5484  * Parse an HTML file and build a tree.
5485  *
5486  * Returns the resulting document tree
5487  */
5488 
5489 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)5490 htmlParseFile(const char *filename, const char *encoding) {
5491     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5492 }
5493 
5494 /**
5495  * htmlHandleOmittedElem:
5496  * @val:  int 0 or 1
5497  *
5498  * DEPRECATED: Use HTML_PARSE_NOIMPLIED
5499  *
5500  * Set and return the previous value for handling HTML omitted tags.
5501  *
5502  * Returns the last value for 0 for no handling, 1 for auto insertion.
5503  */
5504 
5505 int
htmlHandleOmittedElem(int val)5506 htmlHandleOmittedElem(int val) {
5507     int old = htmlOmittedDefaultValue;
5508 
5509     htmlOmittedDefaultValue = val;
5510     return(old);
5511 }
5512 
5513 /**
5514  * htmlElementAllowedHere:
5515  * @parent: HTML parent element
5516  * @elt: HTML element
5517  *
5518  * DEPRECATED: Don't use.
5519  *
5520  * Returns 1
5521  */
5522 int
htmlElementAllowedHere(const htmlElemDesc * parent ATTRIBUTE_UNUSED,const xmlChar * elt ATTRIBUTE_UNUSED)5523 htmlElementAllowedHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5524                        const xmlChar* elt ATTRIBUTE_UNUSED) {
5525     return(1);
5526 }
5527 
5528 /**
5529  * htmlElementStatusHere:
5530  * @parent: HTML parent element
5531  * @elt: HTML element
5532  *
5533  * DEPRECATED: Don't use.
5534  *
5535  * Returns HTML_VALID
5536  */
5537 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent ATTRIBUTE_UNUSED,const htmlElemDesc * elt ATTRIBUTE_UNUSED)5538 htmlElementStatusHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5539                       const htmlElemDesc* elt ATTRIBUTE_UNUSED) {
5540     return(HTML_VALID);
5541 }
5542 
5543 /**
5544  * htmlAttrAllowed:
5545  * @elt: HTML element
5546  * @attr: HTML attribute
5547  * @legacy: whether to allow deprecated attributes
5548  *
5549  * DEPRECATED: Don't use.
5550  *
5551  * Returns HTML_VALID
5552  */
5553 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt ATTRIBUTE_UNUSED,const xmlChar * attr ATTRIBUTE_UNUSED,int legacy ATTRIBUTE_UNUSED)5554 htmlAttrAllowed(const htmlElemDesc* elt ATTRIBUTE_UNUSED,
5555                 const xmlChar* attr ATTRIBUTE_UNUSED,
5556                 int legacy ATTRIBUTE_UNUSED) {
5557     return(HTML_VALID);
5558 }
5559 
5560 /**
5561  * htmlNodeStatus:
5562  * @node: an htmlNodePtr in a tree
5563  * @legacy: whether to allow deprecated elements (YES is faster here
5564  *	for Element nodes)
5565  *
5566  * DEPRECATED: Don't use.
5567  *
5568  * Returns HTML_VALID
5569  */
5570 htmlStatus
htmlNodeStatus(htmlNodePtr node ATTRIBUTE_UNUSED,int legacy ATTRIBUTE_UNUSED)5571 htmlNodeStatus(htmlNodePtr node ATTRIBUTE_UNUSED,
5572                int legacy ATTRIBUTE_UNUSED) {
5573     return(HTML_VALID);
5574 }
5575 
5576 /************************************************************************
5577  *									*
5578  *	New set (2.6.0) of simpler and more flexible APIs		*
5579  *									*
5580  ************************************************************************/
5581 /**
5582  * DICT_FREE:
5583  * @str:  a string
5584  *
5585  * Free a string if it is not owned by the "dict" dictionary in the
5586  * current scope
5587  */
5588 #define DICT_FREE(str)						\
5589 	if ((str) && ((!dict) ||				\
5590 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
5591 	    xmlFree((char *)(str));
5592 
5593 /**
5594  * htmlCtxtReset:
5595  * @ctxt: an HTML parser context
5596  *
5597  * Reset a parser context
5598  */
5599 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)5600 htmlCtxtReset(htmlParserCtxtPtr ctxt)
5601 {
5602     xmlParserInputPtr input;
5603     xmlDictPtr dict;
5604 
5605     if (ctxt == NULL)
5606         return;
5607 
5608     dict = ctxt->dict;
5609 
5610     while ((input = xmlCtxtPopInput(ctxt)) != NULL) { /* Non consuming */
5611         xmlFreeInputStream(input);
5612     }
5613     ctxt->inputNr = 0;
5614     ctxt->input = NULL;
5615 
5616     ctxt->spaceNr = 0;
5617     if (ctxt->spaceTab != NULL) {
5618 	ctxt->spaceTab[0] = -1;
5619 	ctxt->space = &ctxt->spaceTab[0];
5620     } else {
5621 	ctxt->space = NULL;
5622     }
5623 
5624 
5625     ctxt->nodeNr = 0;
5626     ctxt->node = NULL;
5627 
5628     ctxt->nameNr = 0;
5629     ctxt->name = NULL;
5630 
5631     ctxt->nsNr = 0;
5632 
5633     DICT_FREE(ctxt->version);
5634     ctxt->version = NULL;
5635     DICT_FREE(ctxt->encoding);
5636     ctxt->encoding = NULL;
5637     DICT_FREE(ctxt->extSubURI);
5638     ctxt->extSubURI = NULL;
5639     DICT_FREE(ctxt->extSubSystem);
5640     ctxt->extSubSystem = NULL;
5641 
5642     if (ctxt->directory != NULL) {
5643         xmlFree(ctxt->directory);
5644         ctxt->directory = NULL;
5645     }
5646 
5647     if (ctxt->myDoc != NULL)
5648         xmlFreeDoc(ctxt->myDoc);
5649     ctxt->myDoc = NULL;
5650 
5651     ctxt->standalone = -1;
5652     ctxt->hasExternalSubset = 0;
5653     ctxt->hasPErefs = 0;
5654     ctxt->html = 1;
5655     ctxt->instate = XML_PARSER_START;
5656 
5657     ctxt->wellFormed = 1;
5658     ctxt->nsWellFormed = 1;
5659     ctxt->disableSAX = 0;
5660     ctxt->valid = 1;
5661     ctxt->vctxt.userData = ctxt;
5662     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5663     ctxt->vctxt.error = xmlParserValidityError;
5664     ctxt->vctxt.warning = xmlParserValidityWarning;
5665     ctxt->record_info = 0;
5666     ctxt->checkIndex = 0;
5667     ctxt->endCheckState = 0;
5668     ctxt->inSubset = 0;
5669     ctxt->errNo = XML_ERR_OK;
5670     ctxt->depth = 0;
5671     ctxt->catalogs = NULL;
5672     xmlInitNodeInfoSeq(&ctxt->node_seq);
5673 
5674     if (ctxt->attsDefault != NULL) {
5675         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
5676         ctxt->attsDefault = NULL;
5677     }
5678     if (ctxt->attsSpecial != NULL) {
5679         xmlHashFree(ctxt->attsSpecial, NULL);
5680         ctxt->attsSpecial = NULL;
5681     }
5682 
5683     ctxt->nbErrors = 0;
5684     ctxt->nbWarnings = 0;
5685     if (ctxt->lastError.code != XML_ERR_OK)
5686         xmlResetError(&ctxt->lastError);
5687 }
5688 
5689 static int
htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt,int options,int keepMask)5690 htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask)
5691 {
5692     int allMask;
5693 
5694     if (ctxt == NULL)
5695         return(-1);
5696 
5697     allMask = HTML_PARSE_RECOVER |
5698               HTML_PARSE_HTML5 |
5699               HTML_PARSE_NODEFDTD |
5700               HTML_PARSE_NOERROR |
5701               HTML_PARSE_NOWARNING |
5702               HTML_PARSE_PEDANTIC |
5703               HTML_PARSE_NOBLANKS |
5704               HTML_PARSE_NONET |
5705               HTML_PARSE_NOIMPLIED |
5706               HTML_PARSE_COMPACT |
5707               HTML_PARSE_HUGE |
5708               HTML_PARSE_IGNORE_ENC |
5709               HTML_PARSE_BIG_LINES;
5710 
5711     ctxt->options = (ctxt->options & keepMask) | (options & allMask);
5712 
5713     /*
5714      * For some options, struct members are historically the source
5715      * of truth. See xmlCtxtSetOptionsInternal.
5716      */
5717     ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1;
5718 
5719     /*
5720      * Changing SAX callbacks is a bad idea. This should be fixed.
5721      */
5722     if (options & HTML_PARSE_NOBLANKS) {
5723         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5724     }
5725     if (options & HTML_PARSE_HUGE) {
5726         if (ctxt->dict != NULL)
5727             xmlDictSetLimit(ctxt->dict, 0);
5728     }
5729 
5730     /*
5731      * It would be useful to allow this feature.
5732      */
5733     ctxt->dictNames = 0;
5734 
5735     ctxt->linenumbers = 1;
5736 
5737     return(options & ~allMask);
5738 }
5739 
5740 /**
5741  * htmlCtxtSetOptions:
5742  * @ctxt: an HTML parser context
5743  * @options:  a bitmask of xmlParserOption values
5744  *
5745  * Applies the options to the parser context. Unset options are
5746  * cleared.
5747  *
5748  * Available since 2.14.0. With older versions, you can use
5749  * htmlCtxtUseOptions.
5750  *
5751  * HTML_PARSE_RECOVER
5752  *
5753  * No effect as of 2.14.0.
5754  *
5755  * HTML_PARSE_HTML5
5756  *
5757  * Make the tokenizer emit a SAX callback for each token. This results
5758  * in unbalanced invocations of startElement and endElement.
5759  *
5760  * For now, this is only usable with custom SAX callbacks.
5761  *
5762  * HTML_PARSE_NODEFDTD
5763  *
5764  * Do not default to a doctype if none was found.
5765  *
5766  * HTML_PARSE_NOERROR
5767  *
5768  * Disable error and warning reports to the error handlers.
5769  * Errors are still accessible with xmlCtxtGetLastError.
5770  *
5771  * HTML_PARSE_NOWARNING
5772  *
5773  * Disable warning reports.
5774  *
5775  * HTML_PARSE_PEDANTIC
5776  *
5777  * No effect.
5778  *
5779  * HTML_PARSE_NOBLANKS
5780  *
5781  * Remove some text nodes containing only whitespace from the
5782  * result document. Which nodes are removed depends on a conservative
5783  * heuristic. The reindenting feature of the serialization code relies
5784  * on this option to be set when parsing. Use of this option is
5785  * DISCOURAGED.
5786  *
5787  * HTML_PARSE_NONET
5788  *
5789  * No effect.
5790  *
5791  * HTML_PARSE_NOIMPLIED
5792  *
5793  * Do not add implied html, head or body elements.
5794  *
5795  * HTML_PARSE_COMPACT
5796  *
5797  * Store small strings directly in the node struct to save
5798  * memory.
5799  *
5800  * HTML_PARSE_HUGE
5801  *
5802  * Relax some internal limits.
5803  *
5804  * Available since 2.14.0. Use XML_PARSE_HUGE works with older
5805  * versions.
5806  *
5807  * Maximum size of text nodes, tags, comments, CDATA sections
5808  *
5809  * normal: 10M
5810  * huge:    1B
5811  *
5812  * Maximum size of names, system literals, pubid literals
5813  *
5814  * normal: 50K
5815  * huge:   10M
5816  *
5817  * Maximum nesting depth of elements
5818  *
5819  * normal:  256
5820  * huge:   2048
5821  *
5822  * HTML_PARSE_IGNORE_ENC
5823  *
5824  * Ignore the encoding in the HTML declaration. This option is
5825  * mostly unneeded these days. The only effect is to enforce
5826  * UTF-8 decoding of ASCII-like data.
5827  *
5828  * HTML_PARSE_BIG_LINES
5829  *
5830  * Enable reporting of line numbers larger than 65535.
5831  *
5832  * Available since 2.14.0.
5833  *
5834  * Returns 0 in case of success, the set of unknown or unimplemented options
5835  *         in case of error.
5836  */
5837 int
htmlCtxtSetOptions(xmlParserCtxtPtr ctxt,int options)5838 htmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options)
5839 {
5840     return(htmlCtxtSetOptionsInternal(ctxt, options, 0));
5841 }
5842 
5843 /**
5844  * htmlCtxtUseOptions:
5845  * @ctxt: an HTML parser context
5846  * @options:  a combination of htmlParserOption(s)
5847  *
5848  * DEPRECATED: Use htmlCtxtSetOptions.
5849  *
5850  * Applies the options to the parser context. The following options
5851  * are never cleared and can only be enabled:
5852  *
5853  * HTML_PARSE_NODEFDTD
5854  * HTML_PARSE_NOERROR
5855  * HTML_PARSE_NOWARNING
5856  * HTML_PARSE_NOIMPLIED
5857  * HTML_PARSE_COMPACT
5858  * HTML_PARSE_HUGE
5859  * HTML_PARSE_IGNORE_ENC
5860  * HTML_PARSE_BIG_LINES
5861  *
5862  * Returns 0 in case of success, the set of unknown or unimplemented options
5863  *         in case of error.
5864  */
5865 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)5866 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5867 {
5868     int keepMask;
5869 
5870     /*
5871      * For historic reasons, some options can only be enabled.
5872      */
5873     keepMask = HTML_PARSE_NODEFDTD |
5874                HTML_PARSE_NOERROR |
5875                HTML_PARSE_NOWARNING |
5876                HTML_PARSE_NOIMPLIED |
5877                HTML_PARSE_COMPACT |
5878                HTML_PARSE_HUGE |
5879                HTML_PARSE_IGNORE_ENC |
5880                HTML_PARSE_BIG_LINES;
5881 
5882     return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask));
5883 }
5884 
5885 /**
5886  * htmlCtxtParseDocument:
5887  * @ctxt:  an HTML parser context
5888  * @input:  parser input
5889  *
5890  * Parse an HTML document and return the resulting document tree.
5891  *
5892  * Available since 2.13.0.
5893  *
5894  * Returns the resulting document tree or NULL
5895  */
5896 htmlDocPtr
htmlCtxtParseDocument(htmlParserCtxtPtr ctxt,xmlParserInputPtr input)5897 htmlCtxtParseDocument(htmlParserCtxtPtr ctxt, xmlParserInputPtr input)
5898 {
5899     htmlDocPtr ret;
5900 
5901     if ((ctxt == NULL) || (input == NULL)) {
5902         xmlFatalErr(ctxt, XML_ERR_ARGUMENT, NULL);
5903         xmlFreeInputStream(input);
5904         return(NULL);
5905     }
5906 
5907     /* assert(ctxt->inputNr == 0); */
5908     while (ctxt->inputNr > 0)
5909         xmlFreeInputStream(xmlCtxtPopInput(ctxt));
5910 
5911     if (xmlCtxtPushInput(ctxt, input) < 0) {
5912         xmlFreeInputStream(input);
5913         return(NULL);
5914     }
5915 
5916     ctxt->html = 1;
5917     htmlParseDocument(ctxt);
5918 
5919     if (ctxt->errNo != XML_ERR_NO_MEMORY) {
5920         ret = ctxt->myDoc;
5921     } else {
5922         ret = NULL;
5923         xmlFreeDoc(ctxt->myDoc);
5924     }
5925     ctxt->myDoc = NULL;
5926 
5927     /* assert(ctxt->inputNr == 1); */
5928     while (ctxt->inputNr > 0)
5929         xmlFreeInputStream(xmlCtxtPopInput(ctxt));
5930 
5931     return(ret);
5932 }
5933 
5934 /**
5935  * htmlReadDoc:
5936  * @str:  a pointer to a zero terminated string
5937  * @url:  only used for error reporting (optoinal)
5938  * @encoding:  the document encoding (optional)
5939  * @options:  a combination of htmlParserOptions
5940  *
5941  * Convenience function to parse an HTML document from a zero-terminated
5942  * string.
5943  *
5944  * See htmlCtxtReadDoc for details.
5945  *
5946  * Returns the resulting document tree.
5947  */
5948 htmlDocPtr
htmlReadDoc(const xmlChar * str,const char * url,const char * encoding,int options)5949 htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
5950             int options)
5951 {
5952     htmlParserCtxtPtr ctxt;
5953     xmlParserInputPtr input;
5954     htmlDocPtr doc = NULL;
5955 
5956     ctxt = htmlNewParserCtxt();
5957     if (ctxt == NULL)
5958         return(NULL);
5959 
5960     htmlCtxtUseOptions(ctxt, options);
5961 
5962     input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str, encoding,
5963                                       XML_INPUT_BUF_STATIC);
5964 
5965     if (input != NULL)
5966         doc = htmlCtxtParseDocument(ctxt, input);
5967 
5968     htmlFreeParserCtxt(ctxt);
5969     return(doc);
5970 }
5971 
5972 /**
5973  * htmlReadFile:
5974  * @filename:  a file or URL
5975  * @encoding:  the document encoding (optional)
5976  * @options:  a combination of htmlParserOptions
5977  *
5978  * Convenience function to parse an HTML file from the filesystem,
5979  * the network or a global user-defined resource loader.
5980  *
5981  * See htmlCtxtReadFile for details.
5982  *
5983  * Returns the resulting document tree.
5984  */
5985 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)5986 htmlReadFile(const char *filename, const char *encoding, int options)
5987 {
5988     htmlParserCtxtPtr ctxt;
5989     xmlParserInputPtr input;
5990     htmlDocPtr doc = NULL;
5991 
5992     ctxt = htmlNewParserCtxt();
5993     if (ctxt == NULL)
5994         return(NULL);
5995 
5996     htmlCtxtUseOptions(ctxt, options);
5997 
5998     input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5999 
6000     if (input != NULL)
6001         doc = htmlCtxtParseDocument(ctxt, input);
6002 
6003     htmlFreeParserCtxt(ctxt);
6004     return(doc);
6005 }
6006 
6007 /**
6008  * htmlReadMemory:
6009  * @buffer:  a pointer to a char array
6010  * @size:  the size of the array
6011  * @url:  only used for error reporting (optional)
6012  * @encoding:  the document encoding, or NULL
6013  * @options:  a combination of htmlParserOption(s)
6014  *
6015  * Convenience function to parse an HTML document from memory.
6016  * The input buffer must not contain any terminating null bytes.
6017  *
6018  * See htmlCtxtReadMemory for details.
6019  *
6020  * Returns the resulting document tree
6021  */
6022 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * url,const char * encoding,int options)6023 htmlReadMemory(const char *buffer, int size, const char *url,
6024                const char *encoding, int options)
6025 {
6026     htmlParserCtxtPtr ctxt;
6027     xmlParserInputPtr input;
6028     htmlDocPtr doc = NULL;
6029 
6030     if (size < 0)
6031 	return(NULL);
6032 
6033     ctxt = htmlNewParserCtxt();
6034     if (ctxt == NULL)
6035         return(NULL);
6036 
6037     htmlCtxtUseOptions(ctxt, options);
6038 
6039     input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding,
6040                                       XML_INPUT_BUF_STATIC);
6041 
6042     if (input != NULL)
6043         doc = htmlCtxtParseDocument(ctxt, input);
6044 
6045     htmlFreeParserCtxt(ctxt);
6046     return(doc);
6047 }
6048 
6049 /**
6050  * htmlReadFd:
6051  * @fd:  an open file descriptor
6052  * @url:  only used for error reporting (optional)
6053  * @encoding:  the document encoding, or NULL
6054  * @options:  a combination of htmlParserOptions
6055  *
6056  * Convenience function to parse an HTML document from a
6057  * file descriptor.
6058  *
6059  * NOTE that the file descriptor will not be closed when the
6060  * context is freed or reset.
6061  *
6062  * See htmlCtxtReadFd for details.
6063  *
6064  * Returns the resulting document tree
6065  */
6066 htmlDocPtr
htmlReadFd(int fd,const char * url,const char * encoding,int options)6067 htmlReadFd(int fd, const char *url, const char *encoding, int options)
6068 {
6069     htmlParserCtxtPtr ctxt;
6070     xmlParserInputPtr input;
6071     htmlDocPtr doc = NULL;
6072 
6073     ctxt = htmlNewParserCtxt();
6074     if (ctxt == NULL)
6075         return(NULL);
6076 
6077     htmlCtxtUseOptions(ctxt, options);
6078 
6079     input = xmlCtxtNewInputFromFd(ctxt, url, fd, encoding, 0);
6080 
6081     if (input != NULL)
6082         doc = htmlCtxtParseDocument(ctxt, input);
6083 
6084     htmlFreeParserCtxt(ctxt);
6085     return(doc);
6086 }
6087 
6088 /**
6089  * htmlReadIO:
6090  * @ioread:  an I/O read function
6091  * @ioclose:  an I/O close function (optional)
6092  * @ioctx:  an I/O handler
6093  * @url:  only used for error reporting (optional)
6094  * @encoding:  the document encoding (optional)
6095  * @options:  a combination of htmlParserOption(s)
6096  *
6097  * Convenience function to parse an HTML document from I/O functions
6098  * and context.
6099  *
6100  * See htmlCtxtReadIO for details.
6101  *
6102  * Returns the resulting document tree
6103  */
6104 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * url,const char * encoding,int options)6105 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6106           void *ioctx, const char *url, const char *encoding, int options)
6107 {
6108     htmlParserCtxtPtr ctxt;
6109     xmlParserInputPtr input;
6110     htmlDocPtr doc = NULL;
6111 
6112     ctxt = htmlNewParserCtxt();
6113     if (ctxt == NULL)
6114         return (NULL);
6115 
6116     htmlCtxtUseOptions(ctxt, options);
6117 
6118     input = xmlCtxtNewInputFromIO(ctxt, url, ioread, ioclose, ioctx,
6119                                   encoding, 0);
6120 
6121     if (input != NULL)
6122         doc = htmlCtxtParseDocument(ctxt, input);
6123 
6124     htmlFreeParserCtxt(ctxt);
6125     return(doc);
6126 }
6127 
6128 /**
6129  * htmlCtxtReadDoc:
6130  * @ctxt:  an HTML parser context
6131  * @str:  a pointer to a zero terminated string
6132  * @URL:  only used for error reporting (optional)
6133  * @encoding:  the document encoding (optional)
6134  * @options:  a combination of htmlParserOptions
6135  *
6136  * Parse an HTML in-memory document and build a tree.
6137  *
6138  * See htmlCtxtUseOptions for details.
6139  *
6140  * Returns the resulting document tree
6141  */
6142 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * str,const char * URL,const char * encoding,int options)6143 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6144                 const char *URL, const char *encoding, int options)
6145 {
6146     xmlParserInputPtr input;
6147 
6148     if (ctxt == NULL)
6149         return (NULL);
6150 
6151     htmlCtxtReset(ctxt);
6152     htmlCtxtUseOptions(ctxt, options);
6153 
6154     input = xmlCtxtNewInputFromString(ctxt, URL, (const char *) str,
6155                                       encoding, 0);
6156     if (input == NULL)
6157         return(NULL);
6158 
6159     return(htmlCtxtParseDocument(ctxt, input));
6160 }
6161 
6162 /**
6163  * htmlCtxtReadFile:
6164  * @ctxt:  an HTML parser context
6165  * @filename:  a file or URL
6166  * @encoding:  the document encoding (optional)
6167  * @options:  a combination of htmlParserOptions
6168  *
6169  * Parse an HTML file from the filesystem, the network or a
6170  * user-defined resource loader.
6171  *
6172  * See htmlCtxtUseOptions for details.
6173  *
6174  * Returns the resulting document tree
6175  */
6176 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)6177 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6178                 const char *encoding, int options)
6179 {
6180     xmlParserInputPtr input;
6181 
6182     if (ctxt == NULL)
6183         return (NULL);
6184 
6185     htmlCtxtReset(ctxt);
6186     htmlCtxtUseOptions(ctxt, options);
6187 
6188     input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
6189     if (input == NULL)
6190         return(NULL);
6191 
6192     return(htmlCtxtParseDocument(ctxt, input));
6193 }
6194 
6195 /**
6196  * htmlCtxtReadMemory:
6197  * @ctxt:  an HTML parser context
6198  * @buffer:  a pointer to a char array
6199  * @size:  the size of the array
6200  * @URL:  only used for error reporting (optional)
6201  * @encoding:  the document encoding (optinal)
6202  * @options:  a combination of htmlParserOptions
6203  *
6204  * Parse an HTML in-memory document and build a tree. The input buffer must
6205  * not contain any terminating null bytes.
6206  *
6207  * See htmlCtxtUseOptions for details.
6208  *
6209  * Returns the resulting document tree
6210  */
6211 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)6212 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6213                   const char *URL, const char *encoding, int options)
6214 {
6215     xmlParserInputPtr input;
6216 
6217     if ((ctxt == NULL) || (size < 0))
6218         return (NULL);
6219 
6220     htmlCtxtReset(ctxt);
6221     htmlCtxtUseOptions(ctxt, options);
6222 
6223     input = xmlCtxtNewInputFromMemory(ctxt, URL, buffer, size, encoding,
6224                                       XML_INPUT_BUF_STATIC);
6225     if (input == NULL)
6226         return(NULL);
6227 
6228     return(htmlCtxtParseDocument(ctxt, input));
6229 }
6230 
6231 /**
6232  * htmlCtxtReadFd:
6233  * @ctxt:  an HTML parser context
6234  * @fd:  an open file descriptor
6235  * @URL:  only used for error reporting (optional)
6236  * @encoding:  the document encoding (optinal)
6237  * @options:  a combination of htmlParserOptions
6238  *
6239  * Parse an HTML from a file descriptor and build a tree.
6240  *
6241  * See htmlCtxtUseOptions for details.
6242  *
6243  * NOTE that the file descriptor will not be closed when the
6244  * context is freed or reset.
6245  *
6246  * Returns the resulting document tree
6247  */
6248 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)6249 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6250               const char *URL, const char *encoding, int options)
6251 {
6252     xmlParserInputPtr input;
6253 
6254     if (ctxt == NULL)
6255         return(NULL);
6256 
6257     htmlCtxtReset(ctxt);
6258     htmlCtxtUseOptions(ctxt, options);
6259 
6260     input = xmlCtxtNewInputFromFd(ctxt, URL, fd, encoding, 0);
6261     if (input == NULL)
6262         return(NULL);
6263 
6264     return(htmlCtxtParseDocument(ctxt, input));
6265 }
6266 
6267 /**
6268  * htmlCtxtReadIO:
6269  * @ctxt:  an HTML parser context
6270  * @ioread:  an I/O read function
6271  * @ioclose:  an I/O close function
6272  * @ioctx:  an I/O handler
6273  * @URL:  the base URL to use for the document
6274  * @encoding:  the document encoding, or NULL
6275  * @options:  a combination of htmlParserOption(s)
6276  *
6277  * Parse an HTML document from I/O functions and source and build a tree.
6278  *
6279  * See htmlCtxtUseOptions for details.
6280  *
6281  * Returns the resulting document tree
6282  */
6283 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6284 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6285               xmlInputCloseCallback ioclose, void *ioctx,
6286 	      const char *URL,
6287               const char *encoding, int options)
6288 {
6289     xmlParserInputPtr input;
6290 
6291     if (ctxt == NULL)
6292         return (NULL);
6293 
6294     htmlCtxtReset(ctxt);
6295     htmlCtxtUseOptions(ctxt, options);
6296 
6297     input = xmlCtxtNewInputFromIO(ctxt, URL, ioread, ioclose, ioctx,
6298                                   encoding, 0);
6299     if (input == NULL)
6300         return(NULL);
6301 
6302     return(htmlCtxtParseDocument(ctxt, input));
6303 }
6304 
6305 #endif /* LIBXML_HTML_ENABLED */
6306