• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLparser.c : an HTML 4.0 non-verifying parser
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12 
13 #include <string.h>
14 #include <ctype.h>
15 #include <stdlib.h>
16 
17 #include <libxml/HTMLparser.h>
18 #include <libxml/xmlmemory.h>
19 #include <libxml/tree.h>
20 #include <libxml/parser.h>
21 #include <libxml/parserInternals.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/HTMLtree.h>
24 #include <libxml/entities.h>
25 #include <libxml/encoding.h>
26 #include <libxml/xmlIO.h>
27 #include <libxml/uri.h>
28 
29 #include "private/buf.h"
30 #include "private/enc.h"
31 #include "private/error.h"
32 #include "private/html.h"
33 #include "private/io.h"
34 #include "private/parser.h"
35 #include "private/tree.h"
36 
37 #define HTML_MAX_NAMELEN 1000
38 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
39 #define HTML_PARSER_BUFFER_SIZE 100
40 
41 static int htmlOmittedDefaultValue = 1;
42 
43 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44 			     xmlChar end, xmlChar  end2, xmlChar end3);
45 static void htmlParseComment(htmlParserCtxtPtr ctxt);
46 
47 /************************************************************************
48  *									*
49  *		Some factorized error routines				*
50  *									*
51  ************************************************************************/
52 
53 /**
54  * htmlErrMemory:
55  * @ctxt:  an HTML parser context
56  * @extra:  extra information
57  *
58  * Handle a redefinition of attribute error
59  */
60 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)61 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
62 {
63     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
64         (ctxt->instate == XML_PARSER_EOF))
65 	return;
66     if (ctxt != NULL) {
67         ctxt->errNo = XML_ERR_NO_MEMORY;
68         ctxt->instate = XML_PARSER_EOF;
69         ctxt->disableSAX = 1;
70     }
71     if (extra)
72         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
73                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
74                         NULL, NULL, 0, 0,
75                         "Memory allocation failed : %s\n", extra);
76     else
77         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
78                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
79                         NULL, NULL, 0, 0, "Memory allocation failed\n");
80 }
81 
82 /**
83  * htmlParseErr:
84  * @ctxt:  an HTML parser context
85  * @error:  the error number
86  * @msg:  the error message
87  * @str1:  string infor
88  * @str2:  string infor
89  *
90  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
91  */
92 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)93 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
94              const char *msg, const xmlChar *str1, const xmlChar *str2)
95 {
96     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
97         (ctxt->instate == XML_PARSER_EOF))
98 	return;
99     if (ctxt != NULL)
100 	ctxt->errNo = error;
101     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
102                     XML_ERR_ERROR, NULL, 0,
103 		    (const char *) str1, (const char *) str2,
104 		    NULL, 0, 0,
105 		    msg, str1, str2);
106     if (ctxt != NULL)
107 	ctxt->wellFormed = 0;
108 }
109 
110 /**
111  * htmlParseErrInt:
112  * @ctxt:  an HTML parser context
113  * @error:  the error number
114  * @msg:  the error message
115  * @val:  integer info
116  *
117  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
118  */
119 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)120 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
121              const char *msg, int val)
122 {
123     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
124         (ctxt->instate == XML_PARSER_EOF))
125 	return;
126     if (ctxt != NULL)
127 	ctxt->errNo = error;
128     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
129                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
130 		    NULL, val, 0, msg, val);
131     if (ctxt != NULL)
132 	ctxt->wellFormed = 0;
133 }
134 
135 /************************************************************************
136  *									*
137  *	Parser stacks related functions and macros		*
138  *									*
139  ************************************************************************/
140 
141 /**
142  * htmlnamePush:
143  * @ctxt:  an HTML parser context
144  * @value:  the element name
145  *
146  * Pushes a new element name on top of the name stack
147  *
148  * Returns -1 in case of error, the index in the stack otherwise
149  */
150 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)151 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
152 {
153     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
154         ctxt->html = 3;
155     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
156         ctxt->html = 10;
157     if (ctxt->nameNr >= ctxt->nameMax) {
158         size_t newSize = ctxt->nameMax * 2;
159         const xmlChar **tmp;
160 
161         tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
162                          newSize * sizeof(ctxt->nameTab[0]));
163         if (tmp == NULL) {
164             htmlErrMemory(ctxt, NULL);
165             return (-1);
166         }
167         ctxt->nameTab = tmp;
168         ctxt->nameMax = newSize;
169     }
170     ctxt->nameTab[ctxt->nameNr] = value;
171     ctxt->name = value;
172     return (ctxt->nameNr++);
173 }
174 /**
175  * htmlnamePop:
176  * @ctxt: an HTML parser context
177  *
178  * Pops the top element name from the name stack
179  *
180  * Returns the name just removed
181  */
182 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)183 htmlnamePop(htmlParserCtxtPtr ctxt)
184 {
185     const xmlChar *ret;
186 
187     if (ctxt->nameNr <= 0)
188         return (NULL);
189     ctxt->nameNr--;
190     if (ctxt->nameNr < 0)
191         return (NULL);
192     if (ctxt->nameNr > 0)
193         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
194     else
195         ctxt->name = NULL;
196     ret = ctxt->nameTab[ctxt->nameNr];
197     ctxt->nameTab[ctxt->nameNr] = NULL;
198     return (ret);
199 }
200 
201 /**
202  * htmlNodeInfoPush:
203  * @ctxt:  an HTML parser context
204  * @value:  the node info
205  *
206  * Pushes a new element name on top of the node info stack
207  *
208  * Returns 0 in case of error, the index in the stack otherwise
209  */
210 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)211 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
212 {
213     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
214         if (ctxt->nodeInfoMax == 0)
215                 ctxt->nodeInfoMax = 5;
216         ctxt->nodeInfoMax *= 2;
217         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
218                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
219                                     ctxt->nodeInfoMax *
220                                     sizeof(ctxt->nodeInfoTab[0]));
221         if (ctxt->nodeInfoTab == NULL) {
222             htmlErrMemory(ctxt, NULL);
223             return (0);
224         }
225     }
226     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
227     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
228     return (ctxt->nodeInfoNr++);
229 }
230 
231 /**
232  * htmlNodeInfoPop:
233  * @ctxt:  an HTML parser context
234  *
235  * Pops the top element name from the node info stack
236  *
237  * Returns 0 in case of error, the pointer to NodeInfo otherwise
238  */
239 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)240 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
241 {
242     if (ctxt->nodeInfoNr <= 0)
243         return (NULL);
244     ctxt->nodeInfoNr--;
245     if (ctxt->nodeInfoNr < 0)
246         return (NULL);
247     if (ctxt->nodeInfoNr > 0)
248         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
249     else
250         ctxt->nodeInfo = NULL;
251     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
252 }
253 
254 /*
255  * Macros for accessing the content. Those should be used only by the parser,
256  * and not exported.
257  *
258  * Dirty macros, i.e. one need to make assumption on the context to use them
259  *
260  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
261  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
262  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
263  *           in UNICODE mode. This should be used internally by the parser
264  *           only to compare to ASCII values otherwise it would break when
265  *           running with UTF-8 encoding.
266  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
267  *           to compare on ASCII based substring.
268  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
269  *           it should be used only to compare on ASCII based substring.
270  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
271  *           strings without newlines within the parser.
272  *
273  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
274  *
275  *   NEXT    Skip to the next character, this does the proper decoding
276  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
277  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
278  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
279  */
280 
281 #define UPPER (toupper(*ctxt->input->cur))
282 
283 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
284 
285 #define NXT(val) ctxt->input->cur[(val)]
286 
287 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
288 
289 #define CUR_PTR ctxt->input->cur
290 #define BASE_PTR ctxt->input->base
291 
292 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
293 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
294 	xmlParserShrink(ctxt)
295 
296 #define GROW if ((ctxt->progressive == 0) &&				\
297 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
298 	xmlParserGrow(ctxt)
299 
300 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
301 
302 /* Imported from XML */
303 
304 #define CUR (*ctxt->input->cur)
305 #define NEXT xmlNextChar(ctxt)
306 
307 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
308 
309 
310 #define NEXTL(l) do {							\
311     if (*(ctxt->input->cur) == '\n') {					\
312 	ctxt->input->line++; ctxt->input->col = 1;			\
313     } else ctxt->input->col++;						\
314     ctxt->token = 0; ctxt->input->cur += l;				\
315   } while (0)
316 
317 /************
318     \
319     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
320     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
321  ************/
322 
323 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
324 
325 #define COPY_BUF(l,b,i,v)						\
326     if (l == 1) b[i++] = v;						\
327     else i += xmlCopyChar(l,&b[i],v)
328 
329 /**
330  * htmlFindEncoding:
331  * @the HTML parser context
332  *
333  * Ty to find and encoding in the current data available in the input
334  * buffer this is needed to try to switch to the proper encoding when
335  * one face a character error.
336  * That's an heuristic, since it's operating outside of parsing it could
337  * try to use a meta which had been commented out, that's the reason it
338  * should only be used in case of error, not as a default.
339  *
340  * Returns an encoding string or NULL if not found, the string need to
341  *   be freed
342  */
343 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)344 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
345     const xmlChar *start, *cur, *end;
346 
347     if ((ctxt == NULL) || (ctxt->input == NULL) ||
348         (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
349         return(NULL);
350     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
351         return(NULL);
352 
353     start = ctxt->input->cur;
354     end = ctxt->input->end;
355     /* we also expect the input buffer to be zero terminated */
356     if (*end != 0)
357         return(NULL);
358 
359     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
360     if (cur == NULL)
361         return(NULL);
362     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
363     if (cur == NULL)
364         return(NULL);
365     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
366     if (cur == NULL)
367         return(NULL);
368     cur += 8;
369     start = cur;
370     while (((*cur >= 'A') && (*cur <= 'Z')) ||
371            ((*cur >= 'a') && (*cur <= 'z')) ||
372            ((*cur >= '0') && (*cur <= '9')) ||
373            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
374            cur++;
375     if (cur == start)
376         return(NULL);
377     return(xmlStrndup(start, cur - start));
378 }
379 
380 /**
381  * htmlCurrentChar:
382  * @ctxt:  the HTML parser context
383  * @len:  pointer to the length of the char read
384  *
385  * The current char value, if using UTF-8 this may actually span multiple
386  * bytes in the input buffer. Implement the end of line normalization:
387  * 2.11 End-of-Line Handling
388  * If the encoding is unspecified, in the case we find an ISO-Latin-1
389  * char, then the encoding converter is plugged in automatically.
390  *
391  * Returns the current char value and its length
392  */
393 
394 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)395 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
396     const unsigned char *cur;
397     unsigned char c;
398     unsigned int val;
399 
400     if (ctxt->instate == XML_PARSER_EOF)
401 	return(0);
402 
403     if (ctxt->token != 0) {
404 	*len = 0;
405 	return(ctxt->token);
406     }
407 
408     if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) {
409         xmlParserGrow(ctxt);
410         if (ctxt->instate == XML_PARSER_EOF)
411             return(0);
412     }
413 
414     if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
415         xmlChar * guess;
416         xmlCharEncodingHandlerPtr handler;
417 
418         /*
419          * Assume it's a fixed length encoding (1) with
420          * a compatible encoding for the ASCII set, since
421          * HTML constructs only use < 128 chars
422          */
423         if (*ctxt->input->cur < 0x80) {
424             *len = 1;
425             if ((*ctxt->input->cur == 0) &&
426                 (ctxt->input->cur < ctxt->input->end)) {
427                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
428                                 "Char 0x%X out of allowed range\n", 0);
429                 return(' ');
430             }
431             return(*ctxt->input->cur);
432         }
433 
434         /*
435          * Humm this is bad, do an automatic flow conversion
436          */
437         guess = htmlFindEncoding(ctxt);
438         if (guess == NULL) {
439             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
440         } else {
441             handler = xmlFindCharEncodingHandler((const char *) guess);
442             if (handler != NULL) {
443                 /*
444                  * Don't use UTF-8 encoder which isn't required and
445                  * can produce invalid UTF-8.
446                  */
447                 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
448                     xmlSwitchToEncoding(ctxt, handler);
449             } else {
450                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
451                              "Unsupported encoding %s", guess, NULL);
452             }
453             xmlFree(guess);
454         }
455         ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
456     }
457 
458     /*
459      * We are supposed to handle UTF8, check it's valid
460      * From rfc2044: encoding of the Unicode values on UTF-8:
461      *
462      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
463      * 0000 0000-0000 007F   0xxxxxxx
464      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
465      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
466      *
467      * Check for the 0x110000 limit too
468      */
469     cur = ctxt->input->cur;
470     c = *cur;
471     if (c & 0x80) {
472         size_t avail;
473 
474         if ((c & 0x40) == 0)
475             goto encoding_error;
476 
477         avail = ctxt->input->end - ctxt->input->cur;
478 
479         if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
480             goto encoding_error;
481         if ((c & 0xe0) == 0xe0) {
482             if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
483                 goto encoding_error;
484             if ((c & 0xf0) == 0xf0) {
485                 if (((c & 0xf8) != 0xf0) ||
486                     (avail < 4) || ((cur[3] & 0xc0) != 0x80))
487                     goto encoding_error;
488                 /* 4-byte code */
489                 *len = 4;
490                 val = (cur[0] & 0x7) << 18;
491                 val |= (cur[1] & 0x3f) << 12;
492                 val |= (cur[2] & 0x3f) << 6;
493                 val |= cur[3] & 0x3f;
494                 if (val < 0x10000)
495                     goto encoding_error;
496             } else {
497               /* 3-byte code */
498                 *len = 3;
499                 val = (cur[0] & 0xf) << 12;
500                 val |= (cur[1] & 0x3f) << 6;
501                 val |= cur[2] & 0x3f;
502                 if (val < 0x800)
503                     goto encoding_error;
504             }
505         } else {
506           /* 2-byte code */
507             *len = 2;
508             val = (cur[0] & 0x1f) << 6;
509             val |= cur[1] & 0x3f;
510             if (val < 0x80)
511                 goto encoding_error;
512         }
513         if (!IS_CHAR(val)) {
514             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
515                             "Char 0x%X out of allowed range\n", val);
516         }
517         return(val);
518     } else {
519         if ((*ctxt->input->cur == 0) &&
520             (ctxt->input->cur < ctxt->input->end)) {
521             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
522                             "Char 0x%X out of allowed range\n", 0);
523             *len = 1;
524             return(' ');
525         }
526         /* 1-byte code */
527         *len = 1;
528         return(*ctxt->input->cur);
529     }
530 
531 encoding_error:
532     {
533         char buffer[150];
534 
535 	if (ctxt->input->end - ctxt->input->cur >= 4) {
536 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
537 			    ctxt->input->cur[0], ctxt->input->cur[1],
538 			    ctxt->input->cur[2], ctxt->input->cur[3]);
539 	} else {
540 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
541 	}
542 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
543 		     "Input is not proper UTF-8, indicate encoding !\n",
544 		     BAD_CAST buffer, NULL);
545     }
546 
547     if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
548         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
549     *len = 1;
550     return(*ctxt->input->cur);
551 }
552 
553 /**
554  * htmlSkipBlankChars:
555  * @ctxt:  the HTML parser context
556  *
557  * skip all blanks character found at that point in the input streams.
558  *
559  * Returns the number of space chars skipped
560  */
561 
562 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)563 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
564     int res = 0;
565 
566     while (IS_BLANK_CH(*(ctxt->input->cur))) {
567         if (*(ctxt->input->cur) == '\n') {
568             ctxt->input->line++; ctxt->input->col = 1;
569         } else ctxt->input->col++;
570         ctxt->input->cur++;
571         if (*ctxt->input->cur == 0)
572             xmlParserGrow(ctxt);
573 	if (res < INT_MAX)
574 	    res++;
575     }
576     return(res);
577 }
578 
579 
580 
581 /************************************************************************
582  *									*
583  *	The list of HTML elements and their properties		*
584  *									*
585  ************************************************************************/
586 
587 /*
588  *  Start Tag: 1 means the start tag can be omitted
589  *  End Tag:   1 means the end tag can be omitted
590  *             2 means it's forbidden (empty elements)
591  *             3 means the tag is stylistic and should be closed easily
592  *  Depr:      this element is deprecated
593  *  DTD:       1 means that this element is valid only in the Loose DTD
594  *             2 means that this element is valid only in the Frameset DTD
595  *
596  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
597 	, subElements , impliedsubelt , Attributes, userdata
598  */
599 
600 /* Definitions and a couple of vars for HTML Elements */
601 
602 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
603 #define NB_FONTSTYLE 8
604 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
605 #define NB_PHRASE 10
606 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
607 #define NB_SPECIAL 16
608 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
609 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
610 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
611 #define NB_BLOCK NB_HEADING + NB_LIST + 14
612 #define FORMCTRL "input", "select", "textarea", "label", "button"
613 #define NB_FORMCTRL 5
614 #define PCDATA
615 #define NB_PCDATA 0
616 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
617 #define NB_HEADING 6
618 #define LIST "ul", "ol", "dir", "menu"
619 #define NB_LIST 4
620 #define MODIFIER
621 #define NB_MODIFIER 0
622 #define FLOW BLOCK,INLINE
623 #define NB_FLOW NB_BLOCK + NB_INLINE
624 #define EMPTY NULL
625 
626 
627 static const char* const html_flow[] = { FLOW, NULL } ;
628 static const char* const html_inline[] = { INLINE, NULL } ;
629 
630 /* placeholders: elts with content but no subelements */
631 static const char* const html_pcdata[] = { NULL } ;
632 #define html_cdata html_pcdata
633 
634 
635 /* ... and for HTML Attributes */
636 
637 #define COREATTRS "id", "class", "style", "title"
638 #define NB_COREATTRS 4
639 #define I18N "lang", "dir"
640 #define NB_I18N 2
641 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
642 #define NB_EVENTS 9
643 #define ATTRS COREATTRS,I18N,EVENTS
644 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
645 #define CELLHALIGN "align", "char", "charoff"
646 #define NB_CELLHALIGN 3
647 #define CELLVALIGN "valign"
648 #define NB_CELLVALIGN 1
649 
650 static const char* const html_attrs[] = { ATTRS, NULL } ;
651 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
652 static const char* const core_attrs[] = { COREATTRS, NULL } ;
653 static const char* const i18n_attrs[] = { I18N, NULL } ;
654 
655 
656 /* Other declarations that should go inline ... */
657 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
658 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
659 	"tabindex", "onfocus", "onblur", NULL } ;
660 static const char* const target_attr[] = { "target", NULL } ;
661 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
662 static const char* const alt_attr[] = { "alt", NULL } ;
663 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
664 static const char* const href_attrs[] = { "href", NULL } ;
665 static const char* const clear_attrs[] = { "clear", NULL } ;
666 static const char* const inline_p[] = { INLINE, "p", NULL } ;
667 
668 static const char* const flow_param[] = { FLOW, "param", NULL } ;
669 static const char* const applet_attrs[] = { COREATTRS , "codebase",
670 		"archive", "alt", "name", "height", "width", "align",
671 		"hspace", "vspace", NULL } ;
672 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
673 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
674 static const char* const basefont_attrs[] =
675 	{ "id", "size", "color", "face", NULL } ;
676 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
677 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
678 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
679 static const char* const body_depr[] = { "background", "bgcolor", "text",
680 	"link", "vlink", "alink", NULL } ;
681 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
682 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
683 
684 
685 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
686 static const char* const col_elt[] = { "col", NULL } ;
687 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
688 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
689 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
690 static const char* const compact_attr[] = { "compact", NULL } ;
691 static const char* const label_attr[] = { "label", NULL } ;
692 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
693 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
694 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
695 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
696 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
697 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
698 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
699 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
700 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
701 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
702 static const char* const version_attr[] = { "version", NULL } ;
703 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
704 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
705 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
706 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
707 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
708 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
709 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
710 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
711 static const char* const align_attr[] = { "align", NULL } ;
712 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
713 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
714 static const char* const name_attr[] = { "name", NULL } ;
715 static const char* const action_attr[] = { "action", NULL } ;
716 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
717 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
718 static const char* const content_attr[] = { "content", NULL } ;
719 static const char* const type_attr[] = { "type", NULL } ;
720 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
721 static const char* const object_contents[] = { FLOW, "param", NULL } ;
722 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
723 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
724 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
725 static const char* const option_elt[] = { "option", NULL } ;
726 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
727 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
728 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
729 static const char* const width_attr[] = { "width", NULL } ;
730 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
731 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
732 static const char* const language_attr[] = { "language", NULL } ;
733 static const char* const select_content[] = { "optgroup", "option", NULL } ;
734 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
735 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
736 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
737 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
738 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
739 static const char* const tr_elt[] = { "tr", NULL } ;
740 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
741 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
742 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
743 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
744 static const char* const tr_contents[] = { "th", "td", NULL } ;
745 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
746 static const char* const li_elt[] = { "li", NULL } ;
747 static const char* const ul_depr[] = { "type", "compact", NULL} ;
748 static const char* const dir_attr[] = { "dir", NULL} ;
749 
750 #define DECL (const char**)
751 
752 static const htmlElemDesc
753 html40ElementTable[] = {
754 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
755 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
756 },
757 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
758 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
759 },
760 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
761 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
762 },
763 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
764 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
765 },
766 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
767 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
768 },
769 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
770 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
771 },
772 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
773 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
774 },
775 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
776 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
777 },
778 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
779 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
780 },
781 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
782 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
783 },
784 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
785 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
786 },
787 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
788 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
789 },
790 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
791 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
792 },
793 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
794 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
795 },
796 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
797 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
798 },
799 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
800 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
801 },
802 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
803 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
804 },
805 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
806 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
807 },
808 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
809 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
810 },
811 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
812 	EMPTY , NULL , DECL col_attrs , NULL, NULL
813 },
814 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
815 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
816 },
817 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
818 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
819 },
820 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
821 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
822 },
823 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
824 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
825 },
826 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
827 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
828 },
829 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
830 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
831 },
832 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
833 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
834 },
835 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
836 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
837 },
838 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
839 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
840 },
841 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
842 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
843 },
844 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
845 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
846 },
847 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
848 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
849 },
850 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
851 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
852 },
853 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
854 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
855 },
856 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
857 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
858 },
859 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
860 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
861 },
862 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
863 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
864 },
865 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
866 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
867 },
868 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
869 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
870 },
871 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
872 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
873 },
874 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
875 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
876 },
877 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
878 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
879 },
880 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
881 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
882 },
883 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
884 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
885 },
886 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
887 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
888 },
889 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
890 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
891 },
892 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
893 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
894 },
895 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
896 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
897 },
898 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
899 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
900 },
901 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
902 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
903 },
904 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
905 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
906 },
907 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
908 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
909 },
910 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
911 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
912 },
913 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
914 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
915 },
916 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
917 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
918 },
919 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
920 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
921 },
922 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
923 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
924 },
925 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
926 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
927 },
928 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
929 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
930 },
931 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
932 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
933 },
934 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
935 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
936 },
937 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
938 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
939 },
940 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
941 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
942 },
943 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
944 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
945 },
946 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
947 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
948 },
949 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
950 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
951 },
952 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
953 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
954 },
955 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
956 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
957 },
958 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
959 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
960 },
961 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
962 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
963 },
964 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
965 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
966 },
967 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
968 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
969 },
970 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
971 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
972 },
973 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
974 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
975 },
976 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
977 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
978 },
979 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
980 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
981 },
982 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
983 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
984 },
985 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
986 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
987 },
988 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
989 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
990 },
991 { "table",	0, 0, 0, 0, 0, 0, 0, "",
992 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
993 },
994 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
995 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
996 },
997 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
998 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
999 },
1000 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1001 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1002 },
1003 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
1004 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1005 },
1006 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
1007 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1008 },
1009 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
1010 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1011 },
1012 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
1013 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1014 },
1015 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
1016 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1017 },
1018 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1019 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1020 },
1021 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
1022 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1023 },
1024 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
1025 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1026 },
1027 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1028 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1029 }
1030 };
1031 
1032 typedef struct {
1033     const char *oldTag;
1034     const char *newTag;
1035 } htmlStartCloseEntry;
1036 
1037 /*
1038  * start tags that imply the end of current element
1039  */
1040 static const htmlStartCloseEntry htmlStartClose[] = {
1041     { "a", "a" },
1042     { "a", "fieldset" },
1043     { "a", "table" },
1044     { "a", "td" },
1045     { "a", "th" },
1046     { "address", "dd" },
1047     { "address", "dl" },
1048     { "address", "dt" },
1049     { "address", "form" },
1050     { "address", "li" },
1051     { "address", "ul" },
1052     { "b", "center" },
1053     { "b", "p" },
1054     { "b", "td" },
1055     { "b", "th" },
1056     { "big", "p" },
1057     { "caption", "col" },
1058     { "caption", "colgroup" },
1059     { "caption", "tbody" },
1060     { "caption", "tfoot" },
1061     { "caption", "thead" },
1062     { "caption", "tr" },
1063     { "col", "col" },
1064     { "col", "colgroup" },
1065     { "col", "tbody" },
1066     { "col", "tfoot" },
1067     { "col", "thead" },
1068     { "col", "tr" },
1069     { "colgroup", "colgroup" },
1070     { "colgroup", "tbody" },
1071     { "colgroup", "tfoot" },
1072     { "colgroup", "thead" },
1073     { "colgroup", "tr" },
1074     { "dd", "dt" },
1075     { "dir", "dd" },
1076     { "dir", "dl" },
1077     { "dir", "dt" },
1078     { "dir", "form" },
1079     { "dir", "ul" },
1080     { "dl", "form" },
1081     { "dl", "li" },
1082     { "dt", "dd" },
1083     { "dt", "dl" },
1084     { "font", "center" },
1085     { "font", "td" },
1086     { "font", "th" },
1087     { "form", "form" },
1088     { "h1", "fieldset" },
1089     { "h1", "form" },
1090     { "h1", "li" },
1091     { "h1", "p" },
1092     { "h1", "table" },
1093     { "h2", "fieldset" },
1094     { "h2", "form" },
1095     { "h2", "li" },
1096     { "h2", "p" },
1097     { "h2", "table" },
1098     { "h3", "fieldset" },
1099     { "h3", "form" },
1100     { "h3", "li" },
1101     { "h3", "p" },
1102     { "h3", "table" },
1103     { "h4", "fieldset" },
1104     { "h4", "form" },
1105     { "h4", "li" },
1106     { "h4", "p" },
1107     { "h4", "table" },
1108     { "h5", "fieldset" },
1109     { "h5", "form" },
1110     { "h5", "li" },
1111     { "h5", "p" },
1112     { "h5", "table" },
1113     { "h6", "fieldset" },
1114     { "h6", "form" },
1115     { "h6", "li" },
1116     { "h6", "p" },
1117     { "h6", "table" },
1118     { "head", "a" },
1119     { "head", "abbr" },
1120     { "head", "acronym" },
1121     { "head", "address" },
1122     { "head", "b" },
1123     { "head", "bdo" },
1124     { "head", "big" },
1125     { "head", "blockquote" },
1126     { "head", "body" },
1127     { "head", "br" },
1128     { "head", "center" },
1129     { "head", "cite" },
1130     { "head", "code" },
1131     { "head", "dd" },
1132     { "head", "dfn" },
1133     { "head", "dir" },
1134     { "head", "div" },
1135     { "head", "dl" },
1136     { "head", "dt" },
1137     { "head", "em" },
1138     { "head", "fieldset" },
1139     { "head", "font" },
1140     { "head", "form" },
1141     { "head", "frameset" },
1142     { "head", "h1" },
1143     { "head", "h2" },
1144     { "head", "h3" },
1145     { "head", "h4" },
1146     { "head", "h5" },
1147     { "head", "h6" },
1148     { "head", "hr" },
1149     { "head", "i" },
1150     { "head", "iframe" },
1151     { "head", "img" },
1152     { "head", "kbd" },
1153     { "head", "li" },
1154     { "head", "listing" },
1155     { "head", "map" },
1156     { "head", "menu" },
1157     { "head", "ol" },
1158     { "head", "p" },
1159     { "head", "pre" },
1160     { "head", "q" },
1161     { "head", "s" },
1162     { "head", "samp" },
1163     { "head", "small" },
1164     { "head", "span" },
1165     { "head", "strike" },
1166     { "head", "strong" },
1167     { "head", "sub" },
1168     { "head", "sup" },
1169     { "head", "table" },
1170     { "head", "tt" },
1171     { "head", "u" },
1172     { "head", "ul" },
1173     { "head", "var" },
1174     { "head", "xmp" },
1175     { "hr", "form" },
1176     { "i", "center" },
1177     { "i", "p" },
1178     { "i", "td" },
1179     { "i", "th" },
1180     { "legend", "fieldset" },
1181     { "li", "li" },
1182     { "link", "body" },
1183     { "link", "frameset" },
1184     { "listing", "dd" },
1185     { "listing", "dl" },
1186     { "listing", "dt" },
1187     { "listing", "fieldset" },
1188     { "listing", "form" },
1189     { "listing", "li" },
1190     { "listing", "table" },
1191     { "listing", "ul" },
1192     { "menu", "dd" },
1193     { "menu", "dl" },
1194     { "menu", "dt" },
1195     { "menu", "form" },
1196     { "menu", "ul" },
1197     { "ol", "form" },
1198     { "option", "optgroup" },
1199     { "option", "option" },
1200     { "p", "address" },
1201     { "p", "blockquote" },
1202     { "p", "body" },
1203     { "p", "caption" },
1204     { "p", "center" },
1205     { "p", "col" },
1206     { "p", "colgroup" },
1207     { "p", "dd" },
1208     { "p", "dir" },
1209     { "p", "div" },
1210     { "p", "dl" },
1211     { "p", "dt" },
1212     { "p", "fieldset" },
1213     { "p", "form" },
1214     { "p", "frameset" },
1215     { "p", "h1" },
1216     { "p", "h2" },
1217     { "p", "h3" },
1218     { "p", "h4" },
1219     { "p", "h5" },
1220     { "p", "h6" },
1221     { "p", "head" },
1222     { "p", "hr" },
1223     { "p", "li" },
1224     { "p", "listing" },
1225     { "p", "menu" },
1226     { "p", "ol" },
1227     { "p", "p" },
1228     { "p", "pre" },
1229     { "p", "table" },
1230     { "p", "tbody" },
1231     { "p", "td" },
1232     { "p", "tfoot" },
1233     { "p", "th" },
1234     { "p", "title" },
1235     { "p", "tr" },
1236     { "p", "ul" },
1237     { "p", "xmp" },
1238     { "pre", "dd" },
1239     { "pre", "dl" },
1240     { "pre", "dt" },
1241     { "pre", "fieldset" },
1242     { "pre", "form" },
1243     { "pre", "li" },
1244     { "pre", "table" },
1245     { "pre", "ul" },
1246     { "s", "p" },
1247     { "script", "noscript" },
1248     { "small", "p" },
1249     { "span", "td" },
1250     { "span", "th" },
1251     { "strike", "p" },
1252     { "style", "body" },
1253     { "style", "frameset" },
1254     { "tbody", "tbody" },
1255     { "tbody", "tfoot" },
1256     { "td", "tbody" },
1257     { "td", "td" },
1258     { "td", "tfoot" },
1259     { "td", "th" },
1260     { "td", "tr" },
1261     { "tfoot", "tbody" },
1262     { "th", "tbody" },
1263     { "th", "td" },
1264     { "th", "tfoot" },
1265     { "th", "th" },
1266     { "th", "tr" },
1267     { "thead", "tbody" },
1268     { "thead", "tfoot" },
1269     { "title", "body" },
1270     { "title", "frameset" },
1271     { "tr", "tbody" },
1272     { "tr", "tfoot" },
1273     { "tr", "tr" },
1274     { "tt", "p" },
1275     { "u", "p" },
1276     { "u", "td" },
1277     { "u", "th" },
1278     { "ul", "address" },
1279     { "ul", "form" },
1280     { "ul", "menu" },
1281     { "ul", "pre" },
1282     { "xmp", "dd" },
1283     { "xmp", "dl" },
1284     { "xmp", "dt" },
1285     { "xmp", "fieldset" },
1286     { "xmp", "form" },
1287     { "xmp", "li" },
1288     { "xmp", "table" },
1289     { "xmp", "ul" }
1290 };
1291 
1292 /*
1293  * The list of HTML elements which are supposed not to have
1294  * CDATA content and where a p element will be implied
1295  *
1296  * TODO: extend that list by reading the HTML SGML DTD on
1297  *       implied paragraph
1298  */
1299 static const char *const htmlNoContentElements[] = {
1300     "html",
1301     "head",
1302     NULL
1303 };
1304 
1305 /*
1306  * The list of HTML attributes which are of content %Script;
1307  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1308  *       it assumes the name starts with 'on'
1309  */
1310 static const char *const htmlScriptAttributes[] = {
1311     "onclick",
1312     "ondblclick",
1313     "onmousedown",
1314     "onmouseup",
1315     "onmouseover",
1316     "onmousemove",
1317     "onmouseout",
1318     "onkeypress",
1319     "onkeydown",
1320     "onkeyup",
1321     "onload",
1322     "onunload",
1323     "onfocus",
1324     "onblur",
1325     "onsubmit",
1326     "onreset",
1327     "onchange",
1328     "onselect"
1329 };
1330 
1331 /*
1332  * This table is used by the htmlparser to know what to do with
1333  * broken html pages. By assigning different priorities to different
1334  * elements the parser can decide how to handle extra endtags.
1335  * Endtags are only allowed to close elements with lower or equal
1336  * priority.
1337  */
1338 
1339 typedef struct {
1340     const char *name;
1341     int priority;
1342 } elementPriority;
1343 
1344 static const elementPriority htmlEndPriority[] = {
1345     {"div",   150},
1346     {"td",    160},
1347     {"th",    160},
1348     {"tr",    170},
1349     {"thead", 180},
1350     {"tbody", 180},
1351     {"tfoot", 180},
1352     {"table", 190},
1353     {"head",  200},
1354     {"body",  200},
1355     {"html",  220},
1356     {NULL,    100} /* Default priority */
1357 };
1358 
1359 /************************************************************************
1360  *									*
1361  *	functions to handle HTML specific data			*
1362  *									*
1363  ************************************************************************/
1364 
1365 /**
1366  * htmlInitAutoClose:
1367  *
1368  * DEPRECATED: This is a no-op.
1369  */
1370 void
htmlInitAutoClose(void)1371 htmlInitAutoClose(void) {
1372 }
1373 
1374 static int
htmlCompareTags(const void * key,const void * member)1375 htmlCompareTags(const void *key, const void *member) {
1376     const xmlChar *tag = (const xmlChar *) key;
1377     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1378 
1379     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1380 }
1381 
1382 /**
1383  * htmlTagLookup:
1384  * @tag:  The tag name in lowercase
1385  *
1386  * Lookup the HTML tag in the ElementTable
1387  *
1388  * Returns the related htmlElemDescPtr or NULL if not found.
1389  */
1390 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1391 htmlTagLookup(const xmlChar *tag) {
1392     if (tag == NULL)
1393         return(NULL);
1394 
1395     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1396                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1397                 sizeof(htmlElemDesc), htmlCompareTags));
1398 }
1399 
1400 /**
1401  * htmlGetEndPriority:
1402  * @name: The name of the element to look up the priority for.
1403  *
1404  * Return value: The "endtag" priority.
1405  **/
1406 static int
htmlGetEndPriority(const xmlChar * name)1407 htmlGetEndPriority (const xmlChar *name) {
1408     int i = 0;
1409 
1410     while ((htmlEndPriority[i].name != NULL) &&
1411 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1412 	i++;
1413 
1414     return(htmlEndPriority[i].priority);
1415 }
1416 
1417 
1418 static int
htmlCompareStartClose(const void * vkey,const void * member)1419 htmlCompareStartClose(const void *vkey, const void *member) {
1420     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1421     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1422     int ret;
1423 
1424     ret = strcmp(key->oldTag, entry->oldTag);
1425     if (ret == 0)
1426         ret = strcmp(key->newTag, entry->newTag);
1427 
1428     return(ret);
1429 }
1430 
1431 /**
1432  * htmlCheckAutoClose:
1433  * @newtag:  The new tag name
1434  * @oldtag:  The old tag name
1435  *
1436  * Checks whether the new tag is one of the registered valid tags for
1437  * closing old.
1438  *
1439  * Returns 0 if no, 1 if yes.
1440  */
1441 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1442 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1443 {
1444     htmlStartCloseEntry key;
1445     void *res;
1446 
1447     key.oldTag = (const char *) oldtag;
1448     key.newTag = (const char *) newtag;
1449     res = bsearch(&key, htmlStartClose,
1450             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1451             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1452     return(res != NULL);
1453 }
1454 
1455 /**
1456  * htmlAutoCloseOnClose:
1457  * @ctxt:  an HTML parser context
1458  * @newtag:  The new tag name
1459  * @force:  force the tag closure
1460  *
1461  * The HTML DTD allows an ending tag to implicitly close other tags.
1462  */
1463 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1464 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1465 {
1466     const htmlElemDesc *info;
1467     int i, priority;
1468 
1469     priority = htmlGetEndPriority(newtag);
1470 
1471     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1472 
1473         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1474             break;
1475         /*
1476          * A misplaced endtag can only close elements with lower
1477          * or equal priority, so if we find an element with higher
1478          * priority before we find an element with
1479          * matching name, we just ignore this endtag
1480          */
1481         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1482             return;
1483     }
1484     if (i < 0)
1485         return;
1486 
1487     while (!xmlStrEqual(newtag, ctxt->name)) {
1488         info = htmlTagLookup(ctxt->name);
1489         if ((info != NULL) && (info->endTag == 3)) {
1490             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1491 	                 "Opening and ending tag mismatch: %s and %s\n",
1492 			 newtag, ctxt->name);
1493         }
1494         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1495             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1496 	htmlnamePop(ctxt);
1497     }
1498 }
1499 
1500 /**
1501  * htmlAutoCloseOnEnd:
1502  * @ctxt:  an HTML parser context
1503  *
1504  * Close all remaining tags at the end of the stream
1505  */
1506 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1507 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1508 {
1509     int i;
1510 
1511     if (ctxt->nameNr == 0)
1512         return;
1513     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1514         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1515             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1516 	htmlnamePop(ctxt);
1517     }
1518 }
1519 
1520 /**
1521  * htmlAutoClose:
1522  * @ctxt:  an HTML parser context
1523  * @newtag:  The new tag name or NULL
1524  *
1525  * The HTML DTD allows a tag to implicitly close other tags.
1526  * The list is kept in htmlStartClose array. This function is
1527  * called when a new tag has been detected and generates the
1528  * appropriates closes if possible/needed.
1529  * If newtag is NULL this mean we are at the end of the resource
1530  * and we should check
1531  */
1532 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1533 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1534 {
1535     if (newtag == NULL)
1536         return;
1537 
1538     while ((ctxt->name != NULL) &&
1539            (htmlCheckAutoClose(newtag, ctxt->name))) {
1540         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1541             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1542 	htmlnamePop(ctxt);
1543     }
1544 }
1545 
1546 /**
1547  * htmlAutoCloseTag:
1548  * @doc:  the HTML document
1549  * @name:  The tag name
1550  * @elem:  the HTML element
1551  *
1552  * The HTML DTD allows a tag to implicitly close other tags.
1553  * The list is kept in htmlStartClose array. This function checks
1554  * if the element or one of it's children would autoclose the
1555  * given tag.
1556  *
1557  * Returns 1 if autoclose, 0 otherwise
1558  */
1559 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1560 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1561     htmlNodePtr child;
1562 
1563     if (elem == NULL) return(1);
1564     if (xmlStrEqual(name, elem->name)) return(0);
1565     if (htmlCheckAutoClose(elem->name, name)) return(1);
1566     child = elem->children;
1567     while (child != NULL) {
1568         if (htmlAutoCloseTag(doc, name, child)) return(1);
1569 	child = child->next;
1570     }
1571     return(0);
1572 }
1573 
1574 /**
1575  * htmlIsAutoClosed:
1576  * @doc:  the HTML document
1577  * @elem:  the HTML element
1578  *
1579  * The HTML DTD allows a tag to implicitly close other tags.
1580  * The list is kept in htmlStartClose array. This function checks
1581  * if a tag is autoclosed by one of it's child
1582  *
1583  * Returns 1 if autoclosed, 0 otherwise
1584  */
1585 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1586 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1587     htmlNodePtr child;
1588 
1589     if (elem == NULL) return(1);
1590     child = elem->children;
1591     while (child != NULL) {
1592 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1593 	child = child->next;
1594     }
1595     return(0);
1596 }
1597 
1598 /**
1599  * htmlCheckImplied:
1600  * @ctxt:  an HTML parser context
1601  * @newtag:  The new tag name
1602  *
1603  * The HTML DTD allows a tag to exists only implicitly
1604  * called when a new tag has been detected and generates the
1605  * appropriates implicit tags if missing
1606  */
1607 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1608 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1609     int i;
1610 
1611     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1612         return;
1613     if (!htmlOmittedDefaultValue)
1614 	return;
1615     if (xmlStrEqual(newtag, BAD_CAST"html"))
1616 	return;
1617     if (ctxt->nameNr <= 0) {
1618 	htmlnamePush(ctxt, BAD_CAST"html");
1619 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1620 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1621     }
1622     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1623         return;
1624     if ((ctxt->nameNr <= 1) &&
1625         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1626 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1627 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1628 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1629 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1630 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1631         if (ctxt->html >= 3) {
1632             /* we already saw or generated an <head> before */
1633             return;
1634         }
1635         /*
1636          * dropped OBJECT ... i you put it first BODY will be
1637          * assumed !
1638          */
1639         htmlnamePush(ctxt, BAD_CAST"head");
1640         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1641             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1642     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1643 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1644 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1645         if (ctxt->html >= 10) {
1646             /* we already saw or generated a <body> before */
1647             return;
1648         }
1649 	for (i = 0;i < ctxt->nameNr;i++) {
1650 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1651 		return;
1652 	    }
1653 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1654 		return;
1655 	    }
1656 	}
1657 
1658 	htmlnamePush(ctxt, BAD_CAST"body");
1659 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1660 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1661     }
1662 }
1663 
1664 /**
1665  * htmlCheckParagraph
1666  * @ctxt:  an HTML parser context
1667  *
1668  * Check whether a p element need to be implied before inserting
1669  * characters in the current element.
1670  *
1671  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1672  *         in case of error.
1673  */
1674 
1675 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1676 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1677     const xmlChar *tag;
1678     int i;
1679 
1680     if (ctxt == NULL)
1681 	return(-1);
1682     tag = ctxt->name;
1683     if (tag == NULL) {
1684 	htmlAutoClose(ctxt, BAD_CAST"p");
1685 	htmlCheckImplied(ctxt, BAD_CAST"p");
1686 	htmlnamePush(ctxt, BAD_CAST"p");
1687 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1688 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1689 	return(1);
1690     }
1691     if (!htmlOmittedDefaultValue)
1692 	return(0);
1693     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1694 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1695 	    htmlAutoClose(ctxt, BAD_CAST"p");
1696 	    htmlCheckImplied(ctxt, BAD_CAST"p");
1697 	    htmlnamePush(ctxt, BAD_CAST"p");
1698 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1699 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1700 	    return(1);
1701 	}
1702     }
1703     return(0);
1704 }
1705 
1706 /**
1707  * htmlIsScriptAttribute:
1708  * @name:  an attribute name
1709  *
1710  * Check if an attribute is of content type Script
1711  *
1712  * Returns 1 is the attribute is a script 0 otherwise
1713  */
1714 int
htmlIsScriptAttribute(const xmlChar * name)1715 htmlIsScriptAttribute(const xmlChar *name) {
1716     unsigned int i;
1717 
1718     if (name == NULL)
1719       return(0);
1720     /*
1721      * all script attributes start with 'on'
1722      */
1723     if ((name[0] != 'o') || (name[1] != 'n'))
1724       return(0);
1725     for (i = 0;
1726 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1727 	 i++) {
1728 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1729 	    return(1);
1730     }
1731     return(0);
1732 }
1733 
1734 /************************************************************************
1735  *									*
1736  *	The list of HTML predefined entities			*
1737  *									*
1738  ************************************************************************/
1739 
1740 
1741 static const htmlEntityDesc  html40EntitiesTable[] = {
1742 /*
1743  * the 4 absolute ones, plus apostrophe.
1744  */
1745 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1746 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
1747 { 39,	"apos",	"single quote" },
1748 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
1749 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1750 
1751 /*
1752  * A bunch still in the 128-255 range
1753  * Replacing them depend really on the charset used.
1754  */
1755 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1756 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1757 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1758 { 163,	"pound","pound sign, U+00A3 ISOnum" },
1759 { 164,	"curren","currency sign, U+00A4 ISOnum" },
1760 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1761 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1762 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
1763 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1764 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1765 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1766 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1767 { 172,	"not",	"not sign, U+00AC ISOnum" },
1768 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1769 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1770 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1771 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1772 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1773 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1774 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1775 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1776 { 181,	"micro","micro sign, U+00B5 ISOnum" },
1777 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1778 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1779 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1780 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1781 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1782 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1783 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1784 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1785 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1786 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1787 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1788 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1789 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1790 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1791 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1792 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1793 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1794 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1795 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1796 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1797 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1798 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1799 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1800 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1801 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1802 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1803 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1804 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1805 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1806 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1807 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1808 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1809 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1810 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
1811 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1812 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1813 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1814 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1815 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1816 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1817 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1818 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1819 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1820 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1821 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1822 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1823 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1824 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1825 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1826 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1827 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1828 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1829 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1830 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1831 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1832 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1833 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1834 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1835 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1836 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1837 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1838 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1839 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1840 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1841 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1842 { 247,	"divide","division sign, U+00F7 ISOnum" },
1843 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1844 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1845 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1846 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1847 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1848 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1849 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1850 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1851 
1852 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1853 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1854 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1855 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1856 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1857 
1858 /*
1859  * Anything below should really be kept as entities references
1860  */
1861 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1862 
1863 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1864 { 732,	"tilde","small tilde, U+02DC ISOdia" },
1865 
1866 { 913,	"Alpha","greek capital letter alpha, U+0391" },
1867 { 914,	"Beta",	"greek capital letter beta, U+0392" },
1868 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1869 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1870 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1871 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1872 { 919,	"Eta",	"greek capital letter eta, U+0397" },
1873 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1874 { 921,	"Iota",	"greek capital letter iota, U+0399" },
1875 { 922,	"Kappa","greek capital letter kappa, U+039A" },
1876 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1877 { 924,	"Mu",	"greek capital letter mu, U+039C" },
1878 { 925,	"Nu",	"greek capital letter nu, U+039D" },
1879 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1880 { 927,	"Omicron","greek capital letter omicron, U+039F" },
1881 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1882 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
1883 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1884 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
1885 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1886 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1887 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
1888 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1889 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1890 
1891 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1892 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1893 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1894 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1895 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1896 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1897 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1898 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1899 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1900 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1901 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1902 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1903 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1904 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1905 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1906 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1907 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1908 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1909 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1910 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1911 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1912 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1913 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1914 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1915 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1916 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1917 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1918 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1919 
1920 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
1921 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
1922 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
1923 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1924 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1925 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1926 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1927 { 8211,	"ndash","en dash, U+2013 ISOpub" },
1928 { 8212,	"mdash","em dash, U+2014 ISOpub" },
1929 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1930 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1931 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1932 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1933 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1934 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1935 { 8224,	"dagger","dagger, U+2020 ISOpub" },
1936 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1937 
1938 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1939 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1940 
1941 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
1942 
1943 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1944 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1945 
1946 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1947 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1948 
1949 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1950 { 8260,	"frasl","fraction slash, U+2044 NEW" },
1951 
1952 { 8364,	"euro",	"euro sign, U+20AC NEW" },
1953 
1954 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1955 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1956 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1957 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1958 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1959 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1960 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1961 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1962 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1963 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1964 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1965 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1966 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1967 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1968 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1969 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1970 
1971 { 8704,	"forall","for all, U+2200 ISOtech" },
1972 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
1973 { 8707,	"exist","there exists, U+2203 ISOtech" },
1974 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1975 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1976 { 8712,	"isin",	"element of, U+2208 ISOtech" },
1977 { 8713,	"notin","not an element of, U+2209 ISOtech" },
1978 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
1979 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1980 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1981 { 8722,	"minus","minus sign, U+2212 ISOtech" },
1982 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1983 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1984 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
1985 { 8734,	"infin","infinity, U+221E ISOtech" },
1986 { 8736,	"ang",	"angle, U+2220 ISOamso" },
1987 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1988 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1989 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1990 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
1991 { 8747,	"int",	"integral, U+222B ISOtech" },
1992 { 8756,	"there4","therefore, U+2234 ISOtech" },
1993 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1994 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1995 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1996 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1997 { 8801,	"equiv","identical to, U+2261 ISOtech" },
1998 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1999 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
2000 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
2001 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
2002 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
2003 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
2004 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
2005 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
2006 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
2007 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2008 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
2009 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2010 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
2011 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
2012 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
2013 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
2014 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
2015 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
2016 
2017 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
2018 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
2019 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
2020 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
2021 
2022 };
2023 
2024 /************************************************************************
2025  *									*
2026  *		Commodity functions to handle entities			*
2027  *									*
2028  ************************************************************************/
2029 
2030 /*
2031  * Macro used to grow the current buffer.
2032  */
2033 #define growBuffer(buffer) {						\
2034     xmlChar *tmp;							\
2035     buffer##_size *= 2;							\
2036     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size); 		\
2037     if (tmp == NULL) {							\
2038 	htmlErrMemory(ctxt, "growing buffer\n");			\
2039 	xmlFree(buffer);						\
2040 	return(NULL);							\
2041     }									\
2042     buffer = tmp;							\
2043 }
2044 
2045 /**
2046  * htmlEntityLookup:
2047  * @name: the entity name
2048  *
2049  * Lookup the given entity in EntitiesTable
2050  *
2051  * TODO: the linear scan is really ugly, an hash table is really needed.
2052  *
2053  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2054  */
2055 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)2056 htmlEntityLookup(const xmlChar *name) {
2057     unsigned int i;
2058 
2059     for (i = 0;i < (sizeof(html40EntitiesTable)/
2060                     sizeof(html40EntitiesTable[0]));i++) {
2061         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2062             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2063 	}
2064     }
2065     return(NULL);
2066 }
2067 
2068 /**
2069  * htmlEntityValueLookup:
2070  * @value: the entity's unicode value
2071  *
2072  * Lookup the given entity in EntitiesTable
2073  *
2074  * TODO: the linear scan is really ugly, an hash table is really needed.
2075  *
2076  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2077  */
2078 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)2079 htmlEntityValueLookup(unsigned int value) {
2080     unsigned int i;
2081 
2082     for (i = 0;i < (sizeof(html40EntitiesTable)/
2083                     sizeof(html40EntitiesTable[0]));i++) {
2084         if (html40EntitiesTable[i].value >= value) {
2085 	    if (html40EntitiesTable[i].value > value)
2086 		break;
2087             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2088 	}
2089     }
2090     return(NULL);
2091 }
2092 
2093 /**
2094  * UTF8ToHtml:
2095  * @out:  a pointer to an array of bytes to store the result
2096  * @outlen:  the length of @out
2097  * @in:  a pointer to an array of UTF-8 chars
2098  * @inlen:  the length of @in
2099  *
2100  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2101  * plus HTML entities block of chars out.
2102  *
2103  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2104  * The value of @inlen after return is the number of octets consumed
2105  *     as the return value is positive, else unpredictable.
2106  * The value of @outlen after return is the number of octets consumed.
2107  */
2108 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2109 UTF8ToHtml(unsigned char* out, int *outlen,
2110               const unsigned char* in, int *inlen) {
2111     const unsigned char* processed = in;
2112     const unsigned char* outend;
2113     const unsigned char* outstart = out;
2114     const unsigned char* instart = in;
2115     const unsigned char* inend;
2116     unsigned int c, d;
2117     int trailing;
2118 
2119     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2120     if (in == NULL) {
2121         /*
2122 	 * initialization nothing to do
2123 	 */
2124 	*outlen = 0;
2125 	*inlen = 0;
2126 	return(0);
2127     }
2128     inend = in + (*inlen);
2129     outend = out + (*outlen);
2130     while (in < inend) {
2131 	d = *in++;
2132 	if      (d < 0x80)  { c= d; trailing= 0; }
2133 	else if (d < 0xC0) {
2134 	    /* trailing byte in leading position */
2135 	    *outlen = out - outstart;
2136 	    *inlen = processed - instart;
2137 	    return(-2);
2138         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2139         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2140         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2141 	else {
2142 	    /* no chance for this in Ascii */
2143 	    *outlen = out - outstart;
2144 	    *inlen = processed - instart;
2145 	    return(-2);
2146 	}
2147 
2148 	if (inend - in < trailing) {
2149 	    break;
2150 	}
2151 
2152 	for ( ; trailing; trailing--) {
2153 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2154 		break;
2155 	    c <<= 6;
2156 	    c |= d & 0x3F;
2157 	}
2158 
2159 	/* assertion: c is a single UTF-4 value */
2160 	if (c < 0x80) {
2161 	    if (out + 1 >= outend)
2162 		break;
2163 	    *out++ = c;
2164 	} else {
2165 	    int len;
2166 	    const htmlEntityDesc * ent;
2167 	    const char *cp;
2168 	    char nbuf[16];
2169 
2170 	    /*
2171 	     * Try to lookup a predefined HTML entity for it
2172 	     */
2173 
2174 	    ent = htmlEntityValueLookup(c);
2175 	    if (ent == NULL) {
2176 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2177 	      cp = nbuf;
2178 	    }
2179 	    else
2180 	      cp = ent->name;
2181 	    len = strlen(cp);
2182 	    if (out + 2 + len >= outend)
2183 		break;
2184 	    *out++ = '&';
2185 	    memcpy(out, cp, len);
2186 	    out += len;
2187 	    *out++ = ';';
2188 	}
2189 	processed = in;
2190     }
2191     *outlen = out - outstart;
2192     *inlen = processed - instart;
2193     return(0);
2194 }
2195 
2196 /**
2197  * htmlEncodeEntities:
2198  * @out:  a pointer to an array of bytes to store the result
2199  * @outlen:  the length of @out
2200  * @in:  a pointer to an array of UTF-8 chars
2201  * @inlen:  the length of @in
2202  * @quoteChar: the quote character to escape (' or ") or zero.
2203  *
2204  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2205  * plus HTML entities block of chars out.
2206  *
2207  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2208  * The value of @inlen after return is the number of octets consumed
2209  *     as the return value is positive, else unpredictable.
2210  * The value of @outlen after return is the number of octets consumed.
2211  */
2212 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2213 htmlEncodeEntities(unsigned char* out, int *outlen,
2214 		   const unsigned char* in, int *inlen, int quoteChar) {
2215     const unsigned char* processed = in;
2216     const unsigned char* outend;
2217     const unsigned char* outstart = out;
2218     const unsigned char* instart = in;
2219     const unsigned char* inend;
2220     unsigned int c, d;
2221     int trailing;
2222 
2223     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2224         return(-1);
2225     outend = out + (*outlen);
2226     inend = in + (*inlen);
2227     while (in < inend) {
2228 	d = *in++;
2229 	if      (d < 0x80)  { c= d; trailing= 0; }
2230 	else if (d < 0xC0) {
2231 	    /* trailing byte in leading position */
2232 	    *outlen = out - outstart;
2233 	    *inlen = processed - instart;
2234 	    return(-2);
2235         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2236         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2237         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2238 	else {
2239 	    /* no chance for this in Ascii */
2240 	    *outlen = out - outstart;
2241 	    *inlen = processed - instart;
2242 	    return(-2);
2243 	}
2244 
2245 	if (inend - in < trailing)
2246 	    break;
2247 
2248 	while (trailing--) {
2249 	    if (((d= *in++) & 0xC0) != 0x80) {
2250 		*outlen = out - outstart;
2251 		*inlen = processed - instart;
2252 		return(-2);
2253 	    }
2254 	    c <<= 6;
2255 	    c |= d & 0x3F;
2256 	}
2257 
2258 	/* assertion: c is a single UTF-4 value */
2259 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2260 	    (c != '&') && (c != '<') && (c != '>')) {
2261 	    if (out >= outend)
2262 		break;
2263 	    *out++ = c;
2264 	} else {
2265 	    const htmlEntityDesc * ent;
2266 	    const char *cp;
2267 	    char nbuf[16];
2268 	    int len;
2269 
2270 	    /*
2271 	     * Try to lookup a predefined HTML entity for it
2272 	     */
2273 	    ent = htmlEntityValueLookup(c);
2274 	    if (ent == NULL) {
2275 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2276 		cp = nbuf;
2277 	    }
2278 	    else
2279 		cp = ent->name;
2280 	    len = strlen(cp);
2281 	    if (outend - out < len + 2)
2282 		break;
2283 	    *out++ = '&';
2284 	    memcpy(out, cp, len);
2285 	    out += len;
2286 	    *out++ = ';';
2287 	}
2288 	processed = in;
2289     }
2290     *outlen = out - outstart;
2291     *inlen = processed - instart;
2292     return(0);
2293 }
2294 
2295 /************************************************************************
2296  *									*
2297  *		Commodity functions to handle streams			*
2298  *									*
2299  ************************************************************************/
2300 
2301 #ifdef LIBXML_PUSH_ENABLED
2302 /**
2303  * htmlNewInputStream:
2304  * @ctxt:  an HTML parser context
2305  *
2306  * Create a new input stream structure
2307  * Returns the new input stream or NULL
2308  */
2309 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)2310 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2311     htmlParserInputPtr input;
2312 
2313     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2314     if (input == NULL) {
2315         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2316 	return(NULL);
2317     }
2318     memset(input, 0, sizeof(htmlParserInput));
2319     input->filename = NULL;
2320     input->directory = NULL;
2321     input->base = NULL;
2322     input->cur = NULL;
2323     input->buf = NULL;
2324     input->line = 1;
2325     input->col = 1;
2326     input->buf = NULL;
2327     input->free = NULL;
2328     input->version = NULL;
2329     input->consumed = 0;
2330     input->length = 0;
2331     return(input);
2332 }
2333 #endif
2334 
2335 
2336 /************************************************************************
2337  *									*
2338  *		Commodity functions, cleanup needed ?			*
2339  *									*
2340  ************************************************************************/
2341 /*
2342  * all tags allowing pc data from the html 4.01 loose dtd
2343  * NOTE: it might be more appropriate to integrate this information
2344  * into the html40ElementTable array but I don't want to risk any
2345  * binary incompatibility
2346  */
2347 static const char *allowPCData[] = {
2348     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2349     "blockquote", "body", "button", "caption", "center", "cite", "code",
2350     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2351     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2352     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2353     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2354 };
2355 
2356 /**
2357  * areBlanks:
2358  * @ctxt:  an HTML parser context
2359  * @str:  a xmlChar *
2360  * @len:  the size of @str
2361  *
2362  * Is this a sequence of blank chars that one can ignore ?
2363  *
2364  * Returns 1 if ignorable 0 otherwise.
2365  */
2366 
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2367 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2368     unsigned int i;
2369     int j;
2370     xmlNodePtr lastChild;
2371     xmlDtdPtr dtd;
2372 
2373     for (j = 0;j < len;j++)
2374         if (!(IS_BLANK_CH(str[j]))) return(0);
2375 
2376     if (CUR == 0) return(1);
2377     if (CUR != '<') return(0);
2378     if (ctxt->name == NULL)
2379 	return(1);
2380     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2381 	return(1);
2382     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2383 	return(1);
2384 
2385     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2386     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2387         dtd = xmlGetIntSubset(ctxt->myDoc);
2388         if (dtd != NULL && dtd->ExternalID != NULL) {
2389             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2390                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2391                 return(1);
2392         }
2393     }
2394 
2395     if (ctxt->node == NULL) return(0);
2396     lastChild = xmlGetLastChild(ctxt->node);
2397     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2398 	lastChild = lastChild->prev;
2399     if (lastChild == NULL) {
2400         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2401             (ctxt->node->content != NULL)) return(0);
2402 	/* keep ws in constructs like ...<b> </b>...
2403 	   for all tags "b" allowing PCDATA */
2404 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2405 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2406 		return(0);
2407 	    }
2408 	}
2409     } else if (xmlNodeIsText(lastChild)) {
2410         return(0);
2411     } else {
2412 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2413 	   for all tags "p" allowing PCDATA */
2414 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2415 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2416 		return(0);
2417 	    }
2418 	}
2419     }
2420     return(1);
2421 }
2422 
2423 /**
2424  * htmlNewDocNoDtD:
2425  * @URI:  URI for the dtd, or NULL
2426  * @ExternalID:  the external ID of the DTD, or NULL
2427  *
2428  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2429  * are NULL
2430  *
2431  * Returns a new document, do not initialize the DTD if not provided
2432  */
2433 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2434 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2435     xmlDocPtr cur;
2436 
2437     /*
2438      * Allocate a new document and fill the fields.
2439      */
2440     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2441     if (cur == NULL) {
2442 	htmlErrMemory(NULL, "HTML document creation failed\n");
2443 	return(NULL);
2444     }
2445     memset(cur, 0, sizeof(xmlDoc));
2446 
2447     cur->type = XML_HTML_DOCUMENT_NODE;
2448     cur->version = NULL;
2449     cur->intSubset = NULL;
2450     cur->doc = cur;
2451     cur->name = NULL;
2452     cur->children = NULL;
2453     cur->extSubset = NULL;
2454     cur->oldNs = NULL;
2455     cur->encoding = NULL;
2456     cur->standalone = 1;
2457     cur->compression = 0;
2458     cur->ids = NULL;
2459     cur->refs = NULL;
2460     cur->_private = NULL;
2461     cur->charset = XML_CHAR_ENCODING_UTF8;
2462     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2463     if ((ExternalID != NULL) ||
2464 	(URI != NULL))
2465 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2466     if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2467 	xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2468     return(cur);
2469 }
2470 
2471 /**
2472  * htmlNewDoc:
2473  * @URI:  URI for the dtd, or NULL
2474  * @ExternalID:  the external ID of the DTD, or NULL
2475  *
2476  * Creates a new HTML document
2477  *
2478  * Returns a new document
2479  */
2480 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2481 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2482     if ((URI == NULL) && (ExternalID == NULL))
2483 	return(htmlNewDocNoDtD(
2484 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2485 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2486 
2487     return(htmlNewDocNoDtD(URI, ExternalID));
2488 }
2489 
2490 
2491 /************************************************************************
2492  *									*
2493  *			The parser itself				*
2494  *	Relates to http://www.w3.org/TR/html40				*
2495  *									*
2496  ************************************************************************/
2497 
2498 /************************************************************************
2499  *									*
2500  *			The parser itself				*
2501  *									*
2502  ************************************************************************/
2503 
2504 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2505 
2506 static void
htmlSkipBogusComment(htmlParserCtxtPtr ctxt)2507 htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2508     int c;
2509 
2510     htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2511                  "Incorrectly opened comment\n", NULL, NULL);
2512 
2513     do {
2514         c = CUR;
2515         if (c == 0)
2516             break;
2517         NEXT;
2518     } while (c != '>');
2519 }
2520 
2521 /**
2522  * htmlParseHTMLName:
2523  * @ctxt:  an HTML parser context
2524  *
2525  * parse an HTML tag or attribute name, note that we convert it to lowercase
2526  * since HTML names are not case-sensitive.
2527  *
2528  * Returns the Tag Name parsed or NULL
2529  */
2530 
2531 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2532 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2533     const xmlChar *ret;
2534     int i = 0;
2535     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2536 
2537     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2538         (CUR != ':') && (CUR != '.')) return(NULL);
2539 
2540     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2541            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2542 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2543            (CUR == '.'))) {
2544 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2545         else loc[i] = CUR;
2546 	i++;
2547 
2548 	NEXT;
2549     }
2550 
2551     ret = xmlDictLookup(ctxt->dict, loc, i);
2552     if (ret == NULL)
2553         htmlErrMemory(ctxt, NULL);
2554 
2555     return(ret);
2556 }
2557 
2558 
2559 /**
2560  * htmlParseHTMLName_nonInvasive:
2561  * @ctxt:  an HTML parser context
2562  *
2563  * parse an HTML tag or attribute name, note that we convert it to lowercase
2564  * since HTML names are not case-sensitive, this doesn't consume the data
2565  * from the stream, it's a look-ahead
2566  *
2567  * Returns the Tag Name parsed or NULL
2568  */
2569 
2570 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2571 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2572     int i = 0;
2573     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2574 
2575     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2576         (NXT(1) != ':')) return(NULL);
2577 
2578     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2579            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2580 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2581 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2582         else loc[i] = NXT(1+i);
2583 	i++;
2584     }
2585 
2586     return(xmlDictLookup(ctxt->dict, loc, i));
2587 }
2588 
2589 
2590 /**
2591  * htmlParseName:
2592  * @ctxt:  an HTML parser context
2593  *
2594  * parse an HTML name, this routine is case sensitive.
2595  *
2596  * Returns the Name parsed or NULL
2597  */
2598 
2599 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2600 htmlParseName(htmlParserCtxtPtr ctxt) {
2601     const xmlChar *in;
2602     const xmlChar *ret;
2603     int count = 0;
2604 
2605     GROW;
2606 
2607     /*
2608      * Accelerator for simple ASCII names
2609      */
2610     in = ctxt->input->cur;
2611     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2612 	((*in >= 0x41) && (*in <= 0x5A)) ||
2613 	(*in == '_') || (*in == ':')) {
2614 	in++;
2615 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2616 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2617 	       ((*in >= 0x30) && (*in <= 0x39)) ||
2618 	       (*in == '_') || (*in == '-') ||
2619 	       (*in == ':') || (*in == '.'))
2620 	    in++;
2621 
2622 	if (in == ctxt->input->end)
2623 	    return(NULL);
2624 
2625 	if ((*in > 0) && (*in < 0x80)) {
2626 	    count = in - ctxt->input->cur;
2627 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2628 	    ctxt->input->cur = in;
2629 	    ctxt->input->col += count;
2630 	    return(ret);
2631 	}
2632     }
2633     return(htmlParseNameComplex(ctxt));
2634 }
2635 
2636 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2637 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2638     int len = 0, l;
2639     int c;
2640     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2641                     XML_MAX_TEXT_LENGTH :
2642                     XML_MAX_NAME_LENGTH;
2643     const xmlChar *base = ctxt->input->base;
2644 
2645     /*
2646      * Handler for more complex cases
2647      */
2648     c = CUR_CHAR(l);
2649     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2650 	(!IS_LETTER(c) && (c != '_') &&
2651          (c != ':'))) {
2652 	return(NULL);
2653     }
2654 
2655     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2656 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2657             (c == '.') || (c == '-') ||
2658 	    (c == '_') || (c == ':') ||
2659 	    (IS_COMBINING(c)) ||
2660 	    (IS_EXTENDER(c)))) {
2661 	len += l;
2662         if (len > maxLength) {
2663             htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2664             return(NULL);
2665         }
2666 	NEXTL(l);
2667 	c = CUR_CHAR(l);
2668 	if (ctxt->input->base != base) {
2669 	    /*
2670 	     * We changed encoding from an unknown encoding
2671 	     * Input buffer changed location, so we better start again
2672 	     */
2673 	    return(htmlParseNameComplex(ctxt));
2674 	}
2675     }
2676     if (ctxt->instate == XML_PARSER_EOF)
2677         return(NULL);
2678 
2679     if (ctxt->input->cur - ctxt->input->base < len) {
2680         /* Sanity check */
2681 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2682                      "unexpected change of input buffer", NULL, NULL);
2683         return (NULL);
2684     }
2685 
2686     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2687 }
2688 
2689 
2690 /**
2691  * htmlParseHTMLAttribute:
2692  * @ctxt:  an HTML parser context
2693  * @stop:  a char stop value
2694  *
2695  * parse an HTML attribute value till the stop (quote), if
2696  * stop is 0 then it stops at the first space
2697  *
2698  * Returns the attribute parsed or NULL
2699  */
2700 
2701 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2702 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2703     xmlChar *buffer = NULL;
2704     int buffer_size = 0;
2705     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2706                     XML_MAX_HUGE_LENGTH :
2707                     XML_MAX_TEXT_LENGTH;
2708     xmlChar *out = NULL;
2709     const xmlChar *name = NULL;
2710     const xmlChar *cur = NULL;
2711     const htmlEntityDesc * ent;
2712 
2713     /*
2714      * allocate a translation buffer.
2715      */
2716     buffer_size = HTML_PARSER_BUFFER_SIZE;
2717     buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2718     if (buffer == NULL) {
2719 	htmlErrMemory(ctxt, "buffer allocation failed\n");
2720 	return(NULL);
2721     }
2722     out = buffer;
2723 
2724     /*
2725      * Ok loop until we reach one of the ending chars
2726      */
2727     while ((CUR != 0) && (CUR != stop)) {
2728 	if ((stop == 0) && (CUR == '>')) break;
2729 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2730         if (CUR == '&') {
2731 	    if (NXT(1) == '#') {
2732 		unsigned int c;
2733 		int bits;
2734 
2735 		c = htmlParseCharRef(ctxt);
2736 		if      (c <    0x80)
2737 		        { *out++  = c;                bits= -6; }
2738 		else if (c <   0x800)
2739 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2740 		else if (c < 0x10000)
2741 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2742 		else
2743 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2744 
2745 		for ( ; bits >= 0; bits-= 6) {
2746 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2747 		}
2748 
2749 		if (out - buffer > buffer_size - 100) {
2750 			int indx = out - buffer;
2751 
2752 			growBuffer(buffer);
2753 			out = &buffer[indx];
2754 		}
2755 	    } else {
2756 		ent = htmlParseEntityRef(ctxt, &name);
2757 		if (name == NULL) {
2758 		    *out++ = '&';
2759 		    if (out - buffer > buffer_size - 100) {
2760 			int indx = out - buffer;
2761 
2762 			growBuffer(buffer);
2763 			out = &buffer[indx];
2764 		    }
2765 		} else if (ent == NULL) {
2766 		    *out++ = '&';
2767 		    cur = name;
2768 		    while (*cur != 0) {
2769 			if (out - buffer > buffer_size - 100) {
2770 			    int indx = out - buffer;
2771 
2772 			    growBuffer(buffer);
2773 			    out = &buffer[indx];
2774 			}
2775 			*out++ = *cur++;
2776 		    }
2777 		} else {
2778 		    unsigned int c;
2779 		    int bits;
2780 
2781 		    if (out - buffer > buffer_size - 100) {
2782 			int indx = out - buffer;
2783 
2784 			growBuffer(buffer);
2785 			out = &buffer[indx];
2786 		    }
2787 		    c = ent->value;
2788 		    if      (c <    0x80)
2789 			{ *out++  = c;                bits= -6; }
2790 		    else if (c <   0x800)
2791 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2792 		    else if (c < 0x10000)
2793 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2794 		    else
2795 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2796 
2797 		    for ( ; bits >= 0; bits-= 6) {
2798 			*out++  = ((c >> bits) & 0x3F) | 0x80;
2799 		    }
2800 		}
2801 	    }
2802 	} else {
2803 	    unsigned int c;
2804 	    int bits, l;
2805 
2806 	    if (out - buffer > buffer_size - 100) {
2807 		int indx = out - buffer;
2808 
2809 		growBuffer(buffer);
2810 		out = &buffer[indx];
2811 	    }
2812 	    c = CUR_CHAR(l);
2813             if (ctxt->instate == XML_PARSER_EOF) {
2814                 xmlFree(buffer);
2815                 return(NULL);
2816             }
2817 	    if      (c <    0x80)
2818 		    { *out++  = c;                bits= -6; }
2819 	    else if (c <   0x800)
2820 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2821 	    else if (c < 0x10000)
2822 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2823 	    else
2824 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2825 
2826 	    for ( ; bits >= 0; bits-= 6) {
2827 		*out++  = ((c >> bits) & 0x3F) | 0x80;
2828 	    }
2829 	    NEXTL(l);
2830 	}
2831         if (out - buffer > maxLength) {
2832             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2833                          "attribute value too long\n", NULL, NULL);
2834             xmlFree(buffer);
2835             return(NULL);
2836         }
2837     }
2838     *out = 0;
2839     return(buffer);
2840 }
2841 
2842 /**
2843  * htmlParseEntityRef:
2844  * @ctxt:  an HTML parser context
2845  * @str:  location to store the entity name
2846  *
2847  * DEPRECATED: Internal function, don't use.
2848  *
2849  * parse an HTML ENTITY references
2850  *
2851  * [68] EntityRef ::= '&' Name ';'
2852  *
2853  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2854  *         if non-NULL *str will have to be freed by the caller.
2855  */
2856 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2857 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2858     const xmlChar *name;
2859     const htmlEntityDesc * ent = NULL;
2860 
2861     if (str != NULL) *str = NULL;
2862     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2863 
2864     if (CUR == '&') {
2865         NEXT;
2866         name = htmlParseName(ctxt);
2867 	if (name == NULL) {
2868 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2869 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2870 	} else {
2871 	    GROW;
2872 	    if (CUR == ';') {
2873 	        if (str != NULL)
2874 		    *str = name;
2875 
2876 		/*
2877 		 * Lookup the entity in the table.
2878 		 */
2879 		ent = htmlEntityLookup(name);
2880 		if (ent != NULL) /* OK that's ugly !!! */
2881 		    NEXT;
2882 	    } else {
2883 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2884 		             "htmlParseEntityRef: expecting ';'\n",
2885 			     NULL, NULL);
2886 	        if (str != NULL)
2887 		    *str = name;
2888 	    }
2889 	}
2890     }
2891     return(ent);
2892 }
2893 
2894 /**
2895  * htmlParseAttValue:
2896  * @ctxt:  an HTML parser context
2897  *
2898  * parse a value for an attribute
2899  * Note: the parser won't do substitution of entities here, this
2900  * will be handled later in xmlStringGetNodeList, unless it was
2901  * asked for ctxt->replaceEntities != 0
2902  *
2903  * Returns the AttValue parsed or NULL.
2904  */
2905 
2906 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2907 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2908     xmlChar *ret = NULL;
2909 
2910     if (CUR == '"') {
2911         NEXT;
2912 	ret = htmlParseHTMLAttribute(ctxt, '"');
2913         if (CUR != '"') {
2914 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2915 	                 "AttValue: \" expected\n", NULL, NULL);
2916 	} else
2917 	    NEXT;
2918     } else if (CUR == '\'') {
2919         NEXT;
2920 	ret = htmlParseHTMLAttribute(ctxt, '\'');
2921         if (CUR != '\'') {
2922 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2923 	                 "AttValue: ' expected\n", NULL, NULL);
2924 	} else
2925 	    NEXT;
2926     } else {
2927         /*
2928 	 * That's an HTMLism, the attribute value may not be quoted
2929 	 */
2930 	ret = htmlParseHTMLAttribute(ctxt, 0);
2931 	if (ret == NULL) {
2932 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2933 	                 "AttValue: no value found\n", NULL, NULL);
2934 	}
2935     }
2936     return(ret);
2937 }
2938 
2939 /**
2940  * htmlParseSystemLiteral:
2941  * @ctxt:  an HTML parser context
2942  *
2943  * parse an HTML Literal
2944  *
2945  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2946  *
2947  * Returns the SystemLiteral parsed or NULL
2948  */
2949 
2950 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2951 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2952     size_t len = 0, startPosition = 0;
2953     int err = 0;
2954     int quote;
2955     xmlChar *ret = NULL;
2956 
2957     if ((CUR != '"') && (CUR != '\'')) {
2958 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2959 	             "SystemLiteral \" or ' expected\n", NULL, NULL);
2960         return(NULL);
2961     }
2962     quote = CUR;
2963     NEXT;
2964 
2965     if (CUR_PTR < BASE_PTR)
2966         return(ret);
2967     startPosition = CUR_PTR - BASE_PTR;
2968 
2969     while ((CUR != 0) && (CUR != quote)) {
2970         /* TODO: Handle UTF-8 */
2971         if (!IS_CHAR_CH(CUR)) {
2972             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2973                             "Invalid char in SystemLiteral 0x%X\n", CUR);
2974             err = 1;
2975         }
2976         NEXT;
2977         len++;
2978     }
2979     if (CUR != quote) {
2980         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2981                      "Unfinished SystemLiteral\n", NULL, NULL);
2982     } else {
2983         if (err == 0)
2984             ret = xmlStrndup((BASE_PTR+startPosition), len);
2985         NEXT;
2986     }
2987 
2988     return(ret);
2989 }
2990 
2991 /**
2992  * htmlParsePubidLiteral:
2993  * @ctxt:  an HTML parser context
2994  *
2995  * parse an HTML public literal
2996  *
2997  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2998  *
2999  * Returns the PubidLiteral parsed or NULL.
3000  */
3001 
3002 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)3003 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3004     size_t len = 0, startPosition = 0;
3005     int err = 0;
3006     int quote;
3007     xmlChar *ret = NULL;
3008 
3009     if ((CUR != '"') && (CUR != '\'')) {
3010 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3011 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
3012         return(NULL);
3013     }
3014     quote = CUR;
3015     NEXT;
3016 
3017     /*
3018      * Name ::= (Letter | '_') (NameChar)*
3019      */
3020     if (CUR_PTR < BASE_PTR)
3021         return(ret);
3022     startPosition = CUR_PTR - BASE_PTR;
3023 
3024     while ((CUR != 0) && (CUR != quote)) {
3025         if (!IS_PUBIDCHAR_CH(CUR)) {
3026             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3027                             "Invalid char in PubidLiteral 0x%X\n", CUR);
3028             err = 1;
3029         }
3030         len++;
3031         NEXT;
3032     }
3033 
3034     if (CUR != quote) {
3035         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3036                      "Unfinished PubidLiteral\n", NULL, NULL);
3037     } else {
3038         if (err == 0)
3039             ret = xmlStrndup((BASE_PTR + startPosition), len);
3040         NEXT;
3041     }
3042 
3043     return(ret);
3044 }
3045 
3046 /**
3047  * htmlParseScript:
3048  * @ctxt:  an HTML parser context
3049  *
3050  * parse the content of an HTML SCRIPT or STYLE element
3051  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3052  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3053  * http://www.w3.org/TR/html4/types.html#type-script
3054  * http://www.w3.org/TR/html4/types.html#h-6.15
3055  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3056  *
3057  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3058  * element and the value of intrinsic event attributes. User agents must
3059  * not evaluate script data as HTML markup but instead must pass it on as
3060  * data to a script engine.
3061  * NOTES:
3062  * - The content is passed like CDATA
3063  * - the attributes for style and scripting "onXXX" are also described
3064  *   as CDATA but SGML allows entities references in attributes so their
3065  *   processing is identical as other attributes
3066  */
3067 static void
htmlParseScript(htmlParserCtxtPtr ctxt)3068 htmlParseScript(htmlParserCtxtPtr ctxt) {
3069     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3070     int nbchar = 0;
3071     int cur,l;
3072 
3073     cur = CUR_CHAR(l);
3074     while (cur != 0) {
3075 	if ((cur == '<') && (NXT(1) == '/')) {
3076             /*
3077              * One should break here, the specification is clear:
3078              * Authors should therefore escape "</" within the content.
3079              * Escape mechanisms are specific to each scripting or
3080              * style sheet language.
3081              *
3082              * In recovery mode, only break if end tag match the
3083              * current tag, effectively ignoring all tags inside the
3084              * script/style block and treating the entire block as
3085              * CDATA.
3086              */
3087             if (ctxt->recovery) {
3088                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3089 				   xmlStrlen(ctxt->name)) == 0)
3090                 {
3091                     break; /* while */
3092                 } else {
3093 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3094 				 "Element %s embeds close tag\n",
3095 		                 ctxt->name, NULL);
3096 		}
3097             } else {
3098                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3099                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3100                 {
3101                     break; /* while */
3102                 }
3103             }
3104 	}
3105         if (IS_CHAR(cur)) {
3106 	    COPY_BUF(l,buf,nbchar,cur);
3107         } else {
3108             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3109                             "Invalid char in CDATA 0x%X\n", cur);
3110         }
3111 	NEXTL(l);
3112 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3113             buf[nbchar] = 0;
3114 	    if (ctxt->sax->cdataBlock!= NULL) {
3115 		/*
3116 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3117 		 */
3118 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3119 	    } else if (ctxt->sax->characters != NULL) {
3120 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
3121 	    }
3122 	    nbchar = 0;
3123             SHRINK;
3124 	}
3125 	cur = CUR_CHAR(l);
3126     }
3127 
3128     if (ctxt->instate == XML_PARSER_EOF)
3129         return;
3130 
3131     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3132         buf[nbchar] = 0;
3133 	if (ctxt->sax->cdataBlock!= NULL) {
3134 	    /*
3135 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3136 	     */
3137 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3138 	} else if (ctxt->sax->characters != NULL) {
3139 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3140 	}
3141     }
3142 }
3143 
3144 
3145 /**
3146  * htmlParseCharDataInternal:
3147  * @ctxt:  an HTML parser context
3148  * @readahead: optional read ahead character in ascii range
3149  *
3150  * parse a CharData section.
3151  * if we are within a CDATA section ']]>' marks an end of section.
3152  *
3153  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3154  */
3155 
3156 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3157 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3158     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3159     int nbchar = 0;
3160     int cur, l;
3161 
3162     if (readahead)
3163         buf[nbchar++] = readahead;
3164 
3165     cur = CUR_CHAR(l);
3166     while (((cur != '<') || (ctxt->token == '<')) &&
3167            ((cur != '&') || (ctxt->token == '&')) &&
3168 	   (cur != 0)) {
3169 	if (!(IS_CHAR(cur))) {
3170 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3171 	                "Invalid char in CDATA 0x%X\n", cur);
3172 	} else {
3173 	    COPY_BUF(l,buf,nbchar,cur);
3174 	}
3175 	NEXTL(l);
3176 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3177             buf[nbchar] = 0;
3178 
3179 	    /*
3180 	     * Ok the segment is to be consumed as chars.
3181 	     */
3182 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3183 		if (areBlanks(ctxt, buf, nbchar)) {
3184 		    if (ctxt->keepBlanks) {
3185 			if (ctxt->sax->characters != NULL)
3186 			    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3187 		    } else {
3188 			if (ctxt->sax->ignorableWhitespace != NULL)
3189 			    ctxt->sax->ignorableWhitespace(ctxt->userData,
3190 			                                   buf, nbchar);
3191 		    }
3192 		} else {
3193 		    htmlCheckParagraph(ctxt);
3194 		    if (ctxt->sax->characters != NULL)
3195 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3196 		}
3197 	    }
3198 	    nbchar = 0;
3199             SHRINK;
3200 	}
3201 	cur = CUR_CHAR(l);
3202     }
3203     if (ctxt->instate == XML_PARSER_EOF)
3204         return;
3205     if (nbchar != 0) {
3206         buf[nbchar] = 0;
3207 
3208 	/*
3209 	 * Ok the segment is to be consumed as chars.
3210 	 */
3211 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3212 	    if (areBlanks(ctxt, buf, nbchar)) {
3213 		if (ctxt->keepBlanks) {
3214 		    if (ctxt->sax->characters != NULL)
3215 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3216 		} else {
3217 		    if (ctxt->sax->ignorableWhitespace != NULL)
3218 			ctxt->sax->ignorableWhitespace(ctxt->userData,
3219 			                               buf, nbchar);
3220 		}
3221 	    } else {
3222 		htmlCheckParagraph(ctxt);
3223 		if (ctxt->sax->characters != NULL)
3224 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3225 	    }
3226 	}
3227     }
3228 }
3229 
3230 /**
3231  * htmlParseCharData:
3232  * @ctxt:  an HTML parser context
3233  *
3234  * parse a CharData section.
3235  * if we are within a CDATA section ']]>' marks an end of section.
3236  *
3237  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3238  */
3239 
3240 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3241 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3242     htmlParseCharDataInternal(ctxt, 0);
3243 }
3244 
3245 /**
3246  * htmlParseExternalID:
3247  * @ctxt:  an HTML parser context
3248  * @publicID:  a xmlChar** receiving PubidLiteral
3249  *
3250  * Parse an External ID or a Public ID
3251  *
3252  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3253  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3254  *
3255  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3256  *
3257  * Returns the function returns SystemLiteral and in the second
3258  *                case publicID receives PubidLiteral, is strict is off
3259  *                it is possible to return NULL and have publicID set.
3260  */
3261 
3262 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3263 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3264     xmlChar *URI = NULL;
3265 
3266     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3267          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3268 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3269         SKIP(6);
3270 	if (!IS_BLANK_CH(CUR)) {
3271 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3272 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3273 	}
3274         SKIP_BLANKS;
3275 	URI = htmlParseSystemLiteral(ctxt);
3276 	if (URI == NULL) {
3277 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3278 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3279         }
3280     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3281 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3282 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3283         SKIP(6);
3284 	if (!IS_BLANK_CH(CUR)) {
3285 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3286 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3287 	}
3288         SKIP_BLANKS;
3289 	*publicID = htmlParsePubidLiteral(ctxt);
3290 	if (*publicID == NULL) {
3291 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3292 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3293 			 NULL, NULL);
3294 	}
3295         SKIP_BLANKS;
3296         if ((CUR == '"') || (CUR == '\'')) {
3297 	    URI = htmlParseSystemLiteral(ctxt);
3298 	}
3299     }
3300     return(URI);
3301 }
3302 
3303 /**
3304  * xmlParsePI:
3305  * @ctxt:  an XML parser context
3306  *
3307  * parse an XML Processing Instruction.
3308  *
3309  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3310  */
3311 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3312 htmlParsePI(htmlParserCtxtPtr ctxt) {
3313     xmlChar *buf = NULL;
3314     int len = 0;
3315     int size = HTML_PARSER_BUFFER_SIZE;
3316     int cur, l;
3317     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3318                     XML_MAX_HUGE_LENGTH :
3319                     XML_MAX_TEXT_LENGTH;
3320     const xmlChar *target;
3321     xmlParserInputState state;
3322 
3323     if ((RAW == '<') && (NXT(1) == '?')) {
3324 	state = ctxt->instate;
3325         ctxt->instate = XML_PARSER_PI;
3326 	/*
3327 	 * this is a Processing Instruction.
3328 	 */
3329 	SKIP(2);
3330 
3331 	/*
3332 	 * Parse the target name and check for special support like
3333 	 * namespace.
3334 	 */
3335         target = htmlParseName(ctxt);
3336 	if (target != NULL) {
3337 	    if (RAW == '>') {
3338 		SKIP(1);
3339 
3340 		/*
3341 		 * SAX: PI detected.
3342 		 */
3343 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3344 		    (ctxt->sax->processingInstruction != NULL))
3345 		    ctxt->sax->processingInstruction(ctxt->userData,
3346 		                                     target, NULL);
3347 		ctxt->instate = state;
3348 		return;
3349 	    }
3350 	    buf = (xmlChar *) xmlMallocAtomic(size);
3351 	    if (buf == NULL) {
3352 		htmlErrMemory(ctxt, NULL);
3353 		ctxt->instate = state;
3354 		return;
3355 	    }
3356 	    cur = CUR;
3357 	    if (!IS_BLANK(cur)) {
3358 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3359 			  "ParsePI: PI %s space expected\n", target, NULL);
3360 	    }
3361             SKIP_BLANKS;
3362 	    cur = CUR_CHAR(l);
3363 	    while ((cur != 0) && (cur != '>')) {
3364 		if (len + 5 >= size) {
3365 		    xmlChar *tmp;
3366 
3367 		    size *= 2;
3368 		    tmp = (xmlChar *) xmlRealloc(buf, size);
3369 		    if (tmp == NULL) {
3370 			htmlErrMemory(ctxt, NULL);
3371 			xmlFree(buf);
3372 			ctxt->instate = state;
3373 			return;
3374 		    }
3375 		    buf = tmp;
3376 		}
3377                 if (IS_CHAR(cur)) {
3378 		    COPY_BUF(l,buf,len,cur);
3379                 } else {
3380                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3381                                     "Invalid char in processing instruction "
3382                                     "0x%X\n", cur);
3383                 }
3384                 if (len > maxLength) {
3385                     htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3386                                  "PI %s too long", target, NULL);
3387                     xmlFree(buf);
3388                     ctxt->instate = state;
3389                     return;
3390                 }
3391 		NEXTL(l);
3392 		cur = CUR_CHAR(l);
3393 	    }
3394 	    buf[len] = 0;
3395             if (ctxt->instate == XML_PARSER_EOF) {
3396                 xmlFree(buf);
3397                 return;
3398             }
3399 	    if (cur != '>') {
3400 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3401 		      "ParsePI: PI %s never end ...\n", target, NULL);
3402 	    } else {
3403 		SKIP(1);
3404 
3405 		/*
3406 		 * SAX: PI detected.
3407 		 */
3408 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3409 		    (ctxt->sax->processingInstruction != NULL))
3410 		    ctxt->sax->processingInstruction(ctxt->userData,
3411 		                                     target, buf);
3412 	    }
3413 	    xmlFree(buf);
3414 	} else {
3415 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3416                          "PI is not started correctly", NULL, NULL);
3417 	}
3418 	ctxt->instate = state;
3419     }
3420 }
3421 
3422 /**
3423  * htmlParseComment:
3424  * @ctxt:  an HTML parser context
3425  *
3426  * Parse an XML (SGML) comment <!-- .... -->
3427  *
3428  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3429  */
3430 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3431 htmlParseComment(htmlParserCtxtPtr ctxt) {
3432     xmlChar *buf = NULL;
3433     int len;
3434     int size = HTML_PARSER_BUFFER_SIZE;
3435     int q, ql;
3436     int r, rl;
3437     int cur, l;
3438     int next, nl;
3439     int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3440                     XML_MAX_HUGE_LENGTH :
3441                     XML_MAX_TEXT_LENGTH;
3442     xmlParserInputState state;
3443 
3444     /*
3445      * Check that there is a comment right here.
3446      */
3447     if ((RAW != '<') || (NXT(1) != '!') ||
3448         (NXT(2) != '-') || (NXT(3) != '-')) return;
3449 
3450     state = ctxt->instate;
3451     ctxt->instate = XML_PARSER_COMMENT;
3452     SKIP(4);
3453     buf = (xmlChar *) xmlMallocAtomic(size);
3454     if (buf == NULL) {
3455         htmlErrMemory(ctxt, "buffer allocation failed\n");
3456 	ctxt->instate = state;
3457 	return;
3458     }
3459     len = 0;
3460     buf[len] = 0;
3461     q = CUR_CHAR(ql);
3462     if (q == 0)
3463         goto unfinished;
3464     if (q == '>') {
3465         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3466         cur = '>';
3467         goto finished;
3468     }
3469     NEXTL(ql);
3470     r = CUR_CHAR(rl);
3471     if (r == 0)
3472         goto unfinished;
3473     if (q == '-' && r == '>') {
3474         htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3475         cur = '>';
3476         goto finished;
3477     }
3478     NEXTL(rl);
3479     cur = CUR_CHAR(l);
3480     while ((cur != 0) &&
3481            ((cur != '>') ||
3482 	    (r != '-') || (q != '-'))) {
3483 	NEXTL(l);
3484 	next = CUR_CHAR(nl);
3485 
3486 	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3487 	  htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3488 		       "Comment incorrectly closed by '--!>'", NULL, NULL);
3489 	  cur = '>';
3490 	  break;
3491 	}
3492 
3493 	if (len + 5 >= size) {
3494 	    xmlChar *tmp;
3495 
3496 	    size *= 2;
3497 	    tmp = (xmlChar *) xmlRealloc(buf, size);
3498 	    if (tmp == NULL) {
3499 	        xmlFree(buf);
3500 	        htmlErrMemory(ctxt, "growing buffer failed\n");
3501 		ctxt->instate = state;
3502 		return;
3503 	    }
3504 	    buf = tmp;
3505 	}
3506         if (IS_CHAR(q)) {
3507 	    COPY_BUF(ql,buf,len,q);
3508         } else {
3509             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3510                             "Invalid char in comment 0x%X\n", q);
3511         }
3512         if (len > maxLength) {
3513             htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3514                          "comment too long", NULL, NULL);
3515             xmlFree(buf);
3516             ctxt->instate = state;
3517             return;
3518         }
3519 
3520 	q = r;
3521 	ql = rl;
3522 	r = cur;
3523 	rl = l;
3524 	cur = next;
3525 	l = nl;
3526     }
3527 finished:
3528     buf[len] = 0;
3529     if (ctxt->instate == XML_PARSER_EOF) {
3530         xmlFree(buf);
3531         return;
3532     }
3533     if (cur == '>') {
3534         NEXT;
3535 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3536 	    (!ctxt->disableSAX))
3537 	    ctxt->sax->comment(ctxt->userData, buf);
3538 	xmlFree(buf);
3539 	ctxt->instate = state;
3540 	return;
3541     }
3542 
3543 unfinished:
3544     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3545 		 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3546     xmlFree(buf);
3547 }
3548 
3549 /**
3550  * htmlParseCharRef:
3551  * @ctxt:  an HTML parser context
3552  *
3553  * DEPRECATED: Internal function, don't use.
3554  *
3555  * parse Reference declarations
3556  *
3557  * [66] CharRef ::= '&#' [0-9]+ ';' |
3558  *                  '&#x' [0-9a-fA-F]+ ';'
3559  *
3560  * Returns the value parsed (as an int)
3561  */
3562 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3563 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3564     int val = 0;
3565 
3566     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3567 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3568 		     "htmlParseCharRef: context error\n",
3569 		     NULL, NULL);
3570         return(0);
3571     }
3572     if ((CUR == '&') && (NXT(1) == '#') &&
3573         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3574 	SKIP(3);
3575 	while (CUR != ';') {
3576 	    if ((CUR >= '0') && (CUR <= '9')) {
3577                 if (val < 0x110000)
3578 	            val = val * 16 + (CUR - '0');
3579             } else if ((CUR >= 'a') && (CUR <= 'f')) {
3580                 if (val < 0x110000)
3581 	            val = val * 16 + (CUR - 'a') + 10;
3582             } else if ((CUR >= 'A') && (CUR <= 'F')) {
3583                 if (val < 0x110000)
3584 	            val = val * 16 + (CUR - 'A') + 10;
3585             } else {
3586 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3587 		             "htmlParseCharRef: missing semicolon\n",
3588 			     NULL, NULL);
3589 		break;
3590 	    }
3591 	    NEXT;
3592 	}
3593 	if (CUR == ';')
3594 	    NEXT;
3595     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3596 	SKIP(2);
3597 	while (CUR != ';') {
3598 	    if ((CUR >= '0') && (CUR <= '9')) {
3599                 if (val < 0x110000)
3600 	            val = val * 10 + (CUR - '0');
3601             } else {
3602 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3603 		             "htmlParseCharRef: missing semicolon\n",
3604 			     NULL, NULL);
3605 		break;
3606 	    }
3607 	    NEXT;
3608 	}
3609 	if (CUR == ';')
3610 	    NEXT;
3611     } else {
3612 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3613 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3614     }
3615     /*
3616      * Check the value IS_CHAR ...
3617      */
3618     if (IS_CHAR(val)) {
3619         return(val);
3620     } else if (val >= 0x110000) {
3621 	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3622 		     "htmlParseCharRef: value too large\n", NULL, NULL);
3623     } else {
3624 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3625 			"htmlParseCharRef: invalid xmlChar value %d\n",
3626 			val);
3627     }
3628     return(0);
3629 }
3630 
3631 
3632 /**
3633  * htmlParseDocTypeDecl:
3634  * @ctxt:  an HTML parser context
3635  *
3636  * parse a DOCTYPE declaration
3637  *
3638  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3639  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3640  */
3641 
3642 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3643 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3644     const xmlChar *name;
3645     xmlChar *ExternalID = NULL;
3646     xmlChar *URI = NULL;
3647 
3648     /*
3649      * We know that '<!DOCTYPE' has been detected.
3650      */
3651     SKIP(9);
3652 
3653     SKIP_BLANKS;
3654 
3655     /*
3656      * Parse the DOCTYPE name.
3657      */
3658     name = htmlParseName(ctxt);
3659     if (name == NULL) {
3660 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3661 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3662 		     NULL, NULL);
3663     }
3664     /*
3665      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3666      */
3667 
3668     SKIP_BLANKS;
3669 
3670     /*
3671      * Check for SystemID and ExternalID
3672      */
3673     URI = htmlParseExternalID(ctxt, &ExternalID);
3674     SKIP_BLANKS;
3675 
3676     /*
3677      * We should be at the end of the DOCTYPE declaration.
3678      */
3679     if (CUR != '>') {
3680 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3681 	             "DOCTYPE improperly terminated\n", NULL, NULL);
3682         /* Ignore bogus content */
3683         while ((CUR != 0) && (CUR != '>') &&
3684                (ctxt->instate != XML_PARSER_EOF))
3685             NEXT;
3686     }
3687     if (CUR == '>')
3688         NEXT;
3689 
3690     /*
3691      * Create or update the document accordingly to the DOCTYPE
3692      */
3693     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3694 	(!ctxt->disableSAX))
3695 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3696 
3697     /*
3698      * Cleanup, since we don't use all those identifiers
3699      */
3700     if (URI != NULL) xmlFree(URI);
3701     if (ExternalID != NULL) xmlFree(ExternalID);
3702 }
3703 
3704 /**
3705  * htmlParseAttribute:
3706  * @ctxt:  an HTML parser context
3707  * @value:  a xmlChar ** used to store the value of the attribute
3708  *
3709  * parse an attribute
3710  *
3711  * [41] Attribute ::= Name Eq AttValue
3712  *
3713  * [25] Eq ::= S? '=' S?
3714  *
3715  * With namespace:
3716  *
3717  * [NS 11] Attribute ::= QName Eq AttValue
3718  *
3719  * Also the case QName == xmlns:??? is handled independently as a namespace
3720  * definition.
3721  *
3722  * Returns the attribute name, and the value in *value.
3723  */
3724 
3725 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3726 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3727     const xmlChar *name;
3728     xmlChar *val = NULL;
3729 
3730     *value = NULL;
3731     name = htmlParseHTMLName(ctxt);
3732     if (name == NULL) {
3733 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3734 	             "error parsing attribute name\n", NULL, NULL);
3735         return(NULL);
3736     }
3737 
3738     /*
3739      * read the value
3740      */
3741     SKIP_BLANKS;
3742     if (CUR == '=') {
3743         NEXT;
3744 	SKIP_BLANKS;
3745 	val = htmlParseAttValue(ctxt);
3746     }
3747 
3748     *value = val;
3749     return(name);
3750 }
3751 
3752 /**
3753  * htmlCheckEncoding:
3754  * @ctxt:  an HTML parser context
3755  * @attvalue: the attribute value
3756  *
3757  * Checks an http-equiv attribute from a Meta tag to detect
3758  * the encoding
3759  * If a new encoding is detected the parser is switched to decode
3760  * it and pass UTF8
3761  */
3762 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3763 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3764     const xmlChar *encoding;
3765 
3766     if (!attvalue)
3767 	return;
3768 
3769     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3770     if (encoding != NULL) {
3771 	encoding += 7;
3772     }
3773     /*
3774      * skip blank
3775      */
3776     if (encoding && IS_BLANK_CH(*encoding))
3777 	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3778     if (encoding && *encoding == '=') {
3779 	encoding ++;
3780 	xmlSetDeclaredEncoding(ctxt, xmlStrdup(encoding));
3781     }
3782 }
3783 
3784 /**
3785  * htmlCheckMeta:
3786  * @ctxt:  an HTML parser context
3787  * @atts:  the attributes values
3788  *
3789  * Checks an attributes from a Meta tag
3790  */
3791 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3792 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3793     int i;
3794     const xmlChar *att, *value;
3795     int http = 0;
3796     const xmlChar *content = NULL;
3797 
3798     if ((ctxt == NULL) || (atts == NULL))
3799 	return;
3800 
3801     i = 0;
3802     att = atts[i++];
3803     while (att != NULL) {
3804 	value = atts[i++];
3805 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3806 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3807 	    http = 1;
3808 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3809 	    xmlSetDeclaredEncoding(ctxt, xmlStrdup(value));
3810 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3811 	    content = value;
3812 	att = atts[i++];
3813     }
3814     if ((http) && (content != NULL))
3815 	htmlCheckEncoding(ctxt, content);
3816 
3817 }
3818 
3819 /**
3820  * htmlParseStartTag:
3821  * @ctxt:  an HTML parser context
3822  *
3823  * parse a start of tag either for rule element or
3824  * EmptyElement. In both case we don't parse the tag closing chars.
3825  *
3826  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3827  *
3828  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3829  *
3830  * With namespace:
3831  *
3832  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3833  *
3834  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3835  *
3836  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3837  */
3838 
3839 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3840 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3841     const xmlChar *name;
3842     const xmlChar *attname;
3843     xmlChar *attvalue;
3844     const xmlChar **atts;
3845     int nbatts = 0;
3846     int maxatts;
3847     int meta = 0;
3848     int i;
3849     int discardtag = 0;
3850 
3851     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3852 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3853 		     "htmlParseStartTag: context error\n", NULL, NULL);
3854 	return -1;
3855     }
3856     if (ctxt->instate == XML_PARSER_EOF)
3857         return(-1);
3858     if (CUR != '<') return -1;
3859     NEXT;
3860 
3861     atts = ctxt->atts;
3862     maxatts = ctxt->maxatts;
3863 
3864     GROW;
3865     name = htmlParseHTMLName(ctxt);
3866     if (name == NULL) {
3867 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3868 	             "htmlParseStartTag: invalid element name\n",
3869 		     NULL, NULL);
3870 	/* Dump the bogus tag like browsers do */
3871 	while ((CUR != 0) && (CUR != '>') &&
3872                (ctxt->instate != XML_PARSER_EOF))
3873 	    NEXT;
3874         return -1;
3875     }
3876     if (xmlStrEqual(name, BAD_CAST"meta"))
3877 	meta = 1;
3878 
3879     /*
3880      * Check for auto-closure of HTML elements.
3881      */
3882     htmlAutoClose(ctxt, name);
3883 
3884     /*
3885      * Check for implied HTML elements.
3886      */
3887     htmlCheckImplied(ctxt, name);
3888 
3889     /*
3890      * Avoid html at any level > 0, head at any level != 1
3891      * or any attempt to recurse body
3892      */
3893     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3894 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3895 	             "htmlParseStartTag: misplaced <html> tag\n",
3896 		     name, NULL);
3897 	discardtag = 1;
3898 	ctxt->depth++;
3899     }
3900     if ((ctxt->nameNr != 1) &&
3901 	(xmlStrEqual(name, BAD_CAST"head"))) {
3902 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3903 	             "htmlParseStartTag: misplaced <head> tag\n",
3904 		     name, NULL);
3905 	discardtag = 1;
3906 	ctxt->depth++;
3907     }
3908     if (xmlStrEqual(name, BAD_CAST"body")) {
3909 	int indx;
3910 	for (indx = 0;indx < ctxt->nameNr;indx++) {
3911 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3912 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3913 		             "htmlParseStartTag: misplaced <body> tag\n",
3914 			     name, NULL);
3915 		discardtag = 1;
3916 		ctxt->depth++;
3917 	    }
3918 	}
3919     }
3920 
3921     /*
3922      * Now parse the attributes, it ends up with the ending
3923      *
3924      * (S Attribute)* S?
3925      */
3926     SKIP_BLANKS;
3927     while ((CUR != 0) &&
3928            (CUR != '>') &&
3929 	   ((CUR != '/') || (NXT(1) != '>')) &&
3930            (ctxt->instate != XML_PARSER_EOF)) {
3931 	GROW;
3932 	attname = htmlParseAttribute(ctxt, &attvalue);
3933         if (attname != NULL) {
3934 
3935 	    /*
3936 	     * Well formedness requires at most one declaration of an attribute
3937 	     */
3938 	    for (i = 0; i < nbatts;i += 2) {
3939 	        if (xmlStrEqual(atts[i], attname)) {
3940 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3941 		                 "Attribute %s redefined\n", attname, NULL);
3942 		    if (attvalue != NULL)
3943 			xmlFree(attvalue);
3944 		    goto failed;
3945 		}
3946 	    }
3947 
3948 	    /*
3949 	     * Add the pair to atts
3950 	     */
3951 	    if (atts == NULL) {
3952 	        maxatts = 22; /* allow for 10 attrs by default */
3953 	        atts = (const xmlChar **)
3954 		       xmlMalloc(maxatts * sizeof(xmlChar *));
3955 		if (atts == NULL) {
3956 		    htmlErrMemory(ctxt, NULL);
3957 		    if (attvalue != NULL)
3958 			xmlFree(attvalue);
3959 		    goto failed;
3960 		}
3961 		ctxt->atts = atts;
3962 		ctxt->maxatts = maxatts;
3963 	    } else if (nbatts + 4 > maxatts) {
3964 	        const xmlChar **n;
3965 
3966 	        maxatts *= 2;
3967 	        n = (const xmlChar **) xmlRealloc((void *) atts,
3968 					     maxatts * sizeof(const xmlChar *));
3969 		if (n == NULL) {
3970 		    htmlErrMemory(ctxt, NULL);
3971 		    if (attvalue != NULL)
3972 			xmlFree(attvalue);
3973 		    goto failed;
3974 		}
3975 		atts = n;
3976 		ctxt->atts = atts;
3977 		ctxt->maxatts = maxatts;
3978 	    }
3979 	    atts[nbatts++] = attname;
3980 	    atts[nbatts++] = attvalue;
3981 	    atts[nbatts] = NULL;
3982 	    atts[nbatts + 1] = NULL;
3983 	}
3984 	else {
3985 	    if (attvalue != NULL)
3986 	        xmlFree(attvalue);
3987 	    /* Dump the bogus attribute string up to the next blank or
3988 	     * the end of the tag. */
3989 	    while ((CUR != 0) &&
3990 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3991 		   ((CUR != '/') || (NXT(1) != '>')) &&
3992                    (ctxt->instate != XML_PARSER_EOF))
3993 		NEXT;
3994 	}
3995 
3996 failed:
3997 	SKIP_BLANKS;
3998     }
3999 
4000     /*
4001      * Handle specific association to the META tag
4002      */
4003     if (meta && (nbatts != 0))
4004 	htmlCheckMeta(ctxt, atts);
4005 
4006     /*
4007      * SAX: Start of Element !
4008      */
4009     if (!discardtag) {
4010 	htmlnamePush(ctxt, name);
4011 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4012 	    if (nbatts != 0)
4013 		ctxt->sax->startElement(ctxt->userData, name, atts);
4014 	    else
4015 		ctxt->sax->startElement(ctxt->userData, name, NULL);
4016 	}
4017     }
4018 
4019     if (atts != NULL) {
4020         for (i = 1;i < nbatts;i += 2) {
4021 	    if (atts[i] != NULL)
4022 		xmlFree((xmlChar *) atts[i]);
4023 	}
4024     }
4025 
4026     return(discardtag);
4027 }
4028 
4029 /**
4030  * htmlParseEndTag:
4031  * @ctxt:  an HTML parser context
4032  *
4033  * parse an end of tag
4034  *
4035  * [42] ETag ::= '</' Name S? '>'
4036  *
4037  * With namespace
4038  *
4039  * [NS 9] ETag ::= '</' QName S? '>'
4040  *
4041  * Returns 1 if the current level should be closed.
4042  */
4043 
4044 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)4045 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4046 {
4047     const xmlChar *name;
4048     const xmlChar *oldname;
4049     int i, ret;
4050 
4051     if ((CUR != '<') || (NXT(1) != '/')) {
4052         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4053 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
4054         return (0);
4055     }
4056     SKIP(2);
4057 
4058     name = htmlParseHTMLName(ctxt);
4059     if (name == NULL)
4060         return (0);
4061     /*
4062      * We should definitely be at the ending "S? '>'" part
4063      */
4064     SKIP_BLANKS;
4065     if (CUR != '>') {
4066         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4067 	             "End tag : expected '>'\n", NULL, NULL);
4068         /* Skip to next '>' */
4069         while ((CUR != 0) && (CUR != '>'))
4070             NEXT;
4071     }
4072     if (CUR == '>')
4073         NEXT;
4074 
4075     /*
4076      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4077      * out now.
4078      */
4079     if ((ctxt->depth > 0) &&
4080         (xmlStrEqual(name, BAD_CAST "html") ||
4081          xmlStrEqual(name, BAD_CAST "body") ||
4082 	 xmlStrEqual(name, BAD_CAST "head"))) {
4083 	ctxt->depth--;
4084 	return (0);
4085     }
4086 
4087     /*
4088      * If the name read is not one of the element in the parsing stack
4089      * then return, it's just an error.
4090      */
4091     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4092         if (xmlStrEqual(name, ctxt->nameTab[i]))
4093             break;
4094     }
4095     if (i < 0) {
4096         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4097 	             "Unexpected end tag : %s\n", name, NULL);
4098         return (0);
4099     }
4100 
4101 
4102     /*
4103      * Check for auto-closure of HTML elements.
4104      */
4105 
4106     htmlAutoCloseOnClose(ctxt, name);
4107 
4108     /*
4109      * Well formedness constraints, opening and closing must match.
4110      * With the exception that the autoclose may have popped stuff out
4111      * of the stack.
4112      */
4113     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4114         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4115                      "Opening and ending tag mismatch: %s and %s\n",
4116                      name, ctxt->name);
4117     }
4118 
4119     /*
4120      * SAX: End of Tag
4121      */
4122     oldname = ctxt->name;
4123     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4124         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4125             ctxt->sax->endElement(ctxt->userData, name);
4126 	htmlNodeInfoPop(ctxt);
4127         htmlnamePop(ctxt);
4128         ret = 1;
4129     } else {
4130         ret = 0;
4131     }
4132 
4133     return (ret);
4134 }
4135 
4136 
4137 /**
4138  * htmlParseReference:
4139  * @ctxt:  an HTML parser context
4140  *
4141  * parse and handle entity references in content,
4142  * this will end-up in a call to character() since this is either a
4143  * CharRef, or a predefined entity.
4144  */
4145 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4146 htmlParseReference(htmlParserCtxtPtr ctxt) {
4147     const htmlEntityDesc * ent;
4148     xmlChar out[6];
4149     const xmlChar *name;
4150     if (CUR != '&') return;
4151 
4152     if (NXT(1) == '#') {
4153 	unsigned int c;
4154 	int bits, i = 0;
4155 
4156 	c = htmlParseCharRef(ctxt);
4157 	if (c == 0)
4158 	    return;
4159 
4160         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4161         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4162         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4163         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4164 
4165         for ( ; bits >= 0; bits-= 6) {
4166             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4167         }
4168 	out[i] = 0;
4169 
4170 	htmlCheckParagraph(ctxt);
4171 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4172 	    ctxt->sax->characters(ctxt->userData, out, i);
4173     } else {
4174 	ent = htmlParseEntityRef(ctxt, &name);
4175 	if (name == NULL) {
4176 	    htmlCheckParagraph(ctxt);
4177 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4178 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4179 	    return;
4180 	}
4181 	if ((ent == NULL) || !(ent->value > 0)) {
4182 	    htmlCheckParagraph(ctxt);
4183 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4184 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4185 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4186 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4187 	    }
4188 	} else {
4189 	    unsigned int c;
4190 	    int bits, i = 0;
4191 
4192 	    c = ent->value;
4193 	    if      (c <    0x80)
4194 	            { out[i++]= c;                bits= -6; }
4195 	    else if (c <   0x800)
4196 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4197 	    else if (c < 0x10000)
4198 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4199 	    else
4200 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4201 
4202 	    for ( ; bits >= 0; bits-= 6) {
4203 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
4204 	    }
4205 	    out[i] = 0;
4206 
4207 	    htmlCheckParagraph(ctxt);
4208 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4209 		ctxt->sax->characters(ctxt->userData, out, i);
4210 	}
4211     }
4212 }
4213 
4214 /**
4215  * htmlParseContent:
4216  * @ctxt:  an HTML parser context
4217  *
4218  * Parse a content: comment, sub-element, reference or text.
4219  * Kept for compatibility with old code
4220  */
4221 
4222 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4223 htmlParseContent(htmlParserCtxtPtr ctxt) {
4224     xmlChar *currentNode;
4225     int depth;
4226     const xmlChar *name;
4227 
4228     currentNode = xmlStrdup(ctxt->name);
4229     depth = ctxt->nameNr;
4230     while (1) {
4231         GROW;
4232 
4233         if (ctxt->instate == XML_PARSER_EOF)
4234             break;
4235 
4236 	/*
4237 	 * Our tag or one of it's parent or children is ending.
4238 	 */
4239         if ((CUR == '<') && (NXT(1) == '/')) {
4240 	    if (htmlParseEndTag(ctxt) &&
4241 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4242 		if (currentNode != NULL)
4243 		    xmlFree(currentNode);
4244 		return;
4245 	    }
4246 	    continue; /* while */
4247         }
4248 
4249 	else if ((CUR == '<') &&
4250 	         ((IS_ASCII_LETTER(NXT(1))) ||
4251 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4252 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4253 	    if (name == NULL) {
4254 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4255 			 "htmlParseStartTag: invalid element name\n",
4256 			 NULL, NULL);
4257 	        /* Dump the bogus tag like browsers do */
4258                 while ((CUR != 0) && (CUR != '>'))
4259 	            NEXT;
4260 
4261 	        if (currentNode != NULL)
4262 	            xmlFree(currentNode);
4263 	        return;
4264 	    }
4265 
4266 	    if (ctxt->name != NULL) {
4267 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4268 	            htmlAutoClose(ctxt, name);
4269 	            continue;
4270 	        }
4271 	    }
4272 	}
4273 
4274 	/*
4275 	 * Has this node been popped out during parsing of
4276 	 * the next element
4277 	 */
4278         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4279 	    (!xmlStrEqual(currentNode, ctxt->name)))
4280 	     {
4281 	    if (currentNode != NULL) xmlFree(currentNode);
4282 	    return;
4283 	}
4284 
4285 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4286 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4287 	    /*
4288 	     * Handle SCRIPT/STYLE separately
4289 	     */
4290 	    htmlParseScript(ctxt);
4291 	}
4292 
4293         else if ((CUR == '<') && (NXT(1) == '!')) {
4294             /*
4295              * Sometimes DOCTYPE arrives in the middle of the document
4296              */
4297             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4298                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4299                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4300                 (UPP(8) == 'E')) {
4301                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4302                              "Misplaced DOCTYPE declaration\n",
4303                              BAD_CAST "DOCTYPE" , NULL);
4304                 htmlParseDocTypeDecl(ctxt);
4305             }
4306             /*
4307              * First case :  a comment
4308              */
4309             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4310                 htmlParseComment(ctxt);
4311             }
4312             else {
4313                 htmlSkipBogusComment(ctxt);
4314             }
4315         }
4316 
4317         /*
4318          * Second case : a Processing Instruction.
4319          */
4320         else if ((CUR == '<') && (NXT(1) == '?')) {
4321             htmlParsePI(ctxt);
4322         }
4323 
4324         /*
4325          * Third case :  a sub-element.
4326          */
4327         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4328             htmlParseElement(ctxt);
4329         }
4330         else if (CUR == '<') {
4331             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4332                 (ctxt->sax->characters != NULL))
4333                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4334             NEXT;
4335         }
4336 
4337         /*
4338          * Fourth case : a reference. If if has not been resolved,
4339          *    parsing returns it's Name, create the node
4340          */
4341         else if (CUR == '&') {
4342             htmlParseReference(ctxt);
4343         }
4344 
4345         /*
4346          * Fifth case : end of the resource
4347          */
4348         else if (CUR == 0) {
4349             htmlAutoCloseOnEnd(ctxt);
4350             break;
4351         }
4352 
4353         /*
4354          * Last case, text. Note that References are handled directly.
4355          */
4356         else {
4357             htmlParseCharData(ctxt);
4358         }
4359 
4360         SHRINK;
4361         GROW;
4362     }
4363     if (currentNode != NULL) xmlFree(currentNode);
4364 }
4365 
4366 /**
4367  * htmlParseElement:
4368  * @ctxt:  an HTML parser context
4369  *
4370  * DEPRECATED: Internal function, don't use.
4371  *
4372  * parse an HTML element, this is highly recursive
4373  * this is kept for compatibility with previous code versions
4374  *
4375  * [39] element ::= EmptyElemTag | STag content ETag
4376  *
4377  * [41] Attribute ::= Name Eq AttValue
4378  */
4379 
4380 void
htmlParseElement(htmlParserCtxtPtr ctxt)4381 htmlParseElement(htmlParserCtxtPtr ctxt) {
4382     const xmlChar *name;
4383     xmlChar *currentNode = NULL;
4384     const htmlElemDesc * info;
4385     htmlParserNodeInfo node_info;
4386     int failed;
4387     int depth;
4388     const xmlChar *oldptr;
4389 
4390     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4391 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4392 		     "htmlParseElement: context error\n", NULL, NULL);
4393 	return;
4394     }
4395 
4396     if (ctxt->instate == XML_PARSER_EOF)
4397         return;
4398 
4399     /* Capture start position */
4400     if (ctxt->record_info) {
4401         node_info.begin_pos = ctxt->input->consumed +
4402                           (CUR_PTR - ctxt->input->base);
4403 	node_info.begin_line = ctxt->input->line;
4404     }
4405 
4406     failed = htmlParseStartTag(ctxt);
4407     name = ctxt->name;
4408     if ((failed == -1) || (name == NULL)) {
4409 	if (CUR == '>')
4410 	    NEXT;
4411         return;
4412     }
4413 
4414     /*
4415      * Lookup the info for that element.
4416      */
4417     info = htmlTagLookup(name);
4418     if (info == NULL) {
4419 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4420 	             "Tag %s invalid\n", name, NULL);
4421     }
4422 
4423     /*
4424      * Check for an Empty Element labeled the XML/SGML way
4425      */
4426     if ((CUR == '/') && (NXT(1) == '>')) {
4427         SKIP(2);
4428 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4429 	    ctxt->sax->endElement(ctxt->userData, name);
4430 	htmlnamePop(ctxt);
4431 	return;
4432     }
4433 
4434     if (CUR == '>') {
4435         NEXT;
4436     } else {
4437 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4438 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4439 
4440 	/*
4441 	 * end of parsing of this node.
4442 	 */
4443 	if (xmlStrEqual(name, ctxt->name)) {
4444 	    nodePop(ctxt);
4445 	    htmlnamePop(ctxt);
4446 	}
4447 
4448 	/*
4449 	 * Capture end position and add node
4450 	 */
4451 	if (ctxt->record_info) {
4452 	   node_info.end_pos = ctxt->input->consumed +
4453 			      (CUR_PTR - ctxt->input->base);
4454 	   node_info.end_line = ctxt->input->line;
4455 	   node_info.node = ctxt->node;
4456 	   xmlParserAddNodeInfo(ctxt, &node_info);
4457 	}
4458 	return;
4459     }
4460 
4461     /*
4462      * Check for an Empty Element from DTD definition
4463      */
4464     if ((info != NULL) && (info->empty)) {
4465 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4466 	    ctxt->sax->endElement(ctxt->userData, name);
4467 	htmlnamePop(ctxt);
4468 	return;
4469     }
4470 
4471     /*
4472      * Parse the content of the element:
4473      */
4474     currentNode = xmlStrdup(ctxt->name);
4475     depth = ctxt->nameNr;
4476     while (CUR != 0) {
4477 	oldptr = ctxt->input->cur;
4478 	htmlParseContent(ctxt);
4479 	if (oldptr==ctxt->input->cur) break;
4480 	if (ctxt->nameNr < depth) break;
4481     }
4482 
4483     /*
4484      * Capture end position and add node
4485      */
4486     if ( currentNode != NULL && ctxt->record_info ) {
4487        node_info.end_pos = ctxt->input->consumed +
4488                           (CUR_PTR - ctxt->input->base);
4489        node_info.end_line = ctxt->input->line;
4490        node_info.node = ctxt->node;
4491        xmlParserAddNodeInfo(ctxt, &node_info);
4492     }
4493     if (CUR == 0) {
4494 	htmlAutoCloseOnEnd(ctxt);
4495     }
4496 
4497     if (currentNode != NULL)
4498 	xmlFree(currentNode);
4499 }
4500 
4501 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4502 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4503     /*
4504      * Capture end position and add node
4505      */
4506     if ( ctxt->node != NULL && ctxt->record_info ) {
4507        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4508                                 (CUR_PTR - ctxt->input->base);
4509        ctxt->nodeInfo->end_line = ctxt->input->line;
4510        ctxt->nodeInfo->node = ctxt->node;
4511        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4512        htmlNodeInfoPop(ctxt);
4513     }
4514     if (CUR == 0) {
4515        htmlAutoCloseOnEnd(ctxt);
4516     }
4517 }
4518 
4519 /**
4520  * htmlParseElementInternal:
4521  * @ctxt:  an HTML parser context
4522  *
4523  * parse an HTML element, new version, non recursive
4524  *
4525  * [39] element ::= EmptyElemTag | STag content ETag
4526  *
4527  * [41] Attribute ::= Name Eq AttValue
4528  */
4529 
4530 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4531 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4532     const xmlChar *name;
4533     const htmlElemDesc * info;
4534     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4535     int failed;
4536 
4537     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4538 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4539 		     "htmlParseElementInternal: context error\n", NULL, NULL);
4540 	return;
4541     }
4542 
4543     if (ctxt->instate == XML_PARSER_EOF)
4544         return;
4545 
4546     /* Capture start position */
4547     if (ctxt->record_info) {
4548         node_info.begin_pos = ctxt->input->consumed +
4549                           (CUR_PTR - ctxt->input->base);
4550 	node_info.begin_line = ctxt->input->line;
4551     }
4552 
4553     failed = htmlParseStartTag(ctxt);
4554     name = ctxt->name;
4555     if ((failed == -1) || (name == NULL)) {
4556 	if (CUR == '>')
4557 	    NEXT;
4558         return;
4559     }
4560 
4561     /*
4562      * Lookup the info for that element.
4563      */
4564     info = htmlTagLookup(name);
4565     if (info == NULL) {
4566 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4567 	             "Tag %s invalid\n", name, NULL);
4568     }
4569 
4570     /*
4571      * Check for an Empty Element labeled the XML/SGML way
4572      */
4573     if ((CUR == '/') && (NXT(1) == '>')) {
4574         SKIP(2);
4575 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4576 	    ctxt->sax->endElement(ctxt->userData, name);
4577 	htmlnamePop(ctxt);
4578 	return;
4579     }
4580 
4581     if (CUR == '>') {
4582         NEXT;
4583     } else {
4584 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4585 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4586 
4587 	/*
4588 	 * end of parsing of this node.
4589 	 */
4590 	if (xmlStrEqual(name, ctxt->name)) {
4591 	    nodePop(ctxt);
4592 	    htmlnamePop(ctxt);
4593 	}
4594 
4595         if (ctxt->record_info)
4596             htmlNodeInfoPush(ctxt, &node_info);
4597         htmlParserFinishElementParsing(ctxt);
4598 	return;
4599     }
4600 
4601     /*
4602      * Check for an Empty Element from DTD definition
4603      */
4604     if ((info != NULL) && (info->empty)) {
4605 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4606 	    ctxt->sax->endElement(ctxt->userData, name);
4607 	htmlnamePop(ctxt);
4608 	return;
4609     }
4610 
4611     if (ctxt->record_info)
4612         htmlNodeInfoPush(ctxt, &node_info);
4613 }
4614 
4615 /**
4616  * htmlParseContentInternal:
4617  * @ctxt:  an HTML parser context
4618  *
4619  * Parse a content: comment, sub-element, reference or text.
4620  * New version for non recursive htmlParseElementInternal
4621  */
4622 
4623 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4624 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4625     xmlChar *currentNode;
4626     int depth;
4627     const xmlChar *name;
4628 
4629     depth = ctxt->nameNr;
4630     if (depth <= 0) {
4631         currentNode = NULL;
4632     } else {
4633         currentNode = xmlStrdup(ctxt->name);
4634         if (currentNode == NULL) {
4635             htmlErrMemory(ctxt, NULL);
4636             return;
4637         }
4638     }
4639     while (1) {
4640         GROW;
4641 
4642         if (ctxt->instate == XML_PARSER_EOF)
4643             break;
4644 
4645 	/*
4646 	 * Our tag or one of it's parent or children is ending.
4647 	 */
4648         if ((CUR == '<') && (NXT(1) == '/')) {
4649 	    if (htmlParseEndTag(ctxt) &&
4650 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4651 		if (currentNode != NULL)
4652 		    xmlFree(currentNode);
4653 
4654 	        depth = ctxt->nameNr;
4655                 if (depth <= 0) {
4656                     currentNode = NULL;
4657                 } else {
4658                     currentNode = xmlStrdup(ctxt->name);
4659                     if (currentNode == NULL) {
4660                         htmlErrMemory(ctxt, NULL);
4661                         break;
4662                     }
4663                 }
4664 	    }
4665 	    continue; /* while */
4666         }
4667 
4668 	else if ((CUR == '<') &&
4669 	         ((IS_ASCII_LETTER(NXT(1))) ||
4670 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4671 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4672 	    if (name == NULL) {
4673 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4674 			 "htmlParseStartTag: invalid element name\n",
4675 			 NULL, NULL);
4676 	        /* Dump the bogus tag like browsers do */
4677 	        while ((CUR == 0) && (CUR != '>'))
4678 	            NEXT;
4679 
4680 	        htmlParserFinishElementParsing(ctxt);
4681 	        if (currentNode != NULL)
4682 	            xmlFree(currentNode);
4683 
4684 	        currentNode = xmlStrdup(ctxt->name);
4685                 if (currentNode == NULL) {
4686                     htmlErrMemory(ctxt, NULL);
4687                     break;
4688                 }
4689 	        depth = ctxt->nameNr;
4690 	        continue;
4691 	    }
4692 
4693 	    if (ctxt->name != NULL) {
4694 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4695 	            htmlAutoClose(ctxt, name);
4696 	            continue;
4697 	        }
4698 	    }
4699 	}
4700 
4701 	/*
4702 	 * Has this node been popped out during parsing of
4703 	 * the next element
4704 	 */
4705         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4706 	    (!xmlStrEqual(currentNode, ctxt->name)))
4707 	     {
4708 	    htmlParserFinishElementParsing(ctxt);
4709 	    if (currentNode != NULL) xmlFree(currentNode);
4710 
4711 	    currentNode = xmlStrdup(ctxt->name);
4712             if (currentNode == NULL) {
4713                 htmlErrMemory(ctxt, NULL);
4714                 break;
4715             }
4716 	    depth = ctxt->nameNr;
4717 	    continue;
4718 	}
4719 
4720 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4721 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4722 	    /*
4723 	     * Handle SCRIPT/STYLE separately
4724 	     */
4725 	    htmlParseScript(ctxt);
4726 	}
4727 
4728         else if ((CUR == '<') && (NXT(1) == '!')) {
4729             /*
4730              * Sometimes DOCTYPE arrives in the middle of the document
4731              */
4732             if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4733                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
4734                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4735                 (UPP(8) == 'E')) {
4736                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4737                              "Misplaced DOCTYPE declaration\n",
4738                              BAD_CAST "DOCTYPE" , NULL);
4739                 htmlParseDocTypeDecl(ctxt);
4740             }
4741             /*
4742              * First case :  a comment
4743              */
4744             else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4745                 htmlParseComment(ctxt);
4746             }
4747             else {
4748                 htmlSkipBogusComment(ctxt);
4749             }
4750         }
4751 
4752         /*
4753          * Second case : a Processing Instruction.
4754          */
4755         else if ((CUR == '<') && (NXT(1) == '?')) {
4756             htmlParsePI(ctxt);
4757         }
4758 
4759         /*
4760          * Third case :  a sub-element.
4761          */
4762         else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4763             htmlParseElementInternal(ctxt);
4764             if (currentNode != NULL) xmlFree(currentNode);
4765 
4766             currentNode = xmlStrdup(ctxt->name);
4767             if (currentNode == NULL) {
4768                 htmlErrMemory(ctxt, NULL);
4769                 break;
4770             }
4771             depth = ctxt->nameNr;
4772         }
4773         else if (CUR == '<') {
4774             if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4775                 (ctxt->sax->characters != NULL))
4776                 ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4777             NEXT;
4778         }
4779 
4780         /*
4781          * Fourth case : a reference. If if has not been resolved,
4782          *    parsing returns it's Name, create the node
4783          */
4784         else if (CUR == '&') {
4785             htmlParseReference(ctxt);
4786         }
4787 
4788         /*
4789          * Fifth case : end of the resource
4790          */
4791         else if (CUR == 0) {
4792             htmlAutoCloseOnEnd(ctxt);
4793             break;
4794         }
4795 
4796         /*
4797          * Last case, text. Note that References are handled directly.
4798          */
4799         else {
4800             htmlParseCharData(ctxt);
4801         }
4802 
4803         SHRINK;
4804         GROW;
4805     }
4806     if (currentNode != NULL) xmlFree(currentNode);
4807 }
4808 
4809 /**
4810  * htmlParseContent:
4811  * @ctxt:  an HTML parser context
4812  *
4813  * Parse a content: comment, sub-element, reference or text.
4814  * This is the entry point when called from parser.c
4815  */
4816 
4817 void
__htmlParseContent(void * ctxt)4818 __htmlParseContent(void *ctxt) {
4819     if (ctxt != NULL)
4820 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4821 }
4822 
4823 /**
4824  * htmlParseDocument:
4825  * @ctxt:  an HTML parser context
4826  *
4827  * parse an HTML document (and build a tree if using the standard SAX
4828  * interface).
4829  *
4830  * Returns 0, -1 in case of error. the parser context is augmented
4831  *                as a result of the parsing.
4832  */
4833 
4834 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4835 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4836     xmlDtdPtr dtd;
4837 
4838     xmlInitParser();
4839 
4840     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4841 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4842 		     "htmlParseDocument: context error\n", NULL, NULL);
4843 	return(XML_ERR_INTERNAL_ERROR);
4844     }
4845 
4846     /*
4847      * SAX: beginning of the document processing.
4848      */
4849     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4850         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4851 
4852     xmlDetectEncoding(ctxt);
4853 
4854     /*
4855      * Wipe out everything which is before the first '<'
4856      */
4857     SKIP_BLANKS;
4858     if (CUR == 0) {
4859 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4860 	             "Document is empty\n", NULL, NULL);
4861     }
4862 
4863     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4864 	ctxt->sax->startDocument(ctxt->userData);
4865 
4866 
4867     /*
4868      * Parse possible comments and PIs before any content
4869      */
4870     while (((CUR == '<') && (NXT(1) == '!') &&
4871             (NXT(2) == '-') && (NXT(3) == '-')) ||
4872 	   ((CUR == '<') && (NXT(1) == '?'))) {
4873         htmlParseComment(ctxt);
4874         htmlParsePI(ctxt);
4875 	SKIP_BLANKS;
4876     }
4877 
4878 
4879     /*
4880      * Then possibly doc type declaration(s) and more Misc
4881      * (doctypedecl Misc*)?
4882      */
4883     if ((CUR == '<') && (NXT(1) == '!') &&
4884 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4885 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4886 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4887 	(UPP(8) == 'E')) {
4888 	htmlParseDocTypeDecl(ctxt);
4889     }
4890     SKIP_BLANKS;
4891 
4892     /*
4893      * Parse possible comments and PIs before any content
4894      */
4895     while (((CUR == '<') && (NXT(1) == '!') &&
4896             (NXT(2) == '-') && (NXT(3) == '-')) ||
4897 	   ((CUR == '<') && (NXT(1) == '?'))) {
4898         htmlParseComment(ctxt);
4899         htmlParsePI(ctxt);
4900 	SKIP_BLANKS;
4901     }
4902 
4903     /*
4904      * Time to start parsing the tree itself
4905      */
4906     htmlParseContentInternal(ctxt);
4907 
4908     /*
4909      * autoclose
4910      */
4911     if (CUR == 0)
4912 	htmlAutoCloseOnEnd(ctxt);
4913 
4914 
4915     /*
4916      * SAX: end of the document processing.
4917      */
4918     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4919         ctxt->sax->endDocument(ctxt->userData);
4920 
4921     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4922 	dtd = xmlGetIntSubset(ctxt->myDoc);
4923 	if (dtd == NULL)
4924 	    ctxt->myDoc->intSubset =
4925 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4926 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4927 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4928     }
4929     if (! ctxt->wellFormed) return(-1);
4930     return(0);
4931 }
4932 
4933 
4934 /************************************************************************
4935  *									*
4936  *			Parser contexts handling			*
4937  *									*
4938  ************************************************************************/
4939 
4940 /**
4941  * htmlInitParserCtxt:
4942  * @ctxt:  an HTML parser context
4943  * @sax:  SAX handler
4944  * @userData:  user data
4945  *
4946  * Initialize a parser context
4947  *
4948  * Returns 0 in case of success and -1 in case of error
4949  */
4950 
4951 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt,const htmlSAXHandler * sax,void * userData)4952 htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4953                    void *userData)
4954 {
4955     if (ctxt == NULL) return(-1);
4956     memset(ctxt, 0, sizeof(htmlParserCtxt));
4957 
4958     ctxt->dict = xmlDictCreate();
4959     if (ctxt->dict == NULL) {
4960         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4961 	return(-1);
4962     }
4963 
4964     if (ctxt->sax == NULL)
4965         ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4966     if (ctxt->sax == NULL) {
4967         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4968 	return(-1);
4969     }
4970     if (sax == NULL) {
4971         memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4972         xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4973         ctxt->userData = ctxt;
4974     } else {
4975         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4976         ctxt->userData = userData ? userData : ctxt;
4977     }
4978 
4979     /* Allocate the Input stack */
4980     ctxt->inputTab = (htmlParserInputPtr *)
4981                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
4982     if (ctxt->inputTab == NULL) {
4983         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4984 	ctxt->inputNr = 0;
4985 	ctxt->inputMax = 0;
4986 	ctxt->input = NULL;
4987 	return(-1);
4988     }
4989     ctxt->inputNr = 0;
4990     ctxt->inputMax = 5;
4991     ctxt->input = NULL;
4992     ctxt->version = NULL;
4993     ctxt->encoding = NULL;
4994     ctxt->standalone = -1;
4995     ctxt->instate = XML_PARSER_START;
4996 
4997     /* Allocate the Node stack */
4998     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4999     if (ctxt->nodeTab == NULL) {
5000         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5001 	ctxt->nodeNr = 0;
5002 	ctxt->nodeMax = 0;
5003 	ctxt->node = NULL;
5004 	ctxt->inputNr = 0;
5005 	ctxt->inputMax = 0;
5006 	ctxt->input = NULL;
5007 	return(-1);
5008     }
5009     ctxt->nodeNr = 0;
5010     ctxt->nodeMax = 10;
5011     ctxt->node = NULL;
5012 
5013     /* Allocate the Name stack */
5014     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5015     if (ctxt->nameTab == NULL) {
5016         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5017 	ctxt->nameNr = 0;
5018 	ctxt->nameMax = 0;
5019 	ctxt->name = NULL;
5020 	ctxt->nodeNr = 0;
5021 	ctxt->nodeMax = 0;
5022 	ctxt->node = NULL;
5023 	ctxt->inputNr = 0;
5024 	ctxt->inputMax = 0;
5025 	ctxt->input = NULL;
5026 	return(-1);
5027     }
5028     ctxt->nameNr = 0;
5029     ctxt->nameMax = 10;
5030     ctxt->name = NULL;
5031 
5032     ctxt->nodeInfoTab = NULL;
5033     ctxt->nodeInfoNr  = 0;
5034     ctxt->nodeInfoMax = 0;
5035 
5036     ctxt->myDoc = NULL;
5037     ctxt->wellFormed = 1;
5038     ctxt->replaceEntities = 0;
5039     ctxt->linenumbers = xmlLineNumbersDefaultValue;
5040     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5041     ctxt->html = 1;
5042     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5043     ctxt->vctxt.userData = ctxt;
5044     ctxt->vctxt.error = xmlParserValidityError;
5045     ctxt->vctxt.warning = xmlParserValidityWarning;
5046     ctxt->record_info = 0;
5047     ctxt->validate = 0;
5048     ctxt->checkIndex = 0;
5049     ctxt->catalogs = NULL;
5050     xmlInitNodeInfoSeq(&ctxt->node_seq);
5051     return(0);
5052 }
5053 
5054 /**
5055  * htmlFreeParserCtxt:
5056  * @ctxt:  an HTML parser context
5057  *
5058  * Free all the memory used by a parser context. However the parsed
5059  * document in ctxt->myDoc is not freed.
5060  */
5061 
5062 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)5063 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5064 {
5065     xmlFreeParserCtxt(ctxt);
5066 }
5067 
5068 /**
5069  * htmlNewParserCtxt:
5070  *
5071  * Allocate and initialize a new parser context.
5072  *
5073  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5074  */
5075 
5076 htmlParserCtxtPtr
htmlNewParserCtxt(void)5077 htmlNewParserCtxt(void)
5078 {
5079     return(htmlNewSAXParserCtxt(NULL, NULL));
5080 }
5081 
5082 /**
5083  * htmlNewSAXParserCtxt:
5084  * @sax:  SAX handler
5085  * @userData:  user data
5086  *
5087  * Allocate and initialize a new SAX parser context. If userData is NULL,
5088  * the parser context will be passed as user data.
5089  *
5090  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5091  */
5092 
5093 htmlParserCtxtPtr
htmlNewSAXParserCtxt(const htmlSAXHandler * sax,void * userData)5094 htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5095 {
5096     xmlParserCtxtPtr ctxt;
5097 
5098     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5099     if (ctxt == NULL) {
5100         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5101 	return(NULL);
5102     }
5103     memset(ctxt, 0, sizeof(xmlParserCtxt));
5104     if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5105         htmlFreeParserCtxt(ctxt);
5106 	return(NULL);
5107     }
5108     return(ctxt);
5109 }
5110 
5111 /**
5112  * htmlCreateMemoryParserCtxt:
5113  * @buffer:  a pointer to a char array
5114  * @size:  the size of the array
5115  *
5116  * Create a parser context for an HTML in-memory document.
5117  *
5118  * Returns the new parser context or NULL
5119  */
5120 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5121 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5122     xmlParserCtxtPtr ctxt;
5123     xmlParserInputPtr input;
5124     xmlParserInputBufferPtr buf;
5125 
5126     if (buffer == NULL)
5127 	return(NULL);
5128     if (size <= 0)
5129 	return(NULL);
5130 
5131     ctxt = htmlNewParserCtxt();
5132     if (ctxt == NULL)
5133 	return(NULL);
5134 
5135     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5136     if (buf == NULL) {
5137 	xmlFreeParserCtxt(ctxt);
5138         return(NULL);
5139     }
5140 
5141     input = xmlNewInputStream(ctxt);
5142     if (input == NULL) {
5143 	xmlFreeParserInputBuffer(buf);
5144 	xmlFreeParserCtxt(ctxt);
5145 	return(NULL);
5146     }
5147 
5148     input->filename = NULL;
5149     input->buf = buf;
5150     xmlBufResetInput(buf->buffer, input);
5151 
5152     inputPush(ctxt, input);
5153     return(ctxt);
5154 }
5155 
5156 /**
5157  * htmlCreateDocParserCtxt:
5158  * @str:  a pointer to an array of xmlChar
5159  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5160  *
5161  * Create a parser context for an HTML document.
5162  *
5163  * TODO: check the need to add encoding handling there
5164  *
5165  * Returns the new parser context or NULL
5166  */
5167 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * str,const char * encoding)5168 htmlCreateDocParserCtxt(const xmlChar *str, const char *encoding) {
5169     xmlParserCtxtPtr ctxt;
5170     xmlParserInputPtr input;
5171     xmlParserInputBufferPtr buf;
5172 
5173     if (str == NULL)
5174 	return(NULL);
5175 
5176     ctxt = htmlNewParserCtxt();
5177     if (ctxt == NULL)
5178 	return(NULL);
5179 
5180     buf = xmlParserInputBufferCreateString(str);
5181     if (buf == NULL) {
5182 	xmlFreeParserCtxt(ctxt);
5183         return(NULL);
5184     }
5185 
5186     input = xmlNewInputStream(ctxt);
5187     if (input == NULL) {
5188 	xmlFreeParserInputBuffer(buf);
5189 	xmlFreeParserCtxt(ctxt);
5190 	return(NULL);
5191     }
5192 
5193     input->filename = NULL;
5194     input->buf = buf;
5195     xmlBufResetInput(buf->buffer, input);
5196 
5197     inputPush(ctxt, input);
5198 
5199     if (encoding != NULL) {
5200 	xmlCharEncoding enc;
5201 	xmlCharEncodingHandlerPtr handler;
5202 
5203 	enc = xmlParseCharEncoding(encoding);
5204 	/*
5205 	 * registered set of known encodings
5206 	 */
5207 	if (enc != XML_CHAR_ENCODING_ERROR) {
5208 	    xmlSwitchEncoding(ctxt, enc);
5209 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5210 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5211 		             "Unsupported encoding %s\n",
5212 			     (const xmlChar *) encoding, NULL);
5213 	    }
5214 	} else {
5215 	    /*
5216 	     * fallback for unknown encodings
5217 	     */
5218 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
5219 	    if (handler != NULL) {
5220 		xmlSwitchToEncoding(ctxt, handler);
5221 	    } else {
5222 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5223 		             "Unsupported encoding %s\n",
5224 			     (const xmlChar *) encoding, NULL);
5225 	    }
5226 	}
5227     }
5228 
5229     return(ctxt);
5230 }
5231 
5232 #ifdef LIBXML_PUSH_ENABLED
5233 /************************************************************************
5234  *									*
5235  *	Progressive parsing interfaces				*
5236  *									*
5237  ************************************************************************/
5238 
5239 /**
5240  * htmlParseLookupSequence:
5241  * @ctxt:  an HTML parser context
5242  * @first:  the first char to lookup
5243  * @next:  the next char to lookup or zero
5244  * @third:  the next char to lookup or zero
5245  * @ignoreattrval: skip over attribute values
5246  *
5247  * Try to find if a sequence (first, next, third) or  just (first next) or
5248  * (first) is available in the input stream.
5249  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5250  * to avoid rescanning sequences of bytes, it DOES change the state of the
5251  * parser, do not use liberally.
5252  * This is basically similar to xmlParseLookupSequence()
5253  *
5254  * Returns the index to the current parsing point if the full sequence
5255  *      is available, -1 otherwise.
5256  */
5257 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int ignoreattrval)5258 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5259                         xmlChar next, xmlChar third, int ignoreattrval)
5260 {
5261     size_t base, len;
5262     htmlParserInputPtr in;
5263     const xmlChar *buf;
5264     int quote;
5265 
5266     in = ctxt->input;
5267     if (in == NULL)
5268         return (-1);
5269 
5270     base = ctxt->checkIndex;
5271     quote = ctxt->endCheckState;
5272 
5273     buf = in->cur;
5274     len = in->end - in->cur;
5275 
5276     /* take into account the sequence length */
5277     if (third)
5278         len -= 2;
5279     else if (next)
5280         len--;
5281     for (; base < len; base++) {
5282         if (base >= INT_MAX / 2) {
5283             ctxt->checkIndex = 0;
5284             ctxt->endCheckState = 0;
5285             return (base - 2);
5286         }
5287         if (ignoreattrval) {
5288             if (quote) {
5289                 if (buf[base] == quote)
5290                     quote = 0;
5291                 continue;
5292             }
5293             if (buf[base] == '"' || buf[base] == '\'') {
5294                 quote = buf[base];
5295                 continue;
5296             }
5297         }
5298         if (buf[base] == first) {
5299             if (third != 0) {
5300                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5301                     continue;
5302             } else if (next != 0) {
5303                 if (buf[base + 1] != next)
5304                     continue;
5305             }
5306             ctxt->checkIndex = 0;
5307             ctxt->endCheckState = 0;
5308             return (base);
5309         }
5310     }
5311     ctxt->checkIndex = base;
5312     ctxt->endCheckState = quote;
5313     return (-1);
5314 }
5315 
5316 /**
5317  * htmlParseLookupCommentEnd:
5318  * @ctxt: an HTML parser context
5319  *
5320  * Try to find a comment end tag in the input stream
5321  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5322  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5323  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5324  * to avoid rescanning sequences of bytes, it DOES change the state of the
5325  * parser, do not use liberally.
5326  * This wraps to htmlParseLookupSequence()
5327  *
5328  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5329  */
5330 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)5331 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5332 {
5333     int mark = 0;
5334     int offset;
5335 
5336     while (1) {
5337 	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5338 	if (mark < 0)
5339             break;
5340         if ((NXT(mark+2) == '>') ||
5341 	    ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5342             ctxt->checkIndex = 0;
5343 	    break;
5344 	}
5345         offset = (NXT(mark+2) == '!') ? 3 : 2;
5346         if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5347 	    ctxt->checkIndex = mark;
5348             return(-1);
5349         }
5350 	ctxt->checkIndex = mark + 1;
5351     }
5352     return mark;
5353 }
5354 
5355 
5356 /**
5357  * htmlParseTryOrFinish:
5358  * @ctxt:  an HTML parser context
5359  * @terminate:  last chunk indicator
5360  *
5361  * Try to progress on parsing
5362  *
5363  * Returns zero if no parsing was possible
5364  */
5365 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5366 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5367     int ret = 0;
5368     htmlParserInputPtr in;
5369     ptrdiff_t avail = 0;
5370     xmlChar cur, next;
5371 
5372     htmlParserNodeInfo node_info;
5373 
5374     while (1) {
5375 
5376 	in = ctxt->input;
5377 	if (in == NULL) break;
5378 	avail = in->end - in->cur;
5379 	if ((avail == 0) && (terminate)) {
5380 	    htmlAutoCloseOnEnd(ctxt);
5381 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5382 		/*
5383 		 * SAX: end of the document processing.
5384 		 */
5385 		ctxt->instate = XML_PARSER_EOF;
5386 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5387 		    ctxt->sax->endDocument(ctxt->userData);
5388 	    }
5389 	}
5390         if (avail < 1)
5391 	    goto done;
5392         /*
5393          * This is done to make progress and avoid an infinite loop
5394          * if a parsing attempt was aborted by hitting a NUL byte. After
5395          * changing htmlCurrentChar, this probably isn't necessary anymore.
5396          * We should consider removing this check.
5397          */
5398 	cur = in->cur[0];
5399 	if (cur == 0) {
5400 	    SKIP(1);
5401 	    continue;
5402 	}
5403 
5404         switch (ctxt->instate) {
5405             case XML_PARSER_EOF:
5406 	        /*
5407 		 * Document parsing is done !
5408 		 */
5409 	        goto done;
5410             case XML_PARSER_START:
5411 	        /*
5412 		 * Very first chars read from the document flow.
5413 		 */
5414 		cur = in->cur[0];
5415 		if (IS_BLANK_CH(cur)) {
5416 		    SKIP_BLANKS;
5417                     avail = in->end - in->cur;
5418 		}
5419 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5420 		    ctxt->sax->setDocumentLocator(ctxt->userData,
5421 						  &xmlDefaultSAXLocator);
5422 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5423 	            (!ctxt->disableSAX))
5424 		    ctxt->sax->startDocument(ctxt->userData);
5425                 if (ctxt->instate == XML_PARSER_EOF)
5426                     goto done;
5427 
5428 		cur = in->cur[0];
5429 		next = in->cur[1];
5430 		if ((cur == '<') && (next == '!') &&
5431 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5432 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5433 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5434 		    (UPP(8) == 'E')) {
5435 		    if ((!terminate) &&
5436 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5437 			goto done;
5438 		    htmlParseDocTypeDecl(ctxt);
5439                     if (ctxt->instate == XML_PARSER_EOF)
5440                         goto done;
5441 		    ctxt->instate = XML_PARSER_PROLOG;
5442                 } else {
5443 		    ctxt->instate = XML_PARSER_MISC;
5444 		}
5445 		break;
5446             case XML_PARSER_MISC:
5447 		SKIP_BLANKS;
5448                 avail = in->end - in->cur;
5449 		/*
5450 		 * no chars in buffer
5451 		 */
5452 		if (avail < 1)
5453 		    goto done;
5454 		/*
5455 		 * not enough chars in buffer
5456 		 */
5457 		if (avail < 2) {
5458 		    if (!terminate)
5459 			goto done;
5460 		    else
5461 			next = ' ';
5462 		} else {
5463 		    next = in->cur[1];
5464 		}
5465 		cur = in->cur[0];
5466 	        if ((cur == '<') && (next == '!') &&
5467 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5468 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5469 			goto done;
5470 		    htmlParseComment(ctxt);
5471                     if (ctxt->instate == XML_PARSER_EOF)
5472                         goto done;
5473 		    ctxt->instate = XML_PARSER_MISC;
5474 	        } else if ((cur == '<') && (next == '?')) {
5475 		    if ((!terminate) &&
5476 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5477 			goto done;
5478 		    htmlParsePI(ctxt);
5479                     if (ctxt->instate == XML_PARSER_EOF)
5480                         goto done;
5481 		    ctxt->instate = XML_PARSER_MISC;
5482 		} else if ((cur == '<') && (next == '!') &&
5483 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5484 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5485 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5486 		    (UPP(8) == 'E')) {
5487 		    if ((!terminate) &&
5488 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5489 			goto done;
5490 		    htmlParseDocTypeDecl(ctxt);
5491                     if (ctxt->instate == XML_PARSER_EOF)
5492                         goto done;
5493 		    ctxt->instate = XML_PARSER_PROLOG;
5494 		} else if ((cur == '<') && (next == '!') &&
5495 		           (avail < 9)) {
5496 		    goto done;
5497 		} else {
5498 		    ctxt->instate = XML_PARSER_CONTENT;
5499 		}
5500 		break;
5501             case XML_PARSER_PROLOG:
5502 		SKIP_BLANKS;
5503                 avail = in->end - in->cur;
5504 		if (avail < 2)
5505 		    goto done;
5506 		cur = in->cur[0];
5507 		next = in->cur[1];
5508 		if ((cur == '<') && (next == '!') &&
5509 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5510 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5511 			goto done;
5512 		    htmlParseComment(ctxt);
5513                     if (ctxt->instate == XML_PARSER_EOF)
5514                         goto done;
5515 		    ctxt->instate = XML_PARSER_PROLOG;
5516 	        } else if ((cur == '<') && (next == '?')) {
5517 		    if ((!terminate) &&
5518 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5519 			goto done;
5520 		    htmlParsePI(ctxt);
5521                     if (ctxt->instate == XML_PARSER_EOF)
5522                         goto done;
5523 		    ctxt->instate = XML_PARSER_PROLOG;
5524 		} else if ((cur == '<') && (next == '!') &&
5525 		           (avail < 4)) {
5526 		    goto done;
5527 		} else {
5528 		    ctxt->instate = XML_PARSER_CONTENT;
5529 		}
5530 		break;
5531             case XML_PARSER_EPILOG:
5532                 avail = in->end - in->cur;
5533 		if (avail < 1)
5534 		    goto done;
5535 		cur = in->cur[0];
5536 		if (IS_BLANK_CH(cur)) {
5537 		    htmlParseCharData(ctxt);
5538 		    goto done;
5539 		}
5540 		if (avail < 2)
5541 		    goto done;
5542 		next = in->cur[1];
5543 	        if ((cur == '<') && (next == '!') &&
5544 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5545 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5546 			goto done;
5547 		    htmlParseComment(ctxt);
5548                     if (ctxt->instate == XML_PARSER_EOF)
5549                         goto done;
5550 		    ctxt->instate = XML_PARSER_EPILOG;
5551 	        } else if ((cur == '<') && (next == '?')) {
5552 		    if ((!terminate) &&
5553 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5554 			goto done;
5555 		    htmlParsePI(ctxt);
5556                     if (ctxt->instate == XML_PARSER_EOF)
5557                         goto done;
5558 		    ctxt->instate = XML_PARSER_EPILOG;
5559 		} else if ((cur == '<') && (next == '!') &&
5560 		           (avail < 4)) {
5561 		    goto done;
5562 		} else {
5563 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5564 		    ctxt->wellFormed = 0;
5565 		    ctxt->instate = XML_PARSER_EOF;
5566 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5567 			ctxt->sax->endDocument(ctxt->userData);
5568 		    goto done;
5569 		}
5570 		break;
5571             case XML_PARSER_START_TAG: {
5572 	        const xmlChar *name;
5573 		int failed;
5574 		const htmlElemDesc * info;
5575 
5576 		/*
5577 		 * no chars in buffer
5578 		 */
5579 		if (avail < 1)
5580 		    goto done;
5581 		/*
5582 		 * not enough chars in buffer
5583 		 */
5584 		if (avail < 2) {
5585 		    if (!terminate)
5586 			goto done;
5587 		    else
5588 			next = ' ';
5589 		} else {
5590 		    next = in->cur[1];
5591 		}
5592 		cur = in->cur[0];
5593 	        if (cur != '<') {
5594 		    ctxt->instate = XML_PARSER_CONTENT;
5595 		    break;
5596 		}
5597 		if (next == '/') {
5598 		    ctxt->instate = XML_PARSER_END_TAG;
5599 		    ctxt->checkIndex = 0;
5600 		    break;
5601 		}
5602 		if ((!terminate) &&
5603 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5604 		    goto done;
5605 
5606                 /* Capture start position */
5607 	        if (ctxt->record_info) {
5608 	             node_info.begin_pos = ctxt->input->consumed +
5609 	                                (CUR_PTR - ctxt->input->base);
5610 	             node_info.begin_line = ctxt->input->line;
5611 	        }
5612 
5613 
5614 		failed = htmlParseStartTag(ctxt);
5615 		name = ctxt->name;
5616 		if ((failed == -1) ||
5617 		    (name == NULL)) {
5618 		    if (CUR == '>')
5619 			NEXT;
5620 		    break;
5621 		}
5622 
5623 		/*
5624 		 * Lookup the info for that element.
5625 		 */
5626 		info = htmlTagLookup(name);
5627 		if (info == NULL) {
5628 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5629 		                 "Tag %s invalid\n", name, NULL);
5630 		}
5631 
5632 		/*
5633 		 * Check for an Empty Element labeled the XML/SGML way
5634 		 */
5635 		if ((CUR == '/') && (NXT(1) == '>')) {
5636 		    SKIP(2);
5637 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5638 			ctxt->sax->endElement(ctxt->userData, name);
5639 		    htmlnamePop(ctxt);
5640                     if (ctxt->instate == XML_PARSER_EOF)
5641                         goto done;
5642 		    ctxt->instate = XML_PARSER_CONTENT;
5643 		    break;
5644 		}
5645 
5646 		if (CUR == '>') {
5647 		    NEXT;
5648 		} else {
5649 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5650 		                 "Couldn't find end of Start Tag %s\n",
5651 				 name, NULL);
5652 
5653 		    /*
5654 		     * end of parsing of this node.
5655 		     */
5656 		    if (xmlStrEqual(name, ctxt->name)) {
5657 			nodePop(ctxt);
5658 			htmlnamePop(ctxt);
5659 		    }
5660 
5661 		    if (ctxt->record_info)
5662 		        htmlNodeInfoPush(ctxt, &node_info);
5663 
5664                     if (ctxt->instate == XML_PARSER_EOF)
5665                         goto done;
5666 		    ctxt->instate = XML_PARSER_CONTENT;
5667 		    break;
5668 		}
5669 
5670 		/*
5671 		 * Check for an Empty Element from DTD definition
5672 		 */
5673 		if ((info != NULL) && (info->empty)) {
5674 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5675 			ctxt->sax->endElement(ctxt->userData, name);
5676 		    htmlnamePop(ctxt);
5677 		}
5678 
5679                 if (ctxt->record_info)
5680 	            htmlNodeInfoPush(ctxt, &node_info);
5681 
5682                 if (ctxt->instate == XML_PARSER_EOF)
5683                     goto done;
5684 		ctxt->instate = XML_PARSER_CONTENT;
5685                 break;
5686 	    }
5687             case XML_PARSER_CONTENT: {
5688 		xmlChar chr[2] = { 0, 0 };
5689 
5690                 /*
5691 		 * Handle preparsed entities and charRef
5692 		 */
5693 		if (ctxt->token != 0) {
5694 		    chr[0] = ctxt->token;
5695 		    htmlCheckParagraph(ctxt);
5696 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5697 			ctxt->sax->characters(ctxt->userData, chr, 1);
5698 		    ctxt->token = 0;
5699 		    ctxt->checkIndex = 0;
5700 		}
5701 		if ((avail == 1) && (terminate)) {
5702 		    cur = in->cur[0];
5703 		    if ((cur != '<') && (cur != '&')) {
5704 			if (ctxt->sax != NULL) {
5705                             chr[0] = cur;
5706 			    if (IS_BLANK_CH(cur)) {
5707 				if (ctxt->keepBlanks) {
5708 				    if (ctxt->sax->characters != NULL)
5709 					ctxt->sax->characters(
5710 						ctxt->userData, chr, 1);
5711 				} else {
5712 				    if (ctxt->sax->ignorableWhitespace != NULL)
5713 					ctxt->sax->ignorableWhitespace(
5714 						ctxt->userData, chr, 1);
5715 				}
5716 			    } else {
5717 				htmlCheckParagraph(ctxt);
5718 				if (ctxt->sax->characters != NULL)
5719 				    ctxt->sax->characters(
5720 					    ctxt->userData, chr, 1);
5721 			    }
5722 			}
5723 			ctxt->token = 0;
5724 			ctxt->checkIndex = 0;
5725 			in->cur++;
5726 			break;
5727 		    }
5728 		}
5729 		if (avail < 2)
5730 		    goto done;
5731 		cur = in->cur[0];
5732 		next = in->cur[1];
5733 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5734 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5735 		    /*
5736 		     * Handle SCRIPT/STYLE separately
5737 		     */
5738 		    if (!terminate) {
5739 		        int idx;
5740 			xmlChar val;
5741 
5742 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5743 			if (idx < 0)
5744 			    goto done;
5745 		        val = in->cur[idx + 2];
5746 			if (val == 0) { /* bad cut of input */
5747                             /*
5748                              * FIXME: htmlParseScript checks for additional
5749                              * characters after '</'.
5750                              */
5751                             ctxt->checkIndex = idx;
5752 			    goto done;
5753                         }
5754 		    }
5755 		    htmlParseScript(ctxt);
5756                     if (ctxt->instate == XML_PARSER_EOF)
5757                         goto done;
5758 		    if ((cur == '<') && (next == '/')) {
5759 			ctxt->instate = XML_PARSER_END_TAG;
5760 			ctxt->checkIndex = 0;
5761 			break;
5762 		    }
5763 		} else if ((cur == '<') && (next == '!')) {
5764                     if (avail < 4)
5765                         goto done;
5766                     /*
5767                      * Sometimes DOCTYPE arrives in the middle of the document
5768                      */
5769                     if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5770                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
5771                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5772                         (UPP(8) == 'E')) {
5773                         if ((!terminate) &&
5774                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5775                             goto done;
5776                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5777                                      "Misplaced DOCTYPE declaration\n",
5778                                      BAD_CAST "DOCTYPE" , NULL);
5779                         htmlParseDocTypeDecl(ctxt);
5780                     } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5781                         if ((!terminate) &&
5782                             (htmlParseLookupCommentEnd(ctxt) < 0))
5783                             goto done;
5784                         htmlParseComment(ctxt);
5785                         if (ctxt->instate == XML_PARSER_EOF)
5786                             goto done;
5787                         ctxt->instate = XML_PARSER_CONTENT;
5788                     } else {
5789                         if ((!terminate) &&
5790                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5791                             goto done;
5792                         htmlSkipBogusComment(ctxt);
5793                     }
5794                 } else if ((cur == '<') && (next == '?')) {
5795                     if ((!terminate) &&
5796                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5797                         goto done;
5798                     htmlParsePI(ctxt);
5799                     if (ctxt->instate == XML_PARSER_EOF)
5800                         goto done;
5801                     ctxt->instate = XML_PARSER_CONTENT;
5802                 } else if ((cur == '<') && (next == '/')) {
5803                     ctxt->instate = XML_PARSER_END_TAG;
5804                     ctxt->checkIndex = 0;
5805                     break;
5806                 } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
5807                     if ((!terminate) && (next == 0))
5808                         goto done;
5809                     ctxt->instate = XML_PARSER_START_TAG;
5810                     ctxt->checkIndex = 0;
5811                     break;
5812                 } else if (cur == '<') {
5813                     if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5814                         (ctxt->sax->characters != NULL))
5815                         ctxt->sax->characters(ctxt->userData,
5816                                               BAD_CAST "<", 1);
5817                     NEXT;
5818                 } else {
5819                     /*
5820                      * check that the text sequence is complete
5821                      * before handing out the data to the parser
5822                      * to avoid problems with erroneous end of
5823                      * data detection.
5824                      */
5825                     if ((!terminate) &&
5826                         (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5827                         goto done;
5828                     ctxt->checkIndex = 0;
5829                     while ((ctxt->instate != XML_PARSER_EOF) &&
5830                            (cur != '<') && (in->cur < in->end)) {
5831                         if (cur == '&') {
5832                             htmlParseReference(ctxt);
5833                         } else {
5834                             htmlParseCharData(ctxt);
5835                         }
5836                         cur = in->cur[0];
5837                     }
5838 		}
5839 
5840 		break;
5841 	    }
5842             case XML_PARSER_END_TAG:
5843 		if (avail < 2)
5844 		    goto done;
5845 		if ((!terminate) &&
5846 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5847 		    goto done;
5848 		htmlParseEndTag(ctxt);
5849                 if (ctxt->instate == XML_PARSER_EOF)
5850                     goto done;
5851 		if (ctxt->nameNr == 0) {
5852 		    ctxt->instate = XML_PARSER_EPILOG;
5853 		} else {
5854 		    ctxt->instate = XML_PARSER_CONTENT;
5855 		}
5856 		ctxt->checkIndex = 0;
5857 	        break;
5858 	    default:
5859 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5860 			     "HPP: internal error\n", NULL, NULL);
5861 		ctxt->instate = XML_PARSER_EOF;
5862 		break;
5863 	}
5864     }
5865 done:
5866     if ((avail == 0) && (terminate)) {
5867 	htmlAutoCloseOnEnd(ctxt);
5868 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5869 	    /*
5870 	     * SAX: end of the document processing.
5871 	     */
5872 	    ctxt->instate = XML_PARSER_EOF;
5873 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5874 		ctxt->sax->endDocument(ctxt->userData);
5875 	}
5876     }
5877     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5878 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5879 	 (ctxt->instate == XML_PARSER_EPILOG))) {
5880 	xmlDtdPtr dtd;
5881 	dtd = xmlGetIntSubset(ctxt->myDoc);
5882 	if (dtd == NULL)
5883 	    ctxt->myDoc->intSubset =
5884 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5885 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5886 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5887     }
5888     return(ret);
5889 }
5890 
5891 /**
5892  * htmlParseChunk:
5893  * @ctxt:  an HTML parser context
5894  * @chunk:  an char array
5895  * @size:  the size in byte of the chunk
5896  * @terminate:  last chunk indicator
5897  *
5898  * Parse a Chunk of memory
5899  *
5900  * Returns zero if no error, the xmlParserErrors otherwise.
5901  */
5902 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)5903 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5904               int terminate) {
5905     if ((ctxt == NULL) || (ctxt->input == NULL)) {
5906 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5907 		     "htmlParseChunk: context error\n", NULL, NULL);
5908 	return(XML_ERR_INTERNAL_ERROR);
5909     }
5910     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5911         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5912 	size_t pos = ctxt->input->cur - ctxt->input->base;
5913 	int res;
5914 
5915 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5916         xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5917 	if (res < 0) {
5918             htmlParseErr(ctxt, ctxt->input->buf->error,
5919                          "xmlParserInputBufferPush failed", NULL, NULL);
5920             xmlHaltParser(ctxt);
5921 	    return (ctxt->errNo);
5922 	}
5923     }
5924     htmlParseTryOrFinish(ctxt, terminate);
5925     if (terminate) {
5926 	if ((ctxt->instate != XML_PARSER_EOF) &&
5927 	    (ctxt->instate != XML_PARSER_EPILOG) &&
5928 	    (ctxt->instate != XML_PARSER_MISC)) {
5929 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
5930 	    ctxt->wellFormed = 0;
5931 	}
5932 	if (ctxt->instate != XML_PARSER_EOF) {
5933 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5934 		ctxt->sax->endDocument(ctxt->userData);
5935 	}
5936 	ctxt->instate = XML_PARSER_EOF;
5937     }
5938     return((xmlParserErrors) ctxt->errNo);
5939 }
5940 
5941 /************************************************************************
5942  *									*
5943  *			User entry points				*
5944  *									*
5945  ************************************************************************/
5946 
5947 /**
5948  * htmlCreatePushParserCtxt:
5949  * @sax:  a SAX handler
5950  * @user_data:  The user data returned on SAX callbacks
5951  * @chunk:  a pointer to an array of chars
5952  * @size:  number of chars in the array
5953  * @filename:  an optional file name or URI
5954  * @enc:  an optional encoding
5955  *
5956  * Create a parser context for using the HTML parser in push mode
5957  * The value of @filename is used for fetching external entities
5958  * and error/warning reports.
5959  *
5960  * Returns the new parser context or NULL
5961  */
5962 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)5963 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5964                          const char *chunk, int size, const char *filename,
5965 			 xmlCharEncoding enc) {
5966     htmlParserCtxtPtr ctxt;
5967     htmlParserInputPtr inputStream;
5968     xmlParserInputBufferPtr buf;
5969 
5970     xmlInitParser();
5971 
5972     buf = xmlAllocParserInputBuffer(enc);
5973     if (buf == NULL) return(NULL);
5974 
5975     ctxt = htmlNewSAXParserCtxt(sax, user_data);
5976     if (ctxt == NULL) {
5977 	xmlFreeParserInputBuffer(buf);
5978 	return(NULL);
5979     }
5980     if (filename == NULL) {
5981 	ctxt->directory = NULL;
5982     } else {
5983         ctxt->directory = xmlParserGetDirectory(filename);
5984     }
5985 
5986     inputStream = htmlNewInputStream(ctxt);
5987     if (inputStream == NULL) {
5988 	xmlFreeParserCtxt(ctxt);
5989 	xmlFreeParserInputBuffer(buf);
5990 	return(NULL);
5991     }
5992 
5993     if (filename == NULL)
5994 	inputStream->filename = NULL;
5995     else
5996 	inputStream->filename = (char *)
5997 	    xmlCanonicPath((const xmlChar *) filename);
5998     inputStream->buf = buf;
5999     xmlBufResetInput(buf->buffer, inputStream);
6000 
6001     inputPush(ctxt, inputStream);
6002 
6003     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6004         (ctxt->input->buf != NULL))  {
6005 	size_t pos = ctxt->input->cur - ctxt->input->base;
6006         int res;
6007 
6008 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6009         xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
6010         if (res < 0) {
6011             htmlParseErr(ctxt, ctxt->input->buf->error,
6012                          "xmlParserInputBufferPush failed\n", NULL, NULL);
6013             xmlHaltParser(ctxt);
6014         }
6015     }
6016     ctxt->progressive = 1;
6017 
6018     return(ctxt);
6019 }
6020 #endif /* LIBXML_PUSH_ENABLED */
6021 
6022 /**
6023  * htmlSAXParseDoc:
6024  * @cur:  a pointer to an array of xmlChar
6025  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6026  * @sax:  the SAX handler block
6027  * @userData: if using SAX, this pointer will be provided on callbacks.
6028  *
6029  * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
6030  *
6031  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6032  * to handle parse events. If sax is NULL, fallback to the default DOM
6033  * behavior and return a tree.
6034  *
6035  * Returns the resulting document tree unless SAX is NULL or the document is
6036  *     not well formed.
6037  */
6038 
6039 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6040 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6041                 htmlSAXHandlerPtr sax, void *userData) {
6042     htmlDocPtr ret;
6043     htmlParserCtxtPtr ctxt;
6044 
6045     xmlInitParser();
6046 
6047     if (cur == NULL) return(NULL);
6048 
6049 
6050     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6051     if (ctxt == NULL) return(NULL);
6052     if (sax != NULL) {
6053         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6054         ctxt->sax = sax;
6055         ctxt->userData = userData;
6056     }
6057 
6058     htmlParseDocument(ctxt);
6059     ret = ctxt->myDoc;
6060     if (sax != NULL) {
6061 	ctxt->sax = NULL;
6062 	ctxt->userData = NULL;
6063     }
6064     htmlFreeParserCtxt(ctxt);
6065 
6066     return(ret);
6067 }
6068 
6069 /**
6070  * htmlParseDoc:
6071  * @cur:  a pointer to an array of xmlChar
6072  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6073  *
6074  * parse an HTML in-memory document and build a tree.
6075  *
6076  * Returns the resulting document tree
6077  */
6078 
6079 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)6080 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6081     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6082 }
6083 
6084 
6085 /**
6086  * htmlCreateFileParserCtxt:
6087  * @filename:  the filename
6088  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6089  *
6090  * Create a parser context for a file content.
6091  * Automatic support for ZLIB/Compress compressed document is provided
6092  * by default if found at compile-time.
6093  *
6094  * Returns the new parser context or NULL
6095  */
6096 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)6097 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6098 {
6099     htmlParserCtxtPtr ctxt;
6100     htmlParserInputPtr inputStream;
6101     char *canonicFilename;
6102 
6103     if (filename == NULL)
6104         return(NULL);
6105 
6106     ctxt = htmlNewParserCtxt();
6107     if (ctxt == NULL) {
6108 	return(NULL);
6109     }
6110     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6111     if (canonicFilename == NULL) {
6112 	xmlFreeParserCtxt(ctxt);
6113 	return(NULL);
6114     }
6115 
6116     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6117     xmlFree(canonicFilename);
6118     if (inputStream == NULL) {
6119 	xmlFreeParserCtxt(ctxt);
6120 	return(NULL);
6121     }
6122 
6123     inputPush(ctxt, inputStream);
6124 
6125     /* set encoding */
6126     if (encoding) {
6127         xmlCharEncodingHandlerPtr hdlr;
6128 
6129         hdlr = xmlFindCharEncodingHandler(encoding);
6130         if (hdlr != NULL) {
6131             xmlSwitchToEncoding(ctxt, hdlr);
6132         }
6133     }
6134 
6135     return(ctxt);
6136 }
6137 
6138 /**
6139  * htmlSAXParseFile:
6140  * @filename:  the filename
6141  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6142  * @sax:  the SAX handler block
6143  * @userData: if using SAX, this pointer will be provided on callbacks.
6144  *
6145  * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
6146  *
6147  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6148  * compressed document is provided by default if found at compile-time.
6149  * It use the given SAX function block to handle the parsing callback.
6150  * If sax is NULL, fallback to the default DOM tree building routines.
6151  *
6152  * Returns the resulting document tree unless SAX is NULL or the document is
6153  *     not well formed.
6154  */
6155 
6156 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6157 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6158                  void *userData) {
6159     htmlDocPtr ret;
6160     htmlParserCtxtPtr ctxt;
6161     htmlSAXHandlerPtr oldsax = NULL;
6162 
6163     xmlInitParser();
6164 
6165     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6166     if (ctxt == NULL) return(NULL);
6167     if (sax != NULL) {
6168 	oldsax = ctxt->sax;
6169         ctxt->sax = sax;
6170         ctxt->userData = userData;
6171     }
6172 
6173     htmlParseDocument(ctxt);
6174 
6175     ret = ctxt->myDoc;
6176     if (sax != NULL) {
6177         ctxt->sax = oldsax;
6178         ctxt->userData = NULL;
6179     }
6180     htmlFreeParserCtxt(ctxt);
6181 
6182     return(ret);
6183 }
6184 
6185 /**
6186  * htmlParseFile:
6187  * @filename:  the filename
6188  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6189  *
6190  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6191  * compressed document is provided by default if found at compile-time.
6192  *
6193  * Returns the resulting document tree
6194  */
6195 
6196 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)6197 htmlParseFile(const char *filename, const char *encoding) {
6198     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6199 }
6200 
6201 /**
6202  * htmlHandleOmittedElem:
6203  * @val:  int 0 or 1
6204  *
6205  * Set and return the previous value for handling HTML omitted tags.
6206  *
6207  * Returns the last value for 0 for no handling, 1 for auto insertion.
6208  */
6209 
6210 int
htmlHandleOmittedElem(int val)6211 htmlHandleOmittedElem(int val) {
6212     int old = htmlOmittedDefaultValue;
6213 
6214     htmlOmittedDefaultValue = val;
6215     return(old);
6216 }
6217 
6218 /**
6219  * htmlElementAllowedHere:
6220  * @parent: HTML parent element
6221  * @elt: HTML element
6222  *
6223  * Checks whether an HTML element may be a direct child of a parent element.
6224  * Note - doesn't check for deprecated elements
6225  *
6226  * Returns 1 if allowed; 0 otherwise.
6227  */
6228 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6229 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6230   const char** p ;
6231 
6232   if ( ! elt || ! parent || ! parent->subelts )
6233 	return 0 ;
6234 
6235   for ( p = parent->subelts; *p; ++p )
6236     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6237       return 1 ;
6238 
6239   return 0 ;
6240 }
6241 /**
6242  * htmlElementStatusHere:
6243  * @parent: HTML parent element
6244  * @elt: HTML element
6245  *
6246  * Checks whether an HTML element may be a direct child of a parent element.
6247  * and if so whether it is valid or deprecated.
6248  *
6249  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6250  */
6251 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6252 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6253   if ( ! parent || ! elt )
6254     return HTML_INVALID ;
6255   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6256     return HTML_INVALID ;
6257 
6258   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6259 }
6260 /**
6261  * htmlAttrAllowed:
6262  * @elt: HTML element
6263  * @attr: HTML attribute
6264  * @legacy: whether to allow deprecated attributes
6265  *
6266  * Checks whether an attribute is valid for an element
6267  * Has full knowledge of Required and Deprecated attributes
6268  *
6269  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6270  */
6271 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6272 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6273   const char** p ;
6274 
6275   if ( !elt || ! attr )
6276 	return HTML_INVALID ;
6277 
6278   if ( elt->attrs_req )
6279     for ( p = elt->attrs_req; *p; ++p)
6280       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6281         return HTML_REQUIRED ;
6282 
6283   if ( elt->attrs_opt )
6284     for ( p = elt->attrs_opt; *p; ++p)
6285       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6286         return HTML_VALID ;
6287 
6288   if ( legacy && elt->attrs_depr )
6289     for ( p = elt->attrs_depr; *p; ++p)
6290       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6291         return HTML_DEPRECATED ;
6292 
6293   return HTML_INVALID ;
6294 }
6295 /**
6296  * htmlNodeStatus:
6297  * @node: an htmlNodePtr in a tree
6298  * @legacy: whether to allow deprecated elements (YES is faster here
6299  *	for Element nodes)
6300  *
6301  * Checks whether the tree node is valid.  Experimental (the author
6302  *     only uses the HTML enhancements in a SAX parser)
6303  *
6304  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6305  *	legacy allowed) or htmlElementStatusHere (otherwise).
6306  *	for Attribute nodes, a return from htmlAttrAllowed
6307  *	for other nodes, HTML_NA (no checks performed)
6308  */
6309 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)6310 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6311   if ( ! node )
6312     return HTML_INVALID ;
6313 
6314   switch ( node->type ) {
6315     case XML_ELEMENT_NODE:
6316       return legacy
6317 	? ( htmlElementAllowedHere (
6318 		htmlTagLookup(node->parent->name) , node->name
6319 		) ? HTML_VALID : HTML_INVALID )
6320 	: htmlElementStatusHere(
6321 		htmlTagLookup(node->parent->name) ,
6322 		htmlTagLookup(node->name) )
6323 	;
6324     case XML_ATTRIBUTE_NODE:
6325       return htmlAttrAllowed(
6326 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6327     default: return HTML_NA ;
6328   }
6329 }
6330 /************************************************************************
6331  *									*
6332  *	New set (2.6.0) of simpler and more flexible APIs		*
6333  *									*
6334  ************************************************************************/
6335 /**
6336  * DICT_FREE:
6337  * @str:  a string
6338  *
6339  * Free a string if it is not owned by the "dict" dictionary in the
6340  * current scope
6341  */
6342 #define DICT_FREE(str)						\
6343 	if ((str) && ((!dict) ||				\
6344 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6345 	    xmlFree((char *)(str));
6346 
6347 /**
6348  * htmlCtxtReset:
6349  * @ctxt: an HTML parser context
6350  *
6351  * Reset a parser context
6352  */
6353 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6354 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6355 {
6356     xmlParserInputPtr input;
6357     xmlDictPtr dict;
6358 
6359     if (ctxt == NULL)
6360         return;
6361 
6362     xmlInitParser();
6363     dict = ctxt->dict;
6364 
6365     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6366         xmlFreeInputStream(input);
6367     }
6368     ctxt->inputNr = 0;
6369     ctxt->input = NULL;
6370 
6371     ctxt->spaceNr = 0;
6372     if (ctxt->spaceTab != NULL) {
6373 	ctxt->spaceTab[0] = -1;
6374 	ctxt->space = &ctxt->spaceTab[0];
6375     } else {
6376 	ctxt->space = NULL;
6377     }
6378 
6379 
6380     ctxt->nodeNr = 0;
6381     ctxt->node = NULL;
6382 
6383     ctxt->nameNr = 0;
6384     ctxt->name = NULL;
6385 
6386     ctxt->nsNr = 0;
6387 
6388     DICT_FREE(ctxt->version);
6389     ctxt->version = NULL;
6390     DICT_FREE(ctxt->encoding);
6391     ctxt->encoding = NULL;
6392     DICT_FREE(ctxt->directory);
6393     ctxt->directory = NULL;
6394     DICT_FREE(ctxt->extSubURI);
6395     ctxt->extSubURI = NULL;
6396     DICT_FREE(ctxt->extSubSystem);
6397     ctxt->extSubSystem = NULL;
6398     if (ctxt->myDoc != NULL)
6399         xmlFreeDoc(ctxt->myDoc);
6400     ctxt->myDoc = NULL;
6401 
6402     ctxt->standalone = -1;
6403     ctxt->hasExternalSubset = 0;
6404     ctxt->hasPErefs = 0;
6405     ctxt->html = 1;
6406     ctxt->external = 0;
6407     ctxt->instate = XML_PARSER_START;
6408     ctxt->token = 0;
6409 
6410     ctxt->wellFormed = 1;
6411     ctxt->nsWellFormed = 1;
6412     ctxt->disableSAX = 0;
6413     ctxt->valid = 1;
6414     ctxt->vctxt.userData = ctxt;
6415     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6416     ctxt->vctxt.error = xmlParserValidityError;
6417     ctxt->vctxt.warning = xmlParserValidityWarning;
6418     ctxt->record_info = 0;
6419     ctxt->checkIndex = 0;
6420     ctxt->endCheckState = 0;
6421     ctxt->inSubset = 0;
6422     ctxt->errNo = XML_ERR_OK;
6423     ctxt->depth = 0;
6424     ctxt->catalogs = NULL;
6425     xmlInitNodeInfoSeq(&ctxt->node_seq);
6426 
6427     if (ctxt->attsDefault != NULL) {
6428         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6429         ctxt->attsDefault = NULL;
6430     }
6431     if (ctxt->attsSpecial != NULL) {
6432         xmlHashFree(ctxt->attsSpecial, NULL);
6433         ctxt->attsSpecial = NULL;
6434     }
6435 
6436     ctxt->nbErrors = 0;
6437     ctxt->nbWarnings = 0;
6438     if (ctxt->lastError.code != XML_ERR_OK)
6439         xmlResetError(&ctxt->lastError);
6440 }
6441 
6442 /**
6443  * htmlCtxtUseOptions:
6444  * @ctxt: an HTML parser context
6445  * @options:  a combination of htmlParserOption(s)
6446  *
6447  * Applies the options to the parser context
6448  *
6449  * Returns 0 in case of success, the set of unknown or unimplemented options
6450  *         in case of error.
6451  */
6452 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6453 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6454 {
6455     if (ctxt == NULL)
6456         return(-1);
6457 
6458     if (options & HTML_PARSE_NOWARNING) {
6459         ctxt->sax->warning = NULL;
6460         ctxt->vctxt.warning = NULL;
6461         options -= XML_PARSE_NOWARNING;
6462 	ctxt->options |= XML_PARSE_NOWARNING;
6463     }
6464     if (options & HTML_PARSE_NOERROR) {
6465         ctxt->sax->error = NULL;
6466         ctxt->vctxt.error = NULL;
6467         ctxt->sax->fatalError = NULL;
6468         options -= XML_PARSE_NOERROR;
6469 	ctxt->options |= XML_PARSE_NOERROR;
6470     }
6471     if (options & HTML_PARSE_PEDANTIC) {
6472         ctxt->pedantic = 1;
6473         options -= XML_PARSE_PEDANTIC;
6474 	ctxt->options |= XML_PARSE_PEDANTIC;
6475     } else
6476         ctxt->pedantic = 0;
6477     if (options & XML_PARSE_NOBLANKS) {
6478         ctxt->keepBlanks = 0;
6479         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6480         options -= XML_PARSE_NOBLANKS;
6481 	ctxt->options |= XML_PARSE_NOBLANKS;
6482     } else
6483         ctxt->keepBlanks = 1;
6484     if (options & HTML_PARSE_RECOVER) {
6485         ctxt->recovery = 1;
6486 	options -= HTML_PARSE_RECOVER;
6487     } else
6488         ctxt->recovery = 0;
6489     if (options & HTML_PARSE_COMPACT) {
6490 	ctxt->options |= HTML_PARSE_COMPACT;
6491         options -= HTML_PARSE_COMPACT;
6492     }
6493     if (options & XML_PARSE_HUGE) {
6494 	ctxt->options |= XML_PARSE_HUGE;
6495         options -= XML_PARSE_HUGE;
6496     }
6497     if (options & HTML_PARSE_NODEFDTD) {
6498 	ctxt->options |= HTML_PARSE_NODEFDTD;
6499         options -= HTML_PARSE_NODEFDTD;
6500     }
6501     if (options & HTML_PARSE_IGNORE_ENC) {
6502 	ctxt->options |= HTML_PARSE_IGNORE_ENC;
6503         options -= HTML_PARSE_IGNORE_ENC;
6504     }
6505     if (options & HTML_PARSE_NOIMPLIED) {
6506         ctxt->options |= HTML_PARSE_NOIMPLIED;
6507         options -= HTML_PARSE_NOIMPLIED;
6508     }
6509     ctxt->dictNames = 0;
6510     ctxt->linenumbers = 1;
6511     return (options);
6512 }
6513 
6514 /**
6515  * htmlDoRead:
6516  * @ctxt:  an HTML parser context
6517  * @URL:  the base URL to use for the document
6518  * @encoding:  the document encoding, or NULL
6519  * @options:  a combination of htmlParserOption(s)
6520  * @reuse:  keep the context for reuse
6521  *
6522  * Common front-end for the htmlRead functions
6523  *
6524  * Returns the resulting document tree or NULL
6525  */
6526 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)6527 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6528           int options, int reuse)
6529 {
6530     htmlDocPtr ret;
6531 
6532     htmlCtxtUseOptions(ctxt, options);
6533     ctxt->html = 1;
6534     if (encoding != NULL) {
6535         xmlCharEncodingHandlerPtr hdlr;
6536 
6537 	hdlr = xmlFindCharEncodingHandler(encoding);
6538 	if (hdlr != NULL) {
6539 	    xmlSwitchToEncoding(ctxt, hdlr);
6540         }
6541     }
6542     if ((URL != NULL) && (ctxt->input != NULL) &&
6543         (ctxt->input->filename == NULL))
6544         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6545     htmlParseDocument(ctxt);
6546     ret = ctxt->myDoc;
6547     ctxt->myDoc = NULL;
6548     if (!reuse) {
6549         if ((ctxt->dictNames) &&
6550 	    (ret != NULL) &&
6551 	    (ret->dict == ctxt->dict))
6552 	    ctxt->dict = NULL;
6553 	xmlFreeParserCtxt(ctxt);
6554     }
6555     return (ret);
6556 }
6557 
6558 /**
6559  * htmlReadDoc:
6560  * @cur:  a pointer to a zero terminated string
6561  * @URL:  the base URL to use for the document
6562  * @encoding:  the document encoding, or NULL
6563  * @options:  a combination of htmlParserOption(s)
6564  *
6565  * parse an XML in-memory document and build a tree.
6566  *
6567  * Returns the resulting document tree
6568  */
6569 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)6570 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6571 {
6572     htmlParserCtxtPtr ctxt;
6573 
6574     if (cur == NULL)
6575         return (NULL);
6576 
6577     xmlInitParser();
6578     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6579     if (ctxt == NULL)
6580         return (NULL);
6581     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6582 }
6583 
6584 /**
6585  * htmlReadFile:
6586  * @filename:  a file or URL
6587  * @encoding:  the document encoding, or NULL
6588  * @options:  a combination of htmlParserOption(s)
6589  *
6590  * parse an XML file from the filesystem or the network.
6591  *
6592  * Returns the resulting document tree
6593  */
6594 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6595 htmlReadFile(const char *filename, const char *encoding, int options)
6596 {
6597     htmlParserCtxtPtr ctxt;
6598 
6599     xmlInitParser();
6600     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6601     if (ctxt == NULL)
6602         return (NULL);
6603     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6604 }
6605 
6606 /**
6607  * htmlReadMemory:
6608  * @buffer:  a pointer to a char array
6609  * @size:  the size of the array
6610  * @URL:  the base URL to use for the document
6611  * @encoding:  the document encoding, or NULL
6612  * @options:  a combination of htmlParserOption(s)
6613  *
6614  * parse an XML in-memory document and build a tree.
6615  *
6616  * Returns the resulting document tree
6617  */
6618 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)6619 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6620 {
6621     htmlParserCtxtPtr ctxt;
6622 
6623     xmlInitParser();
6624     ctxt = htmlCreateMemoryParserCtxt(buffer, size);
6625     if (ctxt == NULL)
6626         return (NULL);
6627     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6628 }
6629 
6630 /**
6631  * htmlReadFd:
6632  * @fd:  an open file descriptor
6633  * @URL:  the base URL to use for the document
6634  * @encoding:  the document encoding, or NULL
6635  * @options:  a combination of htmlParserOption(s)
6636  *
6637  * parse an HTML from a file descriptor and build a tree.
6638  * NOTE that the file descriptor will not be closed when the
6639  *      reader is closed or reset.
6640  *
6641  * Returns the resulting document tree
6642  */
6643 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)6644 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6645 {
6646     htmlParserCtxtPtr ctxt;
6647     xmlParserInputBufferPtr input;
6648     htmlParserInputPtr stream;
6649 
6650     if (fd < 0)
6651         return (NULL);
6652 
6653     xmlInitParser();
6654     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6655     if (input == NULL)
6656         return (NULL);
6657     input->closecallback = NULL;
6658     ctxt = htmlNewParserCtxt();
6659     if (ctxt == NULL) {
6660         xmlFreeParserInputBuffer(input);
6661         return (NULL);
6662     }
6663     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6664     if (stream == NULL) {
6665         xmlFreeParserInputBuffer(input);
6666 	htmlFreeParserCtxt(ctxt);
6667         return (NULL);
6668     }
6669     inputPush(ctxt, stream);
6670     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6671 }
6672 
6673 /**
6674  * htmlReadIO:
6675  * @ioread:  an I/O read function
6676  * @ioclose:  an I/O close function
6677  * @ioctx:  an I/O handler
6678  * @URL:  the base URL to use for the document
6679  * @encoding:  the document encoding, or NULL
6680  * @options:  a combination of htmlParserOption(s)
6681  *
6682  * parse an HTML document from I/O functions and source and build a tree.
6683  *
6684  * Returns the resulting document tree
6685  */
6686 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6687 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6688           void *ioctx, const char *URL, const char *encoding, int options)
6689 {
6690     htmlParserCtxtPtr ctxt;
6691     xmlParserInputBufferPtr input;
6692     xmlParserInputPtr stream;
6693 
6694     if (ioread == NULL)
6695         return (NULL);
6696     xmlInitParser();
6697 
6698     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6699                                          XML_CHAR_ENCODING_NONE);
6700     if (input == NULL) {
6701         if (ioclose != NULL)
6702             ioclose(ioctx);
6703         return (NULL);
6704     }
6705     ctxt = htmlNewParserCtxt();
6706     if (ctxt == NULL) {
6707         xmlFreeParserInputBuffer(input);
6708         return (NULL);
6709     }
6710     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6711     if (stream == NULL) {
6712         xmlFreeParserInputBuffer(input);
6713 	xmlFreeParserCtxt(ctxt);
6714         return (NULL);
6715     }
6716     inputPush(ctxt, stream);
6717     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6718 }
6719 
6720 /**
6721  * htmlCtxtReadDoc:
6722  * @ctxt:  an HTML parser context
6723  * @str:  a pointer to a zero terminated string
6724  * @URL:  the base URL to use for the document
6725  * @encoding:  the document encoding, or NULL
6726  * @options:  a combination of htmlParserOption(s)
6727  *
6728  * parse an XML in-memory document and build a tree.
6729  * This reuses the existing @ctxt parser context
6730  *
6731  * Returns the resulting document tree
6732  */
6733 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * str,const char * URL,const char * encoding,int options)6734 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6735                const char *URL, const char *encoding, int options)
6736 {
6737     xmlParserInputBufferPtr input;
6738     xmlParserInputPtr stream;
6739 
6740     if (ctxt == NULL)
6741         return (NULL);
6742     if (str == NULL)
6743         return (NULL);
6744     xmlInitParser();
6745 
6746     htmlCtxtReset(ctxt);
6747 
6748     input = xmlParserInputBufferCreateString(str);
6749     if (input == NULL) {
6750 	return(NULL);
6751     }
6752 
6753     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6754     if (stream == NULL) {
6755 	xmlFreeParserInputBuffer(input);
6756 	return(NULL);
6757     }
6758 
6759     inputPush(ctxt, stream);
6760     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6761 }
6762 
6763 /**
6764  * htmlCtxtReadFile:
6765  * @ctxt:  an HTML parser context
6766  * @filename:  a file or URL
6767  * @encoding:  the document encoding, or NULL
6768  * @options:  a combination of htmlParserOption(s)
6769  *
6770  * parse an XML file from the filesystem or the network.
6771  * This reuses the existing @ctxt parser context
6772  *
6773  * Returns the resulting document tree
6774  */
6775 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)6776 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6777                 const char *encoding, int options)
6778 {
6779     xmlParserInputPtr stream;
6780 
6781     if (filename == NULL)
6782         return (NULL);
6783     if (ctxt == NULL)
6784         return (NULL);
6785     xmlInitParser();
6786 
6787     htmlCtxtReset(ctxt);
6788 
6789     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6790     if (stream == NULL) {
6791         return (NULL);
6792     }
6793     inputPush(ctxt, stream);
6794     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6795 }
6796 
6797 /**
6798  * htmlCtxtReadMemory:
6799  * @ctxt:  an HTML parser context
6800  * @buffer:  a pointer to a char array
6801  * @size:  the size of the array
6802  * @URL:  the base URL to use for the document
6803  * @encoding:  the document encoding, or NULL
6804  * @options:  a combination of htmlParserOption(s)
6805  *
6806  * parse an XML in-memory document and build a tree.
6807  * This reuses the existing @ctxt parser context
6808  *
6809  * Returns the resulting document tree
6810  */
6811 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)6812 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6813                   const char *URL, const char *encoding, int options)
6814 {
6815     xmlParserInputBufferPtr input;
6816     xmlParserInputPtr stream;
6817 
6818     if (ctxt == NULL)
6819         return (NULL);
6820     if (buffer == NULL)
6821         return (NULL);
6822     xmlInitParser();
6823 
6824     htmlCtxtReset(ctxt);
6825 
6826     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6827     if (input == NULL) {
6828 	return(NULL);
6829     }
6830 
6831     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6832     if (stream == NULL) {
6833 	xmlFreeParserInputBuffer(input);
6834 	return(NULL);
6835     }
6836 
6837     inputPush(ctxt, stream);
6838     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6839 }
6840 
6841 /**
6842  * htmlCtxtReadFd:
6843  * @ctxt:  an HTML parser context
6844  * @fd:  an open file descriptor
6845  * @URL:  the base URL to use for the document
6846  * @encoding:  the document encoding, or NULL
6847  * @options:  a combination of htmlParserOption(s)
6848  *
6849  * parse an XML from a file descriptor and build a tree.
6850  * This reuses the existing @ctxt parser context
6851  *
6852  * Returns the resulting document tree
6853  */
6854 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)6855 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6856               const char *URL, const char *encoding, int options)
6857 {
6858     xmlParserInputBufferPtr input;
6859     xmlParserInputPtr stream;
6860 
6861     if (fd < 0)
6862         return (NULL);
6863     if (ctxt == NULL)
6864         return (NULL);
6865     xmlInitParser();
6866 
6867     htmlCtxtReset(ctxt);
6868 
6869 
6870     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6871     if (input == NULL)
6872         return (NULL);
6873     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6874     if (stream == NULL) {
6875         xmlFreeParserInputBuffer(input);
6876         return (NULL);
6877     }
6878     inputPush(ctxt, stream);
6879     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6880 }
6881 
6882 /**
6883  * htmlCtxtReadIO:
6884  * @ctxt:  an HTML parser context
6885  * @ioread:  an I/O read function
6886  * @ioclose:  an I/O close function
6887  * @ioctx:  an I/O handler
6888  * @URL:  the base URL to use for the document
6889  * @encoding:  the document encoding, or NULL
6890  * @options:  a combination of htmlParserOption(s)
6891  *
6892  * parse an HTML document from I/O functions and source and build a tree.
6893  * This reuses the existing @ctxt parser context
6894  *
6895  * Returns the resulting document tree
6896  */
6897 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)6898 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6899               xmlInputCloseCallback ioclose, void *ioctx,
6900 	      const char *URL,
6901               const char *encoding, int options)
6902 {
6903     xmlParserInputBufferPtr input;
6904     xmlParserInputPtr stream;
6905 
6906     if (ioread == NULL)
6907         return (NULL);
6908     if (ctxt == NULL)
6909         return (NULL);
6910     xmlInitParser();
6911 
6912     htmlCtxtReset(ctxt);
6913 
6914     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6915                                          XML_CHAR_ENCODING_NONE);
6916     if (input == NULL) {
6917         if (ioclose != NULL)
6918             ioclose(ioctx);
6919         return (NULL);
6920     }
6921     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6922     if (stream == NULL) {
6923         xmlFreeParserInputBuffer(input);
6924         return (NULL);
6925     }
6926     inputPush(ctxt, stream);
6927     return (htmlDoRead(ctxt, URL, encoding, options, 1));
6928 }
6929 
6930 #endif /* LIBXML_HTML_ENABLED */
6931