• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLparser.c : an HTML 4.0 non-verifying parser
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12 
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef LIBXML_ZLIB_ENABLED
30 #include <zlib.h>
31 #endif
32 
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
46 
47 #include "buf.h"
48 #include "enc.h"
49 
50 #define HTML_MAX_NAMELEN 1000
51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
52 #define HTML_PARSER_BUFFER_SIZE 100
53 
54 /* #define DEBUG */
55 /* #define DEBUG_PUSH */
56 
57 static int htmlOmittedDefaultValue = 1;
58 
59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 			     xmlChar end, xmlChar  end2, xmlChar end3);
61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
62 
63 /************************************************************************
64  *									*
65  *		Some factorized error routines				*
66  *									*
67  ************************************************************************/
68 
69 /**
70  * htmlErrMemory:
71  * @ctxt:  an HTML parser context
72  * @extra:  extra information
73  *
74  * Handle a redefinition of attribute error
75  */
76 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78 {
79     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80         (ctxt->instate == XML_PARSER_EOF))
81 	return;
82     if (ctxt != NULL) {
83         ctxt->errNo = XML_ERR_NO_MEMORY;
84         ctxt->instate = XML_PARSER_EOF;
85         ctxt->disableSAX = 1;
86     }
87     if (extra)
88         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90                         NULL, NULL, 0, 0,
91                         "Memory allocation failed : %s\n", extra);
92     else
93         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95                         NULL, NULL, 0, 0, "Memory allocation failed\n");
96 }
97 
98 /**
99  * htmlParseErr:
100  * @ctxt:  an HTML parser context
101  * @error:  the error number
102  * @msg:  the error message
103  * @str1:  string infor
104  * @str2:  string infor
105  *
106  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107  */
108 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110              const char *msg, const xmlChar *str1, const xmlChar *str2)
111 {
112     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113         (ctxt->instate == XML_PARSER_EOF))
114 	return;
115     if (ctxt != NULL)
116 	ctxt->errNo = error;
117     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118                     XML_ERR_ERROR, NULL, 0,
119 		    (const char *) str1, (const char *) str2,
120 		    NULL, 0, 0,
121 		    msg, str1, str2);
122     if (ctxt != NULL)
123 	ctxt->wellFormed = 0;
124 }
125 
126 /**
127  * htmlParseErrInt:
128  * @ctxt:  an HTML parser context
129  * @error:  the error number
130  * @msg:  the error message
131  * @val:  integer info
132  *
133  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134  */
135 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137              const char *msg, int val)
138 {
139     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140         (ctxt->instate == XML_PARSER_EOF))
141 	return;
142     if (ctxt != NULL)
143 	ctxt->errNo = error;
144     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 		    NULL, val, 0, msg, val);
147     if (ctxt != NULL)
148 	ctxt->wellFormed = 0;
149 }
150 
151 /************************************************************************
152  *									*
153  *	Parser stacks related functions and macros		*
154  *									*
155  ************************************************************************/
156 
157 /**
158  * htmlnamePush:
159  * @ctxt:  an HTML parser context
160  * @value:  the element name
161  *
162  * Pushes a new element name on top of the name stack
163  *
164  * Returns 0 in case of error, the index in the stack otherwise
165  */
166 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168 {
169     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170         ctxt->html = 3;
171     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172         ctxt->html = 10;
173     if (ctxt->nameNr >= ctxt->nameMax) {
174         ctxt->nameMax *= 2;
175         ctxt->nameTab = (const xmlChar * *)
176                          xmlRealloc((xmlChar * *)ctxt->nameTab,
177                                     ctxt->nameMax *
178                                     sizeof(ctxt->nameTab[0]));
179         if (ctxt->nameTab == NULL) {
180             htmlErrMemory(ctxt, NULL);
181             return (0);
182         }
183     }
184     ctxt->nameTab[ctxt->nameNr] = value;
185     ctxt->name = value;
186     return (ctxt->nameNr++);
187 }
188 /**
189  * htmlnamePop:
190  * @ctxt: an HTML parser context
191  *
192  * Pops the top element name from the name stack
193  *
194  * Returns the name just removed
195  */
196 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)197 htmlnamePop(htmlParserCtxtPtr ctxt)
198 {
199     const xmlChar *ret;
200 
201     if (ctxt->nameNr <= 0)
202         return (NULL);
203     ctxt->nameNr--;
204     if (ctxt->nameNr < 0)
205         return (NULL);
206     if (ctxt->nameNr > 0)
207         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208     else
209         ctxt->name = NULL;
210     ret = ctxt->nameTab[ctxt->nameNr];
211     ctxt->nameTab[ctxt->nameNr] = NULL;
212     return (ret);
213 }
214 
215 /**
216  * htmlNodeInfoPush:
217  * @ctxt:  an HTML parser context
218  * @value:  the node info
219  *
220  * Pushes a new element name on top of the node info stack
221  *
222  * Returns 0 in case of error, the index in the stack otherwise
223  */
224 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226 {
227     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228         if (ctxt->nodeInfoMax == 0)
229                 ctxt->nodeInfoMax = 5;
230         ctxt->nodeInfoMax *= 2;
231         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233                                     ctxt->nodeInfoMax *
234                                     sizeof(ctxt->nodeInfoTab[0]));
235         if (ctxt->nodeInfoTab == NULL) {
236             htmlErrMemory(ctxt, NULL);
237             return (0);
238         }
239     }
240     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242     return (ctxt->nodeInfoNr++);
243 }
244 
245 /**
246  * htmlNodeInfoPop:
247  * @ctxt:  an HTML parser context
248  *
249  * Pops the top element name from the node info stack
250  *
251  * Returns 0 in case of error, the pointer to NodeInfo otherwise
252  */
253 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255 {
256     if (ctxt->nodeInfoNr <= 0)
257         return (NULL);
258     ctxt->nodeInfoNr--;
259     if (ctxt->nodeInfoNr < 0)
260         return (NULL);
261     if (ctxt->nodeInfoNr > 0)
262         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263     else
264         ctxt->nodeInfo = NULL;
265     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266 }
267 
268 /*
269  * Macros for accessing the content. Those should be used only by the parser,
270  * and not exported.
271  *
272  * Dirty macros, i.e. one need to make assumption on the context to use them
273  *
274  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
275  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
276  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277  *           in UNICODE mode. This should be used internally by the parser
278  *           only to compare to ASCII values otherwise it would break when
279  *           running with UTF-8 encoding.
280  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
281  *           to compare on ASCII based substring.
282  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
283  *           it should be used only to compare on ASCII based substring.
284  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285  *           strings without newlines within the parser.
286  *
287  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288  *
289  *   CURRENT Returns the current char value, with the full decoding of
290  *           UTF-8 if we are using this mode. It returns an int.
291  *   NEXT    Skip to the next character, this does the proper decoding
292  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
293  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
294  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295  */
296 
297 #define UPPER (toupper(*ctxt->input->cur))
298 
299 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
300 
301 #define NXT(val) ctxt->input->cur[(val)]
302 
303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
304 
305 #define CUR_PTR ctxt->input->cur
306 #define BASE_PTR ctxt->input->base
307 
308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 	xmlParserInputShrink(ctxt->input)
311 
312 #define GROW if ((ctxt->progressive == 0) &&				\
313 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
314 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315 
316 #define CURRENT ((int) (*ctxt->input->cur))
317 
318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319 
320 /* Imported from XML */
321 
322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323 #define CUR ((int) (*ctxt->input->cur))
324 #define NEXT xmlNextChar(ctxt)
325 
326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327 
328 
329 #define NEXTL(l) do {							\
330     if (*(ctxt->input->cur) == '\n') {					\
331 	ctxt->input->line++; ctxt->input->col = 1;			\
332     } else ctxt->input->col++;						\
333     ctxt->token = 0; ctxt->input->cur += l;				\
334   } while (0)
335 
336 /************
337     \
338     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
339     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340  ************/
341 
342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344 
345 #define COPY_BUF(l,b,i,v)						\
346     if (l == 1) b[i++] = (xmlChar) v;					\
347     else i += xmlCopyChar(l,&b[i],v)
348 
349 /**
350  * htmlFindEncoding:
351  * @the HTML parser context
352  *
353  * Ty to find and encoding in the current data available in the input
354  * buffer this is needed to try to switch to the proper encoding when
355  * one face a character error.
356  * That's an heuristic, since it's operating outside of parsing it could
357  * try to use a meta which had been commented out, that's the reason it
358  * should only be used in case of error, not as a default.
359  *
360  * Returns an encoding string or NULL if not found, the string need to
361  *   be freed
362  */
363 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365     const xmlChar *start, *cur, *end;
366 
367     if ((ctxt == NULL) || (ctxt->input == NULL) ||
368         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369         (ctxt->input->buf->encoder != NULL))
370         return(NULL);
371     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372         return(NULL);
373 
374     start = ctxt->input->cur;
375     end = ctxt->input->end;
376     /* we also expect the input buffer to be zero terminated */
377     if (*end != 0)
378         return(NULL);
379 
380     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381     if (cur == NULL)
382         return(NULL);
383     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
384     if (cur == NULL)
385         return(NULL);
386     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
387     if (cur == NULL)
388         return(NULL);
389     cur += 8;
390     start = cur;
391     while (((*cur >= 'A') && (*cur <= 'Z')) ||
392            ((*cur >= 'a') && (*cur <= 'z')) ||
393            ((*cur >= '0') && (*cur <= '9')) ||
394            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395            cur++;
396     if (cur == start)
397         return(NULL);
398     return(xmlStrndup(start, cur - start));
399 }
400 
401 /**
402  * htmlCurrentChar:
403  * @ctxt:  the HTML parser context
404  * @len:  pointer to the length of the char read
405  *
406  * The current char value, if using UTF-8 this may actually span multiple
407  * bytes in the input buffer. Implement the end of line normalization:
408  * 2.11 End-of-Line Handling
409  * If the encoding is unspecified, in the case we find an ISO-Latin-1
410  * char, then the encoding converter is plugged in automatically.
411  *
412  * Returns the current char value and its length
413  */
414 
415 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417     const unsigned char *cur;
418     unsigned char c;
419     unsigned int val;
420 
421     if (ctxt->instate == XML_PARSER_EOF)
422 	return(0);
423 
424     if (ctxt->token != 0) {
425 	*len = 0;
426 	return(ctxt->token);
427     }
428     if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
429         xmlChar * guess;
430         xmlCharEncodingHandlerPtr handler;
431 
432         /*
433          * Assume it's a fixed length encoding (1) with
434          * a compatible encoding for the ASCII set, since
435          * HTML constructs only use < 128 chars
436          */
437         if ((int) *ctxt->input->cur < 0x80) {
438             *len = 1;
439             if ((*ctxt->input->cur == 0) &&
440                 (ctxt->input->cur < ctxt->input->end)) {
441                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442                                 "Char 0x%X out of allowed range\n", 0);
443                 return(' ');
444             }
445             return((int) *ctxt->input->cur);
446         }
447 
448         /*
449          * Humm this is bad, do an automatic flow conversion
450          */
451         guess = htmlFindEncoding(ctxt);
452         if (guess == NULL) {
453             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454         } else {
455             if (ctxt->input->encoding != NULL)
456                 xmlFree((xmlChar *) ctxt->input->encoding);
457             ctxt->input->encoding = guess;
458             handler = xmlFindCharEncodingHandler((const char *) guess);
459             if (handler != NULL) {
460                 /*
461                  * Don't use UTF-8 encoder which isn't required and
462                  * can produce invalid UTF-8.
463                  */
464                 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
465                     xmlSwitchToEncoding(ctxt, handler);
466             } else {
467                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468                              "Unsupported encoding %s", guess, NULL);
469             }
470         }
471         ctxt->charset = XML_CHAR_ENCODING_UTF8;
472     }
473 
474     /*
475      * We are supposed to handle UTF8, check it's valid
476      * From rfc2044: encoding of the Unicode values on UTF-8:
477      *
478      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
479      * 0000 0000-0000 007F   0xxxxxxx
480      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
481      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
482      *
483      * Check for the 0x110000 limit too
484      */
485     cur = ctxt->input->cur;
486     c = *cur;
487     if (c & 0x80) {
488         if ((c & 0x40) == 0)
489             goto encoding_error;
490         if (cur[1] == 0) {
491             xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
492             cur = ctxt->input->cur;
493         }
494         if ((cur[1] & 0xc0) != 0x80)
495             goto encoding_error;
496         if ((c & 0xe0) == 0xe0) {
497 
498             if (cur[2] == 0) {
499                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
500                 cur = ctxt->input->cur;
501             }
502             if ((cur[2] & 0xc0) != 0x80)
503                 goto encoding_error;
504             if ((c & 0xf0) == 0xf0) {
505                 if (cur[3] == 0) {
506                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
507                     cur = ctxt->input->cur;
508                 }
509                 if (((c & 0xf8) != 0xf0) ||
510                     ((cur[3] & 0xc0) != 0x80))
511                     goto encoding_error;
512                 /* 4-byte code */
513                 *len = 4;
514                 val = (cur[0] & 0x7) << 18;
515                 val |= (cur[1] & 0x3f) << 12;
516                 val |= (cur[2] & 0x3f) << 6;
517                 val |= cur[3] & 0x3f;
518                 if (val < 0x10000)
519                     goto encoding_error;
520             } else {
521               /* 3-byte code */
522                 *len = 3;
523                 val = (cur[0] & 0xf) << 12;
524                 val |= (cur[1] & 0x3f) << 6;
525                 val |= cur[2] & 0x3f;
526                 if (val < 0x800)
527                     goto encoding_error;
528             }
529         } else {
530           /* 2-byte code */
531             *len = 2;
532             val = (cur[0] & 0x1f) << 6;
533             val |= cur[1] & 0x3f;
534             if (val < 0x80)
535                 goto encoding_error;
536         }
537         if (!IS_CHAR(val)) {
538             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
539                             "Char 0x%X out of allowed range\n", val);
540         }
541         return(val);
542     } else {
543         if ((*ctxt->input->cur == 0) &&
544             (ctxt->input->cur < ctxt->input->end)) {
545             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
546                             "Char 0x%X out of allowed range\n", 0);
547             *len = 1;
548             return(' ');
549         }
550         /* 1-byte code */
551         *len = 1;
552         return((int) *ctxt->input->cur);
553     }
554 
555 encoding_error:
556     /*
557      * If we detect an UTF8 error that probably mean that the
558      * input encoding didn't get properly advertised in the
559      * declaration header. Report the error and switch the encoding
560      * to ISO-Latin-1 (if you don't like this policy, just declare the
561      * encoding !)
562      */
563     {
564         char buffer[150];
565 
566 	if (ctxt->input->end - ctxt->input->cur >= 4) {
567 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
568 			    ctxt->input->cur[0], ctxt->input->cur[1],
569 			    ctxt->input->cur[2], ctxt->input->cur[3]);
570 	} else {
571 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
572 	}
573 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
574 		     "Input is not proper UTF-8, indicate encoding !\n",
575 		     BAD_CAST buffer, NULL);
576     }
577 
578     /*
579      * Don't switch encodings twice. Note that if there's an encoder, we
580      * shouldn't receive invalid UTF-8 anyway.
581      *
582      * Note that if ctxt->input->buf == NULL, switching encodings is
583      * impossible, see Gitlab issue #34.
584      */
585     if ((ctxt->input->buf != NULL) &&
586         (ctxt->input->buf->encoder == NULL))
587         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
588     *len = 1;
589     return((int) *ctxt->input->cur);
590 }
591 
592 /**
593  * htmlSkipBlankChars:
594  * @ctxt:  the HTML parser context
595  *
596  * skip all blanks character found at that point in the input streams.
597  *
598  * Returns the number of space chars skipped
599  */
600 
601 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)602 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
603     int res = 0;
604 
605     while (IS_BLANK_CH(*(ctxt->input->cur))) {
606 	if ((*ctxt->input->cur == 0) &&
607 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
608 		xmlPopInput(ctxt);
609 	} else {
610 	    if (*(ctxt->input->cur) == '\n') {
611 		ctxt->input->line++; ctxt->input->col = 1;
612 	    } else ctxt->input->col++;
613 	    ctxt->input->cur++;
614 	    if (*ctxt->input->cur == 0)
615 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
616 	}
617 	res++;
618     }
619     return(res);
620 }
621 
622 
623 
624 /************************************************************************
625  *									*
626  *	The list of HTML elements and their properties		*
627  *									*
628  ************************************************************************/
629 
630 /*
631  *  Start Tag: 1 means the start tag can be omitted
632  *  End Tag:   1 means the end tag can be omitted
633  *             2 means it's forbidden (empty elements)
634  *             3 means the tag is stylistic and should be closed easily
635  *  Depr:      this element is deprecated
636  *  DTD:       1 means that this element is valid only in the Loose DTD
637  *             2 means that this element is valid only in the Frameset DTD
638  *
639  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
640 	, subElements , impliedsubelt , Attributes, userdata
641  */
642 
643 /* Definitions and a couple of vars for HTML Elements */
644 
645 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
646 #define NB_FONTSTYLE 8
647 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
648 #define NB_PHRASE 10
649 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
650 #define NB_SPECIAL 16
651 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
652 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
653 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
654 #define NB_BLOCK NB_HEADING + NB_LIST + 14
655 #define FORMCTRL "input", "select", "textarea", "label", "button"
656 #define NB_FORMCTRL 5
657 #define PCDATA
658 #define NB_PCDATA 0
659 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
660 #define NB_HEADING 6
661 #define LIST "ul", "ol", "dir", "menu"
662 #define NB_LIST 4
663 #define MODIFIER
664 #define NB_MODIFIER 0
665 #define FLOW BLOCK,INLINE
666 #define NB_FLOW NB_BLOCK + NB_INLINE
667 #define EMPTY NULL
668 
669 
670 static const char* const html_flow[] = { FLOW, NULL } ;
671 static const char* const html_inline[] = { INLINE, NULL } ;
672 
673 /* placeholders: elts with content but no subelements */
674 static const char* const html_pcdata[] = { NULL } ;
675 #define html_cdata html_pcdata
676 
677 
678 /* ... and for HTML Attributes */
679 
680 #define COREATTRS "id", "class", "style", "title"
681 #define NB_COREATTRS 4
682 #define I18N "lang", "dir"
683 #define NB_I18N 2
684 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
685 #define NB_EVENTS 9
686 #define ATTRS COREATTRS,I18N,EVENTS
687 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
688 #define CELLHALIGN "align", "char", "charoff"
689 #define NB_CELLHALIGN 3
690 #define CELLVALIGN "valign"
691 #define NB_CELLVALIGN 1
692 
693 static const char* const html_attrs[] = { ATTRS, NULL } ;
694 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
695 static const char* const core_attrs[] = { COREATTRS, NULL } ;
696 static const char* const i18n_attrs[] = { I18N, NULL } ;
697 
698 
699 /* Other declarations that should go inline ... */
700 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
701 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
702 	"tabindex", "onfocus", "onblur", NULL } ;
703 static const char* const target_attr[] = { "target", NULL } ;
704 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
705 static const char* const alt_attr[] = { "alt", NULL } ;
706 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
707 static const char* const href_attrs[] = { "href", NULL } ;
708 static const char* const clear_attrs[] = { "clear", NULL } ;
709 static const char* const inline_p[] = { INLINE, "p", NULL } ;
710 
711 static const char* const flow_param[] = { FLOW, "param", NULL } ;
712 static const char* const applet_attrs[] = { COREATTRS , "codebase",
713 		"archive", "alt", "name", "height", "width", "align",
714 		"hspace", "vspace", NULL } ;
715 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
716 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
717 static const char* const basefont_attrs[] =
718 	{ "id", "size", "color", "face", NULL } ;
719 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
720 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
721 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
722 static const char* const body_depr[] = { "background", "bgcolor", "text",
723 	"link", "vlink", "alink", NULL } ;
724 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
725 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
726 
727 
728 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
729 static const char* const col_elt[] = { "col", NULL } ;
730 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
731 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
732 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
733 static const char* const compact_attr[] = { "compact", NULL } ;
734 static const char* const label_attr[] = { "label", NULL } ;
735 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
736 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
737 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
738 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
739 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
740 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
741 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
742 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
743 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
744 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
745 static const char* const version_attr[] = { "version", NULL } ;
746 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
747 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
748 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
749 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
750 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
751 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
752 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
753 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
754 static const char* const align_attr[] = { "align", NULL } ;
755 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
756 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
757 static const char* const name_attr[] = { "name", NULL } ;
758 static const char* const action_attr[] = { "action", NULL } ;
759 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
760 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
761 static const char* const content_attr[] = { "content", NULL } ;
762 static const char* const type_attr[] = { "type", NULL } ;
763 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
764 static const char* const object_contents[] = { FLOW, "param", NULL } ;
765 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
766 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
767 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
768 static const char* const option_elt[] = { "option", NULL } ;
769 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
770 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
771 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
772 static const char* const width_attr[] = { "width", NULL } ;
773 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
774 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
775 static const char* const language_attr[] = { "language", NULL } ;
776 static const char* const select_content[] = { "optgroup", "option", NULL } ;
777 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
778 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
779 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
780 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
781 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
782 static const char* const tr_elt[] = { "tr", NULL } ;
783 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
784 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
785 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
786 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
787 static const char* const tr_contents[] = { "th", "td", NULL } ;
788 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
789 static const char* const li_elt[] = { "li", NULL } ;
790 static const char* const ul_depr[] = { "type", "compact", NULL} ;
791 static const char* const dir_attr[] = { "dir", NULL} ;
792 
793 #define DECL (const char**)
794 
795 static const htmlElemDesc
796 html40ElementTable[] = {
797 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
798 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
799 },
800 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
801 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802 },
803 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
804 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
805 },
806 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
807 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
808 },
809 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
810 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
811 },
812 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
813 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
814 },
815 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
816 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817 },
818 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
819 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
820 },
821 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
822 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
823 },
824 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
825 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
826 },
827 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
828 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
829 },
830 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
831 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
832 },
833 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
834 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
835 },
836 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
837 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
838 },
839 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
840 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
841 },
842 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
843 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
844 },
845 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
846 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
847 },
848 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
849 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
850 },
851 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
852 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853 },
854 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
855 	EMPTY , NULL , DECL col_attrs , NULL, NULL
856 },
857 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
858 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
859 },
860 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
861 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
862 },
863 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
864 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
865 },
866 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
867 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
868 },
869 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
870 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
871 },
872 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
873 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
874 },
875 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
876 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
877 },
878 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
879 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
880 },
881 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
882 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
883 },
884 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
885 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
886 },
887 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
888 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
889 },
890 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
891 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
892 },
893 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
894 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
895 },
896 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
897 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
898 },
899 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
900 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
901 },
902 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
903 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904 },
905 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
906 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
907 },
908 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
909 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910 },
911 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
912 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
913 },
914 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
915 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
916 },
917 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
918 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
919 },
920 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
921 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
922 },
923 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
924 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
925 },
926 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
927 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
928 },
929 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
930 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
931 },
932 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
933 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
934 },
935 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
936 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
937 },
938 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
939 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
940 },
941 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
942 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
943 },
944 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
945 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
946 },
947 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
948 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949 },
950 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
951 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
952 },
953 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
954 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
955 },
956 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
957 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
958 },
959 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
960 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
961 },
962 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
963 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
964 },
965 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
966 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
967 },
968 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
969 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
970 },
971 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
972 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
973 },
974 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
975 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
976 },
977 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
978 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
979 },
980 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
981 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
982 },
983 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
984 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
985 },
986 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
987 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
988 },
989 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
990 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
991 },
992 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
993 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
994 },
995 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
996 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
997 },
998 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
999 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1000 },
1001 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1002 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1003 },
1004 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1005 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006 },
1007 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
1008 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1009 },
1010 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
1011 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
1012 },
1013 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
1014 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015 },
1016 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1017 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
1020 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1021 },
1022 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1023 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1024 },
1025 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
1026 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1027 },
1028 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
1029 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1030 },
1031 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
1032 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "table",	0, 0, 0, 0, 0, 0, 0, "",
1035 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1036 },
1037 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
1038 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039 },
1040 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
1041 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1042 },
1043 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1044 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1045 },
1046 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
1047 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1048 },
1049 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
1050 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1051 },
1052 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
1053 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1054 },
1055 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
1056 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1057 },
1058 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
1059 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1060 },
1061 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1062 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1063 },
1064 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
1065 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1066 },
1067 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
1068 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1069 },
1070 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1071 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1072 }
1073 };
1074 
1075 typedef struct {
1076     const char *oldTag;
1077     const char *newTag;
1078 } htmlStartCloseEntry;
1079 
1080 /*
1081  * start tags that imply the end of current element
1082  */
1083 static const htmlStartCloseEntry htmlStartClose[] = {
1084     { "a", "a" },
1085     { "a", "fieldset" },
1086     { "a", "table" },
1087     { "a", "td" },
1088     { "a", "th" },
1089     { "address", "dd" },
1090     { "address", "dl" },
1091     { "address", "dt" },
1092     { "address", "form" },
1093     { "address", "li" },
1094     { "address", "ul" },
1095     { "b", "center" },
1096     { "b", "p" },
1097     { "b", "td" },
1098     { "b", "th" },
1099     { "big", "p" },
1100     { "caption", "col" },
1101     { "caption", "colgroup" },
1102     { "caption", "tbody" },
1103     { "caption", "tfoot" },
1104     { "caption", "thead" },
1105     { "caption", "tr" },
1106     { "col", "col" },
1107     { "col", "colgroup" },
1108     { "col", "tbody" },
1109     { "col", "tfoot" },
1110     { "col", "thead" },
1111     { "col", "tr" },
1112     { "colgroup", "colgroup" },
1113     { "colgroup", "tbody" },
1114     { "colgroup", "tfoot" },
1115     { "colgroup", "thead" },
1116     { "colgroup", "tr" },
1117     { "dd", "dt" },
1118     { "dir", "dd" },
1119     { "dir", "dl" },
1120     { "dir", "dt" },
1121     { "dir", "form" },
1122     { "dir", "ul" },
1123     { "dl", "form" },
1124     { "dl", "li" },
1125     { "dt", "dd" },
1126     { "dt", "dl" },
1127     { "font", "center" },
1128     { "font", "td" },
1129     { "font", "th" },
1130     { "form", "form" },
1131     { "h1", "fieldset" },
1132     { "h1", "form" },
1133     { "h1", "li" },
1134     { "h1", "p" },
1135     { "h1", "table" },
1136     { "h2", "fieldset" },
1137     { "h2", "form" },
1138     { "h2", "li" },
1139     { "h2", "p" },
1140     { "h2", "table" },
1141     { "h3", "fieldset" },
1142     { "h3", "form" },
1143     { "h3", "li" },
1144     { "h3", "p" },
1145     { "h3", "table" },
1146     { "h4", "fieldset" },
1147     { "h4", "form" },
1148     { "h4", "li" },
1149     { "h4", "p" },
1150     { "h4", "table" },
1151     { "h5", "fieldset" },
1152     { "h5", "form" },
1153     { "h5", "li" },
1154     { "h5", "p" },
1155     { "h5", "table" },
1156     { "h6", "fieldset" },
1157     { "h6", "form" },
1158     { "h6", "li" },
1159     { "h6", "p" },
1160     { "h6", "table" },
1161     { "head", "a" },
1162     { "head", "abbr" },
1163     { "head", "acronym" },
1164     { "head", "address" },
1165     { "head", "b" },
1166     { "head", "bdo" },
1167     { "head", "big" },
1168     { "head", "blockquote" },
1169     { "head", "body" },
1170     { "head", "br" },
1171     { "head", "center" },
1172     { "head", "cite" },
1173     { "head", "code" },
1174     { "head", "dd" },
1175     { "head", "dfn" },
1176     { "head", "dir" },
1177     { "head", "div" },
1178     { "head", "dl" },
1179     { "head", "dt" },
1180     { "head", "em" },
1181     { "head", "fieldset" },
1182     { "head", "font" },
1183     { "head", "form" },
1184     { "head", "frameset" },
1185     { "head", "h1" },
1186     { "head", "h2" },
1187     { "head", "h3" },
1188     { "head", "h4" },
1189     { "head", "h5" },
1190     { "head", "h6" },
1191     { "head", "hr" },
1192     { "head", "i" },
1193     { "head", "iframe" },
1194     { "head", "img" },
1195     { "head", "kbd" },
1196     { "head", "li" },
1197     { "head", "listing" },
1198     { "head", "map" },
1199     { "head", "menu" },
1200     { "head", "ol" },
1201     { "head", "p" },
1202     { "head", "pre" },
1203     { "head", "q" },
1204     { "head", "s" },
1205     { "head", "samp" },
1206     { "head", "small" },
1207     { "head", "span" },
1208     { "head", "strike" },
1209     { "head", "strong" },
1210     { "head", "sub" },
1211     { "head", "sup" },
1212     { "head", "table" },
1213     { "head", "tt" },
1214     { "head", "u" },
1215     { "head", "ul" },
1216     { "head", "var" },
1217     { "head", "xmp" },
1218     { "hr", "form" },
1219     { "i", "center" },
1220     { "i", "p" },
1221     { "i", "td" },
1222     { "i", "th" },
1223     { "legend", "fieldset" },
1224     { "li", "li" },
1225     { "link", "body" },
1226     { "link", "frameset" },
1227     { "listing", "dd" },
1228     { "listing", "dl" },
1229     { "listing", "dt" },
1230     { "listing", "fieldset" },
1231     { "listing", "form" },
1232     { "listing", "li" },
1233     { "listing", "table" },
1234     { "listing", "ul" },
1235     { "menu", "dd" },
1236     { "menu", "dl" },
1237     { "menu", "dt" },
1238     { "menu", "form" },
1239     { "menu", "ul" },
1240     { "ol", "form" },
1241     { "ol", "ul" },
1242     { "option", "optgroup" },
1243     { "option", "option" },
1244     { "p", "address" },
1245     { "p", "blockquote" },
1246     { "p", "body" },
1247     { "p", "caption" },
1248     { "p", "center" },
1249     { "p", "col" },
1250     { "p", "colgroup" },
1251     { "p", "dd" },
1252     { "p", "dir" },
1253     { "p", "div" },
1254     { "p", "dl" },
1255     { "p", "dt" },
1256     { "p", "fieldset" },
1257     { "p", "form" },
1258     { "p", "frameset" },
1259     { "p", "h1" },
1260     { "p", "h2" },
1261     { "p", "h3" },
1262     { "p", "h4" },
1263     { "p", "h5" },
1264     { "p", "h6" },
1265     { "p", "head" },
1266     { "p", "hr" },
1267     { "p", "li" },
1268     { "p", "listing" },
1269     { "p", "menu" },
1270     { "p", "ol" },
1271     { "p", "p" },
1272     { "p", "pre" },
1273     { "p", "table" },
1274     { "p", "tbody" },
1275     { "p", "td" },
1276     { "p", "tfoot" },
1277     { "p", "th" },
1278     { "p", "title" },
1279     { "p", "tr" },
1280     { "p", "ul" },
1281     { "p", "xmp" },
1282     { "pre", "dd" },
1283     { "pre", "dl" },
1284     { "pre", "dt" },
1285     { "pre", "fieldset" },
1286     { "pre", "form" },
1287     { "pre", "li" },
1288     { "pre", "table" },
1289     { "pre", "ul" },
1290     { "s", "p" },
1291     { "script", "noscript" },
1292     { "small", "p" },
1293     { "span", "td" },
1294     { "span", "th" },
1295     { "strike", "p" },
1296     { "style", "body" },
1297     { "style", "frameset" },
1298     { "tbody", "tbody" },
1299     { "tbody", "tfoot" },
1300     { "td", "tbody" },
1301     { "td", "td" },
1302     { "td", "tfoot" },
1303     { "td", "th" },
1304     { "td", "tr" },
1305     { "tfoot", "tbody" },
1306     { "th", "tbody" },
1307     { "th", "td" },
1308     { "th", "tfoot" },
1309     { "th", "th" },
1310     { "th", "tr" },
1311     { "thead", "tbody" },
1312     { "thead", "tfoot" },
1313     { "title", "body" },
1314     { "title", "frameset" },
1315     { "tr", "tbody" },
1316     { "tr", "tfoot" },
1317     { "tr", "tr" },
1318     { "tt", "p" },
1319     { "u", "p" },
1320     { "u", "td" },
1321     { "u", "th" },
1322     { "ul", "address" },
1323     { "ul", "form" },
1324     { "ul", "menu" },
1325     { "ul", "ol" },
1326     { "ul", "pre" },
1327     { "xmp", "dd" },
1328     { "xmp", "dl" },
1329     { "xmp", "dt" },
1330     { "xmp", "fieldset" },
1331     { "xmp", "form" },
1332     { "xmp", "li" },
1333     { "xmp", "table" },
1334     { "xmp", "ul" }
1335 };
1336 
1337 /*
1338  * The list of HTML elements which are supposed not to have
1339  * CDATA content and where a p element will be implied
1340  *
1341  * TODO: extend that list by reading the HTML SGML DTD on
1342  *       implied paragraph
1343  */
1344 static const char *const htmlNoContentElements[] = {
1345     "html",
1346     "head",
1347     NULL
1348 };
1349 
1350 /*
1351  * The list of HTML attributes which are of content %Script;
1352  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1353  *       it assumes the name starts with 'on'
1354  */
1355 static const char *const htmlScriptAttributes[] = {
1356     "onclick",
1357     "ondblclick",
1358     "onmousedown",
1359     "onmouseup",
1360     "onmouseover",
1361     "onmousemove",
1362     "onmouseout",
1363     "onkeypress",
1364     "onkeydown",
1365     "onkeyup",
1366     "onload",
1367     "onunload",
1368     "onfocus",
1369     "onblur",
1370     "onsubmit",
1371     "onreset",
1372     "onchange",
1373     "onselect"
1374 };
1375 
1376 /*
1377  * This table is used by the htmlparser to know what to do with
1378  * broken html pages. By assigning different priorities to different
1379  * elements the parser can decide how to handle extra endtags.
1380  * Endtags are only allowed to close elements with lower or equal
1381  * priority.
1382  */
1383 
1384 typedef struct {
1385     const char *name;
1386     int priority;
1387 } elementPriority;
1388 
1389 static const elementPriority htmlEndPriority[] = {
1390     {"div",   150},
1391     {"td",    160},
1392     {"th",    160},
1393     {"tr",    170},
1394     {"thead", 180},
1395     {"tbody", 180},
1396     {"tfoot", 180},
1397     {"table", 190},
1398     {"head",  200},
1399     {"body",  200},
1400     {"html",  220},
1401     {NULL,    100} /* Default priority */
1402 };
1403 
1404 /************************************************************************
1405  *									*
1406  *	functions to handle HTML specific data			*
1407  *									*
1408  ************************************************************************/
1409 
1410 /**
1411  * htmlInitAutoClose:
1412  *
1413  * This is a no-op now.
1414  */
1415 void
htmlInitAutoClose(void)1416 htmlInitAutoClose(void) {
1417 }
1418 
1419 static int
htmlCompareTags(const void * key,const void * member)1420 htmlCompareTags(const void *key, const void *member) {
1421     const xmlChar *tag = (const xmlChar *) key;
1422     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1423 
1424     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1425 }
1426 
1427 /**
1428  * htmlTagLookup:
1429  * @tag:  The tag name in lowercase
1430  *
1431  * Lookup the HTML tag in the ElementTable
1432  *
1433  * Returns the related htmlElemDescPtr or NULL if not found.
1434  */
1435 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1436 htmlTagLookup(const xmlChar *tag) {
1437     if (tag == NULL)
1438         return(NULL);
1439 
1440     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1441                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1442                 sizeof(htmlElemDesc), htmlCompareTags));
1443 }
1444 
1445 /**
1446  * htmlGetEndPriority:
1447  * @name: The name of the element to look up the priority for.
1448  *
1449  * Return value: The "endtag" priority.
1450  **/
1451 static int
htmlGetEndPriority(const xmlChar * name)1452 htmlGetEndPriority (const xmlChar *name) {
1453     int i = 0;
1454 
1455     while ((htmlEndPriority[i].name != NULL) &&
1456 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1457 	i++;
1458 
1459     return(htmlEndPriority[i].priority);
1460 }
1461 
1462 
1463 static int
htmlCompareStartClose(const void * vkey,const void * member)1464 htmlCompareStartClose(const void *vkey, const void *member) {
1465     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1466     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1467     int ret;
1468 
1469     ret = strcmp(key->oldTag, entry->oldTag);
1470     if (ret == 0)
1471         ret = strcmp(key->newTag, entry->newTag);
1472 
1473     return(ret);
1474 }
1475 
1476 /**
1477  * htmlCheckAutoClose:
1478  * @newtag:  The new tag name
1479  * @oldtag:  The old tag name
1480  *
1481  * Checks whether the new tag is one of the registered valid tags for
1482  * closing old.
1483  *
1484  * Returns 0 if no, 1 if yes.
1485  */
1486 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1487 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1488 {
1489     htmlStartCloseEntry key;
1490     void *res;
1491 
1492     key.oldTag = (const char *) oldtag;
1493     key.newTag = (const char *) newtag;
1494     res = bsearch(&key, htmlStartClose,
1495             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1496             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1497     return(res != NULL);
1498 }
1499 
1500 /**
1501  * htmlAutoCloseOnClose:
1502  * @ctxt:  an HTML parser context
1503  * @newtag:  The new tag name
1504  * @force:  force the tag closure
1505  *
1506  * The HTML DTD allows an ending tag to implicitly close other tags.
1507  */
1508 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1509 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1510 {
1511     const htmlElemDesc *info;
1512     int i, priority;
1513 
1514     priority = htmlGetEndPriority(newtag);
1515 
1516     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1517 
1518         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1519             break;
1520         /*
1521          * A misplaced endtag can only close elements with lower
1522          * or equal priority, so if we find an element with higher
1523          * priority before we find an element with
1524          * matching name, we just ignore this endtag
1525          */
1526         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1527             return;
1528     }
1529     if (i < 0)
1530         return;
1531 
1532     while (!xmlStrEqual(newtag, ctxt->name)) {
1533         info = htmlTagLookup(ctxt->name);
1534         if ((info != NULL) && (info->endTag == 3)) {
1535             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1536 	                 "Opening and ending tag mismatch: %s and %s\n",
1537 			 newtag, ctxt->name);
1538         }
1539         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1540             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1541 	htmlnamePop(ctxt);
1542     }
1543 }
1544 
1545 /**
1546  * htmlAutoCloseOnEnd:
1547  * @ctxt:  an HTML parser context
1548  *
1549  * Close all remaining tags at the end of the stream
1550  */
1551 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1552 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1553 {
1554     int i;
1555 
1556     if (ctxt->nameNr == 0)
1557         return;
1558     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1559         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1560             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1561 	htmlnamePop(ctxt);
1562     }
1563 }
1564 
1565 /**
1566  * htmlAutoClose:
1567  * @ctxt:  an HTML parser context
1568  * @newtag:  The new tag name or NULL
1569  *
1570  * The HTML DTD allows a tag to implicitly close other tags.
1571  * The list is kept in htmlStartClose array. This function is
1572  * called when a new tag has been detected and generates the
1573  * appropriates closes if possible/needed.
1574  * If newtag is NULL this mean we are at the end of the resource
1575  * and we should check
1576  */
1577 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1578 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1579 {
1580     while ((newtag != NULL) && (ctxt->name != NULL) &&
1581            (htmlCheckAutoClose(newtag, ctxt->name))) {
1582         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584 	htmlnamePop(ctxt);
1585     }
1586     if (newtag == NULL) {
1587         htmlAutoCloseOnEnd(ctxt);
1588         return;
1589     }
1590     while ((newtag == NULL) && (ctxt->name != NULL) &&
1591            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1592             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1593             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1594         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1595             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1596 	htmlnamePop(ctxt);
1597     }
1598 }
1599 
1600 /**
1601  * htmlAutoCloseTag:
1602  * @doc:  the HTML document
1603  * @name:  The tag name
1604  * @elem:  the HTML element
1605  *
1606  * The HTML DTD allows a tag to implicitly close other tags.
1607  * The list is kept in htmlStartClose array. This function checks
1608  * if the element or one of it's children would autoclose the
1609  * given tag.
1610  *
1611  * Returns 1 if autoclose, 0 otherwise
1612  */
1613 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1614 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1615     htmlNodePtr child;
1616 
1617     if (elem == NULL) return(1);
1618     if (xmlStrEqual(name, elem->name)) return(0);
1619     if (htmlCheckAutoClose(elem->name, name)) return(1);
1620     child = elem->children;
1621     while (child != NULL) {
1622         if (htmlAutoCloseTag(doc, name, child)) return(1);
1623 	child = child->next;
1624     }
1625     return(0);
1626 }
1627 
1628 /**
1629  * htmlIsAutoClosed:
1630  * @doc:  the HTML document
1631  * @elem:  the HTML element
1632  *
1633  * The HTML DTD allows a tag to implicitly close other tags.
1634  * The list is kept in htmlStartClose array. This function checks
1635  * if a tag is autoclosed by one of it's child
1636  *
1637  * Returns 1 if autoclosed, 0 otherwise
1638  */
1639 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1640 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1641     htmlNodePtr child;
1642 
1643     if (elem == NULL) return(1);
1644     child = elem->children;
1645     while (child != NULL) {
1646 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1647 	child = child->next;
1648     }
1649     return(0);
1650 }
1651 
1652 /**
1653  * htmlCheckImplied:
1654  * @ctxt:  an HTML parser context
1655  * @newtag:  The new tag name
1656  *
1657  * The HTML DTD allows a tag to exists only implicitly
1658  * called when a new tag has been detected and generates the
1659  * appropriates implicit tags if missing
1660  */
1661 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1662 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1663     int i;
1664 
1665     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1666         return;
1667     if (!htmlOmittedDefaultValue)
1668 	return;
1669     if (xmlStrEqual(newtag, BAD_CAST"html"))
1670 	return;
1671     if (ctxt->nameNr <= 0) {
1672 	htmlnamePush(ctxt, BAD_CAST"html");
1673 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1674 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1675     }
1676     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1677         return;
1678     if ((ctxt->nameNr <= 1) &&
1679         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1680 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1681 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1682 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1683 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1684 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1685         if (ctxt->html >= 3) {
1686             /* we already saw or generated an <head> before */
1687             return;
1688         }
1689         /*
1690          * dropped OBJECT ... i you put it first BODY will be
1691          * assumed !
1692          */
1693         htmlnamePush(ctxt, BAD_CAST"head");
1694         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1695             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1696     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1697 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1698 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1699         if (ctxt->html >= 10) {
1700             /* we already saw or generated a <body> before */
1701             return;
1702         }
1703 	for (i = 0;i < ctxt->nameNr;i++) {
1704 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1705 		return;
1706 	    }
1707 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1708 		return;
1709 	    }
1710 	}
1711 
1712 	htmlnamePush(ctxt, BAD_CAST"body");
1713 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1714 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1715     }
1716 }
1717 
1718 /**
1719  * htmlCheckParagraph
1720  * @ctxt:  an HTML parser context
1721  *
1722  * Check whether a p element need to be implied before inserting
1723  * characters in the current element.
1724  *
1725  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1726  *         in case of error.
1727  */
1728 
1729 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1730 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1731     const xmlChar *tag;
1732     int i;
1733 
1734     if (ctxt == NULL)
1735 	return(-1);
1736     tag = ctxt->name;
1737     if (tag == NULL) {
1738 	htmlAutoClose(ctxt, BAD_CAST"p");
1739 	htmlCheckImplied(ctxt, BAD_CAST"p");
1740 	htmlnamePush(ctxt, BAD_CAST"p");
1741 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1742 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1743 	return(1);
1744     }
1745     if (!htmlOmittedDefaultValue)
1746 	return(0);
1747     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1748 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1749 	    htmlAutoClose(ctxt, BAD_CAST"p");
1750 	    htmlCheckImplied(ctxt, BAD_CAST"p");
1751 	    htmlnamePush(ctxt, BAD_CAST"p");
1752 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1753 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1754 	    return(1);
1755 	}
1756     }
1757     return(0);
1758 }
1759 
1760 /**
1761  * htmlIsScriptAttribute:
1762  * @name:  an attribute name
1763  *
1764  * Check if an attribute is of content type Script
1765  *
1766  * Returns 1 is the attribute is a script 0 otherwise
1767  */
1768 int
htmlIsScriptAttribute(const xmlChar * name)1769 htmlIsScriptAttribute(const xmlChar *name) {
1770     unsigned int i;
1771 
1772     if (name == NULL)
1773       return(0);
1774     /*
1775      * all script attributes start with 'on'
1776      */
1777     if ((name[0] != 'o') || (name[1] != 'n'))
1778       return(0);
1779     for (i = 0;
1780 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1781 	 i++) {
1782 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1783 	    return(1);
1784     }
1785     return(0);
1786 }
1787 
1788 /************************************************************************
1789  *									*
1790  *	The list of HTML predefined entities			*
1791  *									*
1792  ************************************************************************/
1793 
1794 
1795 static const htmlEntityDesc  html40EntitiesTable[] = {
1796 /*
1797  * the 4 absolute ones, plus apostrophe.
1798  */
1799 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1800 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
1801 { 39,	"apos",	"single quote" },
1802 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
1803 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1804 
1805 /*
1806  * A bunch still in the 128-255 range
1807  * Replacing them depend really on the charset used.
1808  */
1809 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1810 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1811 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1812 { 163,	"pound","pound sign, U+00A3 ISOnum" },
1813 { 164,	"curren","currency sign, U+00A4 ISOnum" },
1814 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1815 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1816 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
1817 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1818 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1819 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1820 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1821 { 172,	"not",	"not sign, U+00AC ISOnum" },
1822 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1823 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1824 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1825 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1826 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1827 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1828 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1829 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1830 { 181,	"micro","micro sign, U+00B5 ISOnum" },
1831 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1832 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1833 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1834 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1835 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1836 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1837 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1838 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1839 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1840 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1841 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1842 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1843 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1844 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1845 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1846 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1847 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1848 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1849 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1850 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1851 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1852 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1853 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1854 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1855 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1856 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1857 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1858 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1859 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1860 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1861 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1862 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1863 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1864 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
1865 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1866 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1867 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1868 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1869 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1870 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1871 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1872 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1873 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1874 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1875 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1876 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1877 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1878 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1879 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1880 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1881 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1882 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1883 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1884 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1885 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1886 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1887 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1888 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1889 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1890 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1891 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1892 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1893 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1894 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1895 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1896 { 247,	"divide","division sign, U+00F7 ISOnum" },
1897 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1898 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1899 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1900 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1901 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1902 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1903 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1904 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1905 
1906 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1907 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1908 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1909 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1910 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1911 
1912 /*
1913  * Anything below should really be kept as entities references
1914  */
1915 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1916 
1917 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1918 { 732,	"tilde","small tilde, U+02DC ISOdia" },
1919 
1920 { 913,	"Alpha","greek capital letter alpha, U+0391" },
1921 { 914,	"Beta",	"greek capital letter beta, U+0392" },
1922 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1923 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1924 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1925 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1926 { 919,	"Eta",	"greek capital letter eta, U+0397" },
1927 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1928 { 921,	"Iota",	"greek capital letter iota, U+0399" },
1929 { 922,	"Kappa","greek capital letter kappa, U+039A" },
1930 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1931 { 924,	"Mu",	"greek capital letter mu, U+039C" },
1932 { 925,	"Nu",	"greek capital letter nu, U+039D" },
1933 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1934 { 927,	"Omicron","greek capital letter omicron, U+039F" },
1935 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1936 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
1937 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1938 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
1939 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1940 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1941 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
1942 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1943 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1944 
1945 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1946 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1947 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1948 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1949 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1950 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1951 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1952 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1953 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1954 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1955 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1956 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1957 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1958 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1959 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1960 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1961 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1962 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1963 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1964 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1965 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1966 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1967 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1968 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1969 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1970 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1971 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1972 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1973 
1974 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
1975 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
1976 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
1977 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1978 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1979 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1980 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1981 { 8211,	"ndash","en dash, U+2013 ISOpub" },
1982 { 8212,	"mdash","em dash, U+2014 ISOpub" },
1983 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1984 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1985 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1986 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1987 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1988 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1989 { 8224,	"dagger","dagger, U+2020 ISOpub" },
1990 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1991 
1992 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1993 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1994 
1995 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
1996 
1997 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1998 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1999 
2000 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
2001 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
2002 
2003 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
2004 { 8260,	"frasl","fraction slash, U+2044 NEW" },
2005 
2006 { 8364,	"euro",	"euro sign, U+20AC NEW" },
2007 
2008 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
2009 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
2010 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
2011 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
2012 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2013 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
2014 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
2015 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
2016 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
2017 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
2018 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2019 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
2020 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
2021 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
2022 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
2023 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
2024 
2025 { 8704,	"forall","for all, U+2200 ISOtech" },
2026 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
2027 { 8707,	"exist","there exists, U+2203 ISOtech" },
2028 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
2029 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
2030 { 8712,	"isin",	"element of, U+2208 ISOtech" },
2031 { 8713,	"notin","not an element of, U+2209 ISOtech" },
2032 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
2033 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
2034 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
2035 { 8722,	"minus","minus sign, U+2212 ISOtech" },
2036 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
2037 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
2038 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
2039 { 8734,	"infin","infinity, U+221E ISOtech" },
2040 { 8736,	"ang",	"angle, U+2220 ISOamso" },
2041 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
2042 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
2043 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
2044 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
2045 { 8747,	"int",	"integral, U+222B ISOtech" },
2046 { 8756,	"there4","therefore, U+2234 ISOtech" },
2047 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
2048 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
2049 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2050 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
2051 { 8801,	"equiv","identical to, U+2261 ISOtech" },
2052 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
2053 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
2054 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
2055 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
2056 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
2057 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
2058 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
2059 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
2060 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
2061 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2062 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
2063 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2064 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
2065 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
2066 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
2067 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
2068 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
2069 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
2070 
2071 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
2072 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
2073 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
2074 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
2075 
2076 };
2077 
2078 /************************************************************************
2079  *									*
2080  *		Commodity functions to handle entities			*
2081  *									*
2082  ************************************************************************/
2083 
2084 /*
2085  * Macro used to grow the current buffer.
2086  */
2087 #define growBuffer(buffer) {						\
2088     xmlChar *tmp;							\
2089     buffer##_size *= 2;							\
2090     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2091     if (tmp == NULL) {						\
2092 	htmlErrMemory(ctxt, "growing buffer\n");			\
2093 	xmlFree(buffer);						\
2094 	return(NULL);							\
2095     }									\
2096     buffer = tmp;							\
2097 }
2098 
2099 /**
2100  * htmlEntityLookup:
2101  * @name: the entity name
2102  *
2103  * Lookup the given entity in EntitiesTable
2104  *
2105  * TODO: the linear scan is really ugly, an hash table is really needed.
2106  *
2107  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2108  */
2109 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)2110 htmlEntityLookup(const xmlChar *name) {
2111     unsigned int i;
2112 
2113     for (i = 0;i < (sizeof(html40EntitiesTable)/
2114                     sizeof(html40EntitiesTable[0]));i++) {
2115         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2116             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2117 	}
2118     }
2119     return(NULL);
2120 }
2121 
2122 /**
2123  * htmlEntityValueLookup:
2124  * @value: the entity's unicode value
2125  *
2126  * Lookup the given entity in EntitiesTable
2127  *
2128  * TODO: the linear scan is really ugly, an hash table is really needed.
2129  *
2130  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2131  */
2132 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)2133 htmlEntityValueLookup(unsigned int value) {
2134     unsigned int i;
2135 
2136     for (i = 0;i < (sizeof(html40EntitiesTable)/
2137                     sizeof(html40EntitiesTable[0]));i++) {
2138         if (html40EntitiesTable[i].value >= value) {
2139 	    if (html40EntitiesTable[i].value > value)
2140 		break;
2141             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2142 	}
2143     }
2144     return(NULL);
2145 }
2146 
2147 /**
2148  * UTF8ToHtml:
2149  * @out:  a pointer to an array of bytes to store the result
2150  * @outlen:  the length of @out
2151  * @in:  a pointer to an array of UTF-8 chars
2152  * @inlen:  the length of @in
2153  *
2154  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2155  * plus HTML entities block of chars out.
2156  *
2157  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2158  * The value of @inlen after return is the number of octets consumed
2159  *     as the return value is positive, else unpredictable.
2160  * The value of @outlen after return is the number of octets consumed.
2161  */
2162 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2163 UTF8ToHtml(unsigned char* out, int *outlen,
2164               const unsigned char* in, int *inlen) {
2165     const unsigned char* processed = in;
2166     const unsigned char* outend;
2167     const unsigned char* outstart = out;
2168     const unsigned char* instart = in;
2169     const unsigned char* inend;
2170     unsigned int c, d;
2171     int trailing;
2172 
2173     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2174     if (in == NULL) {
2175         /*
2176 	 * initialization nothing to do
2177 	 */
2178 	*outlen = 0;
2179 	*inlen = 0;
2180 	return(0);
2181     }
2182     inend = in + (*inlen);
2183     outend = out + (*outlen);
2184     while (in < inend) {
2185 	d = *in++;
2186 	if      (d < 0x80)  { c= d; trailing= 0; }
2187 	else if (d < 0xC0) {
2188 	    /* trailing byte in leading position */
2189 	    *outlen = out - outstart;
2190 	    *inlen = processed - instart;
2191 	    return(-2);
2192         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2193         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2194         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2195 	else {
2196 	    /* no chance for this in Ascii */
2197 	    *outlen = out - outstart;
2198 	    *inlen = processed - instart;
2199 	    return(-2);
2200 	}
2201 
2202 	if (inend - in < trailing) {
2203 	    break;
2204 	}
2205 
2206 	for ( ; trailing; trailing--) {
2207 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2208 		break;
2209 	    c <<= 6;
2210 	    c |= d & 0x3F;
2211 	}
2212 
2213 	/* assertion: c is a single UTF-4 value */
2214 	if (c < 0x80) {
2215 	    if (out + 1 >= outend)
2216 		break;
2217 	    *out++ = c;
2218 	} else {
2219 	    int len;
2220 	    const htmlEntityDesc * ent;
2221 	    const char *cp;
2222 	    char nbuf[16];
2223 
2224 	    /*
2225 	     * Try to lookup a predefined HTML entity for it
2226 	     */
2227 
2228 	    ent = htmlEntityValueLookup(c);
2229 	    if (ent == NULL) {
2230 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2231 	      cp = nbuf;
2232 	    }
2233 	    else
2234 	      cp = ent->name;
2235 	    len = strlen(cp);
2236 	    if (out + 2 + len >= outend)
2237 		break;
2238 	    *out++ = '&';
2239 	    memcpy(out, cp, len);
2240 	    out += len;
2241 	    *out++ = ';';
2242 	}
2243 	processed = in;
2244     }
2245     *outlen = out - outstart;
2246     *inlen = processed - instart;
2247     return(0);
2248 }
2249 
2250 /**
2251  * htmlEncodeEntities:
2252  * @out:  a pointer to an array of bytes to store the result
2253  * @outlen:  the length of @out
2254  * @in:  a pointer to an array of UTF-8 chars
2255  * @inlen:  the length of @in
2256  * @quoteChar: the quote character to escape (' or ") or zero.
2257  *
2258  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2259  * plus HTML entities block of chars out.
2260  *
2261  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2262  * The value of @inlen after return is the number of octets consumed
2263  *     as the return value is positive, else unpredictable.
2264  * The value of @outlen after return is the number of octets consumed.
2265  */
2266 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2267 htmlEncodeEntities(unsigned char* out, int *outlen,
2268 		   const unsigned char* in, int *inlen, int quoteChar) {
2269     const unsigned char* processed = in;
2270     const unsigned char* outend;
2271     const unsigned char* outstart = out;
2272     const unsigned char* instart = in;
2273     const unsigned char* inend;
2274     unsigned int c, d;
2275     int trailing;
2276 
2277     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2278         return(-1);
2279     outend = out + (*outlen);
2280     inend = in + (*inlen);
2281     while (in < inend) {
2282 	d = *in++;
2283 	if      (d < 0x80)  { c= d; trailing= 0; }
2284 	else if (d < 0xC0) {
2285 	    /* trailing byte in leading position */
2286 	    *outlen = out - outstart;
2287 	    *inlen = processed - instart;
2288 	    return(-2);
2289         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2290         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2291         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2292 	else {
2293 	    /* no chance for this in Ascii */
2294 	    *outlen = out - outstart;
2295 	    *inlen = processed - instart;
2296 	    return(-2);
2297 	}
2298 
2299 	if (inend - in < trailing)
2300 	    break;
2301 
2302 	while (trailing--) {
2303 	    if (((d= *in++) & 0xC0) != 0x80) {
2304 		*outlen = out - outstart;
2305 		*inlen = processed - instart;
2306 		return(-2);
2307 	    }
2308 	    c <<= 6;
2309 	    c |= d & 0x3F;
2310 	}
2311 
2312 	/* assertion: c is a single UTF-4 value */
2313 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2314 	    (c != '&') && (c != '<') && (c != '>')) {
2315 	    if (out >= outend)
2316 		break;
2317 	    *out++ = c;
2318 	} else {
2319 	    const htmlEntityDesc * ent;
2320 	    const char *cp;
2321 	    char nbuf[16];
2322 	    int len;
2323 
2324 	    /*
2325 	     * Try to lookup a predefined HTML entity for it
2326 	     */
2327 	    ent = htmlEntityValueLookup(c);
2328 	    if (ent == NULL) {
2329 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2330 		cp = nbuf;
2331 	    }
2332 	    else
2333 		cp = ent->name;
2334 	    len = strlen(cp);
2335 	    if (out + 2 + len > outend)
2336 		break;
2337 	    *out++ = '&';
2338 	    memcpy(out, cp, len);
2339 	    out += len;
2340 	    *out++ = ';';
2341 	}
2342 	processed = in;
2343     }
2344     *outlen = out - outstart;
2345     *inlen = processed - instart;
2346     return(0);
2347 }
2348 
2349 /************************************************************************
2350  *									*
2351  *		Commodity functions to handle streams			*
2352  *									*
2353  ************************************************************************/
2354 
2355 #ifdef LIBXML_PUSH_ENABLED
2356 /**
2357  * htmlNewInputStream:
2358  * @ctxt:  an HTML parser context
2359  *
2360  * Create a new input stream structure
2361  * Returns the new input stream or NULL
2362  */
2363 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)2364 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2365     htmlParserInputPtr input;
2366 
2367     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2368     if (input == NULL) {
2369         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2370 	return(NULL);
2371     }
2372     memset(input, 0, sizeof(htmlParserInput));
2373     input->filename = NULL;
2374     input->directory = NULL;
2375     input->base = NULL;
2376     input->cur = NULL;
2377     input->buf = NULL;
2378     input->line = 1;
2379     input->col = 1;
2380     input->buf = NULL;
2381     input->free = NULL;
2382     input->version = NULL;
2383     input->consumed = 0;
2384     input->length = 0;
2385     return(input);
2386 }
2387 #endif
2388 
2389 
2390 /************************************************************************
2391  *									*
2392  *		Commodity functions, cleanup needed ?			*
2393  *									*
2394  ************************************************************************/
2395 /*
2396  * all tags allowing pc data from the html 4.01 loose dtd
2397  * NOTE: it might be more appropriate to integrate this information
2398  * into the html40ElementTable array but I don't want to risk any
2399  * binary incompatibility
2400  */
2401 static const char *allowPCData[] = {
2402     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2403     "blockquote", "body", "button", "caption", "center", "cite", "code",
2404     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2405     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2406     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2407     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2408 };
2409 
2410 /**
2411  * areBlanks:
2412  * @ctxt:  an HTML parser context
2413  * @str:  a xmlChar *
2414  * @len:  the size of @str
2415  *
2416  * Is this a sequence of blank chars that one can ignore ?
2417  *
2418  * Returns 1 if ignorable 0 otherwise.
2419  */
2420 
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2421 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2422     unsigned int i;
2423     int j;
2424     xmlNodePtr lastChild;
2425     xmlDtdPtr dtd;
2426 
2427     for (j = 0;j < len;j++)
2428         if (!(IS_BLANK_CH(str[j]))) return(0);
2429 
2430     if (CUR == 0) return(1);
2431     if (CUR != '<') return(0);
2432     if (ctxt->name == NULL)
2433 	return(1);
2434     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2435 	return(1);
2436     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2437 	return(1);
2438 
2439     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2440     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2441         dtd = xmlGetIntSubset(ctxt->myDoc);
2442         if (dtd != NULL && dtd->ExternalID != NULL) {
2443             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2444                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2445                 return(1);
2446         }
2447     }
2448 
2449     if (ctxt->node == NULL) return(0);
2450     lastChild = xmlGetLastChild(ctxt->node);
2451     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2452 	lastChild = lastChild->prev;
2453     if (lastChild == NULL) {
2454         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2455             (ctxt->node->content != NULL)) return(0);
2456 	/* keep ws in constructs like ...<b> </b>...
2457 	   for all tags "b" allowing PCDATA */
2458 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2459 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2460 		return(0);
2461 	    }
2462 	}
2463     } else if (xmlNodeIsText(lastChild)) {
2464         return(0);
2465     } else {
2466 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2467 	   for all tags "p" allowing PCDATA */
2468 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2469 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2470 		return(0);
2471 	    }
2472 	}
2473     }
2474     return(1);
2475 }
2476 
2477 /**
2478  * htmlNewDocNoDtD:
2479  * @URI:  URI for the dtd, or NULL
2480  * @ExternalID:  the external ID of the DTD, or NULL
2481  *
2482  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2483  * are NULL
2484  *
2485  * Returns a new document, do not initialize the DTD if not provided
2486  */
2487 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2488 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2489     xmlDocPtr cur;
2490 
2491     /*
2492      * Allocate a new document and fill the fields.
2493      */
2494     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2495     if (cur == NULL) {
2496 	htmlErrMemory(NULL, "HTML document creation failed\n");
2497 	return(NULL);
2498     }
2499     memset(cur, 0, sizeof(xmlDoc));
2500 
2501     cur->type = XML_HTML_DOCUMENT_NODE;
2502     cur->version = NULL;
2503     cur->intSubset = NULL;
2504     cur->doc = cur;
2505     cur->name = NULL;
2506     cur->children = NULL;
2507     cur->extSubset = NULL;
2508     cur->oldNs = NULL;
2509     cur->encoding = NULL;
2510     cur->standalone = 1;
2511     cur->compression = 0;
2512     cur->ids = NULL;
2513     cur->refs = NULL;
2514     cur->_private = NULL;
2515     cur->charset = XML_CHAR_ENCODING_UTF8;
2516     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2517     if ((ExternalID != NULL) ||
2518 	(URI != NULL))
2519 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2520     if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2521 	xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2522     return(cur);
2523 }
2524 
2525 /**
2526  * htmlNewDoc:
2527  * @URI:  URI for the dtd, or NULL
2528  * @ExternalID:  the external ID of the DTD, or NULL
2529  *
2530  * Creates a new HTML document
2531  *
2532  * Returns a new document
2533  */
2534 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2535 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2536     if ((URI == NULL) && (ExternalID == NULL))
2537 	return(htmlNewDocNoDtD(
2538 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2539 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2540 
2541     return(htmlNewDocNoDtD(URI, ExternalID));
2542 }
2543 
2544 
2545 /************************************************************************
2546  *									*
2547  *			The parser itself				*
2548  *	Relates to http://www.w3.org/TR/html40				*
2549  *									*
2550  ************************************************************************/
2551 
2552 /************************************************************************
2553  *									*
2554  *			The parser itself				*
2555  *									*
2556  ************************************************************************/
2557 
2558 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2559 
2560 /**
2561  * htmlParseHTMLName:
2562  * @ctxt:  an HTML parser context
2563  *
2564  * parse an HTML tag or attribute name, note that we convert it to lowercase
2565  * since HTML names are not case-sensitive.
2566  *
2567  * Returns the Tag Name parsed or NULL
2568  */
2569 
2570 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2571 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2572     int i = 0;
2573     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2574 
2575     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2576         (CUR != ':') && (CUR != '.')) return(NULL);
2577 
2578     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2579            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2580 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2581            (CUR == '.'))) {
2582 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2583         else loc[i] = CUR;
2584 	i++;
2585 
2586 	NEXT;
2587     }
2588 
2589     return(xmlDictLookup(ctxt->dict, loc, i));
2590 }
2591 
2592 
2593 /**
2594  * htmlParseHTMLName_nonInvasive:
2595  * @ctxt:  an HTML parser context
2596  *
2597  * parse an HTML tag or attribute name, note that we convert it to lowercase
2598  * since HTML names are not case-sensitive, this doesn't consume the data
2599  * from the stream, it's a look-ahead
2600  *
2601  * Returns the Tag Name parsed or NULL
2602  */
2603 
2604 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2605 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2606     int i = 0;
2607     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2608 
2609     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2610         (NXT(1) != ':')) return(NULL);
2611 
2612     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2613            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2614 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2615 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2616         else loc[i] = NXT(1+i);
2617 	i++;
2618     }
2619 
2620     return(xmlDictLookup(ctxt->dict, loc, i));
2621 }
2622 
2623 
2624 /**
2625  * htmlParseName:
2626  * @ctxt:  an HTML parser context
2627  *
2628  * parse an HTML name, this routine is case sensitive.
2629  *
2630  * Returns the Name parsed or NULL
2631  */
2632 
2633 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2634 htmlParseName(htmlParserCtxtPtr ctxt) {
2635     const xmlChar *in;
2636     const xmlChar *ret;
2637     int count = 0;
2638 
2639     GROW;
2640 
2641     /*
2642      * Accelerator for simple ASCII names
2643      */
2644     in = ctxt->input->cur;
2645     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2646 	((*in >= 0x41) && (*in <= 0x5A)) ||
2647 	(*in == '_') || (*in == ':')) {
2648 	in++;
2649 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2650 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2651 	       ((*in >= 0x30) && (*in <= 0x39)) ||
2652 	       (*in == '_') || (*in == '-') ||
2653 	       (*in == ':') || (*in == '.'))
2654 	    in++;
2655 
2656 	if (in == ctxt->input->end)
2657 	    return(NULL);
2658 
2659 	if ((*in > 0) && (*in < 0x80)) {
2660 	    count = in - ctxt->input->cur;
2661 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2662 	    ctxt->input->cur = in;
2663 	    ctxt->input->col += count;
2664 	    return(ret);
2665 	}
2666     }
2667     return(htmlParseNameComplex(ctxt));
2668 }
2669 
2670 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2671 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2672     int len = 0, l;
2673     int c;
2674     int count = 0;
2675     const xmlChar *base = ctxt->input->base;
2676 
2677     /*
2678      * Handler for more complex cases
2679      */
2680     GROW;
2681     c = CUR_CHAR(l);
2682     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2683 	(!IS_LETTER(c) && (c != '_') &&
2684          (c != ':'))) {
2685 	return(NULL);
2686     }
2687 
2688     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2689 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2690             (c == '.') || (c == '-') ||
2691 	    (c == '_') || (c == ':') ||
2692 	    (IS_COMBINING(c)) ||
2693 	    (IS_EXTENDER(c)))) {
2694 	if (count++ > 100) {
2695 	    count = 0;
2696 	    GROW;
2697 	}
2698 	len += l;
2699 	NEXTL(l);
2700 	c = CUR_CHAR(l);
2701 	if (ctxt->input->base != base) {
2702 	    /*
2703 	     * We changed encoding from an unknown encoding
2704 	     * Input buffer changed location, so we better start again
2705 	     */
2706 	    return(htmlParseNameComplex(ctxt));
2707 	}
2708     }
2709 
2710     if (ctxt->input->cur - ctxt->input->base < len) {
2711         /* Sanity check */
2712 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2713                      "unexpected change of input buffer", NULL, NULL);
2714         return (NULL);
2715     }
2716 
2717     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2718 }
2719 
2720 
2721 /**
2722  * htmlParseHTMLAttribute:
2723  * @ctxt:  an HTML parser context
2724  * @stop:  a char stop value
2725  *
2726  * parse an HTML attribute value till the stop (quote), if
2727  * stop is 0 then it stops at the first space
2728  *
2729  * Returns the attribute parsed or NULL
2730  */
2731 
2732 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2733 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2734     xmlChar *buffer = NULL;
2735     int buffer_size = 0;
2736     xmlChar *out = NULL;
2737     const xmlChar *name = NULL;
2738     const xmlChar *cur = NULL;
2739     const htmlEntityDesc * ent;
2740 
2741     /*
2742      * allocate a translation buffer.
2743      */
2744     buffer_size = HTML_PARSER_BUFFER_SIZE;
2745     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2746     if (buffer == NULL) {
2747 	htmlErrMemory(ctxt, "buffer allocation failed\n");
2748 	return(NULL);
2749     }
2750     out = buffer;
2751 
2752     /*
2753      * Ok loop until we reach one of the ending chars
2754      */
2755     while ((CUR != 0) && (CUR != stop)) {
2756 	if ((stop == 0) && (CUR == '>')) break;
2757 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2758         if (CUR == '&') {
2759 	    if (NXT(1) == '#') {
2760 		unsigned int c;
2761 		int bits;
2762 
2763 		c = htmlParseCharRef(ctxt);
2764 		if      (c <    0x80)
2765 		        { *out++  = c;                bits= -6; }
2766 		else if (c <   0x800)
2767 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2768 		else if (c < 0x10000)
2769 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2770 		else
2771 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2772 
2773 		for ( ; bits >= 0; bits-= 6) {
2774 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2775 		}
2776 
2777 		if (out - buffer > buffer_size - 100) {
2778 			int indx = out - buffer;
2779 
2780 			growBuffer(buffer);
2781 			out = &buffer[indx];
2782 		}
2783 	    } else {
2784 		ent = htmlParseEntityRef(ctxt, &name);
2785 		if (name == NULL) {
2786 		    *out++ = '&';
2787 		    if (out - buffer > buffer_size - 100) {
2788 			int indx = out - buffer;
2789 
2790 			growBuffer(buffer);
2791 			out = &buffer[indx];
2792 		    }
2793 		} else if (ent == NULL) {
2794 		    *out++ = '&';
2795 		    cur = name;
2796 		    while (*cur != 0) {
2797 			if (out - buffer > buffer_size - 100) {
2798 			    int indx = out - buffer;
2799 
2800 			    growBuffer(buffer);
2801 			    out = &buffer[indx];
2802 			}
2803 			*out++ = *cur++;
2804 		    }
2805 		} else {
2806 		    unsigned int c;
2807 		    int bits;
2808 
2809 		    if (out - buffer > buffer_size - 100) {
2810 			int indx = out - buffer;
2811 
2812 			growBuffer(buffer);
2813 			out = &buffer[indx];
2814 		    }
2815 		    c = ent->value;
2816 		    if      (c <    0x80)
2817 			{ *out++  = c;                bits= -6; }
2818 		    else if (c <   0x800)
2819 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2820 		    else if (c < 0x10000)
2821 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2822 		    else
2823 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2824 
2825 		    for ( ; bits >= 0; bits-= 6) {
2826 			*out++  = ((c >> bits) & 0x3F) | 0x80;
2827 		    }
2828 		}
2829 	    }
2830 	} else {
2831 	    unsigned int c;
2832 	    int bits, l;
2833 
2834 	    if (out - buffer > buffer_size - 100) {
2835 		int indx = out - buffer;
2836 
2837 		growBuffer(buffer);
2838 		out = &buffer[indx];
2839 	    }
2840 	    c = CUR_CHAR(l);
2841 	    if      (c <    0x80)
2842 		    { *out++  = c;                bits= -6; }
2843 	    else if (c <   0x800)
2844 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2845 	    else if (c < 0x10000)
2846 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2847 	    else
2848 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2849 
2850 	    for ( ; bits >= 0; bits-= 6) {
2851 		*out++  = ((c >> bits) & 0x3F) | 0x80;
2852 	    }
2853 	    NEXT;
2854 	}
2855     }
2856     *out = 0;
2857     return(buffer);
2858 }
2859 
2860 /**
2861  * htmlParseEntityRef:
2862  * @ctxt:  an HTML parser context
2863  * @str:  location to store the entity name
2864  *
2865  * parse an HTML ENTITY references
2866  *
2867  * [68] EntityRef ::= '&' Name ';'
2868  *
2869  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2870  *         if non-NULL *str will have to be freed by the caller.
2871  */
2872 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2873 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2874     const xmlChar *name;
2875     const htmlEntityDesc * ent = NULL;
2876 
2877     if (str != NULL) *str = NULL;
2878     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2879 
2880     if (CUR == '&') {
2881         NEXT;
2882         name = htmlParseName(ctxt);
2883 	if (name == NULL) {
2884 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2885 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2886 	} else {
2887 	    GROW;
2888 	    if (CUR == ';') {
2889 	        if (str != NULL)
2890 		    *str = name;
2891 
2892 		/*
2893 		 * Lookup the entity in the table.
2894 		 */
2895 		ent = htmlEntityLookup(name);
2896 		if (ent != NULL) /* OK that's ugly !!! */
2897 		    NEXT;
2898 	    } else {
2899 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2900 		             "htmlParseEntityRef: expecting ';'\n",
2901 			     NULL, NULL);
2902 	        if (str != NULL)
2903 		    *str = name;
2904 	    }
2905 	}
2906     }
2907     return(ent);
2908 }
2909 
2910 /**
2911  * htmlParseAttValue:
2912  * @ctxt:  an HTML parser context
2913  *
2914  * parse a value for an attribute
2915  * Note: the parser won't do substitution of entities here, this
2916  * will be handled later in xmlStringGetNodeList, unless it was
2917  * asked for ctxt->replaceEntities != 0
2918  *
2919  * Returns the AttValue parsed or NULL.
2920  */
2921 
2922 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2923 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2924     xmlChar *ret = NULL;
2925 
2926     if (CUR == '"') {
2927         NEXT;
2928 	ret = htmlParseHTMLAttribute(ctxt, '"');
2929         if (CUR != '"') {
2930 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2931 	                 "AttValue: \" expected\n", NULL, NULL);
2932 	} else
2933 	    NEXT;
2934     } else if (CUR == '\'') {
2935         NEXT;
2936 	ret = htmlParseHTMLAttribute(ctxt, '\'');
2937         if (CUR != '\'') {
2938 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2939 	                 "AttValue: ' expected\n", NULL, NULL);
2940 	} else
2941 	    NEXT;
2942     } else {
2943         /*
2944 	 * That's an HTMLism, the attribute value may not be quoted
2945 	 */
2946 	ret = htmlParseHTMLAttribute(ctxt, 0);
2947 	if (ret == NULL) {
2948 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2949 	                 "AttValue: no value found\n", NULL, NULL);
2950 	}
2951     }
2952     return(ret);
2953 }
2954 
2955 /**
2956  * htmlParseSystemLiteral:
2957  * @ctxt:  an HTML parser context
2958  *
2959  * parse an HTML Literal
2960  *
2961  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2962  *
2963  * Returns the SystemLiteral parsed or NULL
2964  */
2965 
2966 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2967 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2968     size_t len = 0, startPosition = 0;
2969     int err = 0;
2970     int quote;
2971     xmlChar *ret = NULL;
2972 
2973     if ((CUR != '"') && (CUR != '\'')) {
2974 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2975 	             "SystemLiteral \" or ' expected\n", NULL, NULL);
2976         return(NULL);
2977     }
2978     quote = CUR;
2979     NEXT;
2980 
2981     if (CUR_PTR < BASE_PTR)
2982         return(ret);
2983     startPosition = CUR_PTR - BASE_PTR;
2984 
2985     while ((CUR != 0) && (CUR != quote)) {
2986         /* TODO: Handle UTF-8 */
2987         if (!IS_CHAR_CH(CUR)) {
2988             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2989                             "Invalid char in SystemLiteral 0x%X\n", CUR);
2990             err = 1;
2991         }
2992         NEXT;
2993         len++;
2994     }
2995     if (CUR != quote) {
2996         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2997                      "Unfinished SystemLiteral\n", NULL, NULL);
2998     } else {
2999         NEXT;
3000         if (err == 0)
3001             ret = xmlStrndup((BASE_PTR+startPosition), len);
3002     }
3003 
3004     return(ret);
3005 }
3006 
3007 /**
3008  * htmlParsePubidLiteral:
3009  * @ctxt:  an HTML parser context
3010  *
3011  * parse an HTML public literal
3012  *
3013  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3014  *
3015  * Returns the PubidLiteral parsed or NULL.
3016  */
3017 
3018 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)3019 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3020     size_t len = 0, startPosition = 0;
3021     int err = 0;
3022     int quote;
3023     xmlChar *ret = NULL;
3024 
3025     if ((CUR != '"') && (CUR != '\'')) {
3026 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3027 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
3028         return(NULL);
3029     }
3030     quote = CUR;
3031     NEXT;
3032 
3033     /*
3034      * Name ::= (Letter | '_') (NameChar)*
3035      */
3036     if (CUR_PTR < BASE_PTR)
3037         return(ret);
3038     startPosition = CUR_PTR - BASE_PTR;
3039 
3040     while ((CUR != 0) && (CUR != quote)) {
3041         if (!IS_PUBIDCHAR_CH(CUR)) {
3042             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3043                             "Invalid char in PubidLiteral 0x%X\n", CUR);
3044             err = 1;
3045         }
3046         len++;
3047         NEXT;
3048     }
3049 
3050     if (CUR != quote) {
3051         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3052                      "Unfinished PubidLiteral\n", NULL, NULL);
3053     } else {
3054         NEXT;
3055         if (err == 0)
3056             ret = xmlStrndup((BASE_PTR + startPosition), len);
3057     }
3058 
3059     return(ret);
3060 }
3061 
3062 /**
3063  * htmlParseScript:
3064  * @ctxt:  an HTML parser context
3065  *
3066  * parse the content of an HTML SCRIPT or STYLE element
3067  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3068  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3069  * http://www.w3.org/TR/html4/types.html#type-script
3070  * http://www.w3.org/TR/html4/types.html#h-6.15
3071  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3072  *
3073  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3074  * element and the value of intrinsic event attributes. User agents must
3075  * not evaluate script data as HTML markup but instead must pass it on as
3076  * data to a script engine.
3077  * NOTES:
3078  * - The content is passed like CDATA
3079  * - the attributes for style and scripting "onXXX" are also described
3080  *   as CDATA but SGML allows entities references in attributes so their
3081  *   processing is identical as other attributes
3082  */
3083 static void
htmlParseScript(htmlParserCtxtPtr ctxt)3084 htmlParseScript(htmlParserCtxtPtr ctxt) {
3085     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3086     int nbchar = 0;
3087     int cur,l;
3088 
3089     SHRINK;
3090     cur = CUR_CHAR(l);
3091     while (cur != 0) {
3092 	if ((cur == '<') && (NXT(1) == '/')) {
3093             /*
3094              * One should break here, the specification is clear:
3095              * Authors should therefore escape "</" within the content.
3096              * Escape mechanisms are specific to each scripting or
3097              * style sheet language.
3098              *
3099              * In recovery mode, only break if end tag match the
3100              * current tag, effectively ignoring all tags inside the
3101              * script/style block and treating the entire block as
3102              * CDATA.
3103              */
3104             if (ctxt->recovery) {
3105                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3106 				   xmlStrlen(ctxt->name)) == 0)
3107                 {
3108                     break; /* while */
3109                 } else {
3110 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3111 				 "Element %s embeds close tag\n",
3112 		                 ctxt->name, NULL);
3113 		}
3114             } else {
3115                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3116                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3117                 {
3118                     break; /* while */
3119                 }
3120             }
3121 	}
3122         if (IS_CHAR(cur)) {
3123 	    COPY_BUF(l,buf,nbchar,cur);
3124         } else {
3125             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3126                             "Invalid char in CDATA 0x%X\n", cur);
3127         }
3128 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3129             buf[nbchar] = 0;
3130 	    if (ctxt->sax->cdataBlock!= NULL) {
3131 		/*
3132 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3133 		 */
3134 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3135 	    } else if (ctxt->sax->characters != NULL) {
3136 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
3137 	    }
3138 	    nbchar = 0;
3139 	}
3140 	GROW;
3141 	NEXTL(l);
3142 	cur = CUR_CHAR(l);
3143     }
3144 
3145     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3146         buf[nbchar] = 0;
3147 	if (ctxt->sax->cdataBlock!= NULL) {
3148 	    /*
3149 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3150 	     */
3151 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3152 	} else if (ctxt->sax->characters != NULL) {
3153 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3154 	}
3155     }
3156 }
3157 
3158 
3159 /**
3160  * htmlParseCharDataInternal:
3161  * @ctxt:  an HTML parser context
3162  * @readahead: optional read ahead character in ascii range
3163  *
3164  * parse a CharData section.
3165  * if we are within a CDATA section ']]>' marks an end of section.
3166  *
3167  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3168  */
3169 
3170 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3171 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3172     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3173     int nbchar = 0;
3174     int cur, l;
3175     int chunk = 0;
3176 
3177     if (readahead)
3178         buf[nbchar++] = readahead;
3179 
3180     SHRINK;
3181     cur = CUR_CHAR(l);
3182     while (((cur != '<') || (ctxt->token == '<')) &&
3183            ((cur != '&') || (ctxt->token == '&')) &&
3184 	   (cur != 0)) {
3185 	if (!(IS_CHAR(cur))) {
3186 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3187 	                "Invalid char in CDATA 0x%X\n", cur);
3188 	} else {
3189 	    COPY_BUF(l,buf,nbchar,cur);
3190 	}
3191 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3192             buf[nbchar] = 0;
3193 
3194 	    /*
3195 	     * Ok the segment is to be consumed as chars.
3196 	     */
3197 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3198 		if (areBlanks(ctxt, buf, nbchar)) {
3199 		    if (ctxt->keepBlanks) {
3200 			if (ctxt->sax->characters != NULL)
3201 			    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3202 		    } else {
3203 			if (ctxt->sax->ignorableWhitespace != NULL)
3204 			    ctxt->sax->ignorableWhitespace(ctxt->userData,
3205 			                                   buf, nbchar);
3206 		    }
3207 		} else {
3208 		    htmlCheckParagraph(ctxt);
3209 		    if (ctxt->sax->characters != NULL)
3210 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3211 		}
3212 	    }
3213 	    nbchar = 0;
3214 	}
3215 	NEXTL(l);
3216         chunk++;
3217         if (chunk > HTML_PARSER_BUFFER_SIZE) {
3218             chunk = 0;
3219             SHRINK;
3220             GROW;
3221         }
3222 	cur = CUR_CHAR(l);
3223 	if (cur == 0) {
3224 	    SHRINK;
3225 	    GROW;
3226 	    cur = CUR_CHAR(l);
3227 	}
3228     }
3229     if (nbchar != 0) {
3230         buf[nbchar] = 0;
3231 
3232 	/*
3233 	 * Ok the segment is to be consumed as chars.
3234 	 */
3235 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3236 	    if (areBlanks(ctxt, buf, nbchar)) {
3237 		if (ctxt->keepBlanks) {
3238 		    if (ctxt->sax->characters != NULL)
3239 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3240 		} else {
3241 		    if (ctxt->sax->ignorableWhitespace != NULL)
3242 			ctxt->sax->ignorableWhitespace(ctxt->userData,
3243 			                               buf, nbchar);
3244 		}
3245 	    } else {
3246 		htmlCheckParagraph(ctxt);
3247 		if (ctxt->sax->characters != NULL)
3248 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3249 	    }
3250 	}
3251     } else {
3252 	/*
3253 	 * Loop detection
3254 	 */
3255 	if (cur == 0)
3256 	    ctxt->instate = XML_PARSER_EOF;
3257     }
3258 }
3259 
3260 /**
3261  * htmlParseCharData:
3262  * @ctxt:  an HTML parser context
3263  *
3264  * parse a CharData section.
3265  * if we are within a CDATA section ']]>' marks an end of section.
3266  *
3267  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3268  */
3269 
3270 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3271 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3272     htmlParseCharDataInternal(ctxt, 0);
3273 }
3274 
3275 /**
3276  * htmlParseExternalID:
3277  * @ctxt:  an HTML parser context
3278  * @publicID:  a xmlChar** receiving PubidLiteral
3279  *
3280  * Parse an External ID or a Public ID
3281  *
3282  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3283  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3284  *
3285  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3286  *
3287  * Returns the function returns SystemLiteral and in the second
3288  *                case publicID receives PubidLiteral, is strict is off
3289  *                it is possible to return NULL and have publicID set.
3290  */
3291 
3292 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3293 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3294     xmlChar *URI = NULL;
3295 
3296     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3297          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3298 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3299         SKIP(6);
3300 	if (!IS_BLANK_CH(CUR)) {
3301 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3302 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3303 	}
3304         SKIP_BLANKS;
3305 	URI = htmlParseSystemLiteral(ctxt);
3306 	if (URI == NULL) {
3307 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3308 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3309         }
3310     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3311 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3312 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3313         SKIP(6);
3314 	if (!IS_BLANK_CH(CUR)) {
3315 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3316 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3317 	}
3318         SKIP_BLANKS;
3319 	*publicID = htmlParsePubidLiteral(ctxt);
3320 	if (*publicID == NULL) {
3321 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3322 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3323 			 NULL, NULL);
3324 	}
3325         SKIP_BLANKS;
3326         if ((CUR == '"') || (CUR == '\'')) {
3327 	    URI = htmlParseSystemLiteral(ctxt);
3328 	}
3329     }
3330     return(URI);
3331 }
3332 
3333 /**
3334  * xmlParsePI:
3335  * @ctxt:  an XML parser context
3336  *
3337  * parse an XML Processing Instruction.
3338  *
3339  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3340  */
3341 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3342 htmlParsePI(htmlParserCtxtPtr ctxt) {
3343     xmlChar *buf = NULL;
3344     int len = 0;
3345     int size = HTML_PARSER_BUFFER_SIZE;
3346     int cur, l;
3347     const xmlChar *target;
3348     xmlParserInputState state;
3349     int count = 0;
3350 
3351     if ((RAW == '<') && (NXT(1) == '?')) {
3352 	state = ctxt->instate;
3353         ctxt->instate = XML_PARSER_PI;
3354 	/*
3355 	 * this is a Processing Instruction.
3356 	 */
3357 	SKIP(2);
3358 	SHRINK;
3359 
3360 	/*
3361 	 * Parse the target name and check for special support like
3362 	 * namespace.
3363 	 */
3364         target = htmlParseName(ctxt);
3365 	if (target != NULL) {
3366 	    if (RAW == '>') {
3367 		SKIP(1);
3368 
3369 		/*
3370 		 * SAX: PI detected.
3371 		 */
3372 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3373 		    (ctxt->sax->processingInstruction != NULL))
3374 		    ctxt->sax->processingInstruction(ctxt->userData,
3375 		                                     target, NULL);
3376 		ctxt->instate = state;
3377 		return;
3378 	    }
3379 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3380 	    if (buf == NULL) {
3381 		htmlErrMemory(ctxt, NULL);
3382 		ctxt->instate = state;
3383 		return;
3384 	    }
3385 	    cur = CUR;
3386 	    if (!IS_BLANK(cur)) {
3387 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3388 			  "ParsePI: PI %s space expected\n", target, NULL);
3389 	    }
3390             SKIP_BLANKS;
3391 	    cur = CUR_CHAR(l);
3392 	    while ((cur != 0) && (cur != '>')) {
3393 		if (len + 5 >= size) {
3394 		    xmlChar *tmp;
3395 
3396 		    size *= 2;
3397 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3398 		    if (tmp == NULL) {
3399 			htmlErrMemory(ctxt, NULL);
3400 			xmlFree(buf);
3401 			ctxt->instate = state;
3402 			return;
3403 		    }
3404 		    buf = tmp;
3405 		}
3406 		count++;
3407 		if (count > 50) {
3408 		    GROW;
3409 		    count = 0;
3410 		}
3411                 if (IS_CHAR(cur)) {
3412 		    COPY_BUF(l,buf,len,cur);
3413                 } else {
3414                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3415                                     "Invalid char in processing instruction "
3416                                     "0x%X\n", cur);
3417                 }
3418 		NEXTL(l);
3419 		cur = CUR_CHAR(l);
3420 		if (cur == 0) {
3421 		    SHRINK;
3422 		    GROW;
3423 		    cur = CUR_CHAR(l);
3424 		}
3425 	    }
3426 	    buf[len] = 0;
3427 	    if (cur != '>') {
3428 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3429 		      "ParsePI: PI %s never end ...\n", target, NULL);
3430 	    } else {
3431 		SKIP(1);
3432 
3433 		/*
3434 		 * SAX: PI detected.
3435 		 */
3436 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3437 		    (ctxt->sax->processingInstruction != NULL))
3438 		    ctxt->sax->processingInstruction(ctxt->userData,
3439 		                                     target, buf);
3440 	    }
3441 	    xmlFree(buf);
3442 	} else {
3443 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3444                          "PI is not started correctly", NULL, NULL);
3445 	}
3446 	ctxt->instate = state;
3447     }
3448 }
3449 
3450 /**
3451  * htmlParseComment:
3452  * @ctxt:  an HTML parser context
3453  *
3454  * Parse an XML (SGML) comment <!-- .... -->
3455  *
3456  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3457  */
3458 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3459 htmlParseComment(htmlParserCtxtPtr ctxt) {
3460     xmlChar *buf = NULL;
3461     int len;
3462     int size = HTML_PARSER_BUFFER_SIZE;
3463     int q, ql;
3464     int r, rl;
3465     int cur, l;
3466     int next, nl;
3467     xmlParserInputState state;
3468 
3469     /*
3470      * Check that there is a comment right here.
3471      */
3472     if ((RAW != '<') || (NXT(1) != '!') ||
3473         (NXT(2) != '-') || (NXT(3) != '-')) return;
3474 
3475     state = ctxt->instate;
3476     ctxt->instate = XML_PARSER_COMMENT;
3477     SHRINK;
3478     SKIP(4);
3479     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3480     if (buf == NULL) {
3481         htmlErrMemory(ctxt, "buffer allocation failed\n");
3482 	ctxt->instate = state;
3483 	return;
3484     }
3485     len = 0;
3486     buf[len] = 0;
3487     q = CUR_CHAR(ql);
3488     if (q == 0)
3489         goto unfinished;
3490     NEXTL(ql);
3491     r = CUR_CHAR(rl);
3492     if (r == 0)
3493         goto unfinished;
3494     NEXTL(rl);
3495     cur = CUR_CHAR(l);
3496     while ((cur != 0) &&
3497            ((cur != '>') ||
3498 	    (r != '-') || (q != '-'))) {
3499 	NEXTL(l);
3500 	next = CUR_CHAR(nl);
3501 	if (next == 0) {
3502 	    SHRINK;
3503 	    GROW;
3504 	    next = CUR_CHAR(nl);
3505 	}
3506 
3507 	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3508 	  htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3509 		       "Comment incorrectly closed by '--!>'", NULL, NULL);
3510 	  cur = '>';
3511 	  break;
3512 	}
3513 
3514 	if (len + 5 >= size) {
3515 	    xmlChar *tmp;
3516 
3517 	    size *= 2;
3518 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3519 	    if (tmp == NULL) {
3520 	        xmlFree(buf);
3521 	        htmlErrMemory(ctxt, "growing buffer failed\n");
3522 		ctxt->instate = state;
3523 		return;
3524 	    }
3525 	    buf = tmp;
3526 	}
3527         if (IS_CHAR(q)) {
3528 	    COPY_BUF(ql,buf,len,q);
3529         } else {
3530             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3531                             "Invalid char in comment 0x%X\n", q);
3532         }
3533 
3534 	q = r;
3535 	ql = rl;
3536 	r = cur;
3537 	rl = l;
3538 	cur = next;
3539 	l = nl;
3540     }
3541     buf[len] = 0;
3542     if (cur == '>') {
3543         NEXT;
3544 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3545 	    (!ctxt->disableSAX))
3546 	    ctxt->sax->comment(ctxt->userData, buf);
3547 	xmlFree(buf);
3548 	ctxt->instate = state;
3549 	return;
3550     }
3551 
3552 unfinished:
3553     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3554 		 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3555     xmlFree(buf);
3556 }
3557 
3558 /**
3559  * htmlParseCharRef:
3560  * @ctxt:  an HTML parser context
3561  *
3562  * parse Reference declarations
3563  *
3564  * [66] CharRef ::= '&#' [0-9]+ ';' |
3565  *                  '&#x' [0-9a-fA-F]+ ';'
3566  *
3567  * Returns the value parsed (as an int)
3568  */
3569 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3570 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3571     int val = 0;
3572 
3573     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3574 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3575 		     "htmlParseCharRef: context error\n",
3576 		     NULL, NULL);
3577         return(0);
3578     }
3579     if ((CUR == '&') && (NXT(1) == '#') &&
3580         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3581 	SKIP(3);
3582 	while (CUR != ';') {
3583 	    if ((CUR >= '0') && (CUR <= '9')) {
3584                 if (val < 0x110000)
3585 	            val = val * 16 + (CUR - '0');
3586             } else if ((CUR >= 'a') && (CUR <= 'f')) {
3587                 if (val < 0x110000)
3588 	            val = val * 16 + (CUR - 'a') + 10;
3589             } else if ((CUR >= 'A') && (CUR <= 'F')) {
3590                 if (val < 0x110000)
3591 	            val = val * 16 + (CUR - 'A') + 10;
3592             } else {
3593 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3594 		             "htmlParseCharRef: missing semicolon\n",
3595 			     NULL, NULL);
3596 		break;
3597 	    }
3598 	    NEXT;
3599 	}
3600 	if (CUR == ';')
3601 	    NEXT;
3602     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3603 	SKIP(2);
3604 	while (CUR != ';') {
3605 	    if ((CUR >= '0') && (CUR <= '9')) {
3606                 if (val < 0x110000)
3607 	            val = val * 10 + (CUR - '0');
3608             } else {
3609 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3610 		             "htmlParseCharRef: missing semicolon\n",
3611 			     NULL, NULL);
3612 		break;
3613 	    }
3614 	    NEXT;
3615 	}
3616 	if (CUR == ';')
3617 	    NEXT;
3618     } else {
3619 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3620 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3621     }
3622     /*
3623      * Check the value IS_CHAR ...
3624      */
3625     if (IS_CHAR(val)) {
3626         return(val);
3627     } else if (val >= 0x110000) {
3628 	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3629 		     "htmlParseCharRef: value too large\n", NULL, NULL);
3630     } else {
3631 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3632 			"htmlParseCharRef: invalid xmlChar value %d\n",
3633 			val);
3634     }
3635     return(0);
3636 }
3637 
3638 
3639 /**
3640  * htmlParseDocTypeDecl:
3641  * @ctxt:  an HTML parser context
3642  *
3643  * parse a DOCTYPE declaration
3644  *
3645  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3646  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3647  */
3648 
3649 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3650 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3651     const xmlChar *name;
3652     xmlChar *ExternalID = NULL;
3653     xmlChar *URI = NULL;
3654 
3655     /*
3656      * We know that '<!DOCTYPE' has been detected.
3657      */
3658     SKIP(9);
3659 
3660     SKIP_BLANKS;
3661 
3662     /*
3663      * Parse the DOCTYPE name.
3664      */
3665     name = htmlParseName(ctxt);
3666     if (name == NULL) {
3667 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3668 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3669 		     NULL, NULL);
3670     }
3671     /*
3672      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3673      */
3674 
3675     SKIP_BLANKS;
3676 
3677     /*
3678      * Check for SystemID and ExternalID
3679      */
3680     URI = htmlParseExternalID(ctxt, &ExternalID);
3681     SKIP_BLANKS;
3682 
3683     /*
3684      * We should be at the end of the DOCTYPE declaration.
3685      */
3686     if (CUR != '>') {
3687 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3688 	             "DOCTYPE improperly terminated\n", NULL, NULL);
3689         /* Ignore bogus content */
3690         while ((CUR != 0) && (CUR != '>'))
3691             NEXT;
3692     }
3693     if (CUR == '>')
3694         NEXT;
3695 
3696     /*
3697      * Create or update the document accordingly to the DOCTYPE
3698      */
3699     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3700 	(!ctxt->disableSAX))
3701 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3702 
3703     /*
3704      * Cleanup, since we don't use all those identifiers
3705      */
3706     if (URI != NULL) xmlFree(URI);
3707     if (ExternalID != NULL) xmlFree(ExternalID);
3708 }
3709 
3710 /**
3711  * htmlParseAttribute:
3712  * @ctxt:  an HTML parser context
3713  * @value:  a xmlChar ** used to store the value of the attribute
3714  *
3715  * parse an attribute
3716  *
3717  * [41] Attribute ::= Name Eq AttValue
3718  *
3719  * [25] Eq ::= S? '=' S?
3720  *
3721  * With namespace:
3722  *
3723  * [NS 11] Attribute ::= QName Eq AttValue
3724  *
3725  * Also the case QName == xmlns:??? is handled independently as a namespace
3726  * definition.
3727  *
3728  * Returns the attribute name, and the value in *value.
3729  */
3730 
3731 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3732 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3733     const xmlChar *name;
3734     xmlChar *val = NULL;
3735 
3736     *value = NULL;
3737     name = htmlParseHTMLName(ctxt);
3738     if (name == NULL) {
3739 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3740 	             "error parsing attribute name\n", NULL, NULL);
3741         return(NULL);
3742     }
3743 
3744     /*
3745      * read the value
3746      */
3747     SKIP_BLANKS;
3748     if (CUR == '=') {
3749         NEXT;
3750 	SKIP_BLANKS;
3751 	val = htmlParseAttValue(ctxt);
3752     }
3753 
3754     *value = val;
3755     return(name);
3756 }
3757 
3758 /**
3759  * htmlCheckEncodingDirect:
3760  * @ctxt:  an HTML parser context
3761  * @attvalue: the attribute value
3762  *
3763  * Checks an attribute value to detect
3764  * the encoding
3765  * If a new encoding is detected the parser is switched to decode
3766  * it and pass UTF8
3767  */
3768 static void
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt,const xmlChar * encoding)3769 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3770 
3771     if ((ctxt == NULL) || (encoding == NULL) ||
3772         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3773 	return;
3774 
3775     /* do not change encoding */
3776     if (ctxt->input->encoding != NULL)
3777         return;
3778 
3779     if (encoding != NULL) {
3780 	xmlCharEncoding enc;
3781 	xmlCharEncodingHandlerPtr handler;
3782 
3783 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3784 
3785 	if (ctxt->input->encoding != NULL)
3786 	    xmlFree((xmlChar *) ctxt->input->encoding);
3787 	ctxt->input->encoding = xmlStrdup(encoding);
3788 
3789 	enc = xmlParseCharEncoding((const char *) encoding);
3790 	/*
3791 	 * registered set of known encodings
3792 	 */
3793 	if (enc != XML_CHAR_ENCODING_ERROR) {
3794 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3795 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3796 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3797 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3798 		(ctxt->input->buf != NULL) &&
3799 		(ctxt->input->buf->encoder == NULL)) {
3800 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3801 		             "htmlCheckEncoding: wrong encoding meta\n",
3802 			     NULL, NULL);
3803 	    } else {
3804 		xmlSwitchEncoding(ctxt, enc);
3805 	    }
3806 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3807 	} else {
3808 	    /*
3809 	     * fallback for unknown encodings
3810 	     */
3811 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3812 	    if (handler != NULL) {
3813 		xmlSwitchToEncoding(ctxt, handler);
3814 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3815 	    } else {
3816 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3817 		             "htmlCheckEncoding: unknown encoding %s\n",
3818 			     encoding, NULL);
3819 	    }
3820 	}
3821 
3822 	if ((ctxt->input->buf != NULL) &&
3823 	    (ctxt->input->buf->encoder != NULL) &&
3824 	    (ctxt->input->buf->raw != NULL) &&
3825 	    (ctxt->input->buf->buffer != NULL)) {
3826 	    int nbchars;
3827 	    int processed;
3828 
3829 	    /*
3830 	     * convert as much as possible to the parser reading buffer.
3831 	     */
3832 	    processed = ctxt->input->cur - ctxt->input->base;
3833 	    xmlBufShrink(ctxt->input->buf->buffer, processed);
3834 	    nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3835             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3836 	    if (nbchars < 0) {
3837 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3838 		             "htmlCheckEncoding: encoder error\n",
3839 			     NULL, NULL);
3840 	    }
3841 	}
3842     }
3843 }
3844 
3845 /**
3846  * htmlCheckEncoding:
3847  * @ctxt:  an HTML parser context
3848  * @attvalue: the attribute value
3849  *
3850  * Checks an http-equiv attribute from a Meta tag to detect
3851  * the encoding
3852  * If a new encoding is detected the parser is switched to decode
3853  * it and pass UTF8
3854  */
3855 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3856 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3857     const xmlChar *encoding;
3858 
3859     if (!attvalue)
3860 	return;
3861 
3862     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3863     if (encoding != NULL) {
3864 	encoding += 7;
3865     }
3866     /*
3867      * skip blank
3868      */
3869     if (encoding && IS_BLANK_CH(*encoding))
3870 	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3871     if (encoding && *encoding == '=') {
3872 	encoding ++;
3873 	htmlCheckEncodingDirect(ctxt, encoding);
3874     }
3875 }
3876 
3877 /**
3878  * htmlCheckMeta:
3879  * @ctxt:  an HTML parser context
3880  * @atts:  the attributes values
3881  *
3882  * Checks an attributes from a Meta tag
3883  */
3884 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3885 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3886     int i;
3887     const xmlChar *att, *value;
3888     int http = 0;
3889     const xmlChar *content = NULL;
3890 
3891     if ((ctxt == NULL) || (atts == NULL))
3892 	return;
3893 
3894     i = 0;
3895     att = atts[i++];
3896     while (att != NULL) {
3897 	value = atts[i++];
3898 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3899 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3900 	    http = 1;
3901 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3902 	    htmlCheckEncodingDirect(ctxt, value);
3903 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3904 	    content = value;
3905 	att = atts[i++];
3906     }
3907     if ((http) && (content != NULL))
3908 	htmlCheckEncoding(ctxt, content);
3909 
3910 }
3911 
3912 /**
3913  * htmlParseStartTag:
3914  * @ctxt:  an HTML parser context
3915  *
3916  * parse a start of tag either for rule element or
3917  * EmptyElement. In both case we don't parse the tag closing chars.
3918  *
3919  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3920  *
3921  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3922  *
3923  * With namespace:
3924  *
3925  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3926  *
3927  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3928  *
3929  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3930  */
3931 
3932 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3933 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3934     const xmlChar *name;
3935     const xmlChar *attname;
3936     xmlChar *attvalue;
3937     const xmlChar **atts;
3938     int nbatts = 0;
3939     int maxatts;
3940     int meta = 0;
3941     int i;
3942     int discardtag = 0;
3943 
3944     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3945 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3946 		     "htmlParseStartTag: context error\n", NULL, NULL);
3947 	return -1;
3948     }
3949     if (ctxt->instate == XML_PARSER_EOF)
3950         return(-1);
3951     if (CUR != '<') return -1;
3952     NEXT;
3953 
3954     atts = ctxt->atts;
3955     maxatts = ctxt->maxatts;
3956 
3957     GROW;
3958     name = htmlParseHTMLName(ctxt);
3959     if (name == NULL) {
3960 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3961 	             "htmlParseStartTag: invalid element name\n",
3962 		     NULL, NULL);
3963         /*
3964          * The recovery code is disabled for now as it can result in
3965          * quadratic behavior with the push parser. htmlParseStartTag
3966          * must consume all content up to the final '>' in order to avoid
3967          * rescanning for this terminator.
3968          *
3969          * For a proper fix in line with HTML5, htmlParseStartTag and
3970          * htmlParseElement should only be called when there's an ASCII
3971          * alpha character following the initial '<'. Otherwise, the '<'
3972          * should be emitted as text (unless followed by '!', '/' or '?').
3973          */
3974 #if 0
3975 	/* if recover preserve text on classic misconstructs */
3976 	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3977 	    (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3978 	    htmlParseCharDataInternal(ctxt, '<');
3979 	    return(-1);
3980 	}
3981 #endif
3982 
3983 	/* Dump the bogus tag like browsers do */
3984 	while ((CUR != 0) && (CUR != '>') &&
3985                (ctxt->instate != XML_PARSER_EOF))
3986 	    NEXT;
3987         return -1;
3988     }
3989     if (xmlStrEqual(name, BAD_CAST"meta"))
3990 	meta = 1;
3991 
3992     /*
3993      * Check for auto-closure of HTML elements.
3994      */
3995     htmlAutoClose(ctxt, name);
3996 
3997     /*
3998      * Check for implied HTML elements.
3999      */
4000     htmlCheckImplied(ctxt, name);
4001 
4002     /*
4003      * Avoid html at any level > 0, head at any level != 1
4004      * or any attempt to recurse body
4005      */
4006     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
4007 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4008 	             "htmlParseStartTag: misplaced <html> tag\n",
4009 		     name, NULL);
4010 	discardtag = 1;
4011 	ctxt->depth++;
4012     }
4013     if ((ctxt->nameNr != 1) &&
4014 	(xmlStrEqual(name, BAD_CAST"head"))) {
4015 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4016 	             "htmlParseStartTag: misplaced <head> tag\n",
4017 		     name, NULL);
4018 	discardtag = 1;
4019 	ctxt->depth++;
4020     }
4021     if (xmlStrEqual(name, BAD_CAST"body")) {
4022 	int indx;
4023 	for (indx = 0;indx < ctxt->nameNr;indx++) {
4024 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4025 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4026 		             "htmlParseStartTag: misplaced <body> tag\n",
4027 			     name, NULL);
4028 		discardtag = 1;
4029 		ctxt->depth++;
4030 	    }
4031 	}
4032     }
4033 
4034     /*
4035      * Now parse the attributes, it ends up with the ending
4036      *
4037      * (S Attribute)* S?
4038      */
4039     SKIP_BLANKS;
4040     while ((CUR != 0) &&
4041            (CUR != '>') &&
4042 	   ((CUR != '/') || (NXT(1) != '>'))) {
4043 	GROW;
4044 	attname = htmlParseAttribute(ctxt, &attvalue);
4045         if (attname != NULL) {
4046 
4047 	    /*
4048 	     * Well formedness requires at most one declaration of an attribute
4049 	     */
4050 	    for (i = 0; i < nbatts;i += 2) {
4051 	        if (xmlStrEqual(atts[i], attname)) {
4052 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4053 		                 "Attribute %s redefined\n", attname, NULL);
4054 		    if (attvalue != NULL)
4055 			xmlFree(attvalue);
4056 		    goto failed;
4057 		}
4058 	    }
4059 
4060 	    /*
4061 	     * Add the pair to atts
4062 	     */
4063 	    if (atts == NULL) {
4064 	        maxatts = 22; /* allow for 10 attrs by default */
4065 	        atts = (const xmlChar **)
4066 		       xmlMalloc(maxatts * sizeof(xmlChar *));
4067 		if (atts == NULL) {
4068 		    htmlErrMemory(ctxt, NULL);
4069 		    if (attvalue != NULL)
4070 			xmlFree(attvalue);
4071 		    goto failed;
4072 		}
4073 		ctxt->atts = atts;
4074 		ctxt->maxatts = maxatts;
4075 	    } else if (nbatts + 4 > maxatts) {
4076 	        const xmlChar **n;
4077 
4078 	        maxatts *= 2;
4079 	        n = (const xmlChar **) xmlRealloc((void *) atts,
4080 					     maxatts * sizeof(const xmlChar *));
4081 		if (n == NULL) {
4082 		    htmlErrMemory(ctxt, NULL);
4083 		    if (attvalue != NULL)
4084 			xmlFree(attvalue);
4085 		    goto failed;
4086 		}
4087 		atts = n;
4088 		ctxt->atts = atts;
4089 		ctxt->maxatts = maxatts;
4090 	    }
4091 	    atts[nbatts++] = attname;
4092 	    atts[nbatts++] = attvalue;
4093 	    atts[nbatts] = NULL;
4094 	    atts[nbatts + 1] = NULL;
4095 	}
4096 	else {
4097 	    if (attvalue != NULL)
4098 	        xmlFree(attvalue);
4099 	    /* Dump the bogus attribute string up to the next blank or
4100 	     * the end of the tag. */
4101 	    while ((CUR != 0) &&
4102 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4103 		   ((CUR != '/') || (NXT(1) != '>')))
4104 		NEXT;
4105 	}
4106 
4107 failed:
4108 	SKIP_BLANKS;
4109     }
4110 
4111     /*
4112      * Handle specific association to the META tag
4113      */
4114     if (meta && (nbatts != 0))
4115 	htmlCheckMeta(ctxt, atts);
4116 
4117     /*
4118      * SAX: Start of Element !
4119      */
4120     if (!discardtag) {
4121 	htmlnamePush(ctxt, name);
4122 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4123 	    if (nbatts != 0)
4124 		ctxt->sax->startElement(ctxt->userData, name, atts);
4125 	    else
4126 		ctxt->sax->startElement(ctxt->userData, name, NULL);
4127 	}
4128     }
4129 
4130     if (atts != NULL) {
4131         for (i = 1;i < nbatts;i += 2) {
4132 	    if (atts[i] != NULL)
4133 		xmlFree((xmlChar *) atts[i]);
4134 	}
4135     }
4136 
4137     return(discardtag);
4138 }
4139 
4140 /**
4141  * htmlParseEndTag:
4142  * @ctxt:  an HTML parser context
4143  *
4144  * parse an end of tag
4145  *
4146  * [42] ETag ::= '</' Name S? '>'
4147  *
4148  * With namespace
4149  *
4150  * [NS 9] ETag ::= '</' QName S? '>'
4151  *
4152  * Returns 1 if the current level should be closed.
4153  */
4154 
4155 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)4156 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4157 {
4158     const xmlChar *name;
4159     const xmlChar *oldname;
4160     int i, ret;
4161 
4162     if ((CUR != '<') || (NXT(1) != '/')) {
4163         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4164 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
4165         return (0);
4166     }
4167     SKIP(2);
4168 
4169     name = htmlParseHTMLName(ctxt);
4170     if (name == NULL)
4171         return (0);
4172     /*
4173      * We should definitely be at the ending "S? '>'" part
4174      */
4175     SKIP_BLANKS;
4176     if (CUR != '>') {
4177         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4178 	             "End tag : expected '>'\n", NULL, NULL);
4179         /* Skip to next '>' */
4180         while ((CUR != 0) && (CUR != '>'))
4181             NEXT;
4182     }
4183     if (CUR == '>')
4184         NEXT;
4185 
4186     /*
4187      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4188      * out now.
4189      */
4190     if ((ctxt->depth > 0) &&
4191         (xmlStrEqual(name, BAD_CAST "html") ||
4192          xmlStrEqual(name, BAD_CAST "body") ||
4193 	 xmlStrEqual(name, BAD_CAST "head"))) {
4194 	ctxt->depth--;
4195 	return (0);
4196     }
4197 
4198     /*
4199      * If the name read is not one of the element in the parsing stack
4200      * then return, it's just an error.
4201      */
4202     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4203         if (xmlStrEqual(name, ctxt->nameTab[i]))
4204             break;
4205     }
4206     if (i < 0) {
4207         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4208 	             "Unexpected end tag : %s\n", name, NULL);
4209         return (0);
4210     }
4211 
4212 
4213     /*
4214      * Check for auto-closure of HTML elements.
4215      */
4216 
4217     htmlAutoCloseOnClose(ctxt, name);
4218 
4219     /*
4220      * Well formedness constraints, opening and closing must match.
4221      * With the exception that the autoclose may have popped stuff out
4222      * of the stack.
4223      */
4224     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4225         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4226                      "Opening and ending tag mismatch: %s and %s\n",
4227                      name, ctxt->name);
4228     }
4229 
4230     /*
4231      * SAX: End of Tag
4232      */
4233     oldname = ctxt->name;
4234     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4235         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4236             ctxt->sax->endElement(ctxt->userData, name);
4237 	htmlNodeInfoPop(ctxt);
4238         htmlnamePop(ctxt);
4239         ret = 1;
4240     } else {
4241         ret = 0;
4242     }
4243 
4244     return (ret);
4245 }
4246 
4247 
4248 /**
4249  * htmlParseReference:
4250  * @ctxt:  an HTML parser context
4251  *
4252  * parse and handle entity references in content,
4253  * this will end-up in a call to character() since this is either a
4254  * CharRef, or a predefined entity.
4255  */
4256 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4257 htmlParseReference(htmlParserCtxtPtr ctxt) {
4258     const htmlEntityDesc * ent;
4259     xmlChar out[6];
4260     const xmlChar *name;
4261     if (CUR != '&') return;
4262 
4263     if (NXT(1) == '#') {
4264 	unsigned int c;
4265 	int bits, i = 0;
4266 
4267 	c = htmlParseCharRef(ctxt);
4268 	if (c == 0)
4269 	    return;
4270 
4271         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4272         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4273         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4274         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4275 
4276         for ( ; bits >= 0; bits-= 6) {
4277             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4278         }
4279 	out[i] = 0;
4280 
4281 	htmlCheckParagraph(ctxt);
4282 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4283 	    ctxt->sax->characters(ctxt->userData, out, i);
4284     } else {
4285 	ent = htmlParseEntityRef(ctxt, &name);
4286 	if (name == NULL) {
4287 	    htmlCheckParagraph(ctxt);
4288 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4289 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4290 	    return;
4291 	}
4292 	if ((ent == NULL) || !(ent->value > 0)) {
4293 	    htmlCheckParagraph(ctxt);
4294 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4295 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4296 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4297 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4298 	    }
4299 	} else {
4300 	    unsigned int c;
4301 	    int bits, i = 0;
4302 
4303 	    c = ent->value;
4304 	    if      (c <    0x80)
4305 	            { out[i++]= c;                bits= -6; }
4306 	    else if (c <   0x800)
4307 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4308 	    else if (c < 0x10000)
4309 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4310 	    else
4311 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4312 
4313 	    for ( ; bits >= 0; bits-= 6) {
4314 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
4315 	    }
4316 	    out[i] = 0;
4317 
4318 	    htmlCheckParagraph(ctxt);
4319 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4320 		ctxt->sax->characters(ctxt->userData, out, i);
4321 	}
4322     }
4323 }
4324 
4325 /**
4326  * htmlParseContent:
4327  * @ctxt:  an HTML parser context
4328  *
4329  * Parse a content: comment, sub-element, reference or text.
4330  * Kept for compatibility with old code
4331  */
4332 
4333 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4334 htmlParseContent(htmlParserCtxtPtr ctxt) {
4335     xmlChar *currentNode;
4336     int depth;
4337     const xmlChar *name;
4338 
4339     currentNode = xmlStrdup(ctxt->name);
4340     depth = ctxt->nameNr;
4341     while (1) {
4342         GROW;
4343 
4344         if (ctxt->instate == XML_PARSER_EOF)
4345             break;
4346 
4347 	/*
4348 	 * Our tag or one of it's parent or children is ending.
4349 	 */
4350         if ((CUR == '<') && (NXT(1) == '/')) {
4351 	    if (htmlParseEndTag(ctxt) &&
4352 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4353 		if (currentNode != NULL)
4354 		    xmlFree(currentNode);
4355 		return;
4356 	    }
4357 	    continue; /* while */
4358         }
4359 
4360 	else if ((CUR == '<') &&
4361 	         ((IS_ASCII_LETTER(NXT(1))) ||
4362 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4363 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4364 	    if (name == NULL) {
4365 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4366 			 "htmlParseStartTag: invalid element name\n",
4367 			 NULL, NULL);
4368 	        /* Dump the bogus tag like browsers do */
4369                 while ((CUR != 0) && (CUR != '>'))
4370 	            NEXT;
4371 
4372 	        if (currentNode != NULL)
4373 	            xmlFree(currentNode);
4374 	        return;
4375 	    }
4376 
4377 	    if (ctxt->name != NULL) {
4378 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4379 	            htmlAutoClose(ctxt, name);
4380 	            continue;
4381 	        }
4382 	    }
4383 	}
4384 
4385 	/*
4386 	 * Has this node been popped out during parsing of
4387 	 * the next element
4388 	 */
4389         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4390 	    (!xmlStrEqual(currentNode, ctxt->name)))
4391 	     {
4392 	    if (currentNode != NULL) xmlFree(currentNode);
4393 	    return;
4394 	}
4395 
4396 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4397 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4398 	    /*
4399 	     * Handle SCRIPT/STYLE separately
4400 	     */
4401 	    htmlParseScript(ctxt);
4402 	} else {
4403 	    /*
4404 	     * Sometimes DOCTYPE arrives in the middle of the document
4405 	     */
4406 	    if ((CUR == '<') && (NXT(1) == '!') &&
4407 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4408 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4409 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4410 		(UPP(8) == 'E')) {
4411 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4412 		             "Misplaced DOCTYPE declaration\n",
4413 			     BAD_CAST "DOCTYPE" , NULL);
4414 		htmlParseDocTypeDecl(ctxt);
4415 	    }
4416 
4417 	    /*
4418 	     * First case :  a comment
4419 	     */
4420 	    if ((CUR == '<') && (NXT(1) == '!') &&
4421 		(NXT(2) == '-') && (NXT(3) == '-')) {
4422 		htmlParseComment(ctxt);
4423 	    }
4424 
4425 	    /*
4426 	     * Second case : a Processing Instruction.
4427 	     */
4428 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4429 		htmlParsePI(ctxt);
4430 	    }
4431 
4432 	    /*
4433 	     * Third case :  a sub-element.
4434 	     */
4435 	    else if (CUR == '<') {
4436 		htmlParseElement(ctxt);
4437 	    }
4438 
4439 	    /*
4440 	     * Fourth case : a reference. If if has not been resolved,
4441 	     *    parsing returns it's Name, create the node
4442 	     */
4443 	    else if (CUR == '&') {
4444 		htmlParseReference(ctxt);
4445 	    }
4446 
4447 	    /*
4448 	     * Fifth case : end of the resource
4449 	     */
4450 	    else if (CUR == 0) {
4451 		htmlAutoCloseOnEnd(ctxt);
4452 		break;
4453 	    }
4454 
4455 	    /*
4456 	     * Last case, text. Note that References are handled directly.
4457 	     */
4458 	    else {
4459 		htmlParseCharData(ctxt);
4460 	    }
4461 	}
4462         GROW;
4463     }
4464     if (currentNode != NULL) xmlFree(currentNode);
4465 }
4466 
4467 /**
4468  * htmlParseElement:
4469  * @ctxt:  an HTML parser context
4470  *
4471  * parse an HTML element, this is highly recursive
4472  * this is kept for compatibility with previous code versions
4473  *
4474  * [39] element ::= EmptyElemTag | STag content ETag
4475  *
4476  * [41] Attribute ::= Name Eq AttValue
4477  */
4478 
4479 void
htmlParseElement(htmlParserCtxtPtr ctxt)4480 htmlParseElement(htmlParserCtxtPtr ctxt) {
4481     const xmlChar *name;
4482     xmlChar *currentNode = NULL;
4483     const htmlElemDesc * info;
4484     htmlParserNodeInfo node_info;
4485     int failed;
4486     int depth;
4487     const xmlChar *oldptr;
4488 
4489     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4490 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4491 		     "htmlParseElement: context error\n", NULL, NULL);
4492 	return;
4493     }
4494 
4495     if (ctxt->instate == XML_PARSER_EOF)
4496         return;
4497 
4498     /* Capture start position */
4499     if (ctxt->record_info) {
4500         node_info.begin_pos = ctxt->input->consumed +
4501                           (CUR_PTR - ctxt->input->base);
4502 	node_info.begin_line = ctxt->input->line;
4503     }
4504 
4505     failed = htmlParseStartTag(ctxt);
4506     name = ctxt->name;
4507     if ((failed == -1) || (name == NULL)) {
4508 	if (CUR == '>')
4509 	    NEXT;
4510         return;
4511     }
4512 
4513     /*
4514      * Lookup the info for that element.
4515      */
4516     info = htmlTagLookup(name);
4517     if (info == NULL) {
4518 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4519 	             "Tag %s invalid\n", name, NULL);
4520     }
4521 
4522     /*
4523      * Check for an Empty Element labeled the XML/SGML way
4524      */
4525     if ((CUR == '/') && (NXT(1) == '>')) {
4526         SKIP(2);
4527 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4528 	    ctxt->sax->endElement(ctxt->userData, name);
4529 	htmlnamePop(ctxt);
4530 	return;
4531     }
4532 
4533     if (CUR == '>') {
4534         NEXT;
4535     } else {
4536 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4537 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4538 
4539 	/*
4540 	 * end of parsing of this node.
4541 	 */
4542 	if (xmlStrEqual(name, ctxt->name)) {
4543 	    nodePop(ctxt);
4544 	    htmlnamePop(ctxt);
4545 	}
4546 
4547 	/*
4548 	 * Capture end position and add node
4549 	 */
4550 	if (ctxt->record_info) {
4551 	   node_info.end_pos = ctxt->input->consumed +
4552 			      (CUR_PTR - ctxt->input->base);
4553 	   node_info.end_line = ctxt->input->line;
4554 	   node_info.node = ctxt->node;
4555 	   xmlParserAddNodeInfo(ctxt, &node_info);
4556 	}
4557 	return;
4558     }
4559 
4560     /*
4561      * Check for an Empty Element from DTD definition
4562      */
4563     if ((info != NULL) && (info->empty)) {
4564 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4565 	    ctxt->sax->endElement(ctxt->userData, name);
4566 	htmlnamePop(ctxt);
4567 	return;
4568     }
4569 
4570     /*
4571      * Parse the content of the element:
4572      */
4573     currentNode = xmlStrdup(ctxt->name);
4574     depth = ctxt->nameNr;
4575     while (CUR != 0) {
4576 	oldptr = ctxt->input->cur;
4577 	htmlParseContent(ctxt);
4578 	if (oldptr==ctxt->input->cur) break;
4579 	if (ctxt->nameNr < depth) break;
4580     }
4581 
4582     /*
4583      * Capture end position and add node
4584      */
4585     if ( currentNode != NULL && ctxt->record_info ) {
4586        node_info.end_pos = ctxt->input->consumed +
4587                           (CUR_PTR - ctxt->input->base);
4588        node_info.end_line = ctxt->input->line;
4589        node_info.node = ctxt->node;
4590        xmlParserAddNodeInfo(ctxt, &node_info);
4591     }
4592     if (CUR == 0) {
4593 	htmlAutoCloseOnEnd(ctxt);
4594     }
4595 
4596     if (currentNode != NULL)
4597 	xmlFree(currentNode);
4598 }
4599 
4600 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4601 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4602     /*
4603      * Capture end position and add node
4604      */
4605     if ( ctxt->node != NULL && ctxt->record_info ) {
4606        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4607                                 (CUR_PTR - ctxt->input->base);
4608        ctxt->nodeInfo->end_line = ctxt->input->line;
4609        ctxt->nodeInfo->node = ctxt->node;
4610        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4611        htmlNodeInfoPop(ctxt);
4612     }
4613     if (CUR == 0) {
4614        htmlAutoCloseOnEnd(ctxt);
4615     }
4616 }
4617 
4618 /**
4619  * htmlParseElementInternal:
4620  * @ctxt:  an HTML parser context
4621  *
4622  * parse an HTML element, new version, non recursive
4623  *
4624  * [39] element ::= EmptyElemTag | STag content ETag
4625  *
4626  * [41] Attribute ::= Name Eq AttValue
4627  */
4628 
4629 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4630 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4631     const xmlChar *name;
4632     const htmlElemDesc * info;
4633     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4634     int failed;
4635 
4636     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4637 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4638 		     "htmlParseElementInternal: context error\n", NULL, NULL);
4639 	return;
4640     }
4641 
4642     if (ctxt->instate == XML_PARSER_EOF)
4643         return;
4644 
4645     /* Capture start position */
4646     if (ctxt->record_info) {
4647         node_info.begin_pos = ctxt->input->consumed +
4648                           (CUR_PTR - ctxt->input->base);
4649 	node_info.begin_line = ctxt->input->line;
4650     }
4651 
4652     failed = htmlParseStartTag(ctxt);
4653     name = ctxt->name;
4654     if ((failed == -1) || (name == NULL)) {
4655 	if (CUR == '>')
4656 	    NEXT;
4657         return;
4658     }
4659 
4660     /*
4661      * Lookup the info for that element.
4662      */
4663     info = htmlTagLookup(name);
4664     if (info == NULL) {
4665 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4666 	             "Tag %s invalid\n", name, NULL);
4667     }
4668 
4669     /*
4670      * Check for an Empty Element labeled the XML/SGML way
4671      */
4672     if ((CUR == '/') && (NXT(1) == '>')) {
4673         SKIP(2);
4674 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4675 	    ctxt->sax->endElement(ctxt->userData, name);
4676 	htmlnamePop(ctxt);
4677 	return;
4678     }
4679 
4680     if (CUR == '>') {
4681         NEXT;
4682     } else {
4683 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4684 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4685 
4686 	/*
4687 	 * end of parsing of this node.
4688 	 */
4689 	if (xmlStrEqual(name, ctxt->name)) {
4690 	    nodePop(ctxt);
4691 	    htmlnamePop(ctxt);
4692 	}
4693 
4694         if (ctxt->record_info)
4695             htmlNodeInfoPush(ctxt, &node_info);
4696         htmlParserFinishElementParsing(ctxt);
4697 	return;
4698     }
4699 
4700     /*
4701      * Check for an Empty Element from DTD definition
4702      */
4703     if ((info != NULL) && (info->empty)) {
4704 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4705 	    ctxt->sax->endElement(ctxt->userData, name);
4706 	htmlnamePop(ctxt);
4707 	return;
4708     }
4709 
4710     if (ctxt->record_info)
4711         htmlNodeInfoPush(ctxt, &node_info);
4712 }
4713 
4714 /**
4715  * htmlParseContentInternal:
4716  * @ctxt:  an HTML parser context
4717  *
4718  * Parse a content: comment, sub-element, reference or text.
4719  * New version for non recursive htmlParseElementInternal
4720  */
4721 
4722 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4723 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4724     xmlChar *currentNode;
4725     int depth;
4726     const xmlChar *name;
4727 
4728     currentNode = xmlStrdup(ctxt->name);
4729     depth = ctxt->nameNr;
4730     while (1) {
4731         GROW;
4732 
4733         if (ctxt->instate == XML_PARSER_EOF)
4734             break;
4735 
4736 	/*
4737 	 * Our tag or one of it's parent or children is ending.
4738 	 */
4739         if ((CUR == '<') && (NXT(1) == '/')) {
4740 	    if (htmlParseEndTag(ctxt) &&
4741 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4742 		if (currentNode != NULL)
4743 		    xmlFree(currentNode);
4744 
4745 	        currentNode = xmlStrdup(ctxt->name);
4746 	        depth = ctxt->nameNr;
4747 	    }
4748 	    continue; /* while */
4749         }
4750 
4751 	else if ((CUR == '<') &&
4752 	         ((IS_ASCII_LETTER(NXT(1))) ||
4753 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4754 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4755 	    if (name == NULL) {
4756 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4757 			 "htmlParseStartTag: invalid element name\n",
4758 			 NULL, NULL);
4759 	        /* Dump the bogus tag like browsers do */
4760 	        while ((CUR == 0) && (CUR != '>'))
4761 	            NEXT;
4762 
4763 	        htmlParserFinishElementParsing(ctxt);
4764 	        if (currentNode != NULL)
4765 	            xmlFree(currentNode);
4766 
4767 	        currentNode = xmlStrdup(ctxt->name);
4768 	        depth = ctxt->nameNr;
4769 	        continue;
4770 	    }
4771 
4772 	    if (ctxt->name != NULL) {
4773 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4774 	            htmlAutoClose(ctxt, name);
4775 	            continue;
4776 	        }
4777 	    }
4778 	}
4779 
4780 	/*
4781 	 * Has this node been popped out during parsing of
4782 	 * the next element
4783 	 */
4784         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4785 	    (!xmlStrEqual(currentNode, ctxt->name)))
4786 	     {
4787 	    htmlParserFinishElementParsing(ctxt);
4788 	    if (currentNode != NULL) xmlFree(currentNode);
4789 
4790 	    currentNode = xmlStrdup(ctxt->name);
4791 	    depth = ctxt->nameNr;
4792 	    continue;
4793 	}
4794 
4795 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4796 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4797 	    /*
4798 	     * Handle SCRIPT/STYLE separately
4799 	     */
4800 	    htmlParseScript(ctxt);
4801 	} else {
4802 	    /*
4803 	     * Sometimes DOCTYPE arrives in the middle of the document
4804 	     */
4805 	    if ((CUR == '<') && (NXT(1) == '!') &&
4806 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4807 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4808 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4809 		(UPP(8) == 'E')) {
4810 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4811 		             "Misplaced DOCTYPE declaration\n",
4812 			     BAD_CAST "DOCTYPE" , NULL);
4813 		htmlParseDocTypeDecl(ctxt);
4814 	    }
4815 
4816 	    /*
4817 	     * First case :  a comment
4818 	     */
4819 	    if ((CUR == '<') && (NXT(1) == '!') &&
4820 		(NXT(2) == '-') && (NXT(3) == '-')) {
4821 		htmlParseComment(ctxt);
4822 	    }
4823 
4824 	    /*
4825 	     * Second case : a Processing Instruction.
4826 	     */
4827 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4828 		htmlParsePI(ctxt);
4829 	    }
4830 
4831 	    /*
4832 	     * Third case :  a sub-element.
4833 	     */
4834 	    else if (CUR == '<') {
4835 		htmlParseElementInternal(ctxt);
4836 		if (currentNode != NULL) xmlFree(currentNode);
4837 
4838 		currentNode = xmlStrdup(ctxt->name);
4839 		depth = ctxt->nameNr;
4840 	    }
4841 
4842 	    /*
4843 	     * Fourth case : a reference. If if has not been resolved,
4844 	     *    parsing returns it's Name, create the node
4845 	     */
4846 	    else if (CUR == '&') {
4847 		htmlParseReference(ctxt);
4848 	    }
4849 
4850 	    /*
4851 	     * Fifth case : end of the resource
4852 	     */
4853 	    else if (CUR == 0) {
4854 		htmlAutoCloseOnEnd(ctxt);
4855 		break;
4856 	    }
4857 
4858 	    /*
4859 	     * Last case, text. Note that References are handled directly.
4860 	     */
4861 	    else {
4862 		htmlParseCharData(ctxt);
4863 	    }
4864 	}
4865         GROW;
4866     }
4867     if (currentNode != NULL) xmlFree(currentNode);
4868 }
4869 
4870 /**
4871  * htmlParseContent:
4872  * @ctxt:  an HTML parser context
4873  *
4874  * Parse a content: comment, sub-element, reference or text.
4875  * This is the entry point when called from parser.c
4876  */
4877 
4878 void
__htmlParseContent(void * ctxt)4879 __htmlParseContent(void *ctxt) {
4880     if (ctxt != NULL)
4881 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4882 }
4883 
4884 /**
4885  * htmlParseDocument:
4886  * @ctxt:  an HTML parser context
4887  *
4888  * parse an HTML document (and build a tree if using the standard SAX
4889  * interface).
4890  *
4891  * Returns 0, -1 in case of error. the parser context is augmented
4892  *                as a result of the parsing.
4893  */
4894 
4895 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4896 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4897     xmlChar start[4];
4898     xmlCharEncoding enc;
4899     xmlDtdPtr dtd;
4900 
4901     xmlInitParser();
4902 
4903     htmlDefaultSAXHandlerInit();
4904 
4905     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4906 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4907 		     "htmlParseDocument: context error\n", NULL, NULL);
4908 	return(XML_ERR_INTERNAL_ERROR);
4909     }
4910     ctxt->html = 1;
4911     ctxt->linenumbers = 1;
4912     GROW;
4913     /*
4914      * SAX: beginning of the document processing.
4915      */
4916     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4917         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4918 
4919     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4920         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4921 	/*
4922 	 * Get the 4 first bytes and decode the charset
4923 	 * if enc != XML_CHAR_ENCODING_NONE
4924 	 * plug some encoding conversion routines.
4925 	 */
4926 	start[0] = RAW;
4927 	start[1] = NXT(1);
4928 	start[2] = NXT(2);
4929 	start[3] = NXT(3);
4930 	enc = xmlDetectCharEncoding(&start[0], 4);
4931 	if (enc != XML_CHAR_ENCODING_NONE) {
4932 	    xmlSwitchEncoding(ctxt, enc);
4933 	}
4934     }
4935 
4936     /*
4937      * Wipe out everything which is before the first '<'
4938      */
4939     SKIP_BLANKS;
4940     if (CUR == 0) {
4941 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4942 	             "Document is empty\n", NULL, NULL);
4943     }
4944 
4945     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4946 	ctxt->sax->startDocument(ctxt->userData);
4947 
4948 
4949     /*
4950      * Parse possible comments and PIs before any content
4951      */
4952     while (((CUR == '<') && (NXT(1) == '!') &&
4953             (NXT(2) == '-') && (NXT(3) == '-')) ||
4954 	   ((CUR == '<') && (NXT(1) == '?'))) {
4955         htmlParseComment(ctxt);
4956         htmlParsePI(ctxt);
4957 	SKIP_BLANKS;
4958     }
4959 
4960 
4961     /*
4962      * Then possibly doc type declaration(s) and more Misc
4963      * (doctypedecl Misc*)?
4964      */
4965     if ((CUR == '<') && (NXT(1) == '!') &&
4966 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4967 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4968 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4969 	(UPP(8) == 'E')) {
4970 	htmlParseDocTypeDecl(ctxt);
4971     }
4972     SKIP_BLANKS;
4973 
4974     /*
4975      * Parse possible comments and PIs before any content
4976      */
4977     while (((CUR == '<') && (NXT(1) == '!') &&
4978             (NXT(2) == '-') && (NXT(3) == '-')) ||
4979 	   ((CUR == '<') && (NXT(1) == '?'))) {
4980         htmlParseComment(ctxt);
4981         htmlParsePI(ctxt);
4982 	SKIP_BLANKS;
4983     }
4984 
4985     /*
4986      * Time to start parsing the tree itself
4987      */
4988     htmlParseContentInternal(ctxt);
4989 
4990     /*
4991      * autoclose
4992      */
4993     if (CUR == 0)
4994 	htmlAutoCloseOnEnd(ctxt);
4995 
4996 
4997     /*
4998      * SAX: end of the document processing.
4999      */
5000     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5001         ctxt->sax->endDocument(ctxt->userData);
5002 
5003     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
5004 	dtd = xmlGetIntSubset(ctxt->myDoc);
5005 	if (dtd == NULL)
5006 	    ctxt->myDoc->intSubset =
5007 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5008 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5009 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5010     }
5011     if (! ctxt->wellFormed) return(-1);
5012     return(0);
5013 }
5014 
5015 
5016 /************************************************************************
5017  *									*
5018  *			Parser contexts handling			*
5019  *									*
5020  ************************************************************************/
5021 
5022 /**
5023  * htmlInitParserCtxt:
5024  * @ctxt:  an HTML parser context
5025  *
5026  * Initialize a parser context
5027  *
5028  * Returns 0 in case of success and -1 in case of error
5029  */
5030 
5031 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)5032 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5033 {
5034     htmlSAXHandler *sax;
5035 
5036     if (ctxt == NULL) return(-1);
5037     memset(ctxt, 0, sizeof(htmlParserCtxt));
5038 
5039     ctxt->dict = xmlDictCreate();
5040     if (ctxt->dict == NULL) {
5041         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5042 	return(-1);
5043     }
5044     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5045     if (sax == NULL) {
5046         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5047 	return(-1);
5048     }
5049     else
5050         memset(sax, 0, sizeof(htmlSAXHandler));
5051 
5052     /* Allocate the Input stack */
5053     ctxt->inputTab = (htmlParserInputPtr *)
5054                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
5055     if (ctxt->inputTab == NULL) {
5056         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5057 	ctxt->inputNr = 0;
5058 	ctxt->inputMax = 0;
5059 	ctxt->input = NULL;
5060 	return(-1);
5061     }
5062     ctxt->inputNr = 0;
5063     ctxt->inputMax = 5;
5064     ctxt->input = NULL;
5065     ctxt->version = NULL;
5066     ctxt->encoding = NULL;
5067     ctxt->standalone = -1;
5068     ctxt->instate = XML_PARSER_START;
5069 
5070     /* Allocate the Node stack */
5071     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5072     if (ctxt->nodeTab == NULL) {
5073         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5074 	ctxt->nodeNr = 0;
5075 	ctxt->nodeMax = 0;
5076 	ctxt->node = NULL;
5077 	ctxt->inputNr = 0;
5078 	ctxt->inputMax = 0;
5079 	ctxt->input = NULL;
5080 	return(-1);
5081     }
5082     ctxt->nodeNr = 0;
5083     ctxt->nodeMax = 10;
5084     ctxt->node = NULL;
5085 
5086     /* Allocate the Name stack */
5087     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5088     if (ctxt->nameTab == NULL) {
5089         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5090 	ctxt->nameNr = 0;
5091 	ctxt->nameMax = 0;
5092 	ctxt->name = NULL;
5093 	ctxt->nodeNr = 0;
5094 	ctxt->nodeMax = 0;
5095 	ctxt->node = NULL;
5096 	ctxt->inputNr = 0;
5097 	ctxt->inputMax = 0;
5098 	ctxt->input = NULL;
5099 	return(-1);
5100     }
5101     ctxt->nameNr = 0;
5102     ctxt->nameMax = 10;
5103     ctxt->name = NULL;
5104 
5105     ctxt->nodeInfoTab = NULL;
5106     ctxt->nodeInfoNr  = 0;
5107     ctxt->nodeInfoMax = 0;
5108 
5109     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5110     else {
5111         ctxt->sax = sax;
5112 	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5113     }
5114     ctxt->userData = ctxt;
5115     ctxt->myDoc = NULL;
5116     ctxt->wellFormed = 1;
5117     ctxt->replaceEntities = 0;
5118     ctxt->linenumbers = xmlLineNumbersDefaultValue;
5119     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5120     ctxt->html = 1;
5121     ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
5122     ctxt->vctxt.userData = ctxt;
5123     ctxt->vctxt.error = xmlParserValidityError;
5124     ctxt->vctxt.warning = xmlParserValidityWarning;
5125     ctxt->record_info = 0;
5126     ctxt->validate = 0;
5127     ctxt->checkIndex = 0;
5128     ctxt->catalogs = NULL;
5129     xmlInitNodeInfoSeq(&ctxt->node_seq);
5130     return(0);
5131 }
5132 
5133 /**
5134  * htmlFreeParserCtxt:
5135  * @ctxt:  an HTML parser context
5136  *
5137  * Free all the memory used by a parser context. However the parsed
5138  * document in ctxt->myDoc is not freed.
5139  */
5140 
5141 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)5142 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5143 {
5144     xmlFreeParserCtxt(ctxt);
5145 }
5146 
5147 /**
5148  * htmlNewParserCtxt:
5149  *
5150  * Allocate and initialize a new parser context.
5151  *
5152  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5153  */
5154 
5155 htmlParserCtxtPtr
htmlNewParserCtxt(void)5156 htmlNewParserCtxt(void)
5157 {
5158     xmlParserCtxtPtr ctxt;
5159 
5160     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5161     if (ctxt == NULL) {
5162         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5163 	return(NULL);
5164     }
5165     memset(ctxt, 0, sizeof(xmlParserCtxt));
5166     if (htmlInitParserCtxt(ctxt) < 0) {
5167         htmlFreeParserCtxt(ctxt);
5168 	return(NULL);
5169     }
5170     return(ctxt);
5171 }
5172 
5173 /**
5174  * htmlCreateMemoryParserCtxt:
5175  * @buffer:  a pointer to a char array
5176  * @size:  the size of the array
5177  *
5178  * Create a parser context for an HTML in-memory document.
5179  *
5180  * Returns the new parser context or NULL
5181  */
5182 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5183 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5184     xmlParserCtxtPtr ctxt;
5185     xmlParserInputPtr input;
5186     xmlParserInputBufferPtr buf;
5187 
5188     if (buffer == NULL)
5189 	return(NULL);
5190     if (size <= 0)
5191 	return(NULL);
5192 
5193     ctxt = htmlNewParserCtxt();
5194     if (ctxt == NULL)
5195 	return(NULL);
5196 
5197     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5198     if (buf == NULL) return(NULL);
5199 
5200     input = xmlNewInputStream(ctxt);
5201     if (input == NULL) {
5202 	xmlFreeParserInputBuffer(buf);
5203 	xmlFreeParserCtxt(ctxt);
5204 	return(NULL);
5205     }
5206 
5207     input->filename = NULL;
5208     input->buf = buf;
5209     xmlBufResetInput(buf->buffer, input);
5210 
5211     inputPush(ctxt, input);
5212     return(ctxt);
5213 }
5214 
5215 /**
5216  * htmlCreateDocParserCtxt:
5217  * @cur:  a pointer to an array of xmlChar
5218  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5219  *
5220  * Create a parser context for an HTML document.
5221  *
5222  * TODO: check the need to add encoding handling there
5223  *
5224  * Returns the new parser context or NULL
5225  */
5226 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * cur,const char * encoding)5227 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5228     int len;
5229     htmlParserCtxtPtr ctxt;
5230 
5231     if (cur == NULL)
5232 	return(NULL);
5233     len = xmlStrlen(cur);
5234     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5235     if (ctxt == NULL)
5236 	return(NULL);
5237 
5238     if (encoding != NULL) {
5239 	xmlCharEncoding enc;
5240 	xmlCharEncodingHandlerPtr handler;
5241 
5242 	if (ctxt->input->encoding != NULL)
5243 	    xmlFree((xmlChar *) ctxt->input->encoding);
5244 	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5245 
5246 	enc = xmlParseCharEncoding(encoding);
5247 	/*
5248 	 * registered set of known encodings
5249 	 */
5250 	if (enc != XML_CHAR_ENCODING_ERROR) {
5251 	    xmlSwitchEncoding(ctxt, enc);
5252 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5253 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5254 		             "Unsupported encoding %s\n",
5255 			     (const xmlChar *) encoding, NULL);
5256 	    }
5257 	} else {
5258 	    /*
5259 	     * fallback for unknown encodings
5260 	     */
5261 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
5262 	    if (handler != NULL) {
5263 		xmlSwitchToEncoding(ctxt, handler);
5264 	    } else {
5265 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5266 		             "Unsupported encoding %s\n",
5267 			     (const xmlChar *) encoding, NULL);
5268 	    }
5269 	}
5270     }
5271     return(ctxt);
5272 }
5273 
5274 #ifdef LIBXML_PUSH_ENABLED
5275 /************************************************************************
5276  *									*
5277  *	Progressive parsing interfaces				*
5278  *									*
5279  ************************************************************************/
5280 
5281 /**
5282  * htmlParseLookupSequence:
5283  * @ctxt:  an HTML parser context
5284  * @first:  the first char to lookup
5285  * @next:  the next char to lookup or zero
5286  * @third:  the next char to lookup or zero
5287  * @ignoreattrval: skip over attribute values
5288  *
5289  * Try to find if a sequence (first, next, third) or  just (first next) or
5290  * (first) is available in the input stream.
5291  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5292  * to avoid rescanning sequences of bytes, it DOES change the state of the
5293  * parser, do not use liberally.
5294  * This is basically similar to xmlParseLookupSequence()
5295  *
5296  * Returns the index to the current parsing point if the full sequence
5297  *      is available, -1 otherwise.
5298  */
5299 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int ignoreattrval)5300 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5301                         xmlChar next, xmlChar third, int ignoreattrval)
5302 {
5303     int base, len;
5304     htmlParserInputPtr in;
5305     const xmlChar *buf;
5306     int invalue = 0;
5307     char valdellim = 0x0;
5308 
5309     in = ctxt->input;
5310     if (in == NULL)
5311         return (-1);
5312 
5313     base = in->cur - in->base;
5314     if (base < 0)
5315         return (-1);
5316 
5317     if (ctxt->checkIndex > base) {
5318         base = ctxt->checkIndex;
5319         /* Abuse hasPErefs member to restore current state. */
5320         invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5321     }
5322 
5323     if (in->buf == NULL) {
5324         buf = in->base;
5325         len = in->length;
5326     } else {
5327         buf = xmlBufContent(in->buf->buffer);
5328         len = xmlBufUse(in->buf->buffer);
5329     }
5330 
5331     /* take into account the sequence length */
5332     if (third)
5333         len -= 2;
5334     else if (next)
5335         len--;
5336     for (; base < len; base++) {
5337         if (ignoreattrval) {
5338             if (buf[base] == '"' || buf[base] == '\'') {
5339                 if (invalue) {
5340                     if (buf[base] == valdellim) {
5341                         invalue = 0;
5342                         continue;
5343                     }
5344                 } else {
5345                     valdellim = buf[base];
5346                     invalue = 1;
5347                     continue;
5348                 }
5349             } else if (invalue) {
5350                 continue;
5351             }
5352         }
5353         if (buf[base] == first) {
5354             if (third != 0) {
5355                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5356                     continue;
5357             } else if (next != 0) {
5358                 if (buf[base + 1] != next)
5359                     continue;
5360             }
5361             ctxt->checkIndex = 0;
5362 #ifdef DEBUG_PUSH
5363             if (next == 0)
5364                 xmlGenericError(xmlGenericErrorContext,
5365                                 "HPP: lookup '%c' found at %d\n",
5366                                 first, base);
5367             else if (third == 0)
5368                 xmlGenericError(xmlGenericErrorContext,
5369                                 "HPP: lookup '%c%c' found at %d\n",
5370                                 first, next, base);
5371             else
5372                 xmlGenericError(xmlGenericErrorContext,
5373                                 "HPP: lookup '%c%c%c' found at %d\n",
5374                                 first, next, third, base);
5375 #endif
5376             return (base - (in->cur - in->base));
5377         }
5378     }
5379     ctxt->checkIndex = base;
5380     /* Abuse hasPErefs member to track current state. */
5381     if (invalue)
5382         ctxt->hasPErefs |= 1;
5383     else
5384         ctxt->hasPErefs &= ~1;
5385 #ifdef DEBUG_PUSH
5386     if (next == 0)
5387         xmlGenericError(xmlGenericErrorContext,
5388                         "HPP: lookup '%c' failed\n", first);
5389     else if (third == 0)
5390         xmlGenericError(xmlGenericErrorContext,
5391                         "HPP: lookup '%c%c' failed\n", first, next);
5392     else
5393         xmlGenericError(xmlGenericErrorContext,
5394                         "HPP: lookup '%c%c%c' failed\n", first, next,
5395                         third);
5396 #endif
5397     return (-1);
5398 }
5399 
5400 /**
5401  * htmlParseLookupCommentEnd:
5402  * @ctxt: an HTML parser context
5403  *
5404  * Try to find a comment end tag in the input stream
5405  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5406  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5407  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5408  * to avoid rescanning sequences of bytes, it DOES change the state of the
5409  * parser, do not use liberally.
5410  * This wraps to htmlParseLookupSequence()
5411  *
5412  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5413  */
5414 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)5415 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5416 {
5417     int mark = 0;
5418     int cur = CUR_PTR - BASE_PTR;
5419 
5420     while (mark >= 0) {
5421 	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5422 	if ((mark < 0) ||
5423 	    (NXT(mark+2) == '>') ||
5424 	    ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5425 	    return mark;
5426 	}
5427 	ctxt->checkIndex = cur + mark + 1;
5428     }
5429     return mark;
5430 }
5431 
5432 
5433 /**
5434  * htmlParseTryOrFinish:
5435  * @ctxt:  an HTML parser context
5436  * @terminate:  last chunk indicator
5437  *
5438  * Try to progress on parsing
5439  *
5440  * Returns zero if no parsing was possible
5441  */
5442 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5443 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5444     int ret = 0;
5445     htmlParserInputPtr in;
5446     ptrdiff_t avail = 0;
5447     xmlChar cur, next;
5448 
5449     htmlParserNodeInfo node_info;
5450 
5451 #ifdef DEBUG_PUSH
5452     switch (ctxt->instate) {
5453 	case XML_PARSER_EOF:
5454 	    xmlGenericError(xmlGenericErrorContext,
5455 		    "HPP: try EOF\n"); break;
5456 	case XML_PARSER_START:
5457 	    xmlGenericError(xmlGenericErrorContext,
5458 		    "HPP: try START\n"); break;
5459 	case XML_PARSER_MISC:
5460 	    xmlGenericError(xmlGenericErrorContext,
5461 		    "HPP: try MISC\n");break;
5462 	case XML_PARSER_COMMENT:
5463 	    xmlGenericError(xmlGenericErrorContext,
5464 		    "HPP: try COMMENT\n");break;
5465 	case XML_PARSER_PROLOG:
5466 	    xmlGenericError(xmlGenericErrorContext,
5467 		    "HPP: try PROLOG\n");break;
5468 	case XML_PARSER_START_TAG:
5469 	    xmlGenericError(xmlGenericErrorContext,
5470 		    "HPP: try START_TAG\n");break;
5471 	case XML_PARSER_CONTENT:
5472 	    xmlGenericError(xmlGenericErrorContext,
5473 		    "HPP: try CONTENT\n");break;
5474 	case XML_PARSER_CDATA_SECTION:
5475 	    xmlGenericError(xmlGenericErrorContext,
5476 		    "HPP: try CDATA_SECTION\n");break;
5477 	case XML_PARSER_END_TAG:
5478 	    xmlGenericError(xmlGenericErrorContext,
5479 		    "HPP: try END_TAG\n");break;
5480 	case XML_PARSER_ENTITY_DECL:
5481 	    xmlGenericError(xmlGenericErrorContext,
5482 		    "HPP: try ENTITY_DECL\n");break;
5483 	case XML_PARSER_ENTITY_VALUE:
5484 	    xmlGenericError(xmlGenericErrorContext,
5485 		    "HPP: try ENTITY_VALUE\n");break;
5486 	case XML_PARSER_ATTRIBUTE_VALUE:
5487 	    xmlGenericError(xmlGenericErrorContext,
5488 		    "HPP: try ATTRIBUTE_VALUE\n");break;
5489 	case XML_PARSER_DTD:
5490 	    xmlGenericError(xmlGenericErrorContext,
5491 		    "HPP: try DTD\n");break;
5492 	case XML_PARSER_EPILOG:
5493 	    xmlGenericError(xmlGenericErrorContext,
5494 		    "HPP: try EPILOG\n");break;
5495 	case XML_PARSER_PI:
5496 	    xmlGenericError(xmlGenericErrorContext,
5497 		    "HPP: try PI\n");break;
5498 	case XML_PARSER_SYSTEM_LITERAL:
5499 	    xmlGenericError(xmlGenericErrorContext,
5500 		    "HPP: try SYSTEM_LITERAL\n");break;
5501     }
5502 #endif
5503 
5504     while (1) {
5505 
5506 	in = ctxt->input;
5507 	if (in == NULL) break;
5508 	if (in->buf == NULL)
5509 	    avail = in->length - (in->cur - in->base);
5510 	else
5511 	    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5512                     (in->cur - in->base);
5513 	if ((avail == 0) && (terminate)) {
5514 	    htmlAutoCloseOnEnd(ctxt);
5515 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5516 		/*
5517 		 * SAX: end of the document processing.
5518 		 */
5519 		ctxt->instate = XML_PARSER_EOF;
5520 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5521 		    ctxt->sax->endDocument(ctxt->userData);
5522 	    }
5523 	}
5524         if (avail < 1)
5525 	    goto done;
5526         /*
5527          * This is done to make progress and avoid an infinite loop
5528          * if a parsing attempt was aborted by hitting a NUL byte. After
5529          * changing htmlCurrentChar, this probably isn't necessary anymore.
5530          * We should consider removing this check.
5531          */
5532 	cur = in->cur[0];
5533 	if (cur == 0) {
5534 	    SKIP(1);
5535 	    continue;
5536 	}
5537 
5538         switch (ctxt->instate) {
5539             case XML_PARSER_EOF:
5540 	        /*
5541 		 * Document parsing is done !
5542 		 */
5543 	        goto done;
5544             case XML_PARSER_START:
5545 	        /*
5546 		 * Very first chars read from the document flow.
5547 		 */
5548 		cur = in->cur[0];
5549 		if (IS_BLANK_CH(cur)) {
5550 		    SKIP_BLANKS;
5551 		    if (in->buf == NULL)
5552 			avail = in->length - (in->cur - in->base);
5553 		    else
5554 			avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5555                                 (in->cur - in->base);
5556 		}
5557 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5558 		    ctxt->sax->setDocumentLocator(ctxt->userData,
5559 						  &xmlDefaultSAXLocator);
5560 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5561 	            (!ctxt->disableSAX))
5562 		    ctxt->sax->startDocument(ctxt->userData);
5563 
5564 		cur = in->cur[0];
5565 		next = in->cur[1];
5566 		if ((cur == '<') && (next == '!') &&
5567 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5568 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5569 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5570 		    (UPP(8) == 'E')) {
5571 		    if ((!terminate) &&
5572 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5573 			goto done;
5574 #ifdef DEBUG_PUSH
5575 		    xmlGenericError(xmlGenericErrorContext,
5576 			    "HPP: Parsing internal subset\n");
5577 #endif
5578 		    htmlParseDocTypeDecl(ctxt);
5579 		    ctxt->instate = XML_PARSER_PROLOG;
5580 #ifdef DEBUG_PUSH
5581 		    xmlGenericError(xmlGenericErrorContext,
5582 			    "HPP: entering PROLOG\n");
5583 #endif
5584                 } else {
5585 		    ctxt->instate = XML_PARSER_MISC;
5586 #ifdef DEBUG_PUSH
5587 		    xmlGenericError(xmlGenericErrorContext,
5588 			    "HPP: entering MISC\n");
5589 #endif
5590 		}
5591 		break;
5592             case XML_PARSER_MISC:
5593 		SKIP_BLANKS;
5594 		if (in->buf == NULL)
5595 		    avail = in->length - (in->cur - in->base);
5596 		else
5597 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5598                             (in->cur - in->base);
5599 		/*
5600 		 * no chars in buffer
5601 		 */
5602 		if (avail < 1)
5603 		    goto done;
5604 		/*
5605 		 * not enough chars in buffer
5606 		 */
5607 		if (avail < 2) {
5608 		    if (!terminate)
5609 			goto done;
5610 		    else
5611 			next = ' ';
5612 		} else {
5613 		    next = in->cur[1];
5614 		}
5615 		cur = in->cur[0];
5616 	        if ((cur == '<') && (next == '!') &&
5617 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5618 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5619 			goto done;
5620 #ifdef DEBUG_PUSH
5621 		    xmlGenericError(xmlGenericErrorContext,
5622 			    "HPP: Parsing Comment\n");
5623 #endif
5624 		    htmlParseComment(ctxt);
5625 		    ctxt->instate = XML_PARSER_MISC;
5626 	        } else if ((cur == '<') && (next == '?')) {
5627 		    if ((!terminate) &&
5628 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5629 			goto done;
5630 #ifdef DEBUG_PUSH
5631 		    xmlGenericError(xmlGenericErrorContext,
5632 			    "HPP: Parsing PI\n");
5633 #endif
5634 		    htmlParsePI(ctxt);
5635 		    ctxt->instate = XML_PARSER_MISC;
5636 		} else if ((cur == '<') && (next == '!') &&
5637 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5638 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5639 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5640 		    (UPP(8) == 'E')) {
5641 		    if ((!terminate) &&
5642 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5643 			goto done;
5644 #ifdef DEBUG_PUSH
5645 		    xmlGenericError(xmlGenericErrorContext,
5646 			    "HPP: Parsing internal subset\n");
5647 #endif
5648 		    htmlParseDocTypeDecl(ctxt);
5649 		    ctxt->instate = XML_PARSER_PROLOG;
5650 #ifdef DEBUG_PUSH
5651 		    xmlGenericError(xmlGenericErrorContext,
5652 			    "HPP: entering PROLOG\n");
5653 #endif
5654 		} else if ((cur == '<') && (next == '!') &&
5655 		           (avail < 9)) {
5656 		    goto done;
5657 		} else {
5658 		    ctxt->instate = XML_PARSER_CONTENT;
5659 #ifdef DEBUG_PUSH
5660 		    xmlGenericError(xmlGenericErrorContext,
5661 			    "HPP: entering START_TAG\n");
5662 #endif
5663 		}
5664 		break;
5665             case XML_PARSER_PROLOG:
5666 		SKIP_BLANKS;
5667 		if (in->buf == NULL)
5668 		    avail = in->length - (in->cur - in->base);
5669 		else
5670 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5671                             (in->cur - in->base);
5672 		if (avail < 2)
5673 		    goto done;
5674 		cur = in->cur[0];
5675 		next = in->cur[1];
5676 		if ((cur == '<') && (next == '!') &&
5677 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5678 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5679 			goto done;
5680 #ifdef DEBUG_PUSH
5681 		    xmlGenericError(xmlGenericErrorContext,
5682 			    "HPP: Parsing Comment\n");
5683 #endif
5684 		    htmlParseComment(ctxt);
5685 		    ctxt->instate = XML_PARSER_PROLOG;
5686 	        } else if ((cur == '<') && (next == '?')) {
5687 		    if ((!terminate) &&
5688 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5689 			goto done;
5690 #ifdef DEBUG_PUSH
5691 		    xmlGenericError(xmlGenericErrorContext,
5692 			    "HPP: Parsing PI\n");
5693 #endif
5694 		    htmlParsePI(ctxt);
5695 		    ctxt->instate = XML_PARSER_PROLOG;
5696 		} else if ((cur == '<') && (next == '!') &&
5697 		           (avail < 4)) {
5698 		    goto done;
5699 		} else {
5700 		    ctxt->instate = XML_PARSER_CONTENT;
5701 #ifdef DEBUG_PUSH
5702 		    xmlGenericError(xmlGenericErrorContext,
5703 			    "HPP: entering START_TAG\n");
5704 #endif
5705 		}
5706 		break;
5707             case XML_PARSER_EPILOG:
5708 		if (in->buf == NULL)
5709 		    avail = in->length - (in->cur - in->base);
5710 		else
5711 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5712                             (in->cur - in->base);
5713 		if (avail < 1)
5714 		    goto done;
5715 		cur = in->cur[0];
5716 		if (IS_BLANK_CH(cur)) {
5717 		    htmlParseCharData(ctxt);
5718 		    goto done;
5719 		}
5720 		if (avail < 2)
5721 		    goto done;
5722 		next = in->cur[1];
5723 	        if ((cur == '<') && (next == '!') &&
5724 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5725 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5726 			goto done;
5727 #ifdef DEBUG_PUSH
5728 		    xmlGenericError(xmlGenericErrorContext,
5729 			    "HPP: Parsing Comment\n");
5730 #endif
5731 		    htmlParseComment(ctxt);
5732 		    ctxt->instate = XML_PARSER_EPILOG;
5733 	        } else if ((cur == '<') && (next == '?')) {
5734 		    if ((!terminate) &&
5735 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5736 			goto done;
5737 #ifdef DEBUG_PUSH
5738 		    xmlGenericError(xmlGenericErrorContext,
5739 			    "HPP: Parsing PI\n");
5740 #endif
5741 		    htmlParsePI(ctxt);
5742 		    ctxt->instate = XML_PARSER_EPILOG;
5743 		} else if ((cur == '<') && (next == '!') &&
5744 		           (avail < 4)) {
5745 		    goto done;
5746 		} else {
5747 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5748 		    ctxt->wellFormed = 0;
5749 		    ctxt->instate = XML_PARSER_EOF;
5750 #ifdef DEBUG_PUSH
5751 		    xmlGenericError(xmlGenericErrorContext,
5752 			    "HPP: entering EOF\n");
5753 #endif
5754 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5755 			ctxt->sax->endDocument(ctxt->userData);
5756 		    goto done;
5757 		}
5758 		break;
5759             case XML_PARSER_START_TAG: {
5760 	        const xmlChar *name;
5761 		int failed;
5762 		const htmlElemDesc * info;
5763 
5764 		/*
5765 		 * no chars in buffer
5766 		 */
5767 		if (avail < 1)
5768 		    goto done;
5769 		/*
5770 		 * not enough chars in buffer
5771 		 */
5772 		if (avail < 2) {
5773 		    if (!terminate)
5774 			goto done;
5775 		    else
5776 			next = ' ';
5777 		} else {
5778 		    next = in->cur[1];
5779 		}
5780 		cur = in->cur[0];
5781 	        if (cur != '<') {
5782 		    ctxt->instate = XML_PARSER_CONTENT;
5783 #ifdef DEBUG_PUSH
5784 		    xmlGenericError(xmlGenericErrorContext,
5785 			    "HPP: entering CONTENT\n");
5786 #endif
5787 		    break;
5788 		}
5789 		if (next == '/') {
5790 		    ctxt->instate = XML_PARSER_END_TAG;
5791 		    ctxt->checkIndex = 0;
5792 #ifdef DEBUG_PUSH
5793 		    xmlGenericError(xmlGenericErrorContext,
5794 			    "HPP: entering END_TAG\n");
5795 #endif
5796 		    break;
5797 		}
5798 		if ((!terminate) &&
5799 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5800 		    goto done;
5801 
5802                 /* Capture start position */
5803 	        if (ctxt->record_info) {
5804 	             node_info.begin_pos = ctxt->input->consumed +
5805 	                                (CUR_PTR - ctxt->input->base);
5806 	             node_info.begin_line = ctxt->input->line;
5807 	        }
5808 
5809 
5810 		failed = htmlParseStartTag(ctxt);
5811 		name = ctxt->name;
5812 		if ((failed == -1) ||
5813 		    (name == NULL)) {
5814 		    if (CUR == '>')
5815 			NEXT;
5816 		    break;
5817 		}
5818 
5819 		/*
5820 		 * Lookup the info for that element.
5821 		 */
5822 		info = htmlTagLookup(name);
5823 		if (info == NULL) {
5824 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5825 		                 "Tag %s invalid\n", name, NULL);
5826 		}
5827 
5828 		/*
5829 		 * Check for an Empty Element labeled the XML/SGML way
5830 		 */
5831 		if ((CUR == '/') && (NXT(1) == '>')) {
5832 		    SKIP(2);
5833 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5834 			ctxt->sax->endElement(ctxt->userData, name);
5835 		    htmlnamePop(ctxt);
5836 		    ctxt->instate = XML_PARSER_CONTENT;
5837 #ifdef DEBUG_PUSH
5838 		    xmlGenericError(xmlGenericErrorContext,
5839 			    "HPP: entering CONTENT\n");
5840 #endif
5841 		    break;
5842 		}
5843 
5844 		if (CUR == '>') {
5845 		    NEXT;
5846 		} else {
5847 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5848 		                 "Couldn't find end of Start Tag %s\n",
5849 				 name, NULL);
5850 
5851 		    /*
5852 		     * end of parsing of this node.
5853 		     */
5854 		    if (xmlStrEqual(name, ctxt->name)) {
5855 			nodePop(ctxt);
5856 			htmlnamePop(ctxt);
5857 		    }
5858 
5859 		    if (ctxt->record_info)
5860 		        htmlNodeInfoPush(ctxt, &node_info);
5861 
5862 		    ctxt->instate = XML_PARSER_CONTENT;
5863 #ifdef DEBUG_PUSH
5864 		    xmlGenericError(xmlGenericErrorContext,
5865 			    "HPP: entering CONTENT\n");
5866 #endif
5867 		    break;
5868 		}
5869 
5870 		/*
5871 		 * Check for an Empty Element from DTD definition
5872 		 */
5873 		if ((info != NULL) && (info->empty)) {
5874 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5875 			ctxt->sax->endElement(ctxt->userData, name);
5876 		    htmlnamePop(ctxt);
5877 		}
5878 
5879                 if (ctxt->record_info)
5880 	            htmlNodeInfoPush(ctxt, &node_info);
5881 
5882 		ctxt->instate = XML_PARSER_CONTENT;
5883 #ifdef DEBUG_PUSH
5884 		xmlGenericError(xmlGenericErrorContext,
5885 			"HPP: entering CONTENT\n");
5886 #endif
5887                 break;
5888 	    }
5889             case XML_PARSER_CONTENT: {
5890 		xmlChar chr[2] = { 0, 0 };
5891 
5892                 /*
5893 		 * Handle preparsed entities and charRef
5894 		 */
5895 		if (ctxt->token != 0) {
5896 		    chr[0] = (xmlChar) ctxt->token;
5897 		    htmlCheckParagraph(ctxt);
5898 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5899 			ctxt->sax->characters(ctxt->userData, chr, 1);
5900 		    ctxt->token = 0;
5901 		    ctxt->checkIndex = 0;
5902 		}
5903 		if ((avail == 1) && (terminate)) {
5904 		    cur = in->cur[0];
5905 		    if ((cur != '<') && (cur != '&')) {
5906 			if (ctxt->sax != NULL) {
5907                             chr[0] = cur;
5908 			    if (IS_BLANK_CH(cur)) {
5909 				if (ctxt->keepBlanks) {
5910 				    if (ctxt->sax->characters != NULL)
5911 					ctxt->sax->characters(
5912 						ctxt->userData, chr, 1);
5913 				} else {
5914 				    if (ctxt->sax->ignorableWhitespace != NULL)
5915 					ctxt->sax->ignorableWhitespace(
5916 						ctxt->userData, chr, 1);
5917 				}
5918 			    } else {
5919 				htmlCheckParagraph(ctxt);
5920 				if (ctxt->sax->characters != NULL)
5921 				    ctxt->sax->characters(
5922 					    ctxt->userData, chr, 1);
5923 			    }
5924 			}
5925 			ctxt->token = 0;
5926 			ctxt->checkIndex = 0;
5927 			in->cur++;
5928 			break;
5929 		    }
5930 		}
5931 		if (avail < 2)
5932 		    goto done;
5933 		cur = in->cur[0];
5934 		next = in->cur[1];
5935 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5936 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5937 		    /*
5938 		     * Handle SCRIPT/STYLE separately
5939 		     */
5940 		    if (!terminate) {
5941 		        int idx;
5942 			xmlChar val;
5943 
5944 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5945 			if (idx < 0)
5946 			    goto done;
5947 		        val = in->cur[idx + 2];
5948 			if (val == 0) /* bad cut of input */
5949 			    goto done;
5950 		    }
5951 		    htmlParseScript(ctxt);
5952 		    if ((cur == '<') && (next == '/')) {
5953 			ctxt->instate = XML_PARSER_END_TAG;
5954 			ctxt->checkIndex = 0;
5955 #ifdef DEBUG_PUSH
5956 			xmlGenericError(xmlGenericErrorContext,
5957 				"HPP: entering END_TAG\n");
5958 #endif
5959 			break;
5960 		    }
5961 		} else {
5962 		    /*
5963 		     * Sometimes DOCTYPE arrives in the middle of the document
5964 		     */
5965 		    if ((cur == '<') && (next == '!') &&
5966 			(UPP(2) == 'D') && (UPP(3) == 'O') &&
5967 			(UPP(4) == 'C') && (UPP(5) == 'T') &&
5968 			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5969 			(UPP(8) == 'E')) {
5970 			if ((!terminate) &&
5971 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5972 			    goto done;
5973 			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5974 			             "Misplaced DOCTYPE declaration\n",
5975 				     BAD_CAST "DOCTYPE" , NULL);
5976 			htmlParseDocTypeDecl(ctxt);
5977 		    } else if ((cur == '<') && (next == '!') &&
5978 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
5979 			if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5980 			    goto done;
5981 #ifdef DEBUG_PUSH
5982 			xmlGenericError(xmlGenericErrorContext,
5983 				"HPP: Parsing Comment\n");
5984 #endif
5985 			htmlParseComment(ctxt);
5986 			ctxt->instate = XML_PARSER_CONTENT;
5987 		    } else if ((cur == '<') && (next == '?')) {
5988 			if ((!terminate) &&
5989 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5990 			    goto done;
5991 #ifdef DEBUG_PUSH
5992 			xmlGenericError(xmlGenericErrorContext,
5993 				"HPP: Parsing PI\n");
5994 #endif
5995 			htmlParsePI(ctxt);
5996 			ctxt->instate = XML_PARSER_CONTENT;
5997 		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5998 			goto done;
5999 		    } else if ((cur == '<') && (next == '/')) {
6000 			ctxt->instate = XML_PARSER_END_TAG;
6001 			ctxt->checkIndex = 0;
6002 #ifdef DEBUG_PUSH
6003 			xmlGenericError(xmlGenericErrorContext,
6004 				"HPP: entering END_TAG\n");
6005 #endif
6006 			break;
6007 		    } else if (cur == '<') {
6008                         if ((!terminate) && (next == 0))
6009                             goto done;
6010                         ctxt->instate = XML_PARSER_START_TAG;
6011                         ctxt->checkIndex = 0;
6012 #ifdef DEBUG_PUSH
6013                         xmlGenericError(xmlGenericErrorContext,
6014                                 "HPP: entering START_TAG\n");
6015 #endif
6016 			break;
6017 		    } else {
6018 		        /*
6019 			 * check that the text sequence is complete
6020 			 * before handing out the data to the parser
6021 			 * to avoid problems with erroneous end of
6022 			 * data detection.
6023 			 */
6024 			if ((!terminate) &&
6025                             (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6026 			    goto done;
6027 			ctxt->checkIndex = 0;
6028 #ifdef DEBUG_PUSH
6029 			xmlGenericError(xmlGenericErrorContext,
6030 				"HPP: Parsing char data\n");
6031 #endif
6032                         while ((ctxt->instate != XML_PARSER_EOF) &&
6033                                (cur != '<') && (in->cur < in->end)) {
6034                             if (cur == '&') {
6035 			        htmlParseReference(ctxt);
6036                             } else {
6037 			        htmlParseCharData(ctxt);
6038                             }
6039                             cur = in->cur[0];
6040                         }
6041 		    }
6042 		}
6043 
6044 		break;
6045 	    }
6046             case XML_PARSER_END_TAG:
6047 		if (avail < 2)
6048 		    goto done;
6049 		if ((!terminate) &&
6050 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6051 		    goto done;
6052 		htmlParseEndTag(ctxt);
6053 		if (ctxt->nameNr == 0) {
6054 		    ctxt->instate = XML_PARSER_EPILOG;
6055 		} else {
6056 		    ctxt->instate = XML_PARSER_CONTENT;
6057 		}
6058 		ctxt->checkIndex = 0;
6059 #ifdef DEBUG_PUSH
6060 		xmlGenericError(xmlGenericErrorContext,
6061 			"HPP: entering CONTENT\n");
6062 #endif
6063 	        break;
6064             case XML_PARSER_CDATA_SECTION:
6065 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6066 			"HPP: internal error, state == CDATA\n",
6067 			     NULL, NULL);
6068 		ctxt->instate = XML_PARSER_CONTENT;
6069 		ctxt->checkIndex = 0;
6070 #ifdef DEBUG_PUSH
6071 		xmlGenericError(xmlGenericErrorContext,
6072 			"HPP: entering CONTENT\n");
6073 #endif
6074 		break;
6075             case XML_PARSER_DTD:
6076 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6077 			"HPP: internal error, state == DTD\n",
6078 			     NULL, NULL);
6079 		ctxt->instate = XML_PARSER_CONTENT;
6080 		ctxt->checkIndex = 0;
6081 #ifdef DEBUG_PUSH
6082 		xmlGenericError(xmlGenericErrorContext,
6083 			"HPP: entering CONTENT\n");
6084 #endif
6085 		break;
6086             case XML_PARSER_COMMENT:
6087 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6088 			"HPP: internal error, state == COMMENT\n",
6089 			     NULL, NULL);
6090 		ctxt->instate = XML_PARSER_CONTENT;
6091 		ctxt->checkIndex = 0;
6092 #ifdef DEBUG_PUSH
6093 		xmlGenericError(xmlGenericErrorContext,
6094 			"HPP: entering CONTENT\n");
6095 #endif
6096 		break;
6097             case XML_PARSER_PI:
6098 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6099 			"HPP: internal error, state == PI\n",
6100 			     NULL, NULL);
6101 		ctxt->instate = XML_PARSER_CONTENT;
6102 		ctxt->checkIndex = 0;
6103 #ifdef DEBUG_PUSH
6104 		xmlGenericError(xmlGenericErrorContext,
6105 			"HPP: entering CONTENT\n");
6106 #endif
6107 		break;
6108             case XML_PARSER_ENTITY_DECL:
6109 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6110 			"HPP: internal error, state == ENTITY_DECL\n",
6111 			     NULL, NULL);
6112 		ctxt->instate = XML_PARSER_CONTENT;
6113 		ctxt->checkIndex = 0;
6114 #ifdef DEBUG_PUSH
6115 		xmlGenericError(xmlGenericErrorContext,
6116 			"HPP: entering CONTENT\n");
6117 #endif
6118 		break;
6119             case XML_PARSER_ENTITY_VALUE:
6120 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6121 			"HPP: internal error, state == ENTITY_VALUE\n",
6122 			     NULL, NULL);
6123 		ctxt->instate = XML_PARSER_CONTENT;
6124 		ctxt->checkIndex = 0;
6125 #ifdef DEBUG_PUSH
6126 		xmlGenericError(xmlGenericErrorContext,
6127 			"HPP: entering DTD\n");
6128 #endif
6129 		break;
6130             case XML_PARSER_ATTRIBUTE_VALUE:
6131 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6132 			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
6133 			     NULL, NULL);
6134 		ctxt->instate = XML_PARSER_START_TAG;
6135 		ctxt->checkIndex = 0;
6136 #ifdef DEBUG_PUSH
6137 		xmlGenericError(xmlGenericErrorContext,
6138 			"HPP: entering START_TAG\n");
6139 #endif
6140 		break;
6141 	    case XML_PARSER_SYSTEM_LITERAL:
6142 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6143 		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6144 			     NULL, NULL);
6145 		ctxt->instate = XML_PARSER_CONTENT;
6146 		ctxt->checkIndex = 0;
6147 #ifdef DEBUG_PUSH
6148 		xmlGenericError(xmlGenericErrorContext,
6149 			"HPP: entering CONTENT\n");
6150 #endif
6151 		break;
6152 	    case XML_PARSER_IGNORE:
6153 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6154 			"HPP: internal error, state == XML_PARSER_IGNORE\n",
6155 			     NULL, NULL);
6156 		ctxt->instate = XML_PARSER_CONTENT;
6157 		ctxt->checkIndex = 0;
6158 #ifdef DEBUG_PUSH
6159 		xmlGenericError(xmlGenericErrorContext,
6160 			"HPP: entering CONTENT\n");
6161 #endif
6162 		break;
6163 	    case XML_PARSER_PUBLIC_LITERAL:
6164 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6165 			"HPP: internal error, state == XML_PARSER_LITERAL\n",
6166 			     NULL, NULL);
6167 		ctxt->instate = XML_PARSER_CONTENT;
6168 		ctxt->checkIndex = 0;
6169 #ifdef DEBUG_PUSH
6170 		xmlGenericError(xmlGenericErrorContext,
6171 			"HPP: entering CONTENT\n");
6172 #endif
6173 		break;
6174 
6175 	}
6176     }
6177 done:
6178     if ((avail == 0) && (terminate)) {
6179 	htmlAutoCloseOnEnd(ctxt);
6180 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6181 	    /*
6182 	     * SAX: end of the document processing.
6183 	     */
6184 	    ctxt->instate = XML_PARSER_EOF;
6185 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6186 		ctxt->sax->endDocument(ctxt->userData);
6187 	}
6188     }
6189     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6190 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6191 	 (ctxt->instate == XML_PARSER_EPILOG))) {
6192 	xmlDtdPtr dtd;
6193 	dtd = xmlGetIntSubset(ctxt->myDoc);
6194 	if (dtd == NULL)
6195 	    ctxt->myDoc->intSubset =
6196 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6197 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6198 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6199     }
6200 #ifdef DEBUG_PUSH
6201     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6202 #endif
6203     return(ret);
6204 }
6205 
6206 /**
6207  * htmlParseChunk:
6208  * @ctxt:  an HTML parser context
6209  * @chunk:  an char array
6210  * @size:  the size in byte of the chunk
6211  * @terminate:  last chunk indicator
6212  *
6213  * Parse a Chunk of memory
6214  *
6215  * Returns zero if no error, the xmlParserErrors otherwise.
6216  */
6217 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)6218 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6219               int terminate) {
6220     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6221 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6222 		     "htmlParseChunk: context error\n", NULL, NULL);
6223 	return(XML_ERR_INTERNAL_ERROR);
6224     }
6225     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6226         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6227 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6228 	size_t cur = ctxt->input->cur - ctxt->input->base;
6229 	int res;
6230 
6231 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6232         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6233 	if (res < 0) {
6234 	    ctxt->errNo = XML_PARSER_EOF;
6235 	    ctxt->disableSAX = 1;
6236 	    return (XML_PARSER_EOF);
6237 	}
6238 #ifdef DEBUG_PUSH
6239 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6240 #endif
6241 
6242 #if 0
6243 	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6244 	    htmlParseTryOrFinish(ctxt, terminate);
6245 #endif
6246     } else if (ctxt->instate != XML_PARSER_EOF) {
6247 	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6248 	    xmlParserInputBufferPtr in = ctxt->input->buf;
6249 	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
6250 		    (in->raw != NULL)) {
6251 		int nbchars;
6252 		size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6253 		size_t current = ctxt->input->cur - ctxt->input->base;
6254 
6255 		nbchars = xmlCharEncInput(in, terminate);
6256 		xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6257 		if (nbchars < 0) {
6258 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6259 			         "encoder error\n", NULL, NULL);
6260 		    return(XML_ERR_INVALID_ENCODING);
6261 		}
6262 	    }
6263 	}
6264     }
6265     htmlParseTryOrFinish(ctxt, terminate);
6266     if (terminate) {
6267 	if ((ctxt->instate != XML_PARSER_EOF) &&
6268 	    (ctxt->instate != XML_PARSER_EPILOG) &&
6269 	    (ctxt->instate != XML_PARSER_MISC)) {
6270 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
6271 	    ctxt->wellFormed = 0;
6272 	}
6273 	if (ctxt->instate != XML_PARSER_EOF) {
6274 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6275 		ctxt->sax->endDocument(ctxt->userData);
6276 	}
6277 	ctxt->instate = XML_PARSER_EOF;
6278     }
6279     return((xmlParserErrors) ctxt->errNo);
6280 }
6281 
6282 /************************************************************************
6283  *									*
6284  *			User entry points				*
6285  *									*
6286  ************************************************************************/
6287 
6288 /**
6289  * htmlCreatePushParserCtxt:
6290  * @sax:  a SAX handler
6291  * @user_data:  The user data returned on SAX callbacks
6292  * @chunk:  a pointer to an array of chars
6293  * @size:  number of chars in the array
6294  * @filename:  an optional file name or URI
6295  * @enc:  an optional encoding
6296  *
6297  * Create a parser context for using the HTML parser in push mode
6298  * The value of @filename is used for fetching external entities
6299  * and error/warning reports.
6300  *
6301  * Returns the new parser context or NULL
6302  */
6303 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)6304 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6305                          const char *chunk, int size, const char *filename,
6306 			 xmlCharEncoding enc) {
6307     htmlParserCtxtPtr ctxt;
6308     htmlParserInputPtr inputStream;
6309     xmlParserInputBufferPtr buf;
6310 
6311     xmlInitParser();
6312 
6313     buf = xmlAllocParserInputBuffer(enc);
6314     if (buf == NULL) return(NULL);
6315 
6316     ctxt = htmlNewParserCtxt();
6317     if (ctxt == NULL) {
6318 	xmlFreeParserInputBuffer(buf);
6319 	return(NULL);
6320     }
6321     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6322 	ctxt->charset=XML_CHAR_ENCODING_UTF8;
6323     if (sax != NULL) {
6324 	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6325 	    xmlFree(ctxt->sax);
6326 	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6327 	if (ctxt->sax == NULL) {
6328 	    xmlFree(buf);
6329 	    xmlFree(ctxt);
6330 	    return(NULL);
6331 	}
6332 	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6333 	if (user_data != NULL)
6334 	    ctxt->userData = user_data;
6335     }
6336     if (filename == NULL) {
6337 	ctxt->directory = NULL;
6338     } else {
6339         ctxt->directory = xmlParserGetDirectory(filename);
6340     }
6341 
6342     inputStream = htmlNewInputStream(ctxt);
6343     if (inputStream == NULL) {
6344 	xmlFreeParserCtxt(ctxt);
6345 	xmlFree(buf);
6346 	return(NULL);
6347     }
6348 
6349     if (filename == NULL)
6350 	inputStream->filename = NULL;
6351     else
6352 	inputStream->filename = (char *)
6353 	    xmlCanonicPath((const xmlChar *) filename);
6354     inputStream->buf = buf;
6355     xmlBufResetInput(buf->buffer, inputStream);
6356 
6357     inputPush(ctxt, inputStream);
6358 
6359     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6360         (ctxt->input->buf != NULL))  {
6361 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6362 	size_t cur = ctxt->input->cur - ctxt->input->base;
6363 
6364 	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6365 
6366         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6367 #ifdef DEBUG_PUSH
6368 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6369 #endif
6370     }
6371     ctxt->progressive = 1;
6372 
6373     return(ctxt);
6374 }
6375 #endif /* LIBXML_PUSH_ENABLED */
6376 
6377 /**
6378  * htmlSAXParseDoc:
6379  * @cur:  a pointer to an array of xmlChar
6380  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6381  * @sax:  the SAX handler block
6382  * @userData: if using SAX, this pointer will be provided on callbacks.
6383  *
6384  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6385  * to handle parse events. If sax is NULL, fallback to the default DOM
6386  * behavior and return a tree.
6387  *
6388  * Returns the resulting document tree unless SAX is NULL or the document is
6389  *     not well formed.
6390  */
6391 
6392 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6393 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6394                 htmlSAXHandlerPtr sax, void *userData) {
6395     htmlDocPtr ret;
6396     htmlParserCtxtPtr ctxt;
6397 
6398     xmlInitParser();
6399 
6400     if (cur == NULL) return(NULL);
6401 
6402 
6403     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6404     if (ctxt == NULL) return(NULL);
6405     if (sax != NULL) {
6406         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6407         ctxt->sax = sax;
6408         ctxt->userData = userData;
6409     }
6410 
6411     htmlParseDocument(ctxt);
6412     ret = ctxt->myDoc;
6413     if (sax != NULL) {
6414 	ctxt->sax = NULL;
6415 	ctxt->userData = NULL;
6416     }
6417     htmlFreeParserCtxt(ctxt);
6418 
6419     return(ret);
6420 }
6421 
6422 /**
6423  * htmlParseDoc:
6424  * @cur:  a pointer to an array of xmlChar
6425  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6426  *
6427  * parse an HTML in-memory document and build a tree.
6428  *
6429  * Returns the resulting document tree
6430  */
6431 
6432 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)6433 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6434     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6435 }
6436 
6437 
6438 /**
6439  * htmlCreateFileParserCtxt:
6440  * @filename:  the filename
6441  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6442  *
6443  * Create a parser context for a file content.
6444  * Automatic support for ZLIB/Compress compressed document is provided
6445  * by default if found at compile-time.
6446  *
6447  * Returns the new parser context or NULL
6448  */
6449 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)6450 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6451 {
6452     htmlParserCtxtPtr ctxt;
6453     htmlParserInputPtr inputStream;
6454     char *canonicFilename;
6455     /* htmlCharEncoding enc; */
6456     xmlChar *content, *content_line = (xmlChar *) "charset=";
6457 
6458     if (filename == NULL)
6459         return(NULL);
6460 
6461     ctxt = htmlNewParserCtxt();
6462     if (ctxt == NULL) {
6463 	return(NULL);
6464     }
6465     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6466     if (canonicFilename == NULL) {
6467 #ifdef LIBXML_SAX1_ENABLED
6468 	if (xmlDefaultSAXHandler.error != NULL) {
6469 	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6470 	}
6471 #endif
6472 	xmlFreeParserCtxt(ctxt);
6473 	return(NULL);
6474     }
6475 
6476     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6477     xmlFree(canonicFilename);
6478     if (inputStream == NULL) {
6479 	xmlFreeParserCtxt(ctxt);
6480 	return(NULL);
6481     }
6482 
6483     inputPush(ctxt, inputStream);
6484 
6485     /* set encoding */
6486     if (encoding) {
6487         size_t l = strlen(encoding);
6488 
6489 	if (l < 1000) {
6490 	    content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6491 	    if (content) {
6492 		strcpy ((char *)content, (char *)content_line);
6493 		strcat ((char *)content, (char *)encoding);
6494 		htmlCheckEncoding (ctxt, content);
6495 		xmlFree (content);
6496 	    }
6497 	}
6498     }
6499 
6500     return(ctxt);
6501 }
6502 
6503 /**
6504  * htmlSAXParseFile:
6505  * @filename:  the filename
6506  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6507  * @sax:  the SAX handler block
6508  * @userData: if using SAX, this pointer will be provided on callbacks.
6509  *
6510  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6511  * compressed document is provided by default if found at compile-time.
6512  * It use the given SAX function block to handle the parsing callback.
6513  * If sax is NULL, fallback to the default DOM tree building routines.
6514  *
6515  * Returns the resulting document tree unless SAX is NULL or the document is
6516  *     not well formed.
6517  */
6518 
6519 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6520 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6521                  void *userData) {
6522     htmlDocPtr ret;
6523     htmlParserCtxtPtr ctxt;
6524     htmlSAXHandlerPtr oldsax = NULL;
6525 
6526     xmlInitParser();
6527 
6528     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6529     if (ctxt == NULL) return(NULL);
6530     if (sax != NULL) {
6531 	oldsax = ctxt->sax;
6532         ctxt->sax = sax;
6533         ctxt->userData = userData;
6534     }
6535 
6536     htmlParseDocument(ctxt);
6537 
6538     ret = ctxt->myDoc;
6539     if (sax != NULL) {
6540         ctxt->sax = oldsax;
6541         ctxt->userData = NULL;
6542     }
6543     htmlFreeParserCtxt(ctxt);
6544 
6545     return(ret);
6546 }
6547 
6548 /**
6549  * htmlParseFile:
6550  * @filename:  the filename
6551  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6552  *
6553  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6554  * compressed document is provided by default if found at compile-time.
6555  *
6556  * Returns the resulting document tree
6557  */
6558 
6559 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)6560 htmlParseFile(const char *filename, const char *encoding) {
6561     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6562 }
6563 
6564 /**
6565  * htmlHandleOmittedElem:
6566  * @val:  int 0 or 1
6567  *
6568  * Set and return the previous value for handling HTML omitted tags.
6569  *
6570  * Returns the last value for 0 for no handling, 1 for auto insertion.
6571  */
6572 
6573 int
htmlHandleOmittedElem(int val)6574 htmlHandleOmittedElem(int val) {
6575     int old = htmlOmittedDefaultValue;
6576 
6577     htmlOmittedDefaultValue = val;
6578     return(old);
6579 }
6580 
6581 /**
6582  * htmlElementAllowedHere:
6583  * @parent: HTML parent element
6584  * @elt: HTML element
6585  *
6586  * Checks whether an HTML element may be a direct child of a parent element.
6587  * Note - doesn't check for deprecated elements
6588  *
6589  * Returns 1 if allowed; 0 otherwise.
6590  */
6591 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6592 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6593   const char** p ;
6594 
6595   if ( ! elt || ! parent || ! parent->subelts )
6596 	return 0 ;
6597 
6598   for ( p = parent->subelts; *p; ++p )
6599     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6600       return 1 ;
6601 
6602   return 0 ;
6603 }
6604 /**
6605  * htmlElementStatusHere:
6606  * @parent: HTML parent element
6607  * @elt: HTML element
6608  *
6609  * Checks whether an HTML element may be a direct child of a parent element.
6610  * and if so whether it is valid or deprecated.
6611  *
6612  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6613  */
6614 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6615 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6616   if ( ! parent || ! elt )
6617     return HTML_INVALID ;
6618   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6619     return HTML_INVALID ;
6620 
6621   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6622 }
6623 /**
6624  * htmlAttrAllowed:
6625  * @elt: HTML element
6626  * @attr: HTML attribute
6627  * @legacy: whether to allow deprecated attributes
6628  *
6629  * Checks whether an attribute is valid for an element
6630  * Has full knowledge of Required and Deprecated attributes
6631  *
6632  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6633  */
6634 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6635 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6636   const char** p ;
6637 
6638   if ( !elt || ! attr )
6639 	return HTML_INVALID ;
6640 
6641   if ( elt->attrs_req )
6642     for ( p = elt->attrs_req; *p; ++p)
6643       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6644         return HTML_REQUIRED ;
6645 
6646   if ( elt->attrs_opt )
6647     for ( p = elt->attrs_opt; *p; ++p)
6648       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6649         return HTML_VALID ;
6650 
6651   if ( legacy && elt->attrs_depr )
6652     for ( p = elt->attrs_depr; *p; ++p)
6653       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6654         return HTML_DEPRECATED ;
6655 
6656   return HTML_INVALID ;
6657 }
6658 /**
6659  * htmlNodeStatus:
6660  * @node: an htmlNodePtr in a tree
6661  * @legacy: whether to allow deprecated elements (YES is faster here
6662  *	for Element nodes)
6663  *
6664  * Checks whether the tree node is valid.  Experimental (the author
6665  *     only uses the HTML enhancements in a SAX parser)
6666  *
6667  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6668  *	legacy allowed) or htmlElementStatusHere (otherwise).
6669  *	for Attribute nodes, a return from htmlAttrAllowed
6670  *	for other nodes, HTML_NA (no checks performed)
6671  */
6672 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)6673 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6674   if ( ! node )
6675     return HTML_INVALID ;
6676 
6677   switch ( node->type ) {
6678     case XML_ELEMENT_NODE:
6679       return legacy
6680 	? ( htmlElementAllowedHere (
6681 		htmlTagLookup(node->parent->name) , node->name
6682 		) ? HTML_VALID : HTML_INVALID )
6683 	: htmlElementStatusHere(
6684 		htmlTagLookup(node->parent->name) ,
6685 		htmlTagLookup(node->name) )
6686 	;
6687     case XML_ATTRIBUTE_NODE:
6688       return htmlAttrAllowed(
6689 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6690     default: return HTML_NA ;
6691   }
6692 }
6693 /************************************************************************
6694  *									*
6695  *	New set (2.6.0) of simpler and more flexible APIs		*
6696  *									*
6697  ************************************************************************/
6698 /**
6699  * DICT_FREE:
6700  * @str:  a string
6701  *
6702  * Free a string if it is not owned by the "dict" dictionary in the
6703  * current scope
6704  */
6705 #define DICT_FREE(str)						\
6706 	if ((str) && ((!dict) ||				\
6707 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6708 	    xmlFree((char *)(str));
6709 
6710 /**
6711  * htmlCtxtReset:
6712  * @ctxt: an HTML parser context
6713  *
6714  * Reset a parser context
6715  */
6716 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6717 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6718 {
6719     xmlParserInputPtr input;
6720     xmlDictPtr dict;
6721 
6722     if (ctxt == NULL)
6723         return;
6724 
6725     xmlInitParser();
6726     dict = ctxt->dict;
6727 
6728     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6729         xmlFreeInputStream(input);
6730     }
6731     ctxt->inputNr = 0;
6732     ctxt->input = NULL;
6733 
6734     ctxt->spaceNr = 0;
6735     if (ctxt->spaceTab != NULL) {
6736 	ctxt->spaceTab[0] = -1;
6737 	ctxt->space = &ctxt->spaceTab[0];
6738     } else {
6739 	ctxt->space = NULL;
6740     }
6741 
6742 
6743     ctxt->nodeNr = 0;
6744     ctxt->node = NULL;
6745 
6746     ctxt->nameNr = 0;
6747     ctxt->name = NULL;
6748 
6749     DICT_FREE(ctxt->version);
6750     ctxt->version = NULL;
6751     DICT_FREE(ctxt->encoding);
6752     ctxt->encoding = NULL;
6753     DICT_FREE(ctxt->directory);
6754     ctxt->directory = NULL;
6755     DICT_FREE(ctxt->extSubURI);
6756     ctxt->extSubURI = NULL;
6757     DICT_FREE(ctxt->extSubSystem);
6758     ctxt->extSubSystem = NULL;
6759     if (ctxt->myDoc != NULL)
6760         xmlFreeDoc(ctxt->myDoc);
6761     ctxt->myDoc = NULL;
6762 
6763     ctxt->standalone = -1;
6764     ctxt->hasExternalSubset = 0;
6765     ctxt->hasPErefs = 0;
6766     ctxt->html = 1;
6767     ctxt->external = 0;
6768     ctxt->instate = XML_PARSER_START;
6769     ctxt->token = 0;
6770 
6771     ctxt->wellFormed = 1;
6772     ctxt->nsWellFormed = 1;
6773     ctxt->disableSAX = 0;
6774     ctxt->valid = 1;
6775     ctxt->vctxt.userData = ctxt;
6776     ctxt->vctxt.error = xmlParserValidityError;
6777     ctxt->vctxt.warning = xmlParserValidityWarning;
6778     ctxt->record_info = 0;
6779     ctxt->checkIndex = 0;
6780     ctxt->inSubset = 0;
6781     ctxt->errNo = XML_ERR_OK;
6782     ctxt->depth = 0;
6783     ctxt->charset = XML_CHAR_ENCODING_NONE;
6784     ctxt->catalogs = NULL;
6785     xmlInitNodeInfoSeq(&ctxt->node_seq);
6786 
6787     if (ctxt->attsDefault != NULL) {
6788         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6789         ctxt->attsDefault = NULL;
6790     }
6791     if (ctxt->attsSpecial != NULL) {
6792         xmlHashFree(ctxt->attsSpecial, NULL);
6793         ctxt->attsSpecial = NULL;
6794     }
6795 }
6796 
6797 /**
6798  * htmlCtxtUseOptions:
6799  * @ctxt: an HTML parser context
6800  * @options:  a combination of htmlParserOption(s)
6801  *
6802  * Applies the options to the parser context
6803  *
6804  * Returns 0 in case of success, the set of unknown or unimplemented options
6805  *         in case of error.
6806  */
6807 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6808 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6809 {
6810     if (ctxt == NULL)
6811         return(-1);
6812 
6813     if (options & HTML_PARSE_NOWARNING) {
6814         ctxt->sax->warning = NULL;
6815         ctxt->vctxt.warning = NULL;
6816         options -= XML_PARSE_NOWARNING;
6817 	ctxt->options |= XML_PARSE_NOWARNING;
6818     }
6819     if (options & HTML_PARSE_NOERROR) {
6820         ctxt->sax->error = NULL;
6821         ctxt->vctxt.error = NULL;
6822         ctxt->sax->fatalError = NULL;
6823         options -= XML_PARSE_NOERROR;
6824 	ctxt->options |= XML_PARSE_NOERROR;
6825     }
6826     if (options & HTML_PARSE_PEDANTIC) {
6827         ctxt->pedantic = 1;
6828         options -= XML_PARSE_PEDANTIC;
6829 	ctxt->options |= XML_PARSE_PEDANTIC;
6830     } else
6831         ctxt->pedantic = 0;
6832     if (options & XML_PARSE_NOBLANKS) {
6833         ctxt->keepBlanks = 0;
6834         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6835         options -= XML_PARSE_NOBLANKS;
6836 	ctxt->options |= XML_PARSE_NOBLANKS;
6837     } else
6838         ctxt->keepBlanks = 1;
6839     if (options & HTML_PARSE_RECOVER) {
6840         ctxt->recovery = 1;
6841 	options -= HTML_PARSE_RECOVER;
6842     } else
6843         ctxt->recovery = 0;
6844     if (options & HTML_PARSE_COMPACT) {
6845 	ctxt->options |= HTML_PARSE_COMPACT;
6846         options -= HTML_PARSE_COMPACT;
6847     }
6848     if (options & XML_PARSE_HUGE) {
6849 	ctxt->options |= XML_PARSE_HUGE;
6850         options -= XML_PARSE_HUGE;
6851     }
6852     if (options & HTML_PARSE_NODEFDTD) {
6853 	ctxt->options |= HTML_PARSE_NODEFDTD;
6854         options -= HTML_PARSE_NODEFDTD;
6855     }
6856     if (options & HTML_PARSE_IGNORE_ENC) {
6857 	ctxt->options |= HTML_PARSE_IGNORE_ENC;
6858         options -= HTML_PARSE_IGNORE_ENC;
6859     }
6860     if (options & HTML_PARSE_NOIMPLIED) {
6861         ctxt->options |= HTML_PARSE_NOIMPLIED;
6862         options -= HTML_PARSE_NOIMPLIED;
6863     }
6864     ctxt->dictNames = 0;
6865     return (options);
6866 }
6867 
6868 /**
6869  * htmlDoRead:
6870  * @ctxt:  an HTML parser context
6871  * @URL:  the base URL to use for the document
6872  * @encoding:  the document encoding, or NULL
6873  * @options:  a combination of htmlParserOption(s)
6874  * @reuse:  keep the context for reuse
6875  *
6876  * Common front-end for the htmlRead functions
6877  *
6878  * Returns the resulting document tree or NULL
6879  */
6880 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)6881 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6882           int options, int reuse)
6883 {
6884     htmlDocPtr ret;
6885 
6886     htmlCtxtUseOptions(ctxt, options);
6887     ctxt->html = 1;
6888     if (encoding != NULL) {
6889         xmlCharEncodingHandlerPtr hdlr;
6890 
6891 	hdlr = xmlFindCharEncodingHandler(encoding);
6892 	if (hdlr != NULL) {
6893 	    xmlSwitchToEncoding(ctxt, hdlr);
6894 	    if (ctxt->input->encoding != NULL)
6895 	      xmlFree((xmlChar *) ctxt->input->encoding);
6896             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6897         }
6898     }
6899     if ((URL != NULL) && (ctxt->input != NULL) &&
6900         (ctxt->input->filename == NULL))
6901         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6902     htmlParseDocument(ctxt);
6903     ret = ctxt->myDoc;
6904     ctxt->myDoc = NULL;
6905     if (!reuse) {
6906         if ((ctxt->dictNames) &&
6907 	    (ret != NULL) &&
6908 	    (ret->dict == ctxt->dict))
6909 	    ctxt->dict = NULL;
6910 	xmlFreeParserCtxt(ctxt);
6911     }
6912     return (ret);
6913 }
6914 
6915 /**
6916  * htmlReadDoc:
6917  * @cur:  a pointer to a zero terminated string
6918  * @URL:  the base URL to use for the document
6919  * @encoding:  the document encoding, or NULL
6920  * @options:  a combination of htmlParserOption(s)
6921  *
6922  * parse an XML in-memory document and build a tree.
6923  *
6924  * Returns the resulting document tree
6925  */
6926 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)6927 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6928 {
6929     htmlParserCtxtPtr ctxt;
6930 
6931     if (cur == NULL)
6932         return (NULL);
6933 
6934     xmlInitParser();
6935     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6936     if (ctxt == NULL)
6937         return (NULL);
6938     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6939 }
6940 
6941 /**
6942  * htmlReadFile:
6943  * @filename:  a file or URL
6944  * @encoding:  the document encoding, or NULL
6945  * @options:  a combination of htmlParserOption(s)
6946  *
6947  * parse an XML file from the filesystem or the network.
6948  *
6949  * Returns the resulting document tree
6950  */
6951 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6952 htmlReadFile(const char *filename, const char *encoding, int options)
6953 {
6954     htmlParserCtxtPtr ctxt;
6955 
6956     xmlInitParser();
6957     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6958     if (ctxt == NULL)
6959         return (NULL);
6960     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6961 }
6962 
6963 /**
6964  * htmlReadMemory:
6965  * @buffer:  a pointer to a char array
6966  * @size:  the size of the array
6967  * @URL:  the base URL to use for the document
6968  * @encoding:  the document encoding, or NULL
6969  * @options:  a combination of htmlParserOption(s)
6970  *
6971  * parse an XML in-memory document and build a tree.
6972  *
6973  * Returns the resulting document tree
6974  */
6975 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)6976 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6977 {
6978     htmlParserCtxtPtr ctxt;
6979 
6980     xmlInitParser();
6981     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6982     if (ctxt == NULL)
6983         return (NULL);
6984     htmlDefaultSAXHandlerInit();
6985     if (ctxt->sax != NULL)
6986         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6987     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6988 }
6989 
6990 /**
6991  * htmlReadFd:
6992  * @fd:  an open file descriptor
6993  * @URL:  the base URL to use for the document
6994  * @encoding:  the document encoding, or NULL
6995  * @options:  a combination of htmlParserOption(s)
6996  *
6997  * parse an HTML from a file descriptor and build a tree.
6998  * NOTE that the file descriptor will not be closed when the
6999  *      reader is closed or reset.
7000  *
7001  * Returns the resulting document tree
7002  */
7003 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)7004 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7005 {
7006     htmlParserCtxtPtr ctxt;
7007     xmlParserInputBufferPtr input;
7008     htmlParserInputPtr stream;
7009 
7010     if (fd < 0)
7011         return (NULL);
7012 
7013     xmlInitParser();
7014     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7015     if (input == NULL)
7016         return (NULL);
7017     input->closecallback = NULL;
7018     ctxt = htmlNewParserCtxt();
7019     if (ctxt == NULL) {
7020         xmlFreeParserInputBuffer(input);
7021         return (NULL);
7022     }
7023     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7024     if (stream == NULL) {
7025         xmlFreeParserInputBuffer(input);
7026 	htmlFreeParserCtxt(ctxt);
7027         return (NULL);
7028     }
7029     inputPush(ctxt, stream);
7030     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7031 }
7032 
7033 /**
7034  * htmlReadIO:
7035  * @ioread:  an I/O read function
7036  * @ioclose:  an I/O close function
7037  * @ioctx:  an I/O handler
7038  * @URL:  the base URL to use for the document
7039  * @encoding:  the document encoding, or NULL
7040  * @options:  a combination of htmlParserOption(s)
7041  *
7042  * parse an HTML document from I/O functions and source and build a tree.
7043  *
7044  * Returns the resulting document tree
7045  */
7046 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7047 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7048           void *ioctx, const char *URL, const char *encoding, int options)
7049 {
7050     htmlParserCtxtPtr ctxt;
7051     xmlParserInputBufferPtr input;
7052     xmlParserInputPtr stream;
7053 
7054     if (ioread == NULL)
7055         return (NULL);
7056     xmlInitParser();
7057 
7058     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7059                                          XML_CHAR_ENCODING_NONE);
7060     if (input == NULL) {
7061         if (ioclose != NULL)
7062             ioclose(ioctx);
7063         return (NULL);
7064     }
7065     ctxt = htmlNewParserCtxt();
7066     if (ctxt == NULL) {
7067         xmlFreeParserInputBuffer(input);
7068         return (NULL);
7069     }
7070     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7071     if (stream == NULL) {
7072         xmlFreeParserInputBuffer(input);
7073 	xmlFreeParserCtxt(ctxt);
7074         return (NULL);
7075     }
7076     inputPush(ctxt, stream);
7077     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7078 }
7079 
7080 /**
7081  * htmlCtxtReadDoc:
7082  * @ctxt:  an HTML parser context
7083  * @cur:  a pointer to a zero terminated string
7084  * @URL:  the base URL to use for the document
7085  * @encoding:  the document encoding, or NULL
7086  * @options:  a combination of htmlParserOption(s)
7087  *
7088  * parse an XML in-memory document and build a tree.
7089  * This reuses the existing @ctxt parser context
7090  *
7091  * Returns the resulting document tree
7092  */
7093 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * cur,const char * URL,const char * encoding,int options)7094 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7095                const char *URL, const char *encoding, int options)
7096 {
7097     xmlParserInputPtr stream;
7098 
7099     if (cur == NULL)
7100         return (NULL);
7101     if (ctxt == NULL)
7102         return (NULL);
7103     xmlInitParser();
7104 
7105     htmlCtxtReset(ctxt);
7106 
7107     stream = xmlNewStringInputStream(ctxt, cur);
7108     if (stream == NULL) {
7109         return (NULL);
7110     }
7111     inputPush(ctxt, stream);
7112     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7113 }
7114 
7115 /**
7116  * htmlCtxtReadFile:
7117  * @ctxt:  an HTML parser context
7118  * @filename:  a file or URL
7119  * @encoding:  the document encoding, or NULL
7120  * @options:  a combination of htmlParserOption(s)
7121  *
7122  * parse an XML file from the filesystem or the network.
7123  * This reuses the existing @ctxt parser context
7124  *
7125  * Returns the resulting document tree
7126  */
7127 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)7128 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7129                 const char *encoding, int options)
7130 {
7131     xmlParserInputPtr stream;
7132 
7133     if (filename == NULL)
7134         return (NULL);
7135     if (ctxt == NULL)
7136         return (NULL);
7137     xmlInitParser();
7138 
7139     htmlCtxtReset(ctxt);
7140 
7141     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7142     if (stream == NULL) {
7143         return (NULL);
7144     }
7145     inputPush(ctxt, stream);
7146     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7147 }
7148 
7149 /**
7150  * htmlCtxtReadMemory:
7151  * @ctxt:  an HTML parser context
7152  * @buffer:  a pointer to a char array
7153  * @size:  the size of the array
7154  * @URL:  the base URL to use for the document
7155  * @encoding:  the document encoding, or NULL
7156  * @options:  a combination of htmlParserOption(s)
7157  *
7158  * parse an XML in-memory document and build a tree.
7159  * This reuses the existing @ctxt parser context
7160  *
7161  * Returns the resulting document tree
7162  */
7163 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)7164 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7165                   const char *URL, const char *encoding, int options)
7166 {
7167     xmlParserInputBufferPtr input;
7168     xmlParserInputPtr stream;
7169 
7170     if (ctxt == NULL)
7171         return (NULL);
7172     if (buffer == NULL)
7173         return (NULL);
7174     xmlInitParser();
7175 
7176     htmlCtxtReset(ctxt);
7177 
7178     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7179     if (input == NULL) {
7180 	return(NULL);
7181     }
7182 
7183     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7184     if (stream == NULL) {
7185 	xmlFreeParserInputBuffer(input);
7186 	return(NULL);
7187     }
7188 
7189     inputPush(ctxt, stream);
7190     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7191 }
7192 
7193 /**
7194  * htmlCtxtReadFd:
7195  * @ctxt:  an HTML parser context
7196  * @fd:  an open file descriptor
7197  * @URL:  the base URL to use for the document
7198  * @encoding:  the document encoding, or NULL
7199  * @options:  a combination of htmlParserOption(s)
7200  *
7201  * parse an XML from a file descriptor and build a tree.
7202  * This reuses the existing @ctxt parser context
7203  *
7204  * Returns the resulting document tree
7205  */
7206 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)7207 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7208               const char *URL, const char *encoding, int options)
7209 {
7210     xmlParserInputBufferPtr input;
7211     xmlParserInputPtr stream;
7212 
7213     if (fd < 0)
7214         return (NULL);
7215     if (ctxt == NULL)
7216         return (NULL);
7217     xmlInitParser();
7218 
7219     htmlCtxtReset(ctxt);
7220 
7221 
7222     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7223     if (input == NULL)
7224         return (NULL);
7225     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7226     if (stream == NULL) {
7227         xmlFreeParserInputBuffer(input);
7228         return (NULL);
7229     }
7230     inputPush(ctxt, stream);
7231     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7232 }
7233 
7234 /**
7235  * htmlCtxtReadIO:
7236  * @ctxt:  an HTML parser context
7237  * @ioread:  an I/O read function
7238  * @ioclose:  an I/O close function
7239  * @ioctx:  an I/O handler
7240  * @URL:  the base URL to use for the document
7241  * @encoding:  the document encoding, or NULL
7242  * @options:  a combination of htmlParserOption(s)
7243  *
7244  * parse an HTML document from I/O functions and source and build a tree.
7245  * This reuses the existing @ctxt parser context
7246  *
7247  * Returns the resulting document tree
7248  */
7249 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7250 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7251               xmlInputCloseCallback ioclose, void *ioctx,
7252 	      const char *URL,
7253               const char *encoding, int options)
7254 {
7255     xmlParserInputBufferPtr input;
7256     xmlParserInputPtr stream;
7257 
7258     if (ioread == NULL)
7259         return (NULL);
7260     if (ctxt == NULL)
7261         return (NULL);
7262     xmlInitParser();
7263 
7264     htmlCtxtReset(ctxt);
7265 
7266     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7267                                          XML_CHAR_ENCODING_NONE);
7268     if (input == NULL) {
7269         if (ioclose != NULL)
7270             ioclose(ioctx);
7271         return (NULL);
7272     }
7273     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7274     if (stream == NULL) {
7275         xmlFreeParserInputBuffer(input);
7276         return (NULL);
7277     }
7278     inputPush(ctxt, stream);
7279     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7280 }
7281 
7282 #define bottom_HTMLparser
7283 #include "elfgcchack.h"
7284 #endif /* LIBXML_HTML_ENABLED */
7285