• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * HTMLparser.c : an HTML 4.0 non-verifying parser
3  *
4  * See Copyright for the status of this software.
5  *
6  * daniel@veillard.com
7  */
8 
9 #define IN_LIBXML
10 #include "libxml.h"
11 #ifdef LIBXML_HTML_ENABLED
12 
13 #include <string.h>
14 #ifdef HAVE_CTYPE_H
15 #include <ctype.h>
16 #endif
17 #ifdef HAVE_STDLIB_H
18 #include <stdlib.h>
19 #endif
20 #ifdef HAVE_SYS_STAT_H
21 #include <sys/stat.h>
22 #endif
23 #ifdef HAVE_FCNTL_H
24 #include <fcntl.h>
25 #endif
26 #ifdef HAVE_UNISTD_H
27 #include <unistd.h>
28 #endif
29 #ifdef LIBXML_ZLIB_ENABLED
30 #include <zlib.h>
31 #endif
32 
33 #include <libxml/xmlmemory.h>
34 #include <libxml/tree.h>
35 #include <libxml/parser.h>
36 #include <libxml/parserInternals.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/HTMLparser.h>
39 #include <libxml/HTMLtree.h>
40 #include <libxml/entities.h>
41 #include <libxml/encoding.h>
42 #include <libxml/valid.h>
43 #include <libxml/xmlIO.h>
44 #include <libxml/globals.h>
45 #include <libxml/uri.h>
46 
47 #include "buf.h"
48 #include "enc.h"
49 
50 #define HTML_MAX_NAMELEN 1000
51 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
52 #define HTML_PARSER_BUFFER_SIZE 100
53 
54 /* #define DEBUG */
55 /* #define DEBUG_PUSH */
56 
57 static int htmlOmittedDefaultValue = 1;
58 
59 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60 			     xmlChar end, xmlChar  end2, xmlChar end3);
61 static void htmlParseComment(htmlParserCtxtPtr ctxt);
62 
63 /************************************************************************
64  *									*
65  *		Some factorized error routines				*
66  *									*
67  ************************************************************************/
68 
69 /**
70  * htmlErrMemory:
71  * @ctxt:  an HTML parser context
72  * @extra:  extra information
73  *
74  * Handle a redefinition of attribute error
75  */
76 static void
htmlErrMemory(xmlParserCtxtPtr ctxt,const char * extra)77 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78 {
79     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80         (ctxt->instate == XML_PARSER_EOF))
81 	return;
82     if (ctxt != NULL) {
83         ctxt->errNo = XML_ERR_NO_MEMORY;
84         ctxt->instate = XML_PARSER_EOF;
85         ctxt->disableSAX = 1;
86     }
87     if (extra)
88         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90                         NULL, NULL, 0, 0,
91                         "Memory allocation failed : %s\n", extra);
92     else
93         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95                         NULL, NULL, 0, 0, "Memory allocation failed\n");
96 }
97 
98 /**
99  * htmlParseErr:
100  * @ctxt:  an HTML parser context
101  * @error:  the error number
102  * @msg:  the error message
103  * @str1:  string infor
104  * @str2:  string infor
105  *
106  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107  */
108 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErr(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,const xmlChar * str1,const xmlChar * str2)109 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110              const char *msg, const xmlChar *str1, const xmlChar *str2)
111 {
112     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113         (ctxt->instate == XML_PARSER_EOF))
114 	return;
115     if (ctxt != NULL)
116 	ctxt->errNo = error;
117     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118                     XML_ERR_ERROR, NULL, 0,
119 		    (const char *) str1, (const char *) str2,
120 		    NULL, 0, 0,
121 		    msg, str1, str2);
122     if (ctxt != NULL)
123 	ctxt->wellFormed = 0;
124 }
125 
126 /**
127  * htmlParseErrInt:
128  * @ctxt:  an HTML parser context
129  * @error:  the error number
130  * @msg:  the error message
131  * @val:  integer info
132  *
133  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134  */
135 static void LIBXML_ATTR_FORMAT(3,0)
htmlParseErrInt(xmlParserCtxtPtr ctxt,xmlParserErrors error,const char * msg,int val)136 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137              const char *msg, int val)
138 {
139     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140         (ctxt->instate == XML_PARSER_EOF))
141 	return;
142     if (ctxt != NULL)
143 	ctxt->errNo = error;
144     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
146 		    NULL, val, 0, msg, val);
147     if (ctxt != NULL)
148 	ctxt->wellFormed = 0;
149 }
150 
151 /************************************************************************
152  *									*
153  *	Parser stacks related functions and macros		*
154  *									*
155  ************************************************************************/
156 
157 /**
158  * htmlnamePush:
159  * @ctxt:  an HTML parser context
160  * @value:  the element name
161  *
162  * Pushes a new element name on top of the name stack
163  *
164  * Returns 0 in case of error, the index in the stack otherwise
165  */
166 static int
htmlnamePush(htmlParserCtxtPtr ctxt,const xmlChar * value)167 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168 {
169     if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170         ctxt->html = 3;
171     if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172         ctxt->html = 10;
173     if (ctxt->nameNr >= ctxt->nameMax) {
174         ctxt->nameMax *= 2;
175         ctxt->nameTab = (const xmlChar * *)
176                          xmlRealloc((xmlChar * *)ctxt->nameTab,
177                                     ctxt->nameMax *
178                                     sizeof(ctxt->nameTab[0]));
179         if (ctxt->nameTab == NULL) {
180             htmlErrMemory(ctxt, NULL);
181             return (0);
182         }
183     }
184     ctxt->nameTab[ctxt->nameNr] = value;
185     ctxt->name = value;
186     return (ctxt->nameNr++);
187 }
188 /**
189  * htmlnamePop:
190  * @ctxt: an HTML parser context
191  *
192  * Pops the top element name from the name stack
193  *
194  * Returns the name just removed
195  */
196 static const xmlChar *
htmlnamePop(htmlParserCtxtPtr ctxt)197 htmlnamePop(htmlParserCtxtPtr ctxt)
198 {
199     const xmlChar *ret;
200 
201     if (ctxt->nameNr <= 0)
202         return (NULL);
203     ctxt->nameNr--;
204     if (ctxt->nameNr < 0)
205         return (NULL);
206     if (ctxt->nameNr > 0)
207         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208     else
209         ctxt->name = NULL;
210     ret = ctxt->nameTab[ctxt->nameNr];
211     ctxt->nameTab[ctxt->nameNr] = NULL;
212     return (ret);
213 }
214 
215 /**
216  * htmlNodeInfoPush:
217  * @ctxt:  an HTML parser context
218  * @value:  the node info
219  *
220  * Pushes a new element name on top of the node info stack
221  *
222  * Returns 0 in case of error, the index in the stack otherwise
223  */
224 static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt,htmlParserNodeInfo * value)225 htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226 {
227     if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228         if (ctxt->nodeInfoMax == 0)
229                 ctxt->nodeInfoMax = 5;
230         ctxt->nodeInfoMax *= 2;
231         ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232                          xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233                                     ctxt->nodeInfoMax *
234                                     sizeof(ctxt->nodeInfoTab[0]));
235         if (ctxt->nodeInfoTab == NULL) {
236             htmlErrMemory(ctxt, NULL);
237             return (0);
238         }
239     }
240     ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241     ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242     return (ctxt->nodeInfoNr++);
243 }
244 
245 /**
246  * htmlNodeInfoPop:
247  * @ctxt:  an HTML parser context
248  *
249  * Pops the top element name from the node info stack
250  *
251  * Returns 0 in case of error, the pointer to NodeInfo otherwise
252  */
253 static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)254 htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255 {
256     if (ctxt->nodeInfoNr <= 0)
257         return (NULL);
258     ctxt->nodeInfoNr--;
259     if (ctxt->nodeInfoNr < 0)
260         return (NULL);
261     if (ctxt->nodeInfoNr > 0)
262         ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263     else
264         ctxt->nodeInfo = NULL;
265     return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266 }
267 
268 /*
269  * Macros for accessing the content. Those should be used only by the parser,
270  * and not exported.
271  *
272  * Dirty macros, i.e. one need to make assumption on the context to use them
273  *
274  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
275  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
276  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277  *           in UNICODE mode. This should be used internally by the parser
278  *           only to compare to ASCII values otherwise it would break when
279  *           running with UTF-8 encoding.
280  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
281  *           to compare on ASCII based substring.
282  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
283  *           it should be used only to compare on ASCII based substring.
284  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285  *           strings without newlines within the parser.
286  *
287  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288  *
289  *   CURRENT Returns the current char value, with the full decoding of
290  *           UTF-8 if we are using this mode. It returns an int.
291  *   NEXT    Skip to the next character, this does the proper decoding
292  *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
293  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
294  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295  */
296 
297 #define UPPER (toupper(*ctxt->input->cur))
298 
299 #define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
300 
301 #define NXT(val) ctxt->input->cur[(val)]
302 
303 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
304 
305 #define CUR_PTR ctxt->input->cur
306 #define BASE_PTR ctxt->input->base
307 
308 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309 		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310 	xmlParserInputShrink(ctxt->input)
311 
312 #define GROW if ((ctxt->progressive == 0) &&				\
313 		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
314 	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315 
316 #define CURRENT ((int) (*ctxt->input->cur))
317 
318 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319 
320 /* Imported from XML */
321 
322 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323 #define CUR ((int) (*ctxt->input->cur))
324 #define NEXT xmlNextChar(ctxt)
325 
326 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327 
328 
329 #define NEXTL(l) do {							\
330     if (*(ctxt->input->cur) == '\n') {					\
331 	ctxt->input->line++; ctxt->input->col = 1;			\
332     } else ctxt->input->col++;						\
333     ctxt->token = 0; ctxt->input->cur += l;				\
334   } while (0)
335 
336 /************
337     \
338     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
339     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340  ************/
341 
342 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344 
345 #define COPY_BUF(l,b,i,v)						\
346     if (l == 1) b[i++] = (xmlChar) v;					\
347     else i += xmlCopyChar(l,&b[i],v)
348 
349 /**
350  * htmlFindEncoding:
351  * @the HTML parser context
352  *
353  * Ty to find and encoding in the current data available in the input
354  * buffer this is needed to try to switch to the proper encoding when
355  * one face a character error.
356  * That's an heuristic, since it's operating outside of parsing it could
357  * try to use a meta which had been commented out, that's the reason it
358  * should only be used in case of error, not as a default.
359  *
360  * Returns an encoding string or NULL if not found, the string need to
361  *   be freed
362  */
363 static xmlChar *
htmlFindEncoding(xmlParserCtxtPtr ctxt)364 htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365     const xmlChar *start, *cur, *end;
366 
367     if ((ctxt == NULL) || (ctxt->input == NULL) ||
368         (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369         (ctxt->input->buf->encoder != NULL))
370         return(NULL);
371     if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372         return(NULL);
373 
374     start = ctxt->input->cur;
375     end = ctxt->input->end;
376     /* we also expect the input buffer to be zero terminated */
377     if (*end != 0)
378         return(NULL);
379 
380     cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381     if (cur == NULL)
382         return(NULL);
383     cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
384     if (cur == NULL)
385         return(NULL);
386     cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
387     if (cur == NULL)
388         return(NULL);
389     cur += 8;
390     start = cur;
391     while (((*cur >= 'A') && (*cur <= 'Z')) ||
392            ((*cur >= 'a') && (*cur <= 'z')) ||
393            ((*cur >= '0') && (*cur <= '9')) ||
394            (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395            cur++;
396     if (cur == start)
397         return(NULL);
398     return(xmlStrndup(start, cur - start));
399 }
400 
401 /**
402  * htmlCurrentChar:
403  * @ctxt:  the HTML parser context
404  * @len:  pointer to the length of the char read
405  *
406  * The current char value, if using UTF-8 this may actually span multiple
407  * bytes in the input buffer. Implement the end of line normalization:
408  * 2.11 End-of-Line Handling
409  * If the encoding is unspecified, in the case we find an ISO-Latin-1
410  * char, then the encoding converter is plugged in automatically.
411  *
412  * Returns the current char value and its length
413  */
414 
415 static int
htmlCurrentChar(xmlParserCtxtPtr ctxt,int * len)416 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417     const unsigned char *cur;
418     unsigned char c;
419     unsigned int val;
420 
421     if (ctxt->instate == XML_PARSER_EOF)
422 	return(0);
423 
424     if (ctxt->token != 0) {
425 	*len = 0;
426 	return(ctxt->token);
427     }
428     if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
429         xmlChar * guess;
430         xmlCharEncodingHandlerPtr handler;
431 
432         /*
433          * Assume it's a fixed length encoding (1) with
434          * a compatible encoding for the ASCII set, since
435          * HTML constructs only use < 128 chars
436          */
437         if ((int) *ctxt->input->cur < 0x80) {
438             *len = 1;
439             if ((*ctxt->input->cur == 0) &&
440                 (ctxt->input->cur < ctxt->input->end)) {
441                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442                                 "Char 0x%X out of allowed range\n", 0);
443                 return(' ');
444             }
445             return((int) *ctxt->input->cur);
446         }
447 
448         /*
449          * Humm this is bad, do an automatic flow conversion
450          */
451         guess = htmlFindEncoding(ctxt);
452         if (guess == NULL) {
453             xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454         } else {
455             if (ctxt->input->encoding != NULL)
456                 xmlFree((xmlChar *) ctxt->input->encoding);
457             ctxt->input->encoding = guess;
458             handler = xmlFindCharEncodingHandler((const char *) guess);
459             if (handler != NULL) {
460                 /*
461                  * Don't use UTF-8 encoder which isn't required and
462                  * can produce invalid UTF-8.
463                  */
464                 if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
465                     xmlSwitchToEncoding(ctxt, handler);
466             } else {
467                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468                              "Unsupported encoding %s", guess, NULL);
469             }
470         }
471         ctxt->charset = XML_CHAR_ENCODING_UTF8;
472     }
473 
474     /*
475      * We are supposed to handle UTF8, check it's valid
476      * From rfc2044: encoding of the Unicode values on UTF-8:
477      *
478      * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
479      * 0000 0000-0000 007F   0xxxxxxx
480      * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
481      * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
482      *
483      * Check for the 0x110000 limit too
484      */
485     cur = ctxt->input->cur;
486     c = *cur;
487     if (c & 0x80) {
488         if ((c & 0x40) == 0)
489             goto encoding_error;
490         if (cur[1] == 0) {
491             xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
492             cur = ctxt->input->cur;
493         }
494         if ((cur[1] & 0xc0) != 0x80)
495             goto encoding_error;
496         if ((c & 0xe0) == 0xe0) {
497 
498             if (cur[2] == 0) {
499                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
500                 cur = ctxt->input->cur;
501             }
502             if ((cur[2] & 0xc0) != 0x80)
503                 goto encoding_error;
504             if ((c & 0xf0) == 0xf0) {
505                 if (cur[3] == 0) {
506                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
507                     cur = ctxt->input->cur;
508                 }
509                 if (((c & 0xf8) != 0xf0) ||
510                     ((cur[3] & 0xc0) != 0x80))
511                     goto encoding_error;
512                 /* 4-byte code */
513                 *len = 4;
514                 val = (cur[0] & 0x7) << 18;
515                 val |= (cur[1] & 0x3f) << 12;
516                 val |= (cur[2] & 0x3f) << 6;
517                 val |= cur[3] & 0x3f;
518                 if (val < 0x10000)
519                     goto encoding_error;
520             } else {
521               /* 3-byte code */
522                 *len = 3;
523                 val = (cur[0] & 0xf) << 12;
524                 val |= (cur[1] & 0x3f) << 6;
525                 val |= cur[2] & 0x3f;
526                 if (val < 0x800)
527                     goto encoding_error;
528             }
529         } else {
530           /* 2-byte code */
531             *len = 2;
532             val = (cur[0] & 0x1f) << 6;
533             val |= cur[1] & 0x3f;
534             if (val < 0x80)
535                 goto encoding_error;
536         }
537         if (!IS_CHAR(val)) {
538             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
539                             "Char 0x%X out of allowed range\n", val);
540         }
541         return(val);
542     } else {
543         if ((*ctxt->input->cur == 0) &&
544             (ctxt->input->cur < ctxt->input->end)) {
545             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
546                             "Char 0x%X out of allowed range\n", 0);
547             *len = 1;
548             return(' ');
549         }
550         /* 1-byte code */
551         *len = 1;
552         return((int) *ctxt->input->cur);
553     }
554 
555 encoding_error:
556     /*
557      * If we detect an UTF8 error that probably mean that the
558      * input encoding didn't get properly advertised in the
559      * declaration header. Report the error and switch the encoding
560      * to ISO-Latin-1 (if you don't like this policy, just declare the
561      * encoding !)
562      */
563     {
564         char buffer[150];
565 
566 	if (ctxt->input->end - ctxt->input->cur >= 4) {
567 	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
568 			    ctxt->input->cur[0], ctxt->input->cur[1],
569 			    ctxt->input->cur[2], ctxt->input->cur[3]);
570 	} else {
571 	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
572 	}
573 	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
574 		     "Input is not proper UTF-8, indicate encoding !\n",
575 		     BAD_CAST buffer, NULL);
576     }
577 
578     /*
579      * Don't switch encodings twice. Note that if there's an encoder, we
580      * shouldn't receive invalid UTF-8 anyway.
581      *
582      * Note that if ctxt->input->buf == NULL, switching encodings is
583      * impossible, see Gitlab issue #34.
584      */
585     if ((ctxt->input->buf != NULL) &&
586         (ctxt->input->buf->encoder == NULL))
587         xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
588     *len = 1;
589     return((int) *ctxt->input->cur);
590 }
591 
592 /**
593  * htmlSkipBlankChars:
594  * @ctxt:  the HTML parser context
595  *
596  * skip all blanks character found at that point in the input streams.
597  *
598  * Returns the number of space chars skipped
599  */
600 
601 static int
htmlSkipBlankChars(xmlParserCtxtPtr ctxt)602 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
603     int res = 0;
604 
605     while (IS_BLANK_CH(*(ctxt->input->cur))) {
606 	if ((*ctxt->input->cur == 0) &&
607 	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
608 		xmlPopInput(ctxt);
609 	} else {
610 	    if (*(ctxt->input->cur) == '\n') {
611 		ctxt->input->line++; ctxt->input->col = 1;
612 	    } else ctxt->input->col++;
613 	    ctxt->input->cur++;
614 	    if (*ctxt->input->cur == 0)
615 		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
616 	}
617 	res++;
618     }
619     return(res);
620 }
621 
622 
623 
624 /************************************************************************
625  *									*
626  *	The list of HTML elements and their properties		*
627  *									*
628  ************************************************************************/
629 
630 /*
631  *  Start Tag: 1 means the start tag can be omitted
632  *  End Tag:   1 means the end tag can be omitted
633  *             2 means it's forbidden (empty elements)
634  *             3 means the tag is stylistic and should be closed easily
635  *  Depr:      this element is deprecated
636  *  DTD:       1 means that this element is valid only in the Loose DTD
637  *             2 means that this element is valid only in the Frameset DTD
638  *
639  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
640 	, subElements , impliedsubelt , Attributes, userdata
641  */
642 
643 /* Definitions and a couple of vars for HTML Elements */
644 
645 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
646 #define NB_FONTSTYLE 8
647 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
648 #define NB_PHRASE 10
649 #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
650 #define NB_SPECIAL 16
651 #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
652 #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
653 #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
654 #define NB_BLOCK NB_HEADING + NB_LIST + 14
655 #define FORMCTRL "input", "select", "textarea", "label", "button"
656 #define NB_FORMCTRL 5
657 #define PCDATA
658 #define NB_PCDATA 0
659 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
660 #define NB_HEADING 6
661 #define LIST "ul", "ol", "dir", "menu"
662 #define NB_LIST 4
663 #define MODIFIER
664 #define NB_MODIFIER 0
665 #define FLOW BLOCK,INLINE
666 #define NB_FLOW NB_BLOCK + NB_INLINE
667 #define EMPTY NULL
668 
669 
670 static const char* const html_flow[] = { FLOW, NULL } ;
671 static const char* const html_inline[] = { INLINE, NULL } ;
672 
673 /* placeholders: elts with content but no subelements */
674 static const char* const html_pcdata[] = { NULL } ;
675 #define html_cdata html_pcdata
676 
677 
678 /* ... and for HTML Attributes */
679 
680 #define COREATTRS "id", "class", "style", "title"
681 #define NB_COREATTRS 4
682 #define I18N "lang", "dir"
683 #define NB_I18N 2
684 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
685 #define NB_EVENTS 9
686 #define ATTRS COREATTRS,I18N,EVENTS
687 #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
688 #define CELLHALIGN "align", "char", "charoff"
689 #define NB_CELLHALIGN 3
690 #define CELLVALIGN "valign"
691 #define NB_CELLVALIGN 1
692 
693 static const char* const html_attrs[] = { ATTRS, NULL } ;
694 static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
695 static const char* const core_attrs[] = { COREATTRS, NULL } ;
696 static const char* const i18n_attrs[] = { I18N, NULL } ;
697 
698 
699 /* Other declarations that should go inline ... */
700 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
701 	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
702 	"tabindex", "onfocus", "onblur", NULL } ;
703 static const char* const target_attr[] = { "target", NULL } ;
704 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
705 static const char* const alt_attr[] = { "alt", NULL } ;
706 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
707 static const char* const href_attrs[] = { "href", NULL } ;
708 static const char* const clear_attrs[] = { "clear", NULL } ;
709 static const char* const inline_p[] = { INLINE, "p", NULL } ;
710 
711 static const char* const flow_param[] = { FLOW, "param", NULL } ;
712 static const char* const applet_attrs[] = { COREATTRS , "codebase",
713 		"archive", "alt", "name", "height", "width", "align",
714 		"hspace", "vspace", NULL } ;
715 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
716 	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
717 static const char* const basefont_attrs[] =
718 	{ "id", "size", "color", "face", NULL } ;
719 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
720 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
721 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
722 static const char* const body_depr[] = { "background", "bgcolor", "text",
723 	"link", "vlink", "alink", NULL } ;
724 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
725 	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
726 
727 
728 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
729 static const char* const col_elt[] = { "col", NULL } ;
730 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
731 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
732 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
733 static const char* const compact_attr[] = { "compact", NULL } ;
734 static const char* const label_attr[] = { "label", NULL } ;
735 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
736 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
737 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
738 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
739 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
740 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
741 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
742 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
743 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
744 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
745 static const char* const version_attr[] = { "version", NULL } ;
746 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
747 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
748 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
749 static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
750 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
751 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
752 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
753 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
754 static const char* const align_attr[] = { "align", NULL } ;
755 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
756 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
757 static const char* const name_attr[] = { "name", NULL } ;
758 static const char* const action_attr[] = { "action", NULL } ;
759 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
760 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
761 static const char* const content_attr[] = { "content", NULL } ;
762 static const char* const type_attr[] = { "type", NULL } ;
763 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
764 static const char* const object_contents[] = { FLOW, "param", NULL } ;
765 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
766 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
767 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
768 static const char* const option_elt[] = { "option", NULL } ;
769 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
770 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
771 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
772 static const char* const width_attr[] = { "width", NULL } ;
773 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
774 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
775 static const char* const language_attr[] = { "language", NULL } ;
776 static const char* const select_content[] = { "optgroup", "option", NULL } ;
777 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
778 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
779 static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
780 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
781 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
782 static const char* const tr_elt[] = { "tr", NULL } ;
783 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
784 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
785 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
786 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
787 static const char* const tr_contents[] = { "th", "td", NULL } ;
788 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
789 static const char* const li_elt[] = { "li", NULL } ;
790 static const char* const ul_depr[] = { "type", "compact", NULL} ;
791 static const char* const dir_attr[] = { "dir", NULL} ;
792 
793 #define DECL (const char**)
794 
795 static const htmlElemDesc
796 html40ElementTable[] = {
797 { "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
798 	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
799 },
800 { "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
801 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802 },
803 { "acronym",	0, 0, 0, 0, 0, 0, 1, "",
804 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
805 },
806 { "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
807 	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
808 },
809 { "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
810 	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
811 },
812 { "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
813 	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
814 },
815 { "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
816 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817 },
818 { "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
819 	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
820 },
821 { "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
822 	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
823 },
824 { "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
825 	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
826 },
827 { "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
828 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
829 },
830 { "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
831 	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
832 },
833 { "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
834 	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
835 },
836 { "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
837 	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
838 },
839 { "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
840 	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
841 },
842 { "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
843 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
844 },
845 { "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
846 	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
847 },
848 { "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
849 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
850 },
851 { "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
852 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853 },
854 { "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
855 	EMPTY , NULL , DECL col_attrs , NULL, NULL
856 },
857 { "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
858 	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
859 },
860 { "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
861 	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
862 },
863 { "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
864 	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
865 },
866 { "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
867 	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
868 },
869 { "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
870 	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
871 },
872 { "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
873 	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
874 },
875 { "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
876 	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
877 },
878 { "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
879 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
880 },
881 { "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
882 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
883 },
884 { "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
885 	EMPTY, NULL, DECL embed_attrs, NULL, NULL
886 },
887 { "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
888 	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
889 },
890 { "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
891 	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
892 },
893 { "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
894 	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
895 },
896 { "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
897 	EMPTY, NULL, NULL, DECL frame_attrs, NULL
898 },
899 { "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
900 	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
901 },
902 { "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
903 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904 },
905 { "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
906 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
907 },
908 { "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
909 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910 },
911 { "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
912 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
913 },
914 { "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
915 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
916 },
917 { "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
918 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
919 },
920 { "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
921 	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
922 },
923 { "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
924 	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
925 },
926 { "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
927 	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
928 },
929 { "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
930 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
931 },
932 { "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
933 	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
934 },
935 { "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
936 	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
937 },
938 { "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
939 	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
940 },
941 { "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
942 	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
943 },
944 { "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
945 	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
946 },
947 { "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
948 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949 },
950 { "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
951 	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
952 },
953 { "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
954 	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
955 },
956 { "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
957 	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
958 },
959 { "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
960 	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
961 },
962 { "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
963 	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
964 },
965 { "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
966 	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
967 },
968 { "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
969 	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
970 },
971 { "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
972 	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
973 },
974 { "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
975 	DECL html_flow, "div", DECL html_attrs, NULL, NULL
976 },
977 { "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
978 	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
979 },
980 { "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
981 	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
982 },
983 { "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
984 	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
985 },
986 { "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
987 	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
988 },
989 { "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
990 	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
991 },
992 { "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
993 	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
994 },
995 { "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
996 	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
997 },
998 { "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
999 	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1000 },
1001 { "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1002 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1003 },
1004 { "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1005 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006 },
1007 { "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
1008 	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1009 },
1010 { "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
1011 	DECL select_content, NULL, DECL select_attrs, NULL, NULL
1012 },
1013 { "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
1014 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015 },
1016 { "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1017 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018 },
1019 { "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
1020 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1021 },
1022 { "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1023 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1024 },
1025 { "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
1026 	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1027 },
1028 { "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
1029 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1030 },
1031 { "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
1032 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033 },
1034 { "table",	0, 0, 0, 0, 0, 0, 0, "",
1035 	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1036 },
1037 { "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
1038 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039 },
1040 { "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
1041 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1042 },
1043 { "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1044 	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1045 },
1046 { "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
1047 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1048 },
1049 { "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
1050 	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1051 },
1052 { "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
1053 	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1054 },
1055 { "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
1056 	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1057 },
1058 { "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
1059 	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1060 },
1061 { "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1062 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1063 },
1064 { "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
1065 	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1066 },
1067 { "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
1068 	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1069 },
1070 { "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1071 	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1072 }
1073 };
1074 
1075 typedef struct {
1076     const char *oldTag;
1077     const char *newTag;
1078 } htmlStartCloseEntry;
1079 
1080 /*
1081  * start tags that imply the end of current element
1082  */
1083 static const htmlStartCloseEntry htmlStartClose[] = {
1084     { "a", "a" },
1085     { "a", "fieldset" },
1086     { "a", "table" },
1087     { "a", "td" },
1088     { "a", "th" },
1089     { "address", "dd" },
1090     { "address", "dl" },
1091     { "address", "dt" },
1092     { "address", "form" },
1093     { "address", "li" },
1094     { "address", "ul" },
1095     { "b", "center" },
1096     { "b", "p" },
1097     { "b", "td" },
1098     { "b", "th" },
1099     { "big", "p" },
1100     { "caption", "col" },
1101     { "caption", "colgroup" },
1102     { "caption", "tbody" },
1103     { "caption", "tfoot" },
1104     { "caption", "thead" },
1105     { "caption", "tr" },
1106     { "col", "col" },
1107     { "col", "colgroup" },
1108     { "col", "tbody" },
1109     { "col", "tfoot" },
1110     { "col", "thead" },
1111     { "col", "tr" },
1112     { "colgroup", "colgroup" },
1113     { "colgroup", "tbody" },
1114     { "colgroup", "tfoot" },
1115     { "colgroup", "thead" },
1116     { "colgroup", "tr" },
1117     { "dd", "dt" },
1118     { "dir", "dd" },
1119     { "dir", "dl" },
1120     { "dir", "dt" },
1121     { "dir", "form" },
1122     { "dir", "ul" },
1123     { "dl", "form" },
1124     { "dl", "li" },
1125     { "dt", "dd" },
1126     { "dt", "dl" },
1127     { "font", "center" },
1128     { "font", "td" },
1129     { "font", "th" },
1130     { "form", "form" },
1131     { "h1", "fieldset" },
1132     { "h1", "form" },
1133     { "h1", "li" },
1134     { "h1", "p" },
1135     { "h1", "table" },
1136     { "h2", "fieldset" },
1137     { "h2", "form" },
1138     { "h2", "li" },
1139     { "h2", "p" },
1140     { "h2", "table" },
1141     { "h3", "fieldset" },
1142     { "h3", "form" },
1143     { "h3", "li" },
1144     { "h3", "p" },
1145     { "h3", "table" },
1146     { "h4", "fieldset" },
1147     { "h4", "form" },
1148     { "h4", "li" },
1149     { "h4", "p" },
1150     { "h4", "table" },
1151     { "h5", "fieldset" },
1152     { "h5", "form" },
1153     { "h5", "li" },
1154     { "h5", "p" },
1155     { "h5", "table" },
1156     { "h6", "fieldset" },
1157     { "h6", "form" },
1158     { "h6", "li" },
1159     { "h6", "p" },
1160     { "h6", "table" },
1161     { "head", "a" },
1162     { "head", "abbr" },
1163     { "head", "acronym" },
1164     { "head", "address" },
1165     { "head", "b" },
1166     { "head", "bdo" },
1167     { "head", "big" },
1168     { "head", "blockquote" },
1169     { "head", "body" },
1170     { "head", "br" },
1171     { "head", "center" },
1172     { "head", "cite" },
1173     { "head", "code" },
1174     { "head", "dd" },
1175     { "head", "dfn" },
1176     { "head", "dir" },
1177     { "head", "div" },
1178     { "head", "dl" },
1179     { "head", "dt" },
1180     { "head", "em" },
1181     { "head", "fieldset" },
1182     { "head", "font" },
1183     { "head", "form" },
1184     { "head", "frameset" },
1185     { "head", "h1" },
1186     { "head", "h2" },
1187     { "head", "h3" },
1188     { "head", "h4" },
1189     { "head", "h5" },
1190     { "head", "h6" },
1191     { "head", "hr" },
1192     { "head", "i" },
1193     { "head", "iframe" },
1194     { "head", "img" },
1195     { "head", "kbd" },
1196     { "head", "li" },
1197     { "head", "listing" },
1198     { "head", "map" },
1199     { "head", "menu" },
1200     { "head", "ol" },
1201     { "head", "p" },
1202     { "head", "pre" },
1203     { "head", "q" },
1204     { "head", "s" },
1205     { "head", "samp" },
1206     { "head", "small" },
1207     { "head", "span" },
1208     { "head", "strike" },
1209     { "head", "strong" },
1210     { "head", "sub" },
1211     { "head", "sup" },
1212     { "head", "table" },
1213     { "head", "tt" },
1214     { "head", "u" },
1215     { "head", "ul" },
1216     { "head", "var" },
1217     { "head", "xmp" },
1218     { "hr", "form" },
1219     { "i", "center" },
1220     { "i", "p" },
1221     { "i", "td" },
1222     { "i", "th" },
1223     { "legend", "fieldset" },
1224     { "li", "li" },
1225     { "link", "body" },
1226     { "link", "frameset" },
1227     { "listing", "dd" },
1228     { "listing", "dl" },
1229     { "listing", "dt" },
1230     { "listing", "fieldset" },
1231     { "listing", "form" },
1232     { "listing", "li" },
1233     { "listing", "table" },
1234     { "listing", "ul" },
1235     { "menu", "dd" },
1236     { "menu", "dl" },
1237     { "menu", "dt" },
1238     { "menu", "form" },
1239     { "menu", "ul" },
1240     { "ol", "form" },
1241     { "ol", "ul" },
1242     { "option", "optgroup" },
1243     { "option", "option" },
1244     { "p", "address" },
1245     { "p", "blockquote" },
1246     { "p", "body" },
1247     { "p", "caption" },
1248     { "p", "center" },
1249     { "p", "col" },
1250     { "p", "colgroup" },
1251     { "p", "dd" },
1252     { "p", "dir" },
1253     { "p", "div" },
1254     { "p", "dl" },
1255     { "p", "dt" },
1256     { "p", "fieldset" },
1257     { "p", "form" },
1258     { "p", "frameset" },
1259     { "p", "h1" },
1260     { "p", "h2" },
1261     { "p", "h3" },
1262     { "p", "h4" },
1263     { "p", "h5" },
1264     { "p", "h6" },
1265     { "p", "head" },
1266     { "p", "hr" },
1267     { "p", "li" },
1268     { "p", "listing" },
1269     { "p", "menu" },
1270     { "p", "ol" },
1271     { "p", "p" },
1272     { "p", "pre" },
1273     { "p", "table" },
1274     { "p", "tbody" },
1275     { "p", "td" },
1276     { "p", "tfoot" },
1277     { "p", "th" },
1278     { "p", "title" },
1279     { "p", "tr" },
1280     { "p", "ul" },
1281     { "p", "xmp" },
1282     { "pre", "dd" },
1283     { "pre", "dl" },
1284     { "pre", "dt" },
1285     { "pre", "fieldset" },
1286     { "pre", "form" },
1287     { "pre", "li" },
1288     { "pre", "table" },
1289     { "pre", "ul" },
1290     { "s", "p" },
1291     { "script", "noscript" },
1292     { "small", "p" },
1293     { "span", "td" },
1294     { "span", "th" },
1295     { "strike", "p" },
1296     { "style", "body" },
1297     { "style", "frameset" },
1298     { "tbody", "tbody" },
1299     { "tbody", "tfoot" },
1300     { "td", "tbody" },
1301     { "td", "td" },
1302     { "td", "tfoot" },
1303     { "td", "th" },
1304     { "td", "tr" },
1305     { "tfoot", "tbody" },
1306     { "th", "tbody" },
1307     { "th", "td" },
1308     { "th", "tfoot" },
1309     { "th", "th" },
1310     { "th", "tr" },
1311     { "thead", "tbody" },
1312     { "thead", "tfoot" },
1313     { "title", "body" },
1314     { "title", "frameset" },
1315     { "tr", "tbody" },
1316     { "tr", "tfoot" },
1317     { "tr", "tr" },
1318     { "tt", "p" },
1319     { "u", "p" },
1320     { "u", "td" },
1321     { "u", "th" },
1322     { "ul", "address" },
1323     { "ul", "form" },
1324     { "ul", "menu" },
1325     { "ul", "ol" },
1326     { "ul", "pre" },
1327     { "xmp", "dd" },
1328     { "xmp", "dl" },
1329     { "xmp", "dt" },
1330     { "xmp", "fieldset" },
1331     { "xmp", "form" },
1332     { "xmp", "li" },
1333     { "xmp", "table" },
1334     { "xmp", "ul" }
1335 };
1336 
1337 /*
1338  * The list of HTML elements which are supposed not to have
1339  * CDATA content and where a p element will be implied
1340  *
1341  * TODO: extend that list by reading the HTML SGML DTD on
1342  *       implied paragraph
1343  */
1344 static const char *const htmlNoContentElements[] = {
1345     "html",
1346     "head",
1347     NULL
1348 };
1349 
1350 /*
1351  * The list of HTML attributes which are of content %Script;
1352  * NOTE: when adding ones, check htmlIsScriptAttribute() since
1353  *       it assumes the name starts with 'on'
1354  */
1355 static const char *const htmlScriptAttributes[] = {
1356     "onclick",
1357     "ondblclick",
1358     "onmousedown",
1359     "onmouseup",
1360     "onmouseover",
1361     "onmousemove",
1362     "onmouseout",
1363     "onkeypress",
1364     "onkeydown",
1365     "onkeyup",
1366     "onload",
1367     "onunload",
1368     "onfocus",
1369     "onblur",
1370     "onsubmit",
1371     "onreset",
1372     "onchange",
1373     "onselect"
1374 };
1375 
1376 /*
1377  * This table is used by the htmlparser to know what to do with
1378  * broken html pages. By assigning different priorities to different
1379  * elements the parser can decide how to handle extra endtags.
1380  * Endtags are only allowed to close elements with lower or equal
1381  * priority.
1382  */
1383 
1384 typedef struct {
1385     const char *name;
1386     int priority;
1387 } elementPriority;
1388 
1389 static const elementPriority htmlEndPriority[] = {
1390     {"div",   150},
1391     {"td",    160},
1392     {"th",    160},
1393     {"tr",    170},
1394     {"thead", 180},
1395     {"tbody", 180},
1396     {"tfoot", 180},
1397     {"table", 190},
1398     {"head",  200},
1399     {"body",  200},
1400     {"html",  220},
1401     {NULL,    100} /* Default priority */
1402 };
1403 
1404 /************************************************************************
1405  *									*
1406  *	functions to handle HTML specific data			*
1407  *									*
1408  ************************************************************************/
1409 
1410 /**
1411  * htmlInitAutoClose:
1412  *
1413  * This is a no-op now.
1414  */
1415 void
htmlInitAutoClose(void)1416 htmlInitAutoClose(void) {
1417 }
1418 
1419 static int
htmlCompareTags(const void * key,const void * member)1420 htmlCompareTags(const void *key, const void *member) {
1421     const xmlChar *tag = (const xmlChar *) key;
1422     const htmlElemDesc *desc = (const htmlElemDesc *) member;
1423 
1424     return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1425 }
1426 
1427 /**
1428  * htmlTagLookup:
1429  * @tag:  The tag name in lowercase
1430  *
1431  * Lookup the HTML tag in the ElementTable
1432  *
1433  * Returns the related htmlElemDescPtr or NULL if not found.
1434  */
1435 const htmlElemDesc *
htmlTagLookup(const xmlChar * tag)1436 htmlTagLookup(const xmlChar *tag) {
1437     if (tag == NULL)
1438         return(NULL);
1439 
1440     return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1441                 sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1442                 sizeof(htmlElemDesc), htmlCompareTags));
1443 }
1444 
1445 /**
1446  * htmlGetEndPriority:
1447  * @name: The name of the element to look up the priority for.
1448  *
1449  * Return value: The "endtag" priority.
1450  **/
1451 static int
htmlGetEndPriority(const xmlChar * name)1452 htmlGetEndPriority (const xmlChar *name) {
1453     int i = 0;
1454 
1455     while ((htmlEndPriority[i].name != NULL) &&
1456 	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1457 	i++;
1458 
1459     return(htmlEndPriority[i].priority);
1460 }
1461 
1462 
1463 static int
htmlCompareStartClose(const void * vkey,const void * member)1464 htmlCompareStartClose(const void *vkey, const void *member) {
1465     const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1466     const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1467     int ret;
1468 
1469     ret = strcmp(key->oldTag, entry->oldTag);
1470     if (ret == 0)
1471         ret = strcmp(key->newTag, entry->newTag);
1472 
1473     return(ret);
1474 }
1475 
1476 /**
1477  * htmlCheckAutoClose:
1478  * @newtag:  The new tag name
1479  * @oldtag:  The old tag name
1480  *
1481  * Checks whether the new tag is one of the registered valid tags for
1482  * closing old.
1483  *
1484  * Returns 0 if no, 1 if yes.
1485  */
1486 static int
htmlCheckAutoClose(const xmlChar * newtag,const xmlChar * oldtag)1487 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1488 {
1489     htmlStartCloseEntry key;
1490     void *res;
1491 
1492     key.oldTag = (const char *) oldtag;
1493     key.newTag = (const char *) newtag;
1494     res = bsearch(&key, htmlStartClose,
1495             sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1496             sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1497     return(res != NULL);
1498 }
1499 
1500 /**
1501  * htmlAutoCloseOnClose:
1502  * @ctxt:  an HTML parser context
1503  * @newtag:  The new tag name
1504  * @force:  force the tag closure
1505  *
1506  * The HTML DTD allows an ending tag to implicitly close other tags.
1507  */
1508 static void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1509 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1510 {
1511     const htmlElemDesc *info;
1512     int i, priority;
1513 
1514     priority = htmlGetEndPriority(newtag);
1515 
1516     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1517 
1518         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1519             break;
1520         /*
1521          * A misplaced endtag can only close elements with lower
1522          * or equal priority, so if we find an element with higher
1523          * priority before we find an element with
1524          * matching name, we just ignore this endtag
1525          */
1526         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1527             return;
1528     }
1529     if (i < 0)
1530         return;
1531 
1532     while (!xmlStrEqual(newtag, ctxt->name)) {
1533         info = htmlTagLookup(ctxt->name);
1534         if ((info != NULL) && (info->endTag == 3)) {
1535             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1536 	                 "Opening and ending tag mismatch: %s and %s\n",
1537 			 newtag, ctxt->name);
1538         }
1539         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1540             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1541 	htmlnamePop(ctxt);
1542     }
1543 }
1544 
1545 /**
1546  * htmlAutoCloseOnEnd:
1547  * @ctxt:  an HTML parser context
1548  *
1549  * Close all remaining tags at the end of the stream
1550  */
1551 static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)1552 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1553 {
1554     int i;
1555 
1556     if (ctxt->nameNr == 0)
1557         return;
1558     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1559         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1560             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1561 	htmlnamePop(ctxt);
1562     }
1563 }
1564 
1565 /**
1566  * htmlAutoClose:
1567  * @ctxt:  an HTML parser context
1568  * @newtag:  The new tag name or NULL
1569  *
1570  * The HTML DTD allows a tag to implicitly close other tags.
1571  * The list is kept in htmlStartClose array. This function is
1572  * called when a new tag has been detected and generates the
1573  * appropriates closes if possible/needed.
1574  * If newtag is NULL this mean we are at the end of the resource
1575  * and we should check
1576  */
1577 static void
htmlAutoClose(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1578 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1579 {
1580     while ((newtag != NULL) && (ctxt->name != NULL) &&
1581            (htmlCheckAutoClose(newtag, ctxt->name))) {
1582         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584 	htmlnamePop(ctxt);
1585     }
1586     if (newtag == NULL) {
1587         htmlAutoCloseOnEnd(ctxt);
1588         return;
1589     }
1590     while ((newtag == NULL) && (ctxt->name != NULL) &&
1591            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1592             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1593             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1594         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1595             ctxt->sax->endElement(ctxt->userData, ctxt->name);
1596 	htmlnamePop(ctxt);
1597     }
1598 }
1599 
1600 /**
1601  * htmlAutoCloseTag:
1602  * @doc:  the HTML document
1603  * @name:  The tag name
1604  * @elem:  the HTML element
1605  *
1606  * The HTML DTD allows a tag to implicitly close other tags.
1607  * The list is kept in htmlStartClose array. This function checks
1608  * if the element or one of it's children would autoclose the
1609  * given tag.
1610  *
1611  * Returns 1 if autoclose, 0 otherwise
1612  */
1613 int
htmlAutoCloseTag(htmlDocPtr doc,const xmlChar * name,htmlNodePtr elem)1614 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1615     htmlNodePtr child;
1616 
1617     if (elem == NULL) return(1);
1618     if (xmlStrEqual(name, elem->name)) return(0);
1619     if (htmlCheckAutoClose(elem->name, name)) return(1);
1620     child = elem->children;
1621     while (child != NULL) {
1622         if (htmlAutoCloseTag(doc, name, child)) return(1);
1623 	child = child->next;
1624     }
1625     return(0);
1626 }
1627 
1628 /**
1629  * htmlIsAutoClosed:
1630  * @doc:  the HTML document
1631  * @elem:  the HTML element
1632  *
1633  * The HTML DTD allows a tag to implicitly close other tags.
1634  * The list is kept in htmlStartClose array. This function checks
1635  * if a tag is autoclosed by one of it's child
1636  *
1637  * Returns 1 if autoclosed, 0 otherwise
1638  */
1639 int
htmlIsAutoClosed(htmlDocPtr doc,htmlNodePtr elem)1640 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1641     htmlNodePtr child;
1642 
1643     if (elem == NULL) return(1);
1644     child = elem->children;
1645     while (child != NULL) {
1646 	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1647 	child = child->next;
1648     }
1649     return(0);
1650 }
1651 
1652 /**
1653  * htmlCheckImplied:
1654  * @ctxt:  an HTML parser context
1655  * @newtag:  The new tag name
1656  *
1657  * The HTML DTD allows a tag to exists only implicitly
1658  * called when a new tag has been detected and generates the
1659  * appropriates implicit tags if missing
1660  */
1661 static void
htmlCheckImplied(htmlParserCtxtPtr ctxt,const xmlChar * newtag)1662 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1663     int i;
1664 
1665     if (ctxt->options & HTML_PARSE_NOIMPLIED)
1666         return;
1667     if (!htmlOmittedDefaultValue)
1668 	return;
1669     if (xmlStrEqual(newtag, BAD_CAST"html"))
1670 	return;
1671     if (ctxt->nameNr <= 0) {
1672 	htmlnamePush(ctxt, BAD_CAST"html");
1673 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1674 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1675     }
1676     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1677         return;
1678     if ((ctxt->nameNr <= 1) &&
1679         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1680 	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1681 	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1682 	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1683 	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1684 	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1685         if (ctxt->html >= 3) {
1686             /* we already saw or generated an <head> before */
1687             return;
1688         }
1689         /*
1690          * dropped OBJECT ... i you put it first BODY will be
1691          * assumed !
1692          */
1693         htmlnamePush(ctxt, BAD_CAST"head");
1694         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1695             ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1696     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1697 	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1698 	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1699         if (ctxt->html >= 10) {
1700             /* we already saw or generated a <body> before */
1701             return;
1702         }
1703 	for (i = 0;i < ctxt->nameNr;i++) {
1704 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1705 		return;
1706 	    }
1707 	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1708 		return;
1709 	    }
1710 	}
1711 
1712 	htmlnamePush(ctxt, BAD_CAST"body");
1713 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1714 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1715     }
1716 }
1717 
1718 /**
1719  * htmlCheckParagraph
1720  * @ctxt:  an HTML parser context
1721  *
1722  * Check whether a p element need to be implied before inserting
1723  * characters in the current element.
1724  *
1725  * Returns 1 if a paragraph has been inserted, 0 if not and -1
1726  *         in case of error.
1727  */
1728 
1729 static int
htmlCheckParagraph(htmlParserCtxtPtr ctxt)1730 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1731     const xmlChar *tag;
1732     int i;
1733 
1734     if (ctxt == NULL)
1735 	return(-1);
1736     tag = ctxt->name;
1737     if (tag == NULL) {
1738 	htmlAutoClose(ctxt, BAD_CAST"p");
1739 	htmlCheckImplied(ctxt, BAD_CAST"p");
1740 	htmlnamePush(ctxt, BAD_CAST"p");
1741 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1742 	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1743 	return(1);
1744     }
1745     if (!htmlOmittedDefaultValue)
1746 	return(0);
1747     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1748 	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1749 	    htmlAutoClose(ctxt, BAD_CAST"p");
1750 	    htmlCheckImplied(ctxt, BAD_CAST"p");
1751 	    htmlnamePush(ctxt, BAD_CAST"p");
1752 	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1753 		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1754 	    return(1);
1755 	}
1756     }
1757     return(0);
1758 }
1759 
1760 /**
1761  * htmlIsScriptAttribute:
1762  * @name:  an attribute name
1763  *
1764  * Check if an attribute is of content type Script
1765  *
1766  * Returns 1 is the attribute is a script 0 otherwise
1767  */
1768 int
htmlIsScriptAttribute(const xmlChar * name)1769 htmlIsScriptAttribute(const xmlChar *name) {
1770     unsigned int i;
1771 
1772     if (name == NULL)
1773       return(0);
1774     /*
1775      * all script attributes start with 'on'
1776      */
1777     if ((name[0] != 'o') || (name[1] != 'n'))
1778       return(0);
1779     for (i = 0;
1780 	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1781 	 i++) {
1782 	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1783 	    return(1);
1784     }
1785     return(0);
1786 }
1787 
1788 /************************************************************************
1789  *									*
1790  *	The list of HTML predefined entities			*
1791  *									*
1792  ************************************************************************/
1793 
1794 
1795 static const htmlEntityDesc  html40EntitiesTable[] = {
1796 /*
1797  * the 4 absolute ones, plus apostrophe.
1798  */
1799 { 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1800 { 38,	"amp",	"ampersand, U+0026 ISOnum" },
1801 { 39,	"apos",	"single quote" },
1802 { 60,	"lt",	"less-than sign, U+003C ISOnum" },
1803 { 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1804 
1805 /*
1806  * A bunch still in the 128-255 range
1807  * Replacing them depend really on the charset used.
1808  */
1809 { 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1810 { 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1811 { 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1812 { 163,	"pound","pound sign, U+00A3 ISOnum" },
1813 { 164,	"curren","currency sign, U+00A4 ISOnum" },
1814 { 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1815 { 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1816 { 167,	"sect",	"section sign, U+00A7 ISOnum" },
1817 { 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1818 { 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1819 { 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1820 { 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1821 { 172,	"not",	"not sign, U+00AC ISOnum" },
1822 { 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1823 { 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1824 { 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1825 { 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1826 { 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1827 { 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1828 { 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1829 { 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1830 { 181,	"micro","micro sign, U+00B5 ISOnum" },
1831 { 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1832 { 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1833 { 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1834 { 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1835 { 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1836 { 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1837 { 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1838 { 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1839 { 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1840 { 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1841 { 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1842 { 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1843 { 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1844 { 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1845 { 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1846 { 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1847 { 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1848 { 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1849 { 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1850 { 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1851 { 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1852 { 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1853 { 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1854 { 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1855 { 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1856 { 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1857 { 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1858 { 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1859 { 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1860 { 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1861 { 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1862 { 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1863 { 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1864 { 215,	"times","multiplication sign, U+00D7 ISOnum" },
1865 { 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1866 { 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1867 { 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1868 { 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1869 { 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1870 { 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1871 { 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1872 { 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1873 { 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1874 { 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1875 { 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1876 { 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1877 { 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1878 { 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1879 { 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1880 { 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1881 { 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1882 { 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1883 { 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1884 { 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1885 { 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1886 { 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1887 { 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1888 { 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1889 { 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1890 { 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1891 { 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1892 { 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1893 { 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1894 { 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1895 { 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1896 { 247,	"divide","division sign, U+00F7 ISOnum" },
1897 { 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1898 { 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1899 { 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1900 { 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1901 { 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1902 { 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1903 { 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1904 { 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1905 
1906 { 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1907 { 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1908 { 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1909 { 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1910 { 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1911 
1912 /*
1913  * Anything below should really be kept as entities references
1914  */
1915 { 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1916 
1917 { 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1918 { 732,	"tilde","small tilde, U+02DC ISOdia" },
1919 
1920 { 913,	"Alpha","greek capital letter alpha, U+0391" },
1921 { 914,	"Beta",	"greek capital letter beta, U+0392" },
1922 { 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1923 { 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1924 { 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1925 { 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1926 { 919,	"Eta",	"greek capital letter eta, U+0397" },
1927 { 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1928 { 921,	"Iota",	"greek capital letter iota, U+0399" },
1929 { 922,	"Kappa","greek capital letter kappa, U+039A" },
1930 { 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1931 { 924,	"Mu",	"greek capital letter mu, U+039C" },
1932 { 925,	"Nu",	"greek capital letter nu, U+039D" },
1933 { 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1934 { 927,	"Omicron","greek capital letter omicron, U+039F" },
1935 { 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1936 { 929,	"Rho",	"greek capital letter rho, U+03A1" },
1937 { 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1938 { 932,	"Tau",	"greek capital letter tau, U+03A4" },
1939 { 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1940 { 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1941 { 935,	"Chi",	"greek capital letter chi, U+03A7" },
1942 { 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1943 { 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1944 
1945 { 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1946 { 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1947 { 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1948 { 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1949 { 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1950 { 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1951 { 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1952 { 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1953 { 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1954 { 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1955 { 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1956 { 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1957 { 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1958 { 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1959 { 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1960 { 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1961 { 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1962 { 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1963 { 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1964 { 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1965 { 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1966 { 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1967 { 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1968 { 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1969 { 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1970 { 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1971 { 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1972 { 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1973 
1974 { 8194,	"ensp",	"en space, U+2002 ISOpub" },
1975 { 8195,	"emsp",	"em space, U+2003 ISOpub" },
1976 { 8201,	"thinsp","thin space, U+2009 ISOpub" },
1977 { 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1978 { 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1979 { 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1980 { 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1981 { 8211,	"ndash","en dash, U+2013 ISOpub" },
1982 { 8212,	"mdash","em dash, U+2014 ISOpub" },
1983 { 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1984 { 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1985 { 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1986 { 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1987 { 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1988 { 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1989 { 8224,	"dagger","dagger, U+2020 ISOpub" },
1990 { 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1991 
1992 { 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1993 { 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1994 
1995 { 8240,	"permil","per mille sign, U+2030 ISOtech" },
1996 
1997 { 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1998 { 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1999 
2000 { 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
2001 { 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
2002 
2003 { 8254,	"oline","overline = spacing overscore, U+203E NEW" },
2004 { 8260,	"frasl","fraction slash, U+2044 NEW" },
2005 
2006 { 8364,	"euro",	"euro sign, U+20AC NEW" },
2007 
2008 { 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
2009 { 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
2010 { 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
2011 { 8482,	"trade","trade mark sign, U+2122 ISOnum" },
2012 { 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2013 { 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
2014 { 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
2015 { 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
2016 { 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
2017 { 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
2018 { 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2019 { 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
2020 { 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
2021 { 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
2022 { 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
2023 { 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
2024 
2025 { 8704,	"forall","for all, U+2200 ISOtech" },
2026 { 8706,	"part",	"partial differential, U+2202 ISOtech" },
2027 { 8707,	"exist","there exists, U+2203 ISOtech" },
2028 { 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
2029 { 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
2030 { 8712,	"isin",	"element of, U+2208 ISOtech" },
2031 { 8713,	"notin","not an element of, U+2209 ISOtech" },
2032 { 8715,	"ni",	"contains as member, U+220B ISOtech" },
2033 { 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
2034 { 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
2035 { 8722,	"minus","minus sign, U+2212 ISOtech" },
2036 { 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
2037 { 8730,	"radic","square root = radical sign, U+221A ISOtech" },
2038 { 8733,	"prop",	"proportional to, U+221D ISOtech" },
2039 { 8734,	"infin","infinity, U+221E ISOtech" },
2040 { 8736,	"ang",	"angle, U+2220 ISOamso" },
2041 { 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
2042 { 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
2043 { 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
2044 { 8746,	"cup",	"union = cup, U+222A ISOtech" },
2045 { 8747,	"int",	"integral, U+222B ISOtech" },
2046 { 8756,	"there4","therefore, U+2234 ISOtech" },
2047 { 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
2048 { 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
2049 { 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2050 { 8800,	"ne",	"not equal to, U+2260 ISOtech" },
2051 { 8801,	"equiv","identical to, U+2261 ISOtech" },
2052 { 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
2053 { 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
2054 { 8834,	"sub",	"subset of, U+2282 ISOtech" },
2055 { 8835,	"sup",	"superset of, U+2283 ISOtech" },
2056 { 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
2057 { 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
2058 { 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
2059 { 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
2060 { 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
2061 { 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2062 { 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
2063 { 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2064 { 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
2065 { 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
2066 { 8971,	"rfloor","right floor, U+230B ISOamsc" },
2067 { 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
2068 { 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
2069 { 9674,	"loz",	"lozenge, U+25CA ISOpub" },
2070 
2071 { 9824,	"spades","black spade suit, U+2660 ISOpub" },
2072 { 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
2073 { 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
2074 { 9830,	"diams","black diamond suit, U+2666 ISOpub" },
2075 
2076 };
2077 
2078 /************************************************************************
2079  *									*
2080  *		Commodity functions to handle entities			*
2081  *									*
2082  ************************************************************************/
2083 
2084 /*
2085  * Macro used to grow the current buffer.
2086  */
2087 #define growBuffer(buffer) {						\
2088     xmlChar *tmp;							\
2089     buffer##_size *= 2;							\
2090     tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2091     if (tmp == NULL) {						\
2092 	htmlErrMemory(ctxt, "growing buffer\n");			\
2093 	xmlFree(buffer);						\
2094 	return(NULL);							\
2095     }									\
2096     buffer = tmp;							\
2097 }
2098 
2099 /**
2100  * htmlEntityLookup:
2101  * @name: the entity name
2102  *
2103  * Lookup the given entity in EntitiesTable
2104  *
2105  * TODO: the linear scan is really ugly, an hash table is really needed.
2106  *
2107  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2108  */
2109 const htmlEntityDesc *
htmlEntityLookup(const xmlChar * name)2110 htmlEntityLookup(const xmlChar *name) {
2111     unsigned int i;
2112 
2113     for (i = 0;i < (sizeof(html40EntitiesTable)/
2114                     sizeof(html40EntitiesTable[0]));i++) {
2115         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2116             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2117 	}
2118     }
2119     return(NULL);
2120 }
2121 
2122 /**
2123  * htmlEntityValueLookup:
2124  * @value: the entity's unicode value
2125  *
2126  * Lookup the given entity in EntitiesTable
2127  *
2128  * TODO: the linear scan is really ugly, an hash table is really needed.
2129  *
2130  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2131  */
2132 const htmlEntityDesc *
htmlEntityValueLookup(unsigned int value)2133 htmlEntityValueLookup(unsigned int value) {
2134     unsigned int i;
2135 
2136     for (i = 0;i < (sizeof(html40EntitiesTable)/
2137                     sizeof(html40EntitiesTable[0]));i++) {
2138         if (html40EntitiesTable[i].value >= value) {
2139 	    if (html40EntitiesTable[i].value > value)
2140 		break;
2141             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2142 	}
2143     }
2144     return(NULL);
2145 }
2146 
2147 /**
2148  * UTF8ToHtml:
2149  * @out:  a pointer to an array of bytes to store the result
2150  * @outlen:  the length of @out
2151  * @in:  a pointer to an array of UTF-8 chars
2152  * @inlen:  the length of @in
2153  *
2154  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2155  * plus HTML entities block of chars out.
2156  *
2157  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2158  * The value of @inlen after return is the number of octets consumed
2159  *     as the return value is positive, else unpredictable.
2160  * The value of @outlen after return is the number of octets consumed.
2161  */
2162 int
UTF8ToHtml(unsigned char * out,int * outlen,const unsigned char * in,int * inlen)2163 UTF8ToHtml(unsigned char* out, int *outlen,
2164               const unsigned char* in, int *inlen) {
2165     const unsigned char* processed = in;
2166     const unsigned char* outend;
2167     const unsigned char* outstart = out;
2168     const unsigned char* instart = in;
2169     const unsigned char* inend;
2170     unsigned int c, d;
2171     int trailing;
2172 
2173     if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2174     if (in == NULL) {
2175         /*
2176 	 * initialization nothing to do
2177 	 */
2178 	*outlen = 0;
2179 	*inlen = 0;
2180 	return(0);
2181     }
2182     inend = in + (*inlen);
2183     outend = out + (*outlen);
2184     while (in < inend) {
2185 	d = *in++;
2186 	if      (d < 0x80)  { c= d; trailing= 0; }
2187 	else if (d < 0xC0) {
2188 	    /* trailing byte in leading position */
2189 	    *outlen = out - outstart;
2190 	    *inlen = processed - instart;
2191 	    return(-2);
2192         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2193         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2194         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2195 	else {
2196 	    /* no chance for this in Ascii */
2197 	    *outlen = out - outstart;
2198 	    *inlen = processed - instart;
2199 	    return(-2);
2200 	}
2201 
2202 	if (inend - in < trailing) {
2203 	    break;
2204 	}
2205 
2206 	for ( ; trailing; trailing--) {
2207 	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2208 		break;
2209 	    c <<= 6;
2210 	    c |= d & 0x3F;
2211 	}
2212 
2213 	/* assertion: c is a single UTF-4 value */
2214 	if (c < 0x80) {
2215 	    if (out + 1 >= outend)
2216 		break;
2217 	    *out++ = c;
2218 	} else {
2219 	    int len;
2220 	    const htmlEntityDesc * ent;
2221 	    const char *cp;
2222 	    char nbuf[16];
2223 
2224 	    /*
2225 	     * Try to lookup a predefined HTML entity for it
2226 	     */
2227 
2228 	    ent = htmlEntityValueLookup(c);
2229 	    if (ent == NULL) {
2230 	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2231 	      cp = nbuf;
2232 	    }
2233 	    else
2234 	      cp = ent->name;
2235 	    len = strlen(cp);
2236 	    if (out + 2 + len >= outend)
2237 		break;
2238 	    *out++ = '&';
2239 	    memcpy(out, cp, len);
2240 	    out += len;
2241 	    *out++ = ';';
2242 	}
2243 	processed = in;
2244     }
2245     *outlen = out - outstart;
2246     *inlen = processed - instart;
2247     return(0);
2248 }
2249 
2250 /**
2251  * htmlEncodeEntities:
2252  * @out:  a pointer to an array of bytes to store the result
2253  * @outlen:  the length of @out
2254  * @in:  a pointer to an array of UTF-8 chars
2255  * @inlen:  the length of @in
2256  * @quoteChar: the quote character to escape (' or ") or zero.
2257  *
2258  * Take a block of UTF-8 chars in and try to convert it to an ASCII
2259  * plus HTML entities block of chars out.
2260  *
2261  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2262  * The value of @inlen after return is the number of octets consumed
2263  *     as the return value is positive, else unpredictable.
2264  * The value of @outlen after return is the number of octets consumed.
2265  */
2266 int
htmlEncodeEntities(unsigned char * out,int * outlen,const unsigned char * in,int * inlen,int quoteChar)2267 htmlEncodeEntities(unsigned char* out, int *outlen,
2268 		   const unsigned char* in, int *inlen, int quoteChar) {
2269     const unsigned char* processed = in;
2270     const unsigned char* outend;
2271     const unsigned char* outstart = out;
2272     const unsigned char* instart = in;
2273     const unsigned char* inend;
2274     unsigned int c, d;
2275     int trailing;
2276 
2277     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2278         return(-1);
2279     outend = out + (*outlen);
2280     inend = in + (*inlen);
2281     while (in < inend) {
2282 	d = *in++;
2283 	if      (d < 0x80)  { c= d; trailing= 0; }
2284 	else if (d < 0xC0) {
2285 	    /* trailing byte in leading position */
2286 	    *outlen = out - outstart;
2287 	    *inlen = processed - instart;
2288 	    return(-2);
2289         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2290         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2291         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2292 	else {
2293 	    /* no chance for this in Ascii */
2294 	    *outlen = out - outstart;
2295 	    *inlen = processed - instart;
2296 	    return(-2);
2297 	}
2298 
2299 	if (inend - in < trailing)
2300 	    break;
2301 
2302 	while (trailing--) {
2303 	    if (((d= *in++) & 0xC0) != 0x80) {
2304 		*outlen = out - outstart;
2305 		*inlen = processed - instart;
2306 		return(-2);
2307 	    }
2308 	    c <<= 6;
2309 	    c |= d & 0x3F;
2310 	}
2311 
2312 	/* assertion: c is a single UTF-4 value */
2313 	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2314 	    (c != '&') && (c != '<') && (c != '>')) {
2315 	    if (out >= outend)
2316 		break;
2317 	    *out++ = c;
2318 	} else {
2319 	    const htmlEntityDesc * ent;
2320 	    const char *cp;
2321 	    char nbuf[16];
2322 	    int len;
2323 
2324 	    /*
2325 	     * Try to lookup a predefined HTML entity for it
2326 	     */
2327 	    ent = htmlEntityValueLookup(c);
2328 	    if (ent == NULL) {
2329 		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2330 		cp = nbuf;
2331 	    }
2332 	    else
2333 		cp = ent->name;
2334 	    len = strlen(cp);
2335 	    if (out + 2 + len > outend)
2336 		break;
2337 	    *out++ = '&';
2338 	    memcpy(out, cp, len);
2339 	    out += len;
2340 	    *out++ = ';';
2341 	}
2342 	processed = in;
2343     }
2344     *outlen = out - outstart;
2345     *inlen = processed - instart;
2346     return(0);
2347 }
2348 
2349 /************************************************************************
2350  *									*
2351  *		Commodity functions to handle streams			*
2352  *									*
2353  ************************************************************************/
2354 
2355 #ifdef LIBXML_PUSH_ENABLED
2356 /**
2357  * htmlNewInputStream:
2358  * @ctxt:  an HTML parser context
2359  *
2360  * Create a new input stream structure
2361  * Returns the new input stream or NULL
2362  */
2363 static htmlParserInputPtr
htmlNewInputStream(htmlParserCtxtPtr ctxt)2364 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2365     htmlParserInputPtr input;
2366 
2367     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2368     if (input == NULL) {
2369         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2370 	return(NULL);
2371     }
2372     memset(input, 0, sizeof(htmlParserInput));
2373     input->filename = NULL;
2374     input->directory = NULL;
2375     input->base = NULL;
2376     input->cur = NULL;
2377     input->buf = NULL;
2378     input->line = 1;
2379     input->col = 1;
2380     input->buf = NULL;
2381     input->free = NULL;
2382     input->version = NULL;
2383     input->consumed = 0;
2384     input->length = 0;
2385     return(input);
2386 }
2387 #endif
2388 
2389 
2390 /************************************************************************
2391  *									*
2392  *		Commodity functions, cleanup needed ?			*
2393  *									*
2394  ************************************************************************/
2395 /*
2396  * all tags allowing pc data from the html 4.01 loose dtd
2397  * NOTE: it might be more appropriate to integrate this information
2398  * into the html40ElementTable array but I don't want to risk any
2399  * binary incompatibility
2400  */
2401 static const char *allowPCData[] = {
2402     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2403     "blockquote", "body", "button", "caption", "center", "cite", "code",
2404     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2405     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2406     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2407     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2408 };
2409 
2410 /**
2411  * areBlanks:
2412  * @ctxt:  an HTML parser context
2413  * @str:  a xmlChar *
2414  * @len:  the size of @str
2415  *
2416  * Is this a sequence of blank chars that one can ignore ?
2417  *
2418  * Returns 1 if ignorable 0 otherwise.
2419  */
2420 
areBlanks(htmlParserCtxtPtr ctxt,const xmlChar * str,int len)2421 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2422     unsigned int i;
2423     int j;
2424     xmlNodePtr lastChild;
2425     xmlDtdPtr dtd;
2426 
2427     for (j = 0;j < len;j++)
2428         if (!(IS_BLANK_CH(str[j]))) return(0);
2429 
2430     if (CUR == 0) return(1);
2431     if (CUR != '<') return(0);
2432     if (ctxt->name == NULL)
2433 	return(1);
2434     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2435 	return(1);
2436     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2437 	return(1);
2438 
2439     /* Only strip CDATA children of the body tag for strict HTML DTDs */
2440     if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2441         dtd = xmlGetIntSubset(ctxt->myDoc);
2442         if (dtd != NULL && dtd->ExternalID != NULL) {
2443             if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2444                     !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2445                 return(1);
2446         }
2447     }
2448 
2449     if (ctxt->node == NULL) return(0);
2450     lastChild = xmlGetLastChild(ctxt->node);
2451     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2452 	lastChild = lastChild->prev;
2453     if (lastChild == NULL) {
2454         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2455             (ctxt->node->content != NULL)) return(0);
2456 	/* keep ws in constructs like ...<b> </b>...
2457 	   for all tags "b" allowing PCDATA */
2458 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2459 	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2460 		return(0);
2461 	    }
2462 	}
2463     } else if (xmlNodeIsText(lastChild)) {
2464         return(0);
2465     } else {
2466 	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2467 	   for all tags "p" allowing PCDATA */
2468 	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2469 	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2470 		return(0);
2471 	    }
2472 	}
2473     }
2474     return(1);
2475 }
2476 
2477 /**
2478  * htmlNewDocNoDtD:
2479  * @URI:  URI for the dtd, or NULL
2480  * @ExternalID:  the external ID of the DTD, or NULL
2481  *
2482  * Creates a new HTML document without a DTD node if @URI and @ExternalID
2483  * are NULL
2484  *
2485  * Returns a new document, do not initialize the DTD if not provided
2486  */
2487 htmlDocPtr
htmlNewDocNoDtD(const xmlChar * URI,const xmlChar * ExternalID)2488 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2489     xmlDocPtr cur;
2490 
2491     /*
2492      * Allocate a new document and fill the fields.
2493      */
2494     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2495     if (cur == NULL) {
2496 	htmlErrMemory(NULL, "HTML document creation failed\n");
2497 	return(NULL);
2498     }
2499     memset(cur, 0, sizeof(xmlDoc));
2500 
2501     cur->type = XML_HTML_DOCUMENT_NODE;
2502     cur->version = NULL;
2503     cur->intSubset = NULL;
2504     cur->doc = cur;
2505     cur->name = NULL;
2506     cur->children = NULL;
2507     cur->extSubset = NULL;
2508     cur->oldNs = NULL;
2509     cur->encoding = NULL;
2510     cur->standalone = 1;
2511     cur->compression = 0;
2512     cur->ids = NULL;
2513     cur->refs = NULL;
2514     cur->_private = NULL;
2515     cur->charset = XML_CHAR_ENCODING_UTF8;
2516     cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2517     if ((ExternalID != NULL) ||
2518 	(URI != NULL))
2519 	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2520     if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2521 	xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2522     return(cur);
2523 }
2524 
2525 /**
2526  * htmlNewDoc:
2527  * @URI:  URI for the dtd, or NULL
2528  * @ExternalID:  the external ID of the DTD, or NULL
2529  *
2530  * Creates a new HTML document
2531  *
2532  * Returns a new document
2533  */
2534 htmlDocPtr
htmlNewDoc(const xmlChar * URI,const xmlChar * ExternalID)2535 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2536     if ((URI == NULL) && (ExternalID == NULL))
2537 	return(htmlNewDocNoDtD(
2538 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2539 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2540 
2541     return(htmlNewDocNoDtD(URI, ExternalID));
2542 }
2543 
2544 
2545 /************************************************************************
2546  *									*
2547  *			The parser itself				*
2548  *	Relates to http://www.w3.org/TR/html40				*
2549  *									*
2550  ************************************************************************/
2551 
2552 /************************************************************************
2553  *									*
2554  *			The parser itself				*
2555  *									*
2556  ************************************************************************/
2557 
2558 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2559 
2560 /**
2561  * htmlParseHTMLName:
2562  * @ctxt:  an HTML parser context
2563  *
2564  * parse an HTML tag or attribute name, note that we convert it to lowercase
2565  * since HTML names are not case-sensitive.
2566  *
2567  * Returns the Tag Name parsed or NULL
2568  */
2569 
2570 static const xmlChar *
htmlParseHTMLName(htmlParserCtxtPtr ctxt)2571 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2572     int i = 0;
2573     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2574 
2575     if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2576         (CUR != ':') && (CUR != '.')) return(NULL);
2577 
2578     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2579            ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2580 	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2581            (CUR == '.'))) {
2582 	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2583         else loc[i] = CUR;
2584 	i++;
2585 
2586 	NEXT;
2587     }
2588 
2589     return(xmlDictLookup(ctxt->dict, loc, i));
2590 }
2591 
2592 
2593 /**
2594  * htmlParseHTMLName_nonInvasive:
2595  * @ctxt:  an HTML parser context
2596  *
2597  * parse an HTML tag or attribute name, note that we convert it to lowercase
2598  * since HTML names are not case-sensitive, this doesn't consume the data
2599  * from the stream, it's a look-ahead
2600  *
2601  * Returns the Tag Name parsed or NULL
2602  */
2603 
2604 static const xmlChar *
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt)2605 htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2606     int i = 0;
2607     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2608 
2609     if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2610         (NXT(1) != ':')) return(NULL);
2611 
2612     while ((i < HTML_PARSER_BUFFER_SIZE) &&
2613            ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2614 	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2615 	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2616         else loc[i] = NXT(1+i);
2617 	i++;
2618     }
2619 
2620     return(xmlDictLookup(ctxt->dict, loc, i));
2621 }
2622 
2623 
2624 /**
2625  * htmlParseName:
2626  * @ctxt:  an HTML parser context
2627  *
2628  * parse an HTML name, this routine is case sensitive.
2629  *
2630  * Returns the Name parsed or NULL
2631  */
2632 
2633 static const xmlChar *
htmlParseName(htmlParserCtxtPtr ctxt)2634 htmlParseName(htmlParserCtxtPtr ctxt) {
2635     const xmlChar *in;
2636     const xmlChar *ret;
2637     int count = 0;
2638 
2639     GROW;
2640 
2641     /*
2642      * Accelerator for simple ASCII names
2643      */
2644     in = ctxt->input->cur;
2645     if (((*in >= 0x61) && (*in <= 0x7A)) ||
2646 	((*in >= 0x41) && (*in <= 0x5A)) ||
2647 	(*in == '_') || (*in == ':')) {
2648 	in++;
2649 	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2650 	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2651 	       ((*in >= 0x30) && (*in <= 0x39)) ||
2652 	       (*in == '_') || (*in == '-') ||
2653 	       (*in == ':') || (*in == '.'))
2654 	    in++;
2655 
2656 	if (in == ctxt->input->end)
2657 	    return(NULL);
2658 
2659 	if ((*in > 0) && (*in < 0x80)) {
2660 	    count = in - ctxt->input->cur;
2661 	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2662 	    ctxt->input->cur = in;
2663 	    ctxt->input->col += count;
2664 	    return(ret);
2665 	}
2666     }
2667     return(htmlParseNameComplex(ctxt));
2668 }
2669 
2670 static const xmlChar *
htmlParseNameComplex(xmlParserCtxtPtr ctxt)2671 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2672     int len = 0, l;
2673     int c;
2674     int count = 0;
2675     const xmlChar *base = ctxt->input->base;
2676 
2677     /*
2678      * Handler for more complex cases
2679      */
2680     GROW;
2681     c = CUR_CHAR(l);
2682     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2683 	(!IS_LETTER(c) && (c != '_') &&
2684          (c != ':'))) {
2685 	return(NULL);
2686     }
2687 
2688     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2689 	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2690             (c == '.') || (c == '-') ||
2691 	    (c == '_') || (c == ':') ||
2692 	    (IS_COMBINING(c)) ||
2693 	    (IS_EXTENDER(c)))) {
2694 	if (count++ > 100) {
2695 	    count = 0;
2696 	    GROW;
2697 	}
2698 	len += l;
2699 	NEXTL(l);
2700 	c = CUR_CHAR(l);
2701 	if (ctxt->input->base != base) {
2702 	    /*
2703 	     * We changed encoding from an unknown encoding
2704 	     * Input buffer changed location, so we better start again
2705 	     */
2706 	    return(htmlParseNameComplex(ctxt));
2707 	}
2708     }
2709 
2710     if (ctxt->input->cur - ctxt->input->base < len) {
2711         /* Sanity check */
2712 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2713                      "unexpected change of input buffer", NULL, NULL);
2714         return (NULL);
2715     }
2716 
2717     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2718 }
2719 
2720 
2721 /**
2722  * htmlParseHTMLAttribute:
2723  * @ctxt:  an HTML parser context
2724  * @stop:  a char stop value
2725  *
2726  * parse an HTML attribute value till the stop (quote), if
2727  * stop is 0 then it stops at the first space
2728  *
2729  * Returns the attribute parsed or NULL
2730  */
2731 
2732 static xmlChar *
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt,const xmlChar stop)2733 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2734     xmlChar *buffer = NULL;
2735     int buffer_size = 0;
2736     xmlChar *out = NULL;
2737     const xmlChar *name = NULL;
2738     const xmlChar *cur = NULL;
2739     const htmlEntityDesc * ent;
2740 
2741     /*
2742      * allocate a translation buffer.
2743      */
2744     buffer_size = HTML_PARSER_BUFFER_SIZE;
2745     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2746     if (buffer == NULL) {
2747 	htmlErrMemory(ctxt, "buffer allocation failed\n");
2748 	return(NULL);
2749     }
2750     out = buffer;
2751 
2752     /*
2753      * Ok loop until we reach one of the ending chars
2754      */
2755     while ((CUR != 0) && (CUR != stop)) {
2756 	if ((stop == 0) && (CUR == '>')) break;
2757 	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2758         if (CUR == '&') {
2759 	    if (NXT(1) == '#') {
2760 		unsigned int c;
2761 		int bits;
2762 
2763 		c = htmlParseCharRef(ctxt);
2764 		if      (c <    0x80)
2765 		        { *out++  = c;                bits= -6; }
2766 		else if (c <   0x800)
2767 		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2768 		else if (c < 0x10000)
2769 		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2770 		else
2771 		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2772 
2773 		for ( ; bits >= 0; bits-= 6) {
2774 		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2775 		}
2776 
2777 		if (out - buffer > buffer_size - 100) {
2778 			int indx = out - buffer;
2779 
2780 			growBuffer(buffer);
2781 			out = &buffer[indx];
2782 		}
2783 	    } else {
2784 		ent = htmlParseEntityRef(ctxt, &name);
2785 		if (name == NULL) {
2786 		    *out++ = '&';
2787 		    if (out - buffer > buffer_size - 100) {
2788 			int indx = out - buffer;
2789 
2790 			growBuffer(buffer);
2791 			out = &buffer[indx];
2792 		    }
2793 		} else if (ent == NULL) {
2794 		    *out++ = '&';
2795 		    cur = name;
2796 		    while (*cur != 0) {
2797 			if (out - buffer > buffer_size - 100) {
2798 			    int indx = out - buffer;
2799 
2800 			    growBuffer(buffer);
2801 			    out = &buffer[indx];
2802 			}
2803 			*out++ = *cur++;
2804 		    }
2805 		} else {
2806 		    unsigned int c;
2807 		    int bits;
2808 
2809 		    if (out - buffer > buffer_size - 100) {
2810 			int indx = out - buffer;
2811 
2812 			growBuffer(buffer);
2813 			out = &buffer[indx];
2814 		    }
2815 		    c = ent->value;
2816 		    if      (c <    0x80)
2817 			{ *out++  = c;                bits= -6; }
2818 		    else if (c <   0x800)
2819 			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2820 		    else if (c < 0x10000)
2821 			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2822 		    else
2823 			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2824 
2825 		    for ( ; bits >= 0; bits-= 6) {
2826 			*out++  = ((c >> bits) & 0x3F) | 0x80;
2827 		    }
2828 		}
2829 	    }
2830 	} else {
2831 	    unsigned int c;
2832 	    int bits, l;
2833 
2834 	    if (out - buffer > buffer_size - 100) {
2835 		int indx = out - buffer;
2836 
2837 		growBuffer(buffer);
2838 		out = &buffer[indx];
2839 	    }
2840 	    c = CUR_CHAR(l);
2841 	    if      (c <    0x80)
2842 		    { *out++  = c;                bits= -6; }
2843 	    else if (c <   0x800)
2844 		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2845 	    else if (c < 0x10000)
2846 		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2847 	    else
2848 		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2849 
2850 	    for ( ; bits >= 0; bits-= 6) {
2851 		*out++  = ((c >> bits) & 0x3F) | 0x80;
2852 	    }
2853 	    NEXT;
2854 	}
2855     }
2856     *out = 0;
2857     return(buffer);
2858 }
2859 
2860 /**
2861  * htmlParseEntityRef:
2862  * @ctxt:  an HTML parser context
2863  * @str:  location to store the entity name
2864  *
2865  * parse an HTML ENTITY references
2866  *
2867  * [68] EntityRef ::= '&' Name ';'
2868  *
2869  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2870  *         if non-NULL *str will have to be freed by the caller.
2871  */
2872 const htmlEntityDesc *
htmlParseEntityRef(htmlParserCtxtPtr ctxt,const xmlChar ** str)2873 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2874     const xmlChar *name;
2875     const htmlEntityDesc * ent = NULL;
2876 
2877     if (str != NULL) *str = NULL;
2878     if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2879 
2880     if (CUR == '&') {
2881         NEXT;
2882         name = htmlParseName(ctxt);
2883 	if (name == NULL) {
2884 	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2885 	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2886 	} else {
2887 	    GROW;
2888 	    if (CUR == ';') {
2889 	        if (str != NULL)
2890 		    *str = name;
2891 
2892 		/*
2893 		 * Lookup the entity in the table.
2894 		 */
2895 		ent = htmlEntityLookup(name);
2896 		if (ent != NULL) /* OK that's ugly !!! */
2897 		    NEXT;
2898 	    } else {
2899 		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2900 		             "htmlParseEntityRef: expecting ';'\n",
2901 			     NULL, NULL);
2902 	        if (str != NULL)
2903 		    *str = name;
2904 	    }
2905 	}
2906     }
2907     return(ent);
2908 }
2909 
2910 /**
2911  * htmlParseAttValue:
2912  * @ctxt:  an HTML parser context
2913  *
2914  * parse a value for an attribute
2915  * Note: the parser won't do substitution of entities here, this
2916  * will be handled later in xmlStringGetNodeList, unless it was
2917  * asked for ctxt->replaceEntities != 0
2918  *
2919  * Returns the AttValue parsed or NULL.
2920  */
2921 
2922 static xmlChar *
htmlParseAttValue(htmlParserCtxtPtr ctxt)2923 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2924     xmlChar *ret = NULL;
2925 
2926     if (CUR == '"') {
2927         NEXT;
2928 	ret = htmlParseHTMLAttribute(ctxt, '"');
2929         if (CUR != '"') {
2930 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2931 	                 "AttValue: \" expected\n", NULL, NULL);
2932 	} else
2933 	    NEXT;
2934     } else if (CUR == '\'') {
2935         NEXT;
2936 	ret = htmlParseHTMLAttribute(ctxt, '\'');
2937         if (CUR != '\'') {
2938 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2939 	                 "AttValue: ' expected\n", NULL, NULL);
2940 	} else
2941 	    NEXT;
2942     } else {
2943         /*
2944 	 * That's an HTMLism, the attribute value may not be quoted
2945 	 */
2946 	ret = htmlParseHTMLAttribute(ctxt, 0);
2947 	if (ret == NULL) {
2948 	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2949 	                 "AttValue: no value found\n", NULL, NULL);
2950 	}
2951     }
2952     return(ret);
2953 }
2954 
2955 /**
2956  * htmlParseSystemLiteral:
2957  * @ctxt:  an HTML parser context
2958  *
2959  * parse an HTML Literal
2960  *
2961  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2962  *
2963  * Returns the SystemLiteral parsed or NULL
2964  */
2965 
2966 static xmlChar *
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt)2967 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2968     size_t len = 0, startPosition = 0;
2969     int err = 0;
2970     int quote;
2971     xmlChar *ret = NULL;
2972 
2973     if ((CUR != '"') && (CUR != '\'')) {
2974 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2975 	             "SystemLiteral \" or ' expected\n", NULL, NULL);
2976         return(NULL);
2977     }
2978     quote = CUR;
2979     NEXT;
2980 
2981     if (CUR_PTR < BASE_PTR)
2982         return(ret);
2983     startPosition = CUR_PTR - BASE_PTR;
2984 
2985     while ((CUR != 0) && (CUR != quote)) {
2986         /* TODO: Handle UTF-8 */
2987         if (!IS_CHAR_CH(CUR)) {
2988             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2989                             "Invalid char in SystemLiteral 0x%X\n", CUR);
2990             err = 1;
2991         }
2992         NEXT;
2993         len++;
2994     }
2995     if (CUR != quote) {
2996         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2997                      "Unfinished SystemLiteral\n", NULL, NULL);
2998     } else {
2999         NEXT;
3000         if (err == 0)
3001             ret = xmlStrndup((BASE_PTR+startPosition), len);
3002     }
3003 
3004     return(ret);
3005 }
3006 
3007 /**
3008  * htmlParsePubidLiteral:
3009  * @ctxt:  an HTML parser context
3010  *
3011  * parse an HTML public literal
3012  *
3013  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3014  *
3015  * Returns the PubidLiteral parsed or NULL.
3016  */
3017 
3018 static xmlChar *
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt)3019 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3020     size_t len = 0, startPosition = 0;
3021     int err = 0;
3022     int quote;
3023     xmlChar *ret = NULL;
3024 
3025     if ((CUR != '"') && (CUR != '\'')) {
3026 	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3027 	             "PubidLiteral \" or ' expected\n", NULL, NULL);
3028         return(NULL);
3029     }
3030     quote = CUR;
3031     NEXT;
3032 
3033     /*
3034      * Name ::= (Letter | '_') (NameChar)*
3035      */
3036     if (CUR_PTR < BASE_PTR)
3037         return(ret);
3038     startPosition = CUR_PTR - BASE_PTR;
3039 
3040     while ((CUR != 0) && (CUR != quote)) {
3041         if (!IS_PUBIDCHAR_CH(CUR)) {
3042             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3043                             "Invalid char in PubidLiteral 0x%X\n", CUR);
3044             err = 1;
3045         }
3046         len++;
3047         NEXT;
3048     }
3049 
3050     if (CUR != quote) {
3051         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3052                      "Unfinished PubidLiteral\n", NULL, NULL);
3053     } else {
3054         NEXT;
3055         if (err == 0)
3056             ret = xmlStrndup((BASE_PTR + startPosition), len);
3057     }
3058 
3059     return(ret);
3060 }
3061 
3062 /**
3063  * htmlParseScript:
3064  * @ctxt:  an HTML parser context
3065  *
3066  * parse the content of an HTML SCRIPT or STYLE element
3067  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3068  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3069  * http://www.w3.org/TR/html4/types.html#type-script
3070  * http://www.w3.org/TR/html4/types.html#h-6.15
3071  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3072  *
3073  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3074  * element and the value of intrinsic event attributes. User agents must
3075  * not evaluate script data as HTML markup but instead must pass it on as
3076  * data to a script engine.
3077  * NOTES:
3078  * - The content is passed like CDATA
3079  * - the attributes for style and scripting "onXXX" are also described
3080  *   as CDATA but SGML allows entities references in attributes so their
3081  *   processing is identical as other attributes
3082  */
3083 static void
htmlParseScript(htmlParserCtxtPtr ctxt)3084 htmlParseScript(htmlParserCtxtPtr ctxt) {
3085     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3086     int nbchar = 0;
3087     int cur,l;
3088 
3089     SHRINK;
3090     cur = CUR_CHAR(l);
3091     while (cur != 0) {
3092 	if ((cur == '<') && (NXT(1) == '/')) {
3093             /*
3094              * One should break here, the specification is clear:
3095              * Authors should therefore escape "</" within the content.
3096              * Escape mechanisms are specific to each scripting or
3097              * style sheet language.
3098              *
3099              * In recovery mode, only break if end tag match the
3100              * current tag, effectively ignoring all tags inside the
3101              * script/style block and treating the entire block as
3102              * CDATA.
3103              */
3104             if (ctxt->recovery) {
3105                 if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3106 				   xmlStrlen(ctxt->name)) == 0)
3107                 {
3108                     break; /* while */
3109                 } else {
3110 		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3111 				 "Element %s embeds close tag\n",
3112 		                 ctxt->name, NULL);
3113 		}
3114             } else {
3115                 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3116                     ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3117                 {
3118                     break; /* while */
3119                 }
3120             }
3121 	}
3122         if (IS_CHAR(cur)) {
3123 	    COPY_BUF(l,buf,nbchar,cur);
3124         } else {
3125             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3126                             "Invalid char in CDATA 0x%X\n", cur);
3127         }
3128 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3129             buf[nbchar] = 0;
3130 	    if (ctxt->sax->cdataBlock!= NULL) {
3131 		/*
3132 		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3133 		 */
3134 		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3135 	    } else if (ctxt->sax->characters != NULL) {
3136 		ctxt->sax->characters(ctxt->userData, buf, nbchar);
3137 	    }
3138 	    nbchar = 0;
3139 	}
3140 	GROW;
3141 	NEXTL(l);
3142 	cur = CUR_CHAR(l);
3143     }
3144 
3145     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3146         buf[nbchar] = 0;
3147 	if (ctxt->sax->cdataBlock!= NULL) {
3148 	    /*
3149 	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3150 	     */
3151 	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3152 	} else if (ctxt->sax->characters != NULL) {
3153 	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3154 	}
3155     }
3156 }
3157 
3158 
3159 /**
3160  * htmlParseCharDataInternal:
3161  * @ctxt:  an HTML parser context
3162  * @readahead: optional read ahead character in ascii range
3163  *
3164  * parse a CharData section.
3165  * if we are within a CDATA section ']]>' marks an end of section.
3166  *
3167  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3168  */
3169 
3170 static void
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt,int readahead)3171 htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3172     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3173     int nbchar = 0;
3174     int cur, l;
3175     int chunk = 0;
3176 
3177     if (readahead)
3178         buf[nbchar++] = readahead;
3179 
3180     SHRINK;
3181     cur = CUR_CHAR(l);
3182     while (((cur != '<') || (ctxt->token == '<')) &&
3183            ((cur != '&') || (ctxt->token == '&')) &&
3184 	   (cur != 0)) {
3185 	if (!(IS_CHAR(cur))) {
3186 	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3187 	                "Invalid char in CDATA 0x%X\n", cur);
3188 	} else {
3189 	    COPY_BUF(l,buf,nbchar,cur);
3190 	}
3191 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3192             buf[nbchar] = 0;
3193 
3194 	    /*
3195 	     * Ok the segment is to be consumed as chars.
3196 	     */
3197 	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3198 		if (areBlanks(ctxt, buf, nbchar)) {
3199 		    if (ctxt->keepBlanks) {
3200 			if (ctxt->sax->characters != NULL)
3201 			    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3202 		    } else {
3203 			if (ctxt->sax->ignorableWhitespace != NULL)
3204 			    ctxt->sax->ignorableWhitespace(ctxt->userData,
3205 			                                   buf, nbchar);
3206 		    }
3207 		} else {
3208 		    htmlCheckParagraph(ctxt);
3209 		    if (ctxt->sax->characters != NULL)
3210 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3211 		}
3212 	    }
3213 	    nbchar = 0;
3214 	}
3215 	NEXTL(l);
3216         chunk++;
3217         if (chunk > HTML_PARSER_BUFFER_SIZE) {
3218             chunk = 0;
3219             SHRINK;
3220             GROW;
3221         }
3222 	cur = CUR_CHAR(l);
3223 	if (cur == 0) {
3224 	    SHRINK;
3225 	    GROW;
3226 	    cur = CUR_CHAR(l);
3227 	}
3228     }
3229     if (nbchar != 0) {
3230         buf[nbchar] = 0;
3231 
3232 	/*
3233 	 * Ok the segment is to be consumed as chars.
3234 	 */
3235 	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3236 	    if (areBlanks(ctxt, buf, nbchar)) {
3237 		if (ctxt->keepBlanks) {
3238 		    if (ctxt->sax->characters != NULL)
3239 			ctxt->sax->characters(ctxt->userData, buf, nbchar);
3240 		} else {
3241 		    if (ctxt->sax->ignorableWhitespace != NULL)
3242 			ctxt->sax->ignorableWhitespace(ctxt->userData,
3243 			                               buf, nbchar);
3244 		}
3245 	    } else {
3246 		htmlCheckParagraph(ctxt);
3247 		if (ctxt->sax->characters != NULL)
3248 		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3249 	    }
3250 	}
3251     } else {
3252 	/*
3253 	 * Loop detection
3254 	 */
3255 	if (cur == 0)
3256 	    ctxt->instate = XML_PARSER_EOF;
3257     }
3258 }
3259 
3260 /**
3261  * htmlParseCharData:
3262  * @ctxt:  an HTML parser context
3263  *
3264  * parse a CharData section.
3265  * if we are within a CDATA section ']]>' marks an end of section.
3266  *
3267  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3268  */
3269 
3270 static void
htmlParseCharData(htmlParserCtxtPtr ctxt)3271 htmlParseCharData(htmlParserCtxtPtr ctxt) {
3272     htmlParseCharDataInternal(ctxt, 0);
3273 }
3274 
3275 /**
3276  * htmlParseExternalID:
3277  * @ctxt:  an HTML parser context
3278  * @publicID:  a xmlChar** receiving PubidLiteral
3279  *
3280  * Parse an External ID or a Public ID
3281  *
3282  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3283  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3284  *
3285  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3286  *
3287  * Returns the function returns SystemLiteral and in the second
3288  *                case publicID receives PubidLiteral, is strict is off
3289  *                it is possible to return NULL and have publicID set.
3290  */
3291 
3292 static xmlChar *
htmlParseExternalID(htmlParserCtxtPtr ctxt,xmlChar ** publicID)3293 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3294     xmlChar *URI = NULL;
3295 
3296     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3297          (UPP(2) == 'S') && (UPP(3) == 'T') &&
3298 	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3299         SKIP(6);
3300 	if (!IS_BLANK_CH(CUR)) {
3301 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3302 	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3303 	}
3304         SKIP_BLANKS;
3305 	URI = htmlParseSystemLiteral(ctxt);
3306 	if (URI == NULL) {
3307 	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3308 	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3309         }
3310     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3311 	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3312 	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3313         SKIP(6);
3314 	if (!IS_BLANK_CH(CUR)) {
3315 	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3316 	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3317 	}
3318         SKIP_BLANKS;
3319 	*publicID = htmlParsePubidLiteral(ctxt);
3320 	if (*publicID == NULL) {
3321 	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3322 	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3323 			 NULL, NULL);
3324 	}
3325         SKIP_BLANKS;
3326         if ((CUR == '"') || (CUR == '\'')) {
3327 	    URI = htmlParseSystemLiteral(ctxt);
3328 	}
3329     }
3330     return(URI);
3331 }
3332 
3333 /**
3334  * xmlParsePI:
3335  * @ctxt:  an XML parser context
3336  *
3337  * parse an XML Processing Instruction.
3338  *
3339  * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3340  */
3341 static void
htmlParsePI(htmlParserCtxtPtr ctxt)3342 htmlParsePI(htmlParserCtxtPtr ctxt) {
3343     xmlChar *buf = NULL;
3344     int len = 0;
3345     int size = HTML_PARSER_BUFFER_SIZE;
3346     int cur, l;
3347     const xmlChar *target;
3348     xmlParserInputState state;
3349     int count = 0;
3350 
3351     if ((RAW == '<') && (NXT(1) == '?')) {
3352 	state = ctxt->instate;
3353         ctxt->instate = XML_PARSER_PI;
3354 	/*
3355 	 * this is a Processing Instruction.
3356 	 */
3357 	SKIP(2);
3358 	SHRINK;
3359 
3360 	/*
3361 	 * Parse the target name and check for special support like
3362 	 * namespace.
3363 	 */
3364         target = htmlParseName(ctxt);
3365 	if (target != NULL) {
3366 	    if (RAW == '>') {
3367 		SKIP(1);
3368 
3369 		/*
3370 		 * SAX: PI detected.
3371 		 */
3372 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3373 		    (ctxt->sax->processingInstruction != NULL))
3374 		    ctxt->sax->processingInstruction(ctxt->userData,
3375 		                                     target, NULL);
3376 		ctxt->instate = state;
3377 		return;
3378 	    }
3379 	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3380 	    if (buf == NULL) {
3381 		htmlErrMemory(ctxt, NULL);
3382 		ctxt->instate = state;
3383 		return;
3384 	    }
3385 	    cur = CUR;
3386 	    if (!IS_BLANK(cur)) {
3387 		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3388 			  "ParsePI: PI %s space expected\n", target, NULL);
3389 	    }
3390             SKIP_BLANKS;
3391 	    cur = CUR_CHAR(l);
3392 	    while ((cur != 0) && (cur != '>')) {
3393 		if (len + 5 >= size) {
3394 		    xmlChar *tmp;
3395 
3396 		    size *= 2;
3397 		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3398 		    if (tmp == NULL) {
3399 			htmlErrMemory(ctxt, NULL);
3400 			xmlFree(buf);
3401 			ctxt->instate = state;
3402 			return;
3403 		    }
3404 		    buf = tmp;
3405 		}
3406 		count++;
3407 		if (count > 50) {
3408 		    GROW;
3409 		    count = 0;
3410 		}
3411                 if (IS_CHAR(cur)) {
3412 		    COPY_BUF(l,buf,len,cur);
3413                 } else {
3414                     htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3415                                     "Invalid char in processing instruction "
3416                                     "0x%X\n", cur);
3417                 }
3418 		NEXTL(l);
3419 		cur = CUR_CHAR(l);
3420 		if (cur == 0) {
3421 		    SHRINK;
3422 		    GROW;
3423 		    cur = CUR_CHAR(l);
3424 		}
3425 	    }
3426 	    buf[len] = 0;
3427 	    if (cur != '>') {
3428 		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3429 		      "ParsePI: PI %s never end ...\n", target, NULL);
3430 	    } else {
3431 		SKIP(1);
3432 
3433 		/*
3434 		 * SAX: PI detected.
3435 		 */
3436 		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3437 		    (ctxt->sax->processingInstruction != NULL))
3438 		    ctxt->sax->processingInstruction(ctxt->userData,
3439 		                                     target, buf);
3440 	    }
3441 	    xmlFree(buf);
3442 	} else {
3443 	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3444                          "PI is not started correctly", NULL, NULL);
3445 	}
3446 	ctxt->instate = state;
3447     }
3448 }
3449 
3450 /**
3451  * htmlParseComment:
3452  * @ctxt:  an HTML parser context
3453  *
3454  * Parse an XML (SGML) comment <!-- .... -->
3455  *
3456  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3457  */
3458 static void
htmlParseComment(htmlParserCtxtPtr ctxt)3459 htmlParseComment(htmlParserCtxtPtr ctxt) {
3460     xmlChar *buf = NULL;
3461     int len;
3462     int size = HTML_PARSER_BUFFER_SIZE;
3463     int q, ql;
3464     int r, rl;
3465     int cur, l;
3466     int next, nl;
3467     xmlParserInputState state;
3468 
3469     /*
3470      * Check that there is a comment right here.
3471      */
3472     if ((RAW != '<') || (NXT(1) != '!') ||
3473         (NXT(2) != '-') || (NXT(3) != '-')) return;
3474 
3475     state = ctxt->instate;
3476     ctxt->instate = XML_PARSER_COMMENT;
3477     SHRINK;
3478     SKIP(4);
3479     buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3480     if (buf == NULL) {
3481         htmlErrMemory(ctxt, "buffer allocation failed\n");
3482 	ctxt->instate = state;
3483 	return;
3484     }
3485     len = 0;
3486     buf[len] = 0;
3487     q = CUR_CHAR(ql);
3488     if (q == 0)
3489         goto unfinished;
3490     NEXTL(ql);
3491     r = CUR_CHAR(rl);
3492     if (r == 0)
3493         goto unfinished;
3494     NEXTL(rl);
3495     cur = CUR_CHAR(l);
3496     while ((cur != 0) &&
3497            ((cur != '>') ||
3498 	    (r != '-') || (q != '-'))) {
3499 	NEXTL(l);
3500 	next = CUR_CHAR(nl);
3501 	if (next == 0) {
3502 	    SHRINK;
3503 	    GROW;
3504 	    next = CUR_CHAR(nl);
3505 	}
3506 
3507 	if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3508 	  htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3509 		       "Comment incorrectly closed by '--!>'", NULL, NULL);
3510 	  cur = '>';
3511 	  break;
3512 	}
3513 
3514 	if (len + 5 >= size) {
3515 	    xmlChar *tmp;
3516 
3517 	    size *= 2;
3518 	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3519 	    if (tmp == NULL) {
3520 	        xmlFree(buf);
3521 	        htmlErrMemory(ctxt, "growing buffer failed\n");
3522 		ctxt->instate = state;
3523 		return;
3524 	    }
3525 	    buf = tmp;
3526 	}
3527         if (IS_CHAR(q)) {
3528 	    COPY_BUF(ql,buf,len,q);
3529         } else {
3530             htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3531                             "Invalid char in comment 0x%X\n", q);
3532         }
3533 
3534 	q = r;
3535 	ql = rl;
3536 	r = cur;
3537 	rl = l;
3538 	cur = next;
3539 	l = nl;
3540     }
3541     buf[len] = 0;
3542     if (cur == '>') {
3543         NEXT;
3544 	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3545 	    (!ctxt->disableSAX))
3546 	    ctxt->sax->comment(ctxt->userData, buf);
3547 	xmlFree(buf);
3548 	ctxt->instate = state;
3549 	return;
3550     }
3551 
3552 unfinished:
3553     htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3554 		 "Comment not terminated \n<!--%.50s\n", buf, NULL);
3555     xmlFree(buf);
3556 }
3557 
3558 /**
3559  * htmlParseCharRef:
3560  * @ctxt:  an HTML parser context
3561  *
3562  * parse Reference declarations
3563  *
3564  * [66] CharRef ::= '&#' [0-9]+ ';' |
3565  *                  '&#x' [0-9a-fA-F]+ ';'
3566  *
3567  * Returns the value parsed (as an int)
3568  */
3569 int
htmlParseCharRef(htmlParserCtxtPtr ctxt)3570 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3571     int val = 0;
3572 
3573     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3574 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3575 		     "htmlParseCharRef: context error\n",
3576 		     NULL, NULL);
3577         return(0);
3578     }
3579     if ((CUR == '&') && (NXT(1) == '#') &&
3580         ((NXT(2) == 'x') || NXT(2) == 'X')) {
3581 	SKIP(3);
3582 	while (CUR != ';') {
3583 	    if ((CUR >= '0') && (CUR <= '9')) {
3584                 if (val < 0x110000)
3585 	            val = val * 16 + (CUR - '0');
3586             } else if ((CUR >= 'a') && (CUR <= 'f')) {
3587                 if (val < 0x110000)
3588 	            val = val * 16 + (CUR - 'a') + 10;
3589             } else if ((CUR >= 'A') && (CUR <= 'F')) {
3590                 if (val < 0x110000)
3591 	            val = val * 16 + (CUR - 'A') + 10;
3592             } else {
3593 	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3594 		             "htmlParseCharRef: missing semicolon\n",
3595 			     NULL, NULL);
3596 		break;
3597 	    }
3598 	    NEXT;
3599 	}
3600 	if (CUR == ';')
3601 	    NEXT;
3602     } else if  ((CUR == '&') && (NXT(1) == '#')) {
3603 	SKIP(2);
3604 	while (CUR != ';') {
3605 	    if ((CUR >= '0') && (CUR <= '9')) {
3606                 if (val < 0x110000)
3607 	            val = val * 10 + (CUR - '0');
3608             } else {
3609 	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3610 		             "htmlParseCharRef: missing semicolon\n",
3611 			     NULL, NULL);
3612 		break;
3613 	    }
3614 	    NEXT;
3615 	}
3616 	if (CUR == ';')
3617 	    NEXT;
3618     } else {
3619 	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3620 	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3621     }
3622     /*
3623      * Check the value IS_CHAR ...
3624      */
3625     if (IS_CHAR(val)) {
3626         return(val);
3627     } else if (val >= 0x110000) {
3628 	htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3629 		     "htmlParseCharRef: value too large\n", NULL, NULL);
3630     } else {
3631 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3632 			"htmlParseCharRef: invalid xmlChar value %d\n",
3633 			val);
3634     }
3635     return(0);
3636 }
3637 
3638 
3639 /**
3640  * htmlParseDocTypeDecl:
3641  * @ctxt:  an HTML parser context
3642  *
3643  * parse a DOCTYPE declaration
3644  *
3645  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3646  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3647  */
3648 
3649 static void
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt)3650 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3651     const xmlChar *name;
3652     xmlChar *ExternalID = NULL;
3653     xmlChar *URI = NULL;
3654 
3655     /*
3656      * We know that '<!DOCTYPE' has been detected.
3657      */
3658     SKIP(9);
3659 
3660     SKIP_BLANKS;
3661 
3662     /*
3663      * Parse the DOCTYPE name.
3664      */
3665     name = htmlParseName(ctxt);
3666     if (name == NULL) {
3667 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3668 	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3669 		     NULL, NULL);
3670     }
3671     /*
3672      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3673      */
3674 
3675     SKIP_BLANKS;
3676 
3677     /*
3678      * Check for SystemID and ExternalID
3679      */
3680     URI = htmlParseExternalID(ctxt, &ExternalID);
3681     SKIP_BLANKS;
3682 
3683     /*
3684      * We should be at the end of the DOCTYPE declaration.
3685      */
3686     if (CUR != '>') {
3687 	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3688 	             "DOCTYPE improperly terminated\n", NULL, NULL);
3689         /* Ignore bogus content */
3690         while ((CUR != 0) && (CUR != '>'))
3691             NEXT;
3692     }
3693     if (CUR == '>')
3694         NEXT;
3695 
3696     /*
3697      * Create or update the document accordingly to the DOCTYPE
3698      */
3699     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3700 	(!ctxt->disableSAX))
3701 	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3702 
3703     /*
3704      * Cleanup, since we don't use all those identifiers
3705      */
3706     if (URI != NULL) xmlFree(URI);
3707     if (ExternalID != NULL) xmlFree(ExternalID);
3708 }
3709 
3710 /**
3711  * htmlParseAttribute:
3712  * @ctxt:  an HTML parser context
3713  * @value:  a xmlChar ** used to store the value of the attribute
3714  *
3715  * parse an attribute
3716  *
3717  * [41] Attribute ::= Name Eq AttValue
3718  *
3719  * [25] Eq ::= S? '=' S?
3720  *
3721  * With namespace:
3722  *
3723  * [NS 11] Attribute ::= QName Eq AttValue
3724  *
3725  * Also the case QName == xmlns:??? is handled independently as a namespace
3726  * definition.
3727  *
3728  * Returns the attribute name, and the value in *value.
3729  */
3730 
3731 static const xmlChar *
htmlParseAttribute(htmlParserCtxtPtr ctxt,xmlChar ** value)3732 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3733     const xmlChar *name;
3734     xmlChar *val = NULL;
3735 
3736     *value = NULL;
3737     name = htmlParseHTMLName(ctxt);
3738     if (name == NULL) {
3739 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3740 	             "error parsing attribute name\n", NULL, NULL);
3741         return(NULL);
3742     }
3743 
3744     /*
3745      * read the value
3746      */
3747     SKIP_BLANKS;
3748     if (CUR == '=') {
3749         NEXT;
3750 	SKIP_BLANKS;
3751 	val = htmlParseAttValue(ctxt);
3752     }
3753 
3754     *value = val;
3755     return(name);
3756 }
3757 
3758 /**
3759  * htmlCheckEncodingDirect:
3760  * @ctxt:  an HTML parser context
3761  * @attvalue: the attribute value
3762  *
3763  * Checks an attribute value to detect
3764  * the encoding
3765  * If a new encoding is detected the parser is switched to decode
3766  * it and pass UTF8
3767  */
3768 static void
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt,const xmlChar * encoding)3769 htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3770 
3771     if ((ctxt == NULL) || (encoding == NULL) ||
3772         (ctxt->options & HTML_PARSE_IGNORE_ENC))
3773 	return;
3774 
3775     /* do not change encoding */
3776     if (ctxt->input->encoding != NULL)
3777         return;
3778 
3779     if (encoding != NULL) {
3780 	xmlCharEncoding enc;
3781 	xmlCharEncodingHandlerPtr handler;
3782 
3783 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3784 
3785 	if (ctxt->input->encoding != NULL)
3786 	    xmlFree((xmlChar *) ctxt->input->encoding);
3787 	ctxt->input->encoding = xmlStrdup(encoding);
3788 
3789 	enc = xmlParseCharEncoding((const char *) encoding);
3790 	/*
3791 	 * registered set of known encodings
3792 	 */
3793 	if (enc != XML_CHAR_ENCODING_ERROR) {
3794 	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3795 	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3796 		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3797 		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3798 		(ctxt->input->buf != NULL) &&
3799 		(ctxt->input->buf->encoder == NULL)) {
3800 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3801 		             "htmlCheckEncoding: wrong encoding meta\n",
3802 			     NULL, NULL);
3803 	    } else {
3804 		xmlSwitchEncoding(ctxt, enc);
3805 	    }
3806 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3807 	} else {
3808 	    /*
3809 	     * fallback for unknown encodings
3810 	     */
3811 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3812 	    if (handler != NULL) {
3813 		xmlSwitchToEncoding(ctxt, handler);
3814 		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3815 	    } else {
3816 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3817 		             "htmlCheckEncoding: unknown encoding %s\n",
3818 			     encoding, NULL);
3819 	    }
3820 	}
3821 
3822 	if ((ctxt->input->buf != NULL) &&
3823 	    (ctxt->input->buf->encoder != NULL) &&
3824 	    (ctxt->input->buf->raw != NULL) &&
3825 	    (ctxt->input->buf->buffer != NULL)) {
3826 	    int nbchars;
3827 	    int processed;
3828 
3829 	    /*
3830 	     * convert as much as possible to the parser reading buffer.
3831 	     */
3832 	    processed = ctxt->input->cur - ctxt->input->base;
3833 	    xmlBufShrink(ctxt->input->buf->buffer, processed);
3834 	    nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3835             xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3836 	    if (nbchars < 0) {
3837 		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3838 		             "htmlCheckEncoding: encoder error\n",
3839 			     NULL, NULL);
3840 	    }
3841 	}
3842     }
3843 }
3844 
3845 /**
3846  * htmlCheckEncoding:
3847  * @ctxt:  an HTML parser context
3848  * @attvalue: the attribute value
3849  *
3850  * Checks an http-equiv attribute from a Meta tag to detect
3851  * the encoding
3852  * If a new encoding is detected the parser is switched to decode
3853  * it and pass UTF8
3854  */
3855 static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt,const xmlChar * attvalue)3856 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3857     const xmlChar *encoding;
3858 
3859     if (!attvalue)
3860 	return;
3861 
3862     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3863     if (encoding != NULL) {
3864 	encoding += 7;
3865     }
3866     /*
3867      * skip blank
3868      */
3869     if (encoding && IS_BLANK_CH(*encoding))
3870 	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3871     if (encoding && *encoding == '=') {
3872 	encoding ++;
3873 	htmlCheckEncodingDirect(ctxt, encoding);
3874     }
3875 }
3876 
3877 /**
3878  * htmlCheckMeta:
3879  * @ctxt:  an HTML parser context
3880  * @atts:  the attributes values
3881  *
3882  * Checks an attributes from a Meta tag
3883  */
3884 static void
htmlCheckMeta(htmlParserCtxtPtr ctxt,const xmlChar ** atts)3885 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3886     int i;
3887     const xmlChar *att, *value;
3888     int http = 0;
3889     const xmlChar *content = NULL;
3890 
3891     if ((ctxt == NULL) || (atts == NULL))
3892 	return;
3893 
3894     i = 0;
3895     att = atts[i++];
3896     while (att != NULL) {
3897 	value = atts[i++];
3898 	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3899 	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3900 	    http = 1;
3901 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3902 	    htmlCheckEncodingDirect(ctxt, value);
3903 	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3904 	    content = value;
3905 	att = atts[i++];
3906     }
3907     if ((http) && (content != NULL))
3908 	htmlCheckEncoding(ctxt, content);
3909 
3910 }
3911 
3912 /**
3913  * htmlParseStartTag:
3914  * @ctxt:  an HTML parser context
3915  *
3916  * parse a start of tag either for rule element or
3917  * EmptyElement. In both case we don't parse the tag closing chars.
3918  *
3919  * [40] STag ::= '<' Name (S Attribute)* S? '>'
3920  *
3921  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3922  *
3923  * With namespace:
3924  *
3925  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3926  *
3927  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3928  *
3929  * Returns 0 in case of success, -1 in case of error and 1 if discarded
3930  */
3931 
3932 static int
htmlParseStartTag(htmlParserCtxtPtr ctxt)3933 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3934     const xmlChar *name;
3935     const xmlChar *attname;
3936     xmlChar *attvalue;
3937     const xmlChar **atts;
3938     int nbatts = 0;
3939     int maxatts;
3940     int meta = 0;
3941     int i;
3942     int discardtag = 0;
3943 
3944     if ((ctxt == NULL) || (ctxt->input == NULL)) {
3945 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3946 		     "htmlParseStartTag: context error\n", NULL, NULL);
3947 	return -1;
3948     }
3949     if (ctxt->instate == XML_PARSER_EOF)
3950         return(-1);
3951     if (CUR != '<') return -1;
3952     NEXT;
3953 
3954     atts = ctxt->atts;
3955     maxatts = ctxt->maxatts;
3956 
3957     GROW;
3958     name = htmlParseHTMLName(ctxt);
3959     if (name == NULL) {
3960 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3961 	             "htmlParseStartTag: invalid element name\n",
3962 		     NULL, NULL);
3963 	/* Dump the bogus tag like browsers do */
3964 	while ((CUR != 0) && (CUR != '>') &&
3965                (ctxt->instate != XML_PARSER_EOF))
3966 	    NEXT;
3967         return -1;
3968     }
3969     if (xmlStrEqual(name, BAD_CAST"meta"))
3970 	meta = 1;
3971 
3972     /*
3973      * Check for auto-closure of HTML elements.
3974      */
3975     htmlAutoClose(ctxt, name);
3976 
3977     /*
3978      * Check for implied HTML elements.
3979      */
3980     htmlCheckImplied(ctxt, name);
3981 
3982     /*
3983      * Avoid html at any level > 0, head at any level != 1
3984      * or any attempt to recurse body
3985      */
3986     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3987 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3988 	             "htmlParseStartTag: misplaced <html> tag\n",
3989 		     name, NULL);
3990 	discardtag = 1;
3991 	ctxt->depth++;
3992     }
3993     if ((ctxt->nameNr != 1) &&
3994 	(xmlStrEqual(name, BAD_CAST"head"))) {
3995 	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3996 	             "htmlParseStartTag: misplaced <head> tag\n",
3997 		     name, NULL);
3998 	discardtag = 1;
3999 	ctxt->depth++;
4000     }
4001     if (xmlStrEqual(name, BAD_CAST"body")) {
4002 	int indx;
4003 	for (indx = 0;indx < ctxt->nameNr;indx++) {
4004 	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4005 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4006 		             "htmlParseStartTag: misplaced <body> tag\n",
4007 			     name, NULL);
4008 		discardtag = 1;
4009 		ctxt->depth++;
4010 	    }
4011 	}
4012     }
4013 
4014     /*
4015      * Now parse the attributes, it ends up with the ending
4016      *
4017      * (S Attribute)* S?
4018      */
4019     SKIP_BLANKS;
4020     while ((CUR != 0) &&
4021            (CUR != '>') &&
4022 	   ((CUR != '/') || (NXT(1) != '>'))) {
4023 	GROW;
4024 	attname = htmlParseAttribute(ctxt, &attvalue);
4025         if (attname != NULL) {
4026 
4027 	    /*
4028 	     * Well formedness requires at most one declaration of an attribute
4029 	     */
4030 	    for (i = 0; i < nbatts;i += 2) {
4031 	        if (xmlStrEqual(atts[i], attname)) {
4032 		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4033 		                 "Attribute %s redefined\n", attname, NULL);
4034 		    if (attvalue != NULL)
4035 			xmlFree(attvalue);
4036 		    goto failed;
4037 		}
4038 	    }
4039 
4040 	    /*
4041 	     * Add the pair to atts
4042 	     */
4043 	    if (atts == NULL) {
4044 	        maxatts = 22; /* allow for 10 attrs by default */
4045 	        atts = (const xmlChar **)
4046 		       xmlMalloc(maxatts * sizeof(xmlChar *));
4047 		if (atts == NULL) {
4048 		    htmlErrMemory(ctxt, NULL);
4049 		    if (attvalue != NULL)
4050 			xmlFree(attvalue);
4051 		    goto failed;
4052 		}
4053 		ctxt->atts = atts;
4054 		ctxt->maxatts = maxatts;
4055 	    } else if (nbatts + 4 > maxatts) {
4056 	        const xmlChar **n;
4057 
4058 	        maxatts *= 2;
4059 	        n = (const xmlChar **) xmlRealloc((void *) atts,
4060 					     maxatts * sizeof(const xmlChar *));
4061 		if (n == NULL) {
4062 		    htmlErrMemory(ctxt, NULL);
4063 		    if (attvalue != NULL)
4064 			xmlFree(attvalue);
4065 		    goto failed;
4066 		}
4067 		atts = n;
4068 		ctxt->atts = atts;
4069 		ctxt->maxatts = maxatts;
4070 	    }
4071 	    atts[nbatts++] = attname;
4072 	    atts[nbatts++] = attvalue;
4073 	    atts[nbatts] = NULL;
4074 	    atts[nbatts + 1] = NULL;
4075 	}
4076 	else {
4077 	    if (attvalue != NULL)
4078 	        xmlFree(attvalue);
4079 	    /* Dump the bogus attribute string up to the next blank or
4080 	     * the end of the tag. */
4081 	    while ((CUR != 0) &&
4082 	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4083 		   ((CUR != '/') || (NXT(1) != '>')))
4084 		NEXT;
4085 	}
4086 
4087 failed:
4088 	SKIP_BLANKS;
4089     }
4090 
4091     /*
4092      * Handle specific association to the META tag
4093      */
4094     if (meta && (nbatts != 0))
4095 	htmlCheckMeta(ctxt, atts);
4096 
4097     /*
4098      * SAX: Start of Element !
4099      */
4100     if (!discardtag) {
4101 	htmlnamePush(ctxt, name);
4102 	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4103 	    if (nbatts != 0)
4104 		ctxt->sax->startElement(ctxt->userData, name, atts);
4105 	    else
4106 		ctxt->sax->startElement(ctxt->userData, name, NULL);
4107 	}
4108     }
4109 
4110     if (atts != NULL) {
4111         for (i = 1;i < nbatts;i += 2) {
4112 	    if (atts[i] != NULL)
4113 		xmlFree((xmlChar *) atts[i]);
4114 	}
4115     }
4116 
4117     return(discardtag);
4118 }
4119 
4120 /**
4121  * htmlParseEndTag:
4122  * @ctxt:  an HTML parser context
4123  *
4124  * parse an end of tag
4125  *
4126  * [42] ETag ::= '</' Name S? '>'
4127  *
4128  * With namespace
4129  *
4130  * [NS 9] ETag ::= '</' QName S? '>'
4131  *
4132  * Returns 1 if the current level should be closed.
4133  */
4134 
4135 static int
htmlParseEndTag(htmlParserCtxtPtr ctxt)4136 htmlParseEndTag(htmlParserCtxtPtr ctxt)
4137 {
4138     const xmlChar *name;
4139     const xmlChar *oldname;
4140     int i, ret;
4141 
4142     if ((CUR != '<') || (NXT(1) != '/')) {
4143         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4144 	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
4145         return (0);
4146     }
4147     SKIP(2);
4148 
4149     name = htmlParseHTMLName(ctxt);
4150     if (name == NULL)
4151         return (0);
4152     /*
4153      * We should definitely be at the ending "S? '>'" part
4154      */
4155     SKIP_BLANKS;
4156     if (CUR != '>') {
4157         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4158 	             "End tag : expected '>'\n", NULL, NULL);
4159         /* Skip to next '>' */
4160         while ((CUR != 0) && (CUR != '>'))
4161             NEXT;
4162     }
4163     if (CUR == '>')
4164         NEXT;
4165 
4166     /*
4167      * if we ignored misplaced tags in htmlParseStartTag don't pop them
4168      * out now.
4169      */
4170     if ((ctxt->depth > 0) &&
4171         (xmlStrEqual(name, BAD_CAST "html") ||
4172          xmlStrEqual(name, BAD_CAST "body") ||
4173 	 xmlStrEqual(name, BAD_CAST "head"))) {
4174 	ctxt->depth--;
4175 	return (0);
4176     }
4177 
4178     /*
4179      * If the name read is not one of the element in the parsing stack
4180      * then return, it's just an error.
4181      */
4182     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4183         if (xmlStrEqual(name, ctxt->nameTab[i]))
4184             break;
4185     }
4186     if (i < 0) {
4187         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4188 	             "Unexpected end tag : %s\n", name, NULL);
4189         return (0);
4190     }
4191 
4192 
4193     /*
4194      * Check for auto-closure of HTML elements.
4195      */
4196 
4197     htmlAutoCloseOnClose(ctxt, name);
4198 
4199     /*
4200      * Well formedness constraints, opening and closing must match.
4201      * With the exception that the autoclose may have popped stuff out
4202      * of the stack.
4203      */
4204     if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4205         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4206                      "Opening and ending tag mismatch: %s and %s\n",
4207                      name, ctxt->name);
4208     }
4209 
4210     /*
4211      * SAX: End of Tag
4212      */
4213     oldname = ctxt->name;
4214     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4215         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4216             ctxt->sax->endElement(ctxt->userData, name);
4217 	htmlNodeInfoPop(ctxt);
4218         htmlnamePop(ctxt);
4219         ret = 1;
4220     } else {
4221         ret = 0;
4222     }
4223 
4224     return (ret);
4225 }
4226 
4227 
4228 /**
4229  * htmlParseReference:
4230  * @ctxt:  an HTML parser context
4231  *
4232  * parse and handle entity references in content,
4233  * this will end-up in a call to character() since this is either a
4234  * CharRef, or a predefined entity.
4235  */
4236 static void
htmlParseReference(htmlParserCtxtPtr ctxt)4237 htmlParseReference(htmlParserCtxtPtr ctxt) {
4238     const htmlEntityDesc * ent;
4239     xmlChar out[6];
4240     const xmlChar *name;
4241     if (CUR != '&') return;
4242 
4243     if (NXT(1) == '#') {
4244 	unsigned int c;
4245 	int bits, i = 0;
4246 
4247 	c = htmlParseCharRef(ctxt);
4248 	if (c == 0)
4249 	    return;
4250 
4251         if      (c <    0x80) { out[i++]= c;                bits= -6; }
4252         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4253         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4254         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4255 
4256         for ( ; bits >= 0; bits-= 6) {
4257             out[i++]= ((c >> bits) & 0x3F) | 0x80;
4258         }
4259 	out[i] = 0;
4260 
4261 	htmlCheckParagraph(ctxt);
4262 	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4263 	    ctxt->sax->characters(ctxt->userData, out, i);
4264     } else {
4265 	ent = htmlParseEntityRef(ctxt, &name);
4266 	if (name == NULL) {
4267 	    htmlCheckParagraph(ctxt);
4268 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4269 	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4270 	    return;
4271 	}
4272 	if ((ent == NULL) || !(ent->value > 0)) {
4273 	    htmlCheckParagraph(ctxt);
4274 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4275 		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4276 		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4277 		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4278 	    }
4279 	} else {
4280 	    unsigned int c;
4281 	    int bits, i = 0;
4282 
4283 	    c = ent->value;
4284 	    if      (c <    0x80)
4285 	            { out[i++]= c;                bits= -6; }
4286 	    else if (c <   0x800)
4287 	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4288 	    else if (c < 0x10000)
4289 	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4290 	    else
4291 	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4292 
4293 	    for ( ; bits >= 0; bits-= 6) {
4294 		out[i++]= ((c >> bits) & 0x3F) | 0x80;
4295 	    }
4296 	    out[i] = 0;
4297 
4298 	    htmlCheckParagraph(ctxt);
4299 	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4300 		ctxt->sax->characters(ctxt->userData, out, i);
4301 	}
4302     }
4303 }
4304 
4305 /**
4306  * htmlParseContent:
4307  * @ctxt:  an HTML parser context
4308  *
4309  * Parse a content: comment, sub-element, reference or text.
4310  * Kept for compatibility with old code
4311  */
4312 
4313 static void
htmlParseContent(htmlParserCtxtPtr ctxt)4314 htmlParseContent(htmlParserCtxtPtr ctxt) {
4315     xmlChar *currentNode;
4316     int depth;
4317     const xmlChar *name;
4318 
4319     currentNode = xmlStrdup(ctxt->name);
4320     depth = ctxt->nameNr;
4321     while (1) {
4322         GROW;
4323 
4324         if (ctxt->instate == XML_PARSER_EOF)
4325             break;
4326 
4327 	/*
4328 	 * Our tag or one of it's parent or children is ending.
4329 	 */
4330         if ((CUR == '<') && (NXT(1) == '/')) {
4331 	    if (htmlParseEndTag(ctxt) &&
4332 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4333 		if (currentNode != NULL)
4334 		    xmlFree(currentNode);
4335 		return;
4336 	    }
4337 	    continue; /* while */
4338         }
4339 
4340 	else if ((CUR == '<') &&
4341 	         ((IS_ASCII_LETTER(NXT(1))) ||
4342 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4343 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4344 	    if (name == NULL) {
4345 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4346 			 "htmlParseStartTag: invalid element name\n",
4347 			 NULL, NULL);
4348 	        /* Dump the bogus tag like browsers do */
4349                 while ((CUR != 0) && (CUR != '>'))
4350 	            NEXT;
4351 
4352 	        if (currentNode != NULL)
4353 	            xmlFree(currentNode);
4354 	        return;
4355 	    }
4356 
4357 	    if (ctxt->name != NULL) {
4358 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4359 	            htmlAutoClose(ctxt, name);
4360 	            continue;
4361 	        }
4362 	    }
4363 	}
4364 
4365 	/*
4366 	 * Has this node been popped out during parsing of
4367 	 * the next element
4368 	 */
4369         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4370 	    (!xmlStrEqual(currentNode, ctxt->name)))
4371 	     {
4372 	    if (currentNode != NULL) xmlFree(currentNode);
4373 	    return;
4374 	}
4375 
4376 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4377 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4378 	    /*
4379 	     * Handle SCRIPT/STYLE separately
4380 	     */
4381 	    htmlParseScript(ctxt);
4382 	} else {
4383 	    /*
4384 	     * Sometimes DOCTYPE arrives in the middle of the document
4385 	     */
4386 	    if ((CUR == '<') && (NXT(1) == '!') &&
4387 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4388 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4389 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4390 		(UPP(8) == 'E')) {
4391 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4392 		             "Misplaced DOCTYPE declaration\n",
4393 			     BAD_CAST "DOCTYPE" , NULL);
4394 		htmlParseDocTypeDecl(ctxt);
4395 	    }
4396 
4397 	    /*
4398 	     * First case :  a comment
4399 	     */
4400 	    if ((CUR == '<') && (NXT(1) == '!') &&
4401 		(NXT(2) == '-') && (NXT(3) == '-')) {
4402 		htmlParseComment(ctxt);
4403 	    }
4404 
4405 	    /*
4406 	     * Second case : a Processing Instruction.
4407 	     */
4408 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4409 		htmlParsePI(ctxt);
4410 	    }
4411 
4412 	    /*
4413 	     * Third case :  a sub-element.
4414 	     */
4415 	    else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4416 		htmlParseElement(ctxt);
4417 	    }
4418 	    else if (CUR == '<') {
4419                 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4420                     (ctxt->sax->characters != NULL))
4421                     ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4422                 NEXT;
4423 	    }
4424 
4425 	    /*
4426 	     * Fourth case : a reference. If if has not been resolved,
4427 	     *    parsing returns it's Name, create the node
4428 	     */
4429 	    else if (CUR == '&') {
4430 		htmlParseReference(ctxt);
4431 	    }
4432 
4433 	    /*
4434 	     * Fifth case : end of the resource
4435 	     */
4436 	    else if (CUR == 0) {
4437 		htmlAutoCloseOnEnd(ctxt);
4438 		break;
4439 	    }
4440 
4441 	    /*
4442 	     * Last case, text. Note that References are handled directly.
4443 	     */
4444 	    else {
4445 		htmlParseCharData(ctxt);
4446 	    }
4447 	}
4448         GROW;
4449     }
4450     if (currentNode != NULL) xmlFree(currentNode);
4451 }
4452 
4453 /**
4454  * htmlParseElement:
4455  * @ctxt:  an HTML parser context
4456  *
4457  * parse an HTML element, this is highly recursive
4458  * this is kept for compatibility with previous code versions
4459  *
4460  * [39] element ::= EmptyElemTag | STag content ETag
4461  *
4462  * [41] Attribute ::= Name Eq AttValue
4463  */
4464 
4465 void
htmlParseElement(htmlParserCtxtPtr ctxt)4466 htmlParseElement(htmlParserCtxtPtr ctxt) {
4467     const xmlChar *name;
4468     xmlChar *currentNode = NULL;
4469     const htmlElemDesc * info;
4470     htmlParserNodeInfo node_info;
4471     int failed;
4472     int depth;
4473     const xmlChar *oldptr;
4474 
4475     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4476 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4477 		     "htmlParseElement: context error\n", NULL, NULL);
4478 	return;
4479     }
4480 
4481     if (ctxt->instate == XML_PARSER_EOF)
4482         return;
4483 
4484     /* Capture start position */
4485     if (ctxt->record_info) {
4486         node_info.begin_pos = ctxt->input->consumed +
4487                           (CUR_PTR - ctxt->input->base);
4488 	node_info.begin_line = ctxt->input->line;
4489     }
4490 
4491     failed = htmlParseStartTag(ctxt);
4492     name = ctxt->name;
4493     if ((failed == -1) || (name == NULL)) {
4494 	if (CUR == '>')
4495 	    NEXT;
4496         return;
4497     }
4498 
4499     /*
4500      * Lookup the info for that element.
4501      */
4502     info = htmlTagLookup(name);
4503     if (info == NULL) {
4504 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4505 	             "Tag %s invalid\n", name, NULL);
4506     }
4507 
4508     /*
4509      * Check for an Empty Element labeled the XML/SGML way
4510      */
4511     if ((CUR == '/') && (NXT(1) == '>')) {
4512         SKIP(2);
4513 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4514 	    ctxt->sax->endElement(ctxt->userData, name);
4515 	htmlnamePop(ctxt);
4516 	return;
4517     }
4518 
4519     if (CUR == '>') {
4520         NEXT;
4521     } else {
4522 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4523 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4524 
4525 	/*
4526 	 * end of parsing of this node.
4527 	 */
4528 	if (xmlStrEqual(name, ctxt->name)) {
4529 	    nodePop(ctxt);
4530 	    htmlnamePop(ctxt);
4531 	}
4532 
4533 	/*
4534 	 * Capture end position and add node
4535 	 */
4536 	if (ctxt->record_info) {
4537 	   node_info.end_pos = ctxt->input->consumed +
4538 			      (CUR_PTR - ctxt->input->base);
4539 	   node_info.end_line = ctxt->input->line;
4540 	   node_info.node = ctxt->node;
4541 	   xmlParserAddNodeInfo(ctxt, &node_info);
4542 	}
4543 	return;
4544     }
4545 
4546     /*
4547      * Check for an Empty Element from DTD definition
4548      */
4549     if ((info != NULL) && (info->empty)) {
4550 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4551 	    ctxt->sax->endElement(ctxt->userData, name);
4552 	htmlnamePop(ctxt);
4553 	return;
4554     }
4555 
4556     /*
4557      * Parse the content of the element:
4558      */
4559     currentNode = xmlStrdup(ctxt->name);
4560     depth = ctxt->nameNr;
4561     while (CUR != 0) {
4562 	oldptr = ctxt->input->cur;
4563 	htmlParseContent(ctxt);
4564 	if (oldptr==ctxt->input->cur) break;
4565 	if (ctxt->nameNr < depth) break;
4566     }
4567 
4568     /*
4569      * Capture end position and add node
4570      */
4571     if ( currentNode != NULL && ctxt->record_info ) {
4572        node_info.end_pos = ctxt->input->consumed +
4573                           (CUR_PTR - ctxt->input->base);
4574        node_info.end_line = ctxt->input->line;
4575        node_info.node = ctxt->node;
4576        xmlParserAddNodeInfo(ctxt, &node_info);
4577     }
4578     if (CUR == 0) {
4579 	htmlAutoCloseOnEnd(ctxt);
4580     }
4581 
4582     if (currentNode != NULL)
4583 	xmlFree(currentNode);
4584 }
4585 
4586 static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt)4587 htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4588     /*
4589      * Capture end position and add node
4590      */
4591     if ( ctxt->node != NULL && ctxt->record_info ) {
4592        ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4593                                 (CUR_PTR - ctxt->input->base);
4594        ctxt->nodeInfo->end_line = ctxt->input->line;
4595        ctxt->nodeInfo->node = ctxt->node;
4596        xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4597        htmlNodeInfoPop(ctxt);
4598     }
4599     if (CUR == 0) {
4600        htmlAutoCloseOnEnd(ctxt);
4601     }
4602 }
4603 
4604 /**
4605  * htmlParseElementInternal:
4606  * @ctxt:  an HTML parser context
4607  *
4608  * parse an HTML element, new version, non recursive
4609  *
4610  * [39] element ::= EmptyElemTag | STag content ETag
4611  *
4612  * [41] Attribute ::= Name Eq AttValue
4613  */
4614 
4615 static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt)4616 htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4617     const xmlChar *name;
4618     const htmlElemDesc * info;
4619     htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4620     int failed;
4621 
4622     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4623 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4624 		     "htmlParseElementInternal: context error\n", NULL, NULL);
4625 	return;
4626     }
4627 
4628     if (ctxt->instate == XML_PARSER_EOF)
4629         return;
4630 
4631     /* Capture start position */
4632     if (ctxt->record_info) {
4633         node_info.begin_pos = ctxt->input->consumed +
4634                           (CUR_PTR - ctxt->input->base);
4635 	node_info.begin_line = ctxt->input->line;
4636     }
4637 
4638     failed = htmlParseStartTag(ctxt);
4639     name = ctxt->name;
4640     if ((failed == -1) || (name == NULL)) {
4641 	if (CUR == '>')
4642 	    NEXT;
4643         return;
4644     }
4645 
4646     /*
4647      * Lookup the info for that element.
4648      */
4649     info = htmlTagLookup(name);
4650     if (info == NULL) {
4651 	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4652 	             "Tag %s invalid\n", name, NULL);
4653     }
4654 
4655     /*
4656      * Check for an Empty Element labeled the XML/SGML way
4657      */
4658     if ((CUR == '/') && (NXT(1) == '>')) {
4659         SKIP(2);
4660 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4661 	    ctxt->sax->endElement(ctxt->userData, name);
4662 	htmlnamePop(ctxt);
4663 	return;
4664     }
4665 
4666     if (CUR == '>') {
4667         NEXT;
4668     } else {
4669 	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4670 	             "Couldn't find end of Start Tag %s\n", name, NULL);
4671 
4672 	/*
4673 	 * end of parsing of this node.
4674 	 */
4675 	if (xmlStrEqual(name, ctxt->name)) {
4676 	    nodePop(ctxt);
4677 	    htmlnamePop(ctxt);
4678 	}
4679 
4680         if (ctxt->record_info)
4681             htmlNodeInfoPush(ctxt, &node_info);
4682         htmlParserFinishElementParsing(ctxt);
4683 	return;
4684     }
4685 
4686     /*
4687      * Check for an Empty Element from DTD definition
4688      */
4689     if ((info != NULL) && (info->empty)) {
4690 	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4691 	    ctxt->sax->endElement(ctxt->userData, name);
4692 	htmlnamePop(ctxt);
4693 	return;
4694     }
4695 
4696     if (ctxt->record_info)
4697         htmlNodeInfoPush(ctxt, &node_info);
4698 }
4699 
4700 /**
4701  * htmlParseContentInternal:
4702  * @ctxt:  an HTML parser context
4703  *
4704  * Parse a content: comment, sub-element, reference or text.
4705  * New version for non recursive htmlParseElementInternal
4706  */
4707 
4708 static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt)4709 htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4710     xmlChar *currentNode;
4711     int depth;
4712     const xmlChar *name;
4713 
4714     currentNode = xmlStrdup(ctxt->name);
4715     depth = ctxt->nameNr;
4716     while (1) {
4717         GROW;
4718 
4719         if (ctxt->instate == XML_PARSER_EOF)
4720             break;
4721 
4722 	/*
4723 	 * Our tag or one of it's parent or children is ending.
4724 	 */
4725         if ((CUR == '<') && (NXT(1) == '/')) {
4726 	    if (htmlParseEndTag(ctxt) &&
4727 		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4728 		if (currentNode != NULL)
4729 		    xmlFree(currentNode);
4730 
4731 	        currentNode = xmlStrdup(ctxt->name);
4732 	        depth = ctxt->nameNr;
4733 	    }
4734 	    continue; /* while */
4735         }
4736 
4737 	else if ((CUR == '<') &&
4738 	         ((IS_ASCII_LETTER(NXT(1))) ||
4739 		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4740 	    name = htmlParseHTMLName_nonInvasive(ctxt);
4741 	    if (name == NULL) {
4742 	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4743 			 "htmlParseStartTag: invalid element name\n",
4744 			 NULL, NULL);
4745 	        /* Dump the bogus tag like browsers do */
4746 	        while ((CUR == 0) && (CUR != '>'))
4747 	            NEXT;
4748 
4749 	        htmlParserFinishElementParsing(ctxt);
4750 	        if (currentNode != NULL)
4751 	            xmlFree(currentNode);
4752 
4753 	        currentNode = xmlStrdup(ctxt->name);
4754 	        depth = ctxt->nameNr;
4755 	        continue;
4756 	    }
4757 
4758 	    if (ctxt->name != NULL) {
4759 	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4760 	            htmlAutoClose(ctxt, name);
4761 	            continue;
4762 	        }
4763 	    }
4764 	}
4765 
4766 	/*
4767 	 * Has this node been popped out during parsing of
4768 	 * the next element
4769 	 */
4770         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4771 	    (!xmlStrEqual(currentNode, ctxt->name)))
4772 	     {
4773 	    htmlParserFinishElementParsing(ctxt);
4774 	    if (currentNode != NULL) xmlFree(currentNode);
4775 
4776 	    currentNode = xmlStrdup(ctxt->name);
4777 	    depth = ctxt->nameNr;
4778 	    continue;
4779 	}
4780 
4781 	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4782 	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4783 	    /*
4784 	     * Handle SCRIPT/STYLE separately
4785 	     */
4786 	    htmlParseScript(ctxt);
4787 	} else {
4788 	    /*
4789 	     * Sometimes DOCTYPE arrives in the middle of the document
4790 	     */
4791 	    if ((CUR == '<') && (NXT(1) == '!') &&
4792 		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4793 		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4794 		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4795 		(UPP(8) == 'E')) {
4796 		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4797 		             "Misplaced DOCTYPE declaration\n",
4798 			     BAD_CAST "DOCTYPE" , NULL);
4799 		htmlParseDocTypeDecl(ctxt);
4800 	    }
4801 
4802 	    /*
4803 	     * First case :  a comment
4804 	     */
4805 	    if ((CUR == '<') && (NXT(1) == '!') &&
4806 		(NXT(2) == '-') && (NXT(3) == '-')) {
4807 		htmlParseComment(ctxt);
4808 	    }
4809 
4810 	    /*
4811 	     * Second case : a Processing Instruction.
4812 	     */
4813 	    else if ((CUR == '<') && (NXT(1) == '?')) {
4814 		htmlParsePI(ctxt);
4815 	    }
4816 
4817 	    /*
4818 	     * Third case :  a sub-element.
4819 	     */
4820 	    else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4821 		htmlParseElementInternal(ctxt);
4822 		if (currentNode != NULL) xmlFree(currentNode);
4823 
4824 		currentNode = xmlStrdup(ctxt->name);
4825 		depth = ctxt->nameNr;
4826 	    }
4827 	    else if (CUR == '<') {
4828                 if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4829                     (ctxt->sax->characters != NULL))
4830                     ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4831                 NEXT;
4832             }
4833 
4834 	    /*
4835 	     * Fourth case : a reference. If if has not been resolved,
4836 	     *    parsing returns it's Name, create the node
4837 	     */
4838 	    else if (CUR == '&') {
4839 		htmlParseReference(ctxt);
4840 	    }
4841 
4842 	    /*
4843 	     * Fifth case : end of the resource
4844 	     */
4845 	    else if (CUR == 0) {
4846 		htmlAutoCloseOnEnd(ctxt);
4847 		break;
4848 	    }
4849 
4850 	    /*
4851 	     * Last case, text. Note that References are handled directly.
4852 	     */
4853 	    else {
4854 		htmlParseCharData(ctxt);
4855 	    }
4856 	}
4857         GROW;
4858     }
4859     if (currentNode != NULL) xmlFree(currentNode);
4860 }
4861 
4862 /**
4863  * htmlParseContent:
4864  * @ctxt:  an HTML parser context
4865  *
4866  * Parse a content: comment, sub-element, reference or text.
4867  * This is the entry point when called from parser.c
4868  */
4869 
4870 void
__htmlParseContent(void * ctxt)4871 __htmlParseContent(void *ctxt) {
4872     if (ctxt != NULL)
4873 	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4874 }
4875 
4876 /**
4877  * htmlParseDocument:
4878  * @ctxt:  an HTML parser context
4879  *
4880  * parse an HTML document (and build a tree if using the standard SAX
4881  * interface).
4882  *
4883  * Returns 0, -1 in case of error. the parser context is augmented
4884  *                as a result of the parsing.
4885  */
4886 
4887 int
htmlParseDocument(htmlParserCtxtPtr ctxt)4888 htmlParseDocument(htmlParserCtxtPtr ctxt) {
4889     xmlChar start[4];
4890     xmlCharEncoding enc;
4891     xmlDtdPtr dtd;
4892 
4893     xmlInitParser();
4894 
4895     htmlDefaultSAXHandlerInit();
4896 
4897     if ((ctxt == NULL) || (ctxt->input == NULL)) {
4898 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4899 		     "htmlParseDocument: context error\n", NULL, NULL);
4900 	return(XML_ERR_INTERNAL_ERROR);
4901     }
4902     ctxt->html = 1;
4903     ctxt->linenumbers = 1;
4904     GROW;
4905     /*
4906      * SAX: beginning of the document processing.
4907      */
4908     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4909         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4910 
4911     if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4912         ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4913 	/*
4914 	 * Get the 4 first bytes and decode the charset
4915 	 * if enc != XML_CHAR_ENCODING_NONE
4916 	 * plug some encoding conversion routines.
4917 	 */
4918 	start[0] = RAW;
4919 	start[1] = NXT(1);
4920 	start[2] = NXT(2);
4921 	start[3] = NXT(3);
4922 	enc = xmlDetectCharEncoding(&start[0], 4);
4923 	if (enc != XML_CHAR_ENCODING_NONE) {
4924 	    xmlSwitchEncoding(ctxt, enc);
4925 	}
4926     }
4927 
4928     /*
4929      * Wipe out everything which is before the first '<'
4930      */
4931     SKIP_BLANKS;
4932     if (CUR == 0) {
4933 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4934 	             "Document is empty\n", NULL, NULL);
4935     }
4936 
4937     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4938 	ctxt->sax->startDocument(ctxt->userData);
4939 
4940 
4941     /*
4942      * Parse possible comments and PIs before any content
4943      */
4944     while (((CUR == '<') && (NXT(1) == '!') &&
4945             (NXT(2) == '-') && (NXT(3) == '-')) ||
4946 	   ((CUR == '<') && (NXT(1) == '?'))) {
4947         htmlParseComment(ctxt);
4948         htmlParsePI(ctxt);
4949 	SKIP_BLANKS;
4950     }
4951 
4952 
4953     /*
4954      * Then possibly doc type declaration(s) and more Misc
4955      * (doctypedecl Misc*)?
4956      */
4957     if ((CUR == '<') && (NXT(1) == '!') &&
4958 	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4959 	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4960 	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4961 	(UPP(8) == 'E')) {
4962 	htmlParseDocTypeDecl(ctxt);
4963     }
4964     SKIP_BLANKS;
4965 
4966     /*
4967      * Parse possible comments and PIs before any content
4968      */
4969     while (((CUR == '<') && (NXT(1) == '!') &&
4970             (NXT(2) == '-') && (NXT(3) == '-')) ||
4971 	   ((CUR == '<') && (NXT(1) == '?'))) {
4972         htmlParseComment(ctxt);
4973         htmlParsePI(ctxt);
4974 	SKIP_BLANKS;
4975     }
4976 
4977     /*
4978      * Time to start parsing the tree itself
4979      */
4980     htmlParseContentInternal(ctxt);
4981 
4982     /*
4983      * autoclose
4984      */
4985     if (CUR == 0)
4986 	htmlAutoCloseOnEnd(ctxt);
4987 
4988 
4989     /*
4990      * SAX: end of the document processing.
4991      */
4992     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4993         ctxt->sax->endDocument(ctxt->userData);
4994 
4995     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4996 	dtd = xmlGetIntSubset(ctxt->myDoc);
4997 	if (dtd == NULL)
4998 	    ctxt->myDoc->intSubset =
4999 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5000 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5001 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5002     }
5003     if (! ctxt->wellFormed) return(-1);
5004     return(0);
5005 }
5006 
5007 
5008 /************************************************************************
5009  *									*
5010  *			Parser contexts handling			*
5011  *									*
5012  ************************************************************************/
5013 
5014 /**
5015  * htmlInitParserCtxt:
5016  * @ctxt:  an HTML parser context
5017  *
5018  * Initialize a parser context
5019  *
5020  * Returns 0 in case of success and -1 in case of error
5021  */
5022 
5023 static int
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)5024 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5025 {
5026     htmlSAXHandler *sax;
5027 
5028     if (ctxt == NULL) return(-1);
5029     memset(ctxt, 0, sizeof(htmlParserCtxt));
5030 
5031     ctxt->dict = xmlDictCreate();
5032     if (ctxt->dict == NULL) {
5033         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5034 	return(-1);
5035     }
5036     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5037     if (sax == NULL) {
5038         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5039 	return(-1);
5040     }
5041     else
5042         memset(sax, 0, sizeof(htmlSAXHandler));
5043 
5044     /* Allocate the Input stack */
5045     ctxt->inputTab = (htmlParserInputPtr *)
5046                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
5047     if (ctxt->inputTab == NULL) {
5048         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5049 	ctxt->inputNr = 0;
5050 	ctxt->inputMax = 0;
5051 	ctxt->input = NULL;
5052 	return(-1);
5053     }
5054     ctxt->inputNr = 0;
5055     ctxt->inputMax = 5;
5056     ctxt->input = NULL;
5057     ctxt->version = NULL;
5058     ctxt->encoding = NULL;
5059     ctxt->standalone = -1;
5060     ctxt->instate = XML_PARSER_START;
5061 
5062     /* Allocate the Node stack */
5063     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5064     if (ctxt->nodeTab == NULL) {
5065         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5066 	ctxt->nodeNr = 0;
5067 	ctxt->nodeMax = 0;
5068 	ctxt->node = NULL;
5069 	ctxt->inputNr = 0;
5070 	ctxt->inputMax = 0;
5071 	ctxt->input = NULL;
5072 	return(-1);
5073     }
5074     ctxt->nodeNr = 0;
5075     ctxt->nodeMax = 10;
5076     ctxt->node = NULL;
5077 
5078     /* Allocate the Name stack */
5079     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5080     if (ctxt->nameTab == NULL) {
5081         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5082 	ctxt->nameNr = 0;
5083 	ctxt->nameMax = 0;
5084 	ctxt->name = NULL;
5085 	ctxt->nodeNr = 0;
5086 	ctxt->nodeMax = 0;
5087 	ctxt->node = NULL;
5088 	ctxt->inputNr = 0;
5089 	ctxt->inputMax = 0;
5090 	ctxt->input = NULL;
5091 	return(-1);
5092     }
5093     ctxt->nameNr = 0;
5094     ctxt->nameMax = 10;
5095     ctxt->name = NULL;
5096 
5097     ctxt->nodeInfoTab = NULL;
5098     ctxt->nodeInfoNr  = 0;
5099     ctxt->nodeInfoMax = 0;
5100 
5101     if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5102     else {
5103         ctxt->sax = sax;
5104 	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5105     }
5106     ctxt->userData = ctxt;
5107     ctxt->myDoc = NULL;
5108     ctxt->wellFormed = 1;
5109     ctxt->replaceEntities = 0;
5110     ctxt->linenumbers = xmlLineNumbersDefaultValue;
5111     ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5112     ctxt->html = 1;
5113     ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5114     ctxt->vctxt.userData = ctxt;
5115     ctxt->vctxt.error = xmlParserValidityError;
5116     ctxt->vctxt.warning = xmlParserValidityWarning;
5117     ctxt->record_info = 0;
5118     ctxt->validate = 0;
5119     ctxt->checkIndex = 0;
5120     ctxt->catalogs = NULL;
5121     xmlInitNodeInfoSeq(&ctxt->node_seq);
5122     return(0);
5123 }
5124 
5125 /**
5126  * htmlFreeParserCtxt:
5127  * @ctxt:  an HTML parser context
5128  *
5129  * Free all the memory used by a parser context. However the parsed
5130  * document in ctxt->myDoc is not freed.
5131  */
5132 
5133 void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)5134 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5135 {
5136     xmlFreeParserCtxt(ctxt);
5137 }
5138 
5139 /**
5140  * htmlNewParserCtxt:
5141  *
5142  * Allocate and initialize a new parser context.
5143  *
5144  * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5145  */
5146 
5147 htmlParserCtxtPtr
htmlNewParserCtxt(void)5148 htmlNewParserCtxt(void)
5149 {
5150     xmlParserCtxtPtr ctxt;
5151 
5152     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5153     if (ctxt == NULL) {
5154         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5155 	return(NULL);
5156     }
5157     memset(ctxt, 0, sizeof(xmlParserCtxt));
5158     if (htmlInitParserCtxt(ctxt) < 0) {
5159         htmlFreeParserCtxt(ctxt);
5160 	return(NULL);
5161     }
5162     return(ctxt);
5163 }
5164 
5165 /**
5166  * htmlCreateMemoryParserCtxt:
5167  * @buffer:  a pointer to a char array
5168  * @size:  the size of the array
5169  *
5170  * Create a parser context for an HTML in-memory document.
5171  *
5172  * Returns the new parser context or NULL
5173  */
5174 htmlParserCtxtPtr
htmlCreateMemoryParserCtxt(const char * buffer,int size)5175 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5176     xmlParserCtxtPtr ctxt;
5177     xmlParserInputPtr input;
5178     xmlParserInputBufferPtr buf;
5179 
5180     if (buffer == NULL)
5181 	return(NULL);
5182     if (size <= 0)
5183 	return(NULL);
5184 
5185     ctxt = htmlNewParserCtxt();
5186     if (ctxt == NULL)
5187 	return(NULL);
5188 
5189     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5190     if (buf == NULL) return(NULL);
5191 
5192     input = xmlNewInputStream(ctxt);
5193     if (input == NULL) {
5194 	xmlFreeParserInputBuffer(buf);
5195 	xmlFreeParserCtxt(ctxt);
5196 	return(NULL);
5197     }
5198 
5199     input->filename = NULL;
5200     input->buf = buf;
5201     xmlBufResetInput(buf->buffer, input);
5202 
5203     inputPush(ctxt, input);
5204     return(ctxt);
5205 }
5206 
5207 /**
5208  * htmlCreateDocParserCtxt:
5209  * @cur:  a pointer to an array of xmlChar
5210  * @encoding:  a free form C string describing the HTML document encoding, or NULL
5211  *
5212  * Create a parser context for an HTML document.
5213  *
5214  * TODO: check the need to add encoding handling there
5215  *
5216  * Returns the new parser context or NULL
5217  */
5218 static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar * cur,const char * encoding)5219 htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5220     int len;
5221     htmlParserCtxtPtr ctxt;
5222 
5223     if (cur == NULL)
5224 	return(NULL);
5225     len = xmlStrlen(cur);
5226     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5227     if (ctxt == NULL)
5228 	return(NULL);
5229 
5230     if (encoding != NULL) {
5231 	xmlCharEncoding enc;
5232 	xmlCharEncodingHandlerPtr handler;
5233 
5234 	if (ctxt->input->encoding != NULL)
5235 	    xmlFree((xmlChar *) ctxt->input->encoding);
5236 	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5237 
5238 	enc = xmlParseCharEncoding(encoding);
5239 	/*
5240 	 * registered set of known encodings
5241 	 */
5242 	if (enc != XML_CHAR_ENCODING_ERROR) {
5243 	    xmlSwitchEncoding(ctxt, enc);
5244 	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5245 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5246 		             "Unsupported encoding %s\n",
5247 			     (const xmlChar *) encoding, NULL);
5248 	    }
5249 	} else {
5250 	    /*
5251 	     * fallback for unknown encodings
5252 	     */
5253 	    handler = xmlFindCharEncodingHandler((const char *) encoding);
5254 	    if (handler != NULL) {
5255 		xmlSwitchToEncoding(ctxt, handler);
5256 	    } else {
5257 		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5258 		             "Unsupported encoding %s\n",
5259 			     (const xmlChar *) encoding, NULL);
5260 	    }
5261 	}
5262     }
5263     return(ctxt);
5264 }
5265 
5266 #ifdef LIBXML_PUSH_ENABLED
5267 /************************************************************************
5268  *									*
5269  *	Progressive parsing interfaces				*
5270  *									*
5271  ************************************************************************/
5272 
5273 /**
5274  * htmlParseLookupSequence:
5275  * @ctxt:  an HTML parser context
5276  * @first:  the first char to lookup
5277  * @next:  the next char to lookup or zero
5278  * @third:  the next char to lookup or zero
5279  * @ignoreattrval: skip over attribute values
5280  *
5281  * Try to find if a sequence (first, next, third) or  just (first next) or
5282  * (first) is available in the input stream.
5283  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5284  * to avoid rescanning sequences of bytes, it DOES change the state of the
5285  * parser, do not use liberally.
5286  * This is basically similar to xmlParseLookupSequence()
5287  *
5288  * Returns the index to the current parsing point if the full sequence
5289  *      is available, -1 otherwise.
5290  */
5291 static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt,xmlChar first,xmlChar next,xmlChar third,int ignoreattrval)5292 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5293                         xmlChar next, xmlChar third, int ignoreattrval)
5294 {
5295     int base, len;
5296     htmlParserInputPtr in;
5297     const xmlChar *buf;
5298     int invalue = 0;
5299     char valdellim = 0x0;
5300 
5301     in = ctxt->input;
5302     if (in == NULL)
5303         return (-1);
5304 
5305     base = in->cur - in->base;
5306     if (base < 0)
5307         return (-1);
5308 
5309     if (ctxt->checkIndex > base) {
5310         base = ctxt->checkIndex;
5311         /* Abuse hasPErefs member to restore current state. */
5312         invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5313     }
5314 
5315     if (in->buf == NULL) {
5316         buf = in->base;
5317         len = in->length;
5318     } else {
5319         buf = xmlBufContent(in->buf->buffer);
5320         len = xmlBufUse(in->buf->buffer);
5321     }
5322 
5323     /* take into account the sequence length */
5324     if (third)
5325         len -= 2;
5326     else if (next)
5327         len--;
5328     for (; base < len; base++) {
5329         if (ignoreattrval) {
5330             if (buf[base] == '"' || buf[base] == '\'') {
5331                 if (invalue) {
5332                     if (buf[base] == valdellim) {
5333                         invalue = 0;
5334                         continue;
5335                     }
5336                 } else {
5337                     valdellim = buf[base];
5338                     invalue = 1;
5339                     continue;
5340                 }
5341             } else if (invalue) {
5342                 continue;
5343             }
5344         }
5345         if (buf[base] == first) {
5346             if (third != 0) {
5347                 if ((buf[base + 1] != next) || (buf[base + 2] != third))
5348                     continue;
5349             } else if (next != 0) {
5350                 if (buf[base + 1] != next)
5351                     continue;
5352             }
5353             ctxt->checkIndex = 0;
5354 #ifdef DEBUG_PUSH
5355             if (next == 0)
5356                 xmlGenericError(xmlGenericErrorContext,
5357                                 "HPP: lookup '%c' found at %d\n",
5358                                 first, base);
5359             else if (third == 0)
5360                 xmlGenericError(xmlGenericErrorContext,
5361                                 "HPP: lookup '%c%c' found at %d\n",
5362                                 first, next, base);
5363             else
5364                 xmlGenericError(xmlGenericErrorContext,
5365                                 "HPP: lookup '%c%c%c' found at %d\n",
5366                                 first, next, third, base);
5367 #endif
5368             return (base - (in->cur - in->base));
5369         }
5370     }
5371     ctxt->checkIndex = base;
5372     /* Abuse hasPErefs member to track current state. */
5373     if (invalue)
5374         ctxt->hasPErefs |= 1;
5375     else
5376         ctxt->hasPErefs &= ~1;
5377 #ifdef DEBUG_PUSH
5378     if (next == 0)
5379         xmlGenericError(xmlGenericErrorContext,
5380                         "HPP: lookup '%c' failed\n", first);
5381     else if (third == 0)
5382         xmlGenericError(xmlGenericErrorContext,
5383                         "HPP: lookup '%c%c' failed\n", first, next);
5384     else
5385         xmlGenericError(xmlGenericErrorContext,
5386                         "HPP: lookup '%c%c%c' failed\n", first, next,
5387                         third);
5388 #endif
5389     return (-1);
5390 }
5391 
5392 /**
5393  * htmlParseLookupCommentEnd:
5394  * @ctxt: an HTML parser context
5395  *
5396  * Try to find a comment end tag in the input stream
5397  * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5398  * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5399  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5400  * to avoid rescanning sequences of bytes, it DOES change the state of the
5401  * parser, do not use liberally.
5402  * This wraps to htmlParseLookupSequence()
5403  *
5404  * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5405  */
5406 static int
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)5407 htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5408 {
5409     int mark = 0;
5410     int cur = CUR_PTR - BASE_PTR;
5411 
5412     while (mark >= 0) {
5413 	mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5414 	if ((mark < 0) ||
5415 	    (NXT(mark+2) == '>') ||
5416 	    ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5417 	    return mark;
5418 	}
5419 	ctxt->checkIndex = cur + mark + 1;
5420     }
5421     return mark;
5422 }
5423 
5424 
5425 /**
5426  * htmlParseTryOrFinish:
5427  * @ctxt:  an HTML parser context
5428  * @terminate:  last chunk indicator
5429  *
5430  * Try to progress on parsing
5431  *
5432  * Returns zero if no parsing was possible
5433  */
5434 static int
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt,int terminate)5435 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5436     int ret = 0;
5437     htmlParserInputPtr in;
5438     ptrdiff_t avail = 0;
5439     xmlChar cur, next;
5440 
5441     htmlParserNodeInfo node_info;
5442 
5443 #ifdef DEBUG_PUSH
5444     switch (ctxt->instate) {
5445 	case XML_PARSER_EOF:
5446 	    xmlGenericError(xmlGenericErrorContext,
5447 		    "HPP: try EOF\n"); break;
5448 	case XML_PARSER_START:
5449 	    xmlGenericError(xmlGenericErrorContext,
5450 		    "HPP: try START\n"); break;
5451 	case XML_PARSER_MISC:
5452 	    xmlGenericError(xmlGenericErrorContext,
5453 		    "HPP: try MISC\n");break;
5454 	case XML_PARSER_COMMENT:
5455 	    xmlGenericError(xmlGenericErrorContext,
5456 		    "HPP: try COMMENT\n");break;
5457 	case XML_PARSER_PROLOG:
5458 	    xmlGenericError(xmlGenericErrorContext,
5459 		    "HPP: try PROLOG\n");break;
5460 	case XML_PARSER_START_TAG:
5461 	    xmlGenericError(xmlGenericErrorContext,
5462 		    "HPP: try START_TAG\n");break;
5463 	case XML_PARSER_CONTENT:
5464 	    xmlGenericError(xmlGenericErrorContext,
5465 		    "HPP: try CONTENT\n");break;
5466 	case XML_PARSER_CDATA_SECTION:
5467 	    xmlGenericError(xmlGenericErrorContext,
5468 		    "HPP: try CDATA_SECTION\n");break;
5469 	case XML_PARSER_END_TAG:
5470 	    xmlGenericError(xmlGenericErrorContext,
5471 		    "HPP: try END_TAG\n");break;
5472 	case XML_PARSER_ENTITY_DECL:
5473 	    xmlGenericError(xmlGenericErrorContext,
5474 		    "HPP: try ENTITY_DECL\n");break;
5475 	case XML_PARSER_ENTITY_VALUE:
5476 	    xmlGenericError(xmlGenericErrorContext,
5477 		    "HPP: try ENTITY_VALUE\n");break;
5478 	case XML_PARSER_ATTRIBUTE_VALUE:
5479 	    xmlGenericError(xmlGenericErrorContext,
5480 		    "HPP: try ATTRIBUTE_VALUE\n");break;
5481 	case XML_PARSER_DTD:
5482 	    xmlGenericError(xmlGenericErrorContext,
5483 		    "HPP: try DTD\n");break;
5484 	case XML_PARSER_EPILOG:
5485 	    xmlGenericError(xmlGenericErrorContext,
5486 		    "HPP: try EPILOG\n");break;
5487 	case XML_PARSER_PI:
5488 	    xmlGenericError(xmlGenericErrorContext,
5489 		    "HPP: try PI\n");break;
5490 	case XML_PARSER_SYSTEM_LITERAL:
5491 	    xmlGenericError(xmlGenericErrorContext,
5492 		    "HPP: try SYSTEM_LITERAL\n");break;
5493     }
5494 #endif
5495 
5496     while (1) {
5497 
5498 	in = ctxt->input;
5499 	if (in == NULL) break;
5500 	if (in->buf == NULL)
5501 	    avail = in->length - (in->cur - in->base);
5502 	else
5503 	    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5504                     (in->cur - in->base);
5505 	if ((avail == 0) && (terminate)) {
5506 	    htmlAutoCloseOnEnd(ctxt);
5507 	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5508 		/*
5509 		 * SAX: end of the document processing.
5510 		 */
5511 		ctxt->instate = XML_PARSER_EOF;
5512 		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5513 		    ctxt->sax->endDocument(ctxt->userData);
5514 	    }
5515 	}
5516         if (avail < 1)
5517 	    goto done;
5518         /*
5519          * This is done to make progress and avoid an infinite loop
5520          * if a parsing attempt was aborted by hitting a NUL byte. After
5521          * changing htmlCurrentChar, this probably isn't necessary anymore.
5522          * We should consider removing this check.
5523          */
5524 	cur = in->cur[0];
5525 	if (cur == 0) {
5526 	    SKIP(1);
5527 	    continue;
5528 	}
5529 
5530         switch (ctxt->instate) {
5531             case XML_PARSER_EOF:
5532 	        /*
5533 		 * Document parsing is done !
5534 		 */
5535 	        goto done;
5536             case XML_PARSER_START:
5537 	        /*
5538 		 * Very first chars read from the document flow.
5539 		 */
5540 		cur = in->cur[0];
5541 		if (IS_BLANK_CH(cur)) {
5542 		    SKIP_BLANKS;
5543 		    if (in->buf == NULL)
5544 			avail = in->length - (in->cur - in->base);
5545 		    else
5546 			avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5547                                 (in->cur - in->base);
5548 		}
5549 		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5550 		    ctxt->sax->setDocumentLocator(ctxt->userData,
5551 						  &xmlDefaultSAXLocator);
5552 		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5553 	            (!ctxt->disableSAX))
5554 		    ctxt->sax->startDocument(ctxt->userData);
5555 
5556 		cur = in->cur[0];
5557 		next = in->cur[1];
5558 		if ((cur == '<') && (next == '!') &&
5559 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5560 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5561 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5562 		    (UPP(8) == 'E')) {
5563 		    if ((!terminate) &&
5564 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5565 			goto done;
5566 #ifdef DEBUG_PUSH
5567 		    xmlGenericError(xmlGenericErrorContext,
5568 			    "HPP: Parsing internal subset\n");
5569 #endif
5570 		    htmlParseDocTypeDecl(ctxt);
5571 		    ctxt->instate = XML_PARSER_PROLOG;
5572 #ifdef DEBUG_PUSH
5573 		    xmlGenericError(xmlGenericErrorContext,
5574 			    "HPP: entering PROLOG\n");
5575 #endif
5576                 } else {
5577 		    ctxt->instate = XML_PARSER_MISC;
5578 #ifdef DEBUG_PUSH
5579 		    xmlGenericError(xmlGenericErrorContext,
5580 			    "HPP: entering MISC\n");
5581 #endif
5582 		}
5583 		break;
5584             case XML_PARSER_MISC:
5585 		SKIP_BLANKS;
5586 		if (in->buf == NULL)
5587 		    avail = in->length - (in->cur - in->base);
5588 		else
5589 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5590                             (in->cur - in->base);
5591 		/*
5592 		 * no chars in buffer
5593 		 */
5594 		if (avail < 1)
5595 		    goto done;
5596 		/*
5597 		 * not enough chars in buffer
5598 		 */
5599 		if (avail < 2) {
5600 		    if (!terminate)
5601 			goto done;
5602 		    else
5603 			next = ' ';
5604 		} else {
5605 		    next = in->cur[1];
5606 		}
5607 		cur = in->cur[0];
5608 	        if ((cur == '<') && (next == '!') &&
5609 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5610 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5611 			goto done;
5612 #ifdef DEBUG_PUSH
5613 		    xmlGenericError(xmlGenericErrorContext,
5614 			    "HPP: Parsing Comment\n");
5615 #endif
5616 		    htmlParseComment(ctxt);
5617 		    ctxt->instate = XML_PARSER_MISC;
5618 	        } else if ((cur == '<') && (next == '?')) {
5619 		    if ((!terminate) &&
5620 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5621 			goto done;
5622 #ifdef DEBUG_PUSH
5623 		    xmlGenericError(xmlGenericErrorContext,
5624 			    "HPP: Parsing PI\n");
5625 #endif
5626 		    htmlParsePI(ctxt);
5627 		    ctxt->instate = XML_PARSER_MISC;
5628 		} else if ((cur == '<') && (next == '!') &&
5629 		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5630 		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5631 		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5632 		    (UPP(8) == 'E')) {
5633 		    if ((!terminate) &&
5634 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5635 			goto done;
5636 #ifdef DEBUG_PUSH
5637 		    xmlGenericError(xmlGenericErrorContext,
5638 			    "HPP: Parsing internal subset\n");
5639 #endif
5640 		    htmlParseDocTypeDecl(ctxt);
5641 		    ctxt->instate = XML_PARSER_PROLOG;
5642 #ifdef DEBUG_PUSH
5643 		    xmlGenericError(xmlGenericErrorContext,
5644 			    "HPP: entering PROLOG\n");
5645 #endif
5646 		} else if ((cur == '<') && (next == '!') &&
5647 		           (avail < 9)) {
5648 		    goto done;
5649 		} else {
5650 		    ctxt->instate = XML_PARSER_CONTENT;
5651 #ifdef DEBUG_PUSH
5652 		    xmlGenericError(xmlGenericErrorContext,
5653 			    "HPP: entering START_TAG\n");
5654 #endif
5655 		}
5656 		break;
5657             case XML_PARSER_PROLOG:
5658 		SKIP_BLANKS;
5659 		if (in->buf == NULL)
5660 		    avail = in->length - (in->cur - in->base);
5661 		else
5662 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5663                             (in->cur - in->base);
5664 		if (avail < 2)
5665 		    goto done;
5666 		cur = in->cur[0];
5667 		next = in->cur[1];
5668 		if ((cur == '<') && (next == '!') &&
5669 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5670 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5671 			goto done;
5672 #ifdef DEBUG_PUSH
5673 		    xmlGenericError(xmlGenericErrorContext,
5674 			    "HPP: Parsing Comment\n");
5675 #endif
5676 		    htmlParseComment(ctxt);
5677 		    ctxt->instate = XML_PARSER_PROLOG;
5678 	        } else if ((cur == '<') && (next == '?')) {
5679 		    if ((!terminate) &&
5680 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5681 			goto done;
5682 #ifdef DEBUG_PUSH
5683 		    xmlGenericError(xmlGenericErrorContext,
5684 			    "HPP: Parsing PI\n");
5685 #endif
5686 		    htmlParsePI(ctxt);
5687 		    ctxt->instate = XML_PARSER_PROLOG;
5688 		} else if ((cur == '<') && (next == '!') &&
5689 		           (avail < 4)) {
5690 		    goto done;
5691 		} else {
5692 		    ctxt->instate = XML_PARSER_CONTENT;
5693 #ifdef DEBUG_PUSH
5694 		    xmlGenericError(xmlGenericErrorContext,
5695 			    "HPP: entering START_TAG\n");
5696 #endif
5697 		}
5698 		break;
5699             case XML_PARSER_EPILOG:
5700 		if (in->buf == NULL)
5701 		    avail = in->length - (in->cur - in->base);
5702 		else
5703 		    avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5704                             (in->cur - in->base);
5705 		if (avail < 1)
5706 		    goto done;
5707 		cur = in->cur[0];
5708 		if (IS_BLANK_CH(cur)) {
5709 		    htmlParseCharData(ctxt);
5710 		    goto done;
5711 		}
5712 		if (avail < 2)
5713 		    goto done;
5714 		next = in->cur[1];
5715 	        if ((cur == '<') && (next == '!') &&
5716 		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5717 		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5718 			goto done;
5719 #ifdef DEBUG_PUSH
5720 		    xmlGenericError(xmlGenericErrorContext,
5721 			    "HPP: Parsing Comment\n");
5722 #endif
5723 		    htmlParseComment(ctxt);
5724 		    ctxt->instate = XML_PARSER_EPILOG;
5725 	        } else if ((cur == '<') && (next == '?')) {
5726 		    if ((!terminate) &&
5727 		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5728 			goto done;
5729 #ifdef DEBUG_PUSH
5730 		    xmlGenericError(xmlGenericErrorContext,
5731 			    "HPP: Parsing PI\n");
5732 #endif
5733 		    htmlParsePI(ctxt);
5734 		    ctxt->instate = XML_PARSER_EPILOG;
5735 		} else if ((cur == '<') && (next == '!') &&
5736 		           (avail < 4)) {
5737 		    goto done;
5738 		} else {
5739 		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5740 		    ctxt->wellFormed = 0;
5741 		    ctxt->instate = XML_PARSER_EOF;
5742 #ifdef DEBUG_PUSH
5743 		    xmlGenericError(xmlGenericErrorContext,
5744 			    "HPP: entering EOF\n");
5745 #endif
5746 		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5747 			ctxt->sax->endDocument(ctxt->userData);
5748 		    goto done;
5749 		}
5750 		break;
5751             case XML_PARSER_START_TAG: {
5752 	        const xmlChar *name;
5753 		int failed;
5754 		const htmlElemDesc * info;
5755 
5756 		/*
5757 		 * no chars in buffer
5758 		 */
5759 		if (avail < 1)
5760 		    goto done;
5761 		/*
5762 		 * not enough chars in buffer
5763 		 */
5764 		if (avail < 2) {
5765 		    if (!terminate)
5766 			goto done;
5767 		    else
5768 			next = ' ';
5769 		} else {
5770 		    next = in->cur[1];
5771 		}
5772 		cur = in->cur[0];
5773 	        if (cur != '<') {
5774 		    ctxt->instate = XML_PARSER_CONTENT;
5775 #ifdef DEBUG_PUSH
5776 		    xmlGenericError(xmlGenericErrorContext,
5777 			    "HPP: entering CONTENT\n");
5778 #endif
5779 		    break;
5780 		}
5781 		if (next == '/') {
5782 		    ctxt->instate = XML_PARSER_END_TAG;
5783 		    ctxt->checkIndex = 0;
5784 #ifdef DEBUG_PUSH
5785 		    xmlGenericError(xmlGenericErrorContext,
5786 			    "HPP: entering END_TAG\n");
5787 #endif
5788 		    break;
5789 		}
5790 		if ((!terminate) &&
5791 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5792 		    goto done;
5793 
5794                 /* Capture start position */
5795 	        if (ctxt->record_info) {
5796 	             node_info.begin_pos = ctxt->input->consumed +
5797 	                                (CUR_PTR - ctxt->input->base);
5798 	             node_info.begin_line = ctxt->input->line;
5799 	        }
5800 
5801 
5802 		failed = htmlParseStartTag(ctxt);
5803 		name = ctxt->name;
5804 		if ((failed == -1) ||
5805 		    (name == NULL)) {
5806 		    if (CUR == '>')
5807 			NEXT;
5808 		    break;
5809 		}
5810 
5811 		/*
5812 		 * Lookup the info for that element.
5813 		 */
5814 		info = htmlTagLookup(name);
5815 		if (info == NULL) {
5816 		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5817 		                 "Tag %s invalid\n", name, NULL);
5818 		}
5819 
5820 		/*
5821 		 * Check for an Empty Element labeled the XML/SGML way
5822 		 */
5823 		if ((CUR == '/') && (NXT(1) == '>')) {
5824 		    SKIP(2);
5825 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5826 			ctxt->sax->endElement(ctxt->userData, name);
5827 		    htmlnamePop(ctxt);
5828 		    ctxt->instate = XML_PARSER_CONTENT;
5829 #ifdef DEBUG_PUSH
5830 		    xmlGenericError(xmlGenericErrorContext,
5831 			    "HPP: entering CONTENT\n");
5832 #endif
5833 		    break;
5834 		}
5835 
5836 		if (CUR == '>') {
5837 		    NEXT;
5838 		} else {
5839 		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5840 		                 "Couldn't find end of Start Tag %s\n",
5841 				 name, NULL);
5842 
5843 		    /*
5844 		     * end of parsing of this node.
5845 		     */
5846 		    if (xmlStrEqual(name, ctxt->name)) {
5847 			nodePop(ctxt);
5848 			htmlnamePop(ctxt);
5849 		    }
5850 
5851 		    if (ctxt->record_info)
5852 		        htmlNodeInfoPush(ctxt, &node_info);
5853 
5854 		    ctxt->instate = XML_PARSER_CONTENT;
5855 #ifdef DEBUG_PUSH
5856 		    xmlGenericError(xmlGenericErrorContext,
5857 			    "HPP: entering CONTENT\n");
5858 #endif
5859 		    break;
5860 		}
5861 
5862 		/*
5863 		 * Check for an Empty Element from DTD definition
5864 		 */
5865 		if ((info != NULL) && (info->empty)) {
5866 		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5867 			ctxt->sax->endElement(ctxt->userData, name);
5868 		    htmlnamePop(ctxt);
5869 		}
5870 
5871                 if (ctxt->record_info)
5872 	            htmlNodeInfoPush(ctxt, &node_info);
5873 
5874 		ctxt->instate = XML_PARSER_CONTENT;
5875 #ifdef DEBUG_PUSH
5876 		xmlGenericError(xmlGenericErrorContext,
5877 			"HPP: entering CONTENT\n");
5878 #endif
5879                 break;
5880 	    }
5881             case XML_PARSER_CONTENT: {
5882 		xmlChar chr[2] = { 0, 0 };
5883 
5884                 /*
5885 		 * Handle preparsed entities and charRef
5886 		 */
5887 		if (ctxt->token != 0) {
5888 		    chr[0] = (xmlChar) ctxt->token;
5889 		    htmlCheckParagraph(ctxt);
5890 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5891 			ctxt->sax->characters(ctxt->userData, chr, 1);
5892 		    ctxt->token = 0;
5893 		    ctxt->checkIndex = 0;
5894 		}
5895 		if ((avail == 1) && (terminate)) {
5896 		    cur = in->cur[0];
5897 		    if ((cur != '<') && (cur != '&')) {
5898 			if (ctxt->sax != NULL) {
5899                             chr[0] = cur;
5900 			    if (IS_BLANK_CH(cur)) {
5901 				if (ctxt->keepBlanks) {
5902 				    if (ctxt->sax->characters != NULL)
5903 					ctxt->sax->characters(
5904 						ctxt->userData, chr, 1);
5905 				} else {
5906 				    if (ctxt->sax->ignorableWhitespace != NULL)
5907 					ctxt->sax->ignorableWhitespace(
5908 						ctxt->userData, chr, 1);
5909 				}
5910 			    } else {
5911 				htmlCheckParagraph(ctxt);
5912 				if (ctxt->sax->characters != NULL)
5913 				    ctxt->sax->characters(
5914 					    ctxt->userData, chr, 1);
5915 			    }
5916 			}
5917 			ctxt->token = 0;
5918 			ctxt->checkIndex = 0;
5919 			in->cur++;
5920 			break;
5921 		    }
5922 		}
5923 		if (avail < 2)
5924 		    goto done;
5925 		cur = in->cur[0];
5926 		next = in->cur[1];
5927 		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5928 		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5929 		    /*
5930 		     * Handle SCRIPT/STYLE separately
5931 		     */
5932 		    if (!terminate) {
5933 		        int idx;
5934 			xmlChar val;
5935 
5936 			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5937 			if (idx < 0)
5938 			    goto done;
5939 		        val = in->cur[idx + 2];
5940 			if (val == 0) /* bad cut of input */
5941 			    goto done;
5942 		    }
5943 		    htmlParseScript(ctxt);
5944 		    if ((cur == '<') && (next == '/')) {
5945 			ctxt->instate = XML_PARSER_END_TAG;
5946 			ctxt->checkIndex = 0;
5947 #ifdef DEBUG_PUSH
5948 			xmlGenericError(xmlGenericErrorContext,
5949 				"HPP: entering END_TAG\n");
5950 #endif
5951 			break;
5952 		    }
5953 		} else {
5954 		    /*
5955 		     * Sometimes DOCTYPE arrives in the middle of the document
5956 		     */
5957 		    if ((cur == '<') && (next == '!') &&
5958 			(UPP(2) == 'D') && (UPP(3) == 'O') &&
5959 			(UPP(4) == 'C') && (UPP(5) == 'T') &&
5960 			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5961 			(UPP(8) == 'E')) {
5962 			if ((!terminate) &&
5963 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5964 			    goto done;
5965 			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5966 			             "Misplaced DOCTYPE declaration\n",
5967 				     BAD_CAST "DOCTYPE" , NULL);
5968 			htmlParseDocTypeDecl(ctxt);
5969 		    } else if ((cur == '<') && (next == '!') &&
5970 			(in->cur[2] == '-') && (in->cur[3] == '-')) {
5971 			if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5972 			    goto done;
5973 #ifdef DEBUG_PUSH
5974 			xmlGenericError(xmlGenericErrorContext,
5975 				"HPP: Parsing Comment\n");
5976 #endif
5977 			htmlParseComment(ctxt);
5978 			ctxt->instate = XML_PARSER_CONTENT;
5979 		    } else if ((cur == '<') && (next == '?')) {
5980 			if ((!terminate) &&
5981 			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5982 			    goto done;
5983 #ifdef DEBUG_PUSH
5984 			xmlGenericError(xmlGenericErrorContext,
5985 				"HPP: Parsing PI\n");
5986 #endif
5987 			htmlParsePI(ctxt);
5988 			ctxt->instate = XML_PARSER_CONTENT;
5989 		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5990 			goto done;
5991 		    } else if ((cur == '<') && (next == '/')) {
5992 			ctxt->instate = XML_PARSER_END_TAG;
5993 			ctxt->checkIndex = 0;
5994 #ifdef DEBUG_PUSH
5995 			xmlGenericError(xmlGenericErrorContext,
5996 				"HPP: entering END_TAG\n");
5997 #endif
5998 			break;
5999 		    } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
6000                         if ((!terminate) && (next == 0))
6001                             goto done;
6002                         ctxt->instate = XML_PARSER_START_TAG;
6003                         ctxt->checkIndex = 0;
6004 #ifdef DEBUG_PUSH
6005                         xmlGenericError(xmlGenericErrorContext,
6006                                 "HPP: entering START_TAG\n");
6007 #endif
6008 			break;
6009 		    } else if (cur == '<') {
6010                         if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
6011                             (ctxt->sax->characters != NULL))
6012 			    ctxt->sax->characters(ctxt->userData,
6013 						  BAD_CAST "<", 1);
6014                         NEXT;
6015 		    } else {
6016 		        /*
6017 			 * check that the text sequence is complete
6018 			 * before handing out the data to the parser
6019 			 * to avoid problems with erroneous end of
6020 			 * data detection.
6021 			 */
6022 			if ((!terminate) &&
6023                             (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6024 			    goto done;
6025 			ctxt->checkIndex = 0;
6026 #ifdef DEBUG_PUSH
6027 			xmlGenericError(xmlGenericErrorContext,
6028 				"HPP: Parsing char data\n");
6029 #endif
6030                         while ((ctxt->instate != XML_PARSER_EOF) &&
6031                                (cur != '<') && (in->cur < in->end)) {
6032                             if (cur == '&') {
6033 			        htmlParseReference(ctxt);
6034                             } else {
6035 			        htmlParseCharData(ctxt);
6036                             }
6037                             cur = in->cur[0];
6038                         }
6039 		    }
6040 		}
6041 
6042 		break;
6043 	    }
6044             case XML_PARSER_END_TAG:
6045 		if (avail < 2)
6046 		    goto done;
6047 		if ((!terminate) &&
6048 		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6049 		    goto done;
6050 		htmlParseEndTag(ctxt);
6051 		if (ctxt->nameNr == 0) {
6052 		    ctxt->instate = XML_PARSER_EPILOG;
6053 		} else {
6054 		    ctxt->instate = XML_PARSER_CONTENT;
6055 		}
6056 		ctxt->checkIndex = 0;
6057 #ifdef DEBUG_PUSH
6058 		xmlGenericError(xmlGenericErrorContext,
6059 			"HPP: entering CONTENT\n");
6060 #endif
6061 	        break;
6062             case XML_PARSER_CDATA_SECTION:
6063 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6064 			"HPP: internal error, state == CDATA\n",
6065 			     NULL, NULL);
6066 		ctxt->instate = XML_PARSER_CONTENT;
6067 		ctxt->checkIndex = 0;
6068 #ifdef DEBUG_PUSH
6069 		xmlGenericError(xmlGenericErrorContext,
6070 			"HPP: entering CONTENT\n");
6071 #endif
6072 		break;
6073             case XML_PARSER_DTD:
6074 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6075 			"HPP: internal error, state == DTD\n",
6076 			     NULL, NULL);
6077 		ctxt->instate = XML_PARSER_CONTENT;
6078 		ctxt->checkIndex = 0;
6079 #ifdef DEBUG_PUSH
6080 		xmlGenericError(xmlGenericErrorContext,
6081 			"HPP: entering CONTENT\n");
6082 #endif
6083 		break;
6084             case XML_PARSER_COMMENT:
6085 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6086 			"HPP: internal error, state == COMMENT\n",
6087 			     NULL, NULL);
6088 		ctxt->instate = XML_PARSER_CONTENT;
6089 		ctxt->checkIndex = 0;
6090 #ifdef DEBUG_PUSH
6091 		xmlGenericError(xmlGenericErrorContext,
6092 			"HPP: entering CONTENT\n");
6093 #endif
6094 		break;
6095             case XML_PARSER_PI:
6096 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6097 			"HPP: internal error, state == PI\n",
6098 			     NULL, NULL);
6099 		ctxt->instate = XML_PARSER_CONTENT;
6100 		ctxt->checkIndex = 0;
6101 #ifdef DEBUG_PUSH
6102 		xmlGenericError(xmlGenericErrorContext,
6103 			"HPP: entering CONTENT\n");
6104 #endif
6105 		break;
6106             case XML_PARSER_ENTITY_DECL:
6107 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6108 			"HPP: internal error, state == ENTITY_DECL\n",
6109 			     NULL, NULL);
6110 		ctxt->instate = XML_PARSER_CONTENT;
6111 		ctxt->checkIndex = 0;
6112 #ifdef DEBUG_PUSH
6113 		xmlGenericError(xmlGenericErrorContext,
6114 			"HPP: entering CONTENT\n");
6115 #endif
6116 		break;
6117             case XML_PARSER_ENTITY_VALUE:
6118 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6119 			"HPP: internal error, state == ENTITY_VALUE\n",
6120 			     NULL, NULL);
6121 		ctxt->instate = XML_PARSER_CONTENT;
6122 		ctxt->checkIndex = 0;
6123 #ifdef DEBUG_PUSH
6124 		xmlGenericError(xmlGenericErrorContext,
6125 			"HPP: entering DTD\n");
6126 #endif
6127 		break;
6128             case XML_PARSER_ATTRIBUTE_VALUE:
6129 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6130 			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
6131 			     NULL, NULL);
6132 		ctxt->instate = XML_PARSER_START_TAG;
6133 		ctxt->checkIndex = 0;
6134 #ifdef DEBUG_PUSH
6135 		xmlGenericError(xmlGenericErrorContext,
6136 			"HPP: entering START_TAG\n");
6137 #endif
6138 		break;
6139 	    case XML_PARSER_SYSTEM_LITERAL:
6140 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6141 		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6142 			     NULL, NULL);
6143 		ctxt->instate = XML_PARSER_CONTENT;
6144 		ctxt->checkIndex = 0;
6145 #ifdef DEBUG_PUSH
6146 		xmlGenericError(xmlGenericErrorContext,
6147 			"HPP: entering CONTENT\n");
6148 #endif
6149 		break;
6150 	    case XML_PARSER_IGNORE:
6151 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6152 			"HPP: internal error, state == XML_PARSER_IGNORE\n",
6153 			     NULL, NULL);
6154 		ctxt->instate = XML_PARSER_CONTENT;
6155 		ctxt->checkIndex = 0;
6156 #ifdef DEBUG_PUSH
6157 		xmlGenericError(xmlGenericErrorContext,
6158 			"HPP: entering CONTENT\n");
6159 #endif
6160 		break;
6161 	    case XML_PARSER_PUBLIC_LITERAL:
6162 		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6163 			"HPP: internal error, state == XML_PARSER_LITERAL\n",
6164 			     NULL, NULL);
6165 		ctxt->instate = XML_PARSER_CONTENT;
6166 		ctxt->checkIndex = 0;
6167 #ifdef DEBUG_PUSH
6168 		xmlGenericError(xmlGenericErrorContext,
6169 			"HPP: entering CONTENT\n");
6170 #endif
6171 		break;
6172 
6173 	}
6174     }
6175 done:
6176     if ((avail == 0) && (terminate)) {
6177 	htmlAutoCloseOnEnd(ctxt);
6178 	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6179 	    /*
6180 	     * SAX: end of the document processing.
6181 	     */
6182 	    ctxt->instate = XML_PARSER_EOF;
6183 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6184 		ctxt->sax->endDocument(ctxt->userData);
6185 	}
6186     }
6187     if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6188 	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6189 	 (ctxt->instate == XML_PARSER_EPILOG))) {
6190 	xmlDtdPtr dtd;
6191 	dtd = xmlGetIntSubset(ctxt->myDoc);
6192 	if (dtd == NULL)
6193 	    ctxt->myDoc->intSubset =
6194 		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6195 		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6196 		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6197     }
6198 #ifdef DEBUG_PUSH
6199     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6200 #endif
6201     return(ret);
6202 }
6203 
6204 /**
6205  * htmlParseChunk:
6206  * @ctxt:  an HTML parser context
6207  * @chunk:  an char array
6208  * @size:  the size in byte of the chunk
6209  * @terminate:  last chunk indicator
6210  *
6211  * Parse a Chunk of memory
6212  *
6213  * Returns zero if no error, the xmlParserErrors otherwise.
6214  */
6215 int
htmlParseChunk(htmlParserCtxtPtr ctxt,const char * chunk,int size,int terminate)6216 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6217               int terminate) {
6218     if ((ctxt == NULL) || (ctxt->input == NULL)) {
6219 	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6220 		     "htmlParseChunk: context error\n", NULL, NULL);
6221 	return(XML_ERR_INTERNAL_ERROR);
6222     }
6223     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6224         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6225 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6226 	size_t cur = ctxt->input->cur - ctxt->input->base;
6227 	int res;
6228 
6229 	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6230         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6231 	if (res < 0) {
6232 	    ctxt->errNo = XML_PARSER_EOF;
6233 	    ctxt->disableSAX = 1;
6234 	    return (XML_PARSER_EOF);
6235 	}
6236 #ifdef DEBUG_PUSH
6237 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6238 #endif
6239 
6240 #if 0
6241 	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6242 	    htmlParseTryOrFinish(ctxt, terminate);
6243 #endif
6244     } else if (ctxt->instate != XML_PARSER_EOF) {
6245 	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6246 	    xmlParserInputBufferPtr in = ctxt->input->buf;
6247 	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
6248 		    (in->raw != NULL)) {
6249 		int nbchars;
6250 		size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6251 		size_t current = ctxt->input->cur - ctxt->input->base;
6252 
6253 		nbchars = xmlCharEncInput(in, terminate);
6254 		xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6255 		if (nbchars < 0) {
6256 		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6257 			         "encoder error\n", NULL, NULL);
6258 		    return(XML_ERR_INVALID_ENCODING);
6259 		}
6260 	    }
6261 	}
6262     }
6263     htmlParseTryOrFinish(ctxt, terminate);
6264     if (terminate) {
6265 	if ((ctxt->instate != XML_PARSER_EOF) &&
6266 	    (ctxt->instate != XML_PARSER_EPILOG) &&
6267 	    (ctxt->instate != XML_PARSER_MISC)) {
6268 	    ctxt->errNo = XML_ERR_DOCUMENT_END;
6269 	    ctxt->wellFormed = 0;
6270 	}
6271 	if (ctxt->instate != XML_PARSER_EOF) {
6272 	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6273 		ctxt->sax->endDocument(ctxt->userData);
6274 	}
6275 	ctxt->instate = XML_PARSER_EOF;
6276     }
6277     return((xmlParserErrors) ctxt->errNo);
6278 }
6279 
6280 /************************************************************************
6281  *									*
6282  *			User entry points				*
6283  *									*
6284  ************************************************************************/
6285 
6286 /**
6287  * htmlCreatePushParserCtxt:
6288  * @sax:  a SAX handler
6289  * @user_data:  The user data returned on SAX callbacks
6290  * @chunk:  a pointer to an array of chars
6291  * @size:  number of chars in the array
6292  * @filename:  an optional file name or URI
6293  * @enc:  an optional encoding
6294  *
6295  * Create a parser context for using the HTML parser in push mode
6296  * The value of @filename is used for fetching external entities
6297  * and error/warning reports.
6298  *
6299  * Returns the new parser context or NULL
6300  */
6301 htmlParserCtxtPtr
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,void * user_data,const char * chunk,int size,const char * filename,xmlCharEncoding enc)6302 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6303                          const char *chunk, int size, const char *filename,
6304 			 xmlCharEncoding enc) {
6305     htmlParserCtxtPtr ctxt;
6306     htmlParserInputPtr inputStream;
6307     xmlParserInputBufferPtr buf;
6308 
6309     xmlInitParser();
6310 
6311     buf = xmlAllocParserInputBuffer(enc);
6312     if (buf == NULL) return(NULL);
6313 
6314     ctxt = htmlNewParserCtxt();
6315     if (ctxt == NULL) {
6316 	xmlFreeParserInputBuffer(buf);
6317 	return(NULL);
6318     }
6319     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6320 	ctxt->charset=XML_CHAR_ENCODING_UTF8;
6321     if (sax != NULL) {
6322 	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6323 	    xmlFree(ctxt->sax);
6324 	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6325 	if (ctxt->sax == NULL) {
6326 	    xmlFree(buf);
6327 	    xmlFree(ctxt);
6328 	    return(NULL);
6329 	}
6330 	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6331 	if (user_data != NULL)
6332 	    ctxt->userData = user_data;
6333     }
6334     if (filename == NULL) {
6335 	ctxt->directory = NULL;
6336     } else {
6337         ctxt->directory = xmlParserGetDirectory(filename);
6338     }
6339 
6340     inputStream = htmlNewInputStream(ctxt);
6341     if (inputStream == NULL) {
6342 	xmlFreeParserCtxt(ctxt);
6343 	xmlFree(buf);
6344 	return(NULL);
6345     }
6346 
6347     if (filename == NULL)
6348 	inputStream->filename = NULL;
6349     else
6350 	inputStream->filename = (char *)
6351 	    xmlCanonicPath((const xmlChar *) filename);
6352     inputStream->buf = buf;
6353     xmlBufResetInput(buf->buffer, inputStream);
6354 
6355     inputPush(ctxt, inputStream);
6356 
6357     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6358         (ctxt->input->buf != NULL))  {
6359 	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6360 	size_t cur = ctxt->input->cur - ctxt->input->base;
6361 
6362 	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6363 
6364         xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6365 #ifdef DEBUG_PUSH
6366 	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6367 #endif
6368     }
6369     ctxt->progressive = 1;
6370 
6371     return(ctxt);
6372 }
6373 #endif /* LIBXML_PUSH_ENABLED */
6374 
6375 /**
6376  * htmlSAXParseDoc:
6377  * @cur:  a pointer to an array of xmlChar
6378  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6379  * @sax:  the SAX handler block
6380  * @userData: if using SAX, this pointer will be provided on callbacks.
6381  *
6382  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6383  * to handle parse events. If sax is NULL, fallback to the default DOM
6384  * behavior and return a tree.
6385  *
6386  * Returns the resulting document tree unless SAX is NULL or the document is
6387  *     not well formed.
6388  */
6389 
6390 htmlDocPtr
htmlSAXParseDoc(const xmlChar * cur,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6391 htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6392                 htmlSAXHandlerPtr sax, void *userData) {
6393     htmlDocPtr ret;
6394     htmlParserCtxtPtr ctxt;
6395 
6396     xmlInitParser();
6397 
6398     if (cur == NULL) return(NULL);
6399 
6400 
6401     ctxt = htmlCreateDocParserCtxt(cur, encoding);
6402     if (ctxt == NULL) return(NULL);
6403     if (sax != NULL) {
6404         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6405         ctxt->sax = sax;
6406         ctxt->userData = userData;
6407     }
6408 
6409     htmlParseDocument(ctxt);
6410     ret = ctxt->myDoc;
6411     if (sax != NULL) {
6412 	ctxt->sax = NULL;
6413 	ctxt->userData = NULL;
6414     }
6415     htmlFreeParserCtxt(ctxt);
6416 
6417     return(ret);
6418 }
6419 
6420 /**
6421  * htmlParseDoc:
6422  * @cur:  a pointer to an array of xmlChar
6423  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6424  *
6425  * parse an HTML in-memory document and build a tree.
6426  *
6427  * Returns the resulting document tree
6428  */
6429 
6430 htmlDocPtr
htmlParseDoc(const xmlChar * cur,const char * encoding)6431 htmlParseDoc(const xmlChar *cur, const char *encoding) {
6432     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6433 }
6434 
6435 
6436 /**
6437  * htmlCreateFileParserCtxt:
6438  * @filename:  the filename
6439  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6440  *
6441  * Create a parser context for a file content.
6442  * Automatic support for ZLIB/Compress compressed document is provided
6443  * by default if found at compile-time.
6444  *
6445  * Returns the new parser context or NULL
6446  */
6447 htmlParserCtxtPtr
htmlCreateFileParserCtxt(const char * filename,const char * encoding)6448 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6449 {
6450     htmlParserCtxtPtr ctxt;
6451     htmlParserInputPtr inputStream;
6452     char *canonicFilename;
6453     /* htmlCharEncoding enc; */
6454     xmlChar *content, *content_line = (xmlChar *) "charset=";
6455 
6456     if (filename == NULL)
6457         return(NULL);
6458 
6459     ctxt = htmlNewParserCtxt();
6460     if (ctxt == NULL) {
6461 	return(NULL);
6462     }
6463     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6464     if (canonicFilename == NULL) {
6465 #ifdef LIBXML_SAX1_ENABLED
6466 	if (xmlDefaultSAXHandler.error != NULL) {
6467 	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6468 	}
6469 #endif
6470 	xmlFreeParserCtxt(ctxt);
6471 	return(NULL);
6472     }
6473 
6474     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6475     xmlFree(canonicFilename);
6476     if (inputStream == NULL) {
6477 	xmlFreeParserCtxt(ctxt);
6478 	return(NULL);
6479     }
6480 
6481     inputPush(ctxt, inputStream);
6482 
6483     /* set encoding */
6484     if (encoding) {
6485         size_t l = strlen(encoding);
6486 
6487 	if (l < 1000) {
6488 	    content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6489 	    if (content) {
6490 		strcpy ((char *)content, (char *)content_line);
6491 		strcat ((char *)content, (char *)encoding);
6492 		htmlCheckEncoding (ctxt, content);
6493 		xmlFree (content);
6494 	    }
6495 	}
6496     }
6497 
6498     return(ctxt);
6499 }
6500 
6501 /**
6502  * htmlSAXParseFile:
6503  * @filename:  the filename
6504  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6505  * @sax:  the SAX handler block
6506  * @userData: if using SAX, this pointer will be provided on callbacks.
6507  *
6508  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6509  * compressed document is provided by default if found at compile-time.
6510  * It use the given SAX function block to handle the parsing callback.
6511  * If sax is NULL, fallback to the default DOM tree building routines.
6512  *
6513  * Returns the resulting document tree unless SAX is NULL or the document is
6514  *     not well formed.
6515  */
6516 
6517 htmlDocPtr
htmlSAXParseFile(const char * filename,const char * encoding,htmlSAXHandlerPtr sax,void * userData)6518 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6519                  void *userData) {
6520     htmlDocPtr ret;
6521     htmlParserCtxtPtr ctxt;
6522     htmlSAXHandlerPtr oldsax = NULL;
6523 
6524     xmlInitParser();
6525 
6526     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6527     if (ctxt == NULL) return(NULL);
6528     if (sax != NULL) {
6529 	oldsax = ctxt->sax;
6530         ctxt->sax = sax;
6531         ctxt->userData = userData;
6532     }
6533 
6534     htmlParseDocument(ctxt);
6535 
6536     ret = ctxt->myDoc;
6537     if (sax != NULL) {
6538         ctxt->sax = oldsax;
6539         ctxt->userData = NULL;
6540     }
6541     htmlFreeParserCtxt(ctxt);
6542 
6543     return(ret);
6544 }
6545 
6546 /**
6547  * htmlParseFile:
6548  * @filename:  the filename
6549  * @encoding:  a free form C string describing the HTML document encoding, or NULL
6550  *
6551  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6552  * compressed document is provided by default if found at compile-time.
6553  *
6554  * Returns the resulting document tree
6555  */
6556 
6557 htmlDocPtr
htmlParseFile(const char * filename,const char * encoding)6558 htmlParseFile(const char *filename, const char *encoding) {
6559     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6560 }
6561 
6562 /**
6563  * htmlHandleOmittedElem:
6564  * @val:  int 0 or 1
6565  *
6566  * Set and return the previous value for handling HTML omitted tags.
6567  *
6568  * Returns the last value for 0 for no handling, 1 for auto insertion.
6569  */
6570 
6571 int
htmlHandleOmittedElem(int val)6572 htmlHandleOmittedElem(int val) {
6573     int old = htmlOmittedDefaultValue;
6574 
6575     htmlOmittedDefaultValue = val;
6576     return(old);
6577 }
6578 
6579 /**
6580  * htmlElementAllowedHere:
6581  * @parent: HTML parent element
6582  * @elt: HTML element
6583  *
6584  * Checks whether an HTML element may be a direct child of a parent element.
6585  * Note - doesn't check for deprecated elements
6586  *
6587  * Returns 1 if allowed; 0 otherwise.
6588  */
6589 int
htmlElementAllowedHere(const htmlElemDesc * parent,const xmlChar * elt)6590 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6591   const char** p ;
6592 
6593   if ( ! elt || ! parent || ! parent->subelts )
6594 	return 0 ;
6595 
6596   for ( p = parent->subelts; *p; ++p )
6597     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6598       return 1 ;
6599 
6600   return 0 ;
6601 }
6602 /**
6603  * htmlElementStatusHere:
6604  * @parent: HTML parent element
6605  * @elt: HTML element
6606  *
6607  * Checks whether an HTML element may be a direct child of a parent element.
6608  * and if so whether it is valid or deprecated.
6609  *
6610  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6611  */
6612 htmlStatus
htmlElementStatusHere(const htmlElemDesc * parent,const htmlElemDesc * elt)6613 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6614   if ( ! parent || ! elt )
6615     return HTML_INVALID ;
6616   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6617     return HTML_INVALID ;
6618 
6619   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6620 }
6621 /**
6622  * htmlAttrAllowed:
6623  * @elt: HTML element
6624  * @attr: HTML attribute
6625  * @legacy: whether to allow deprecated attributes
6626  *
6627  * Checks whether an attribute is valid for an element
6628  * Has full knowledge of Required and Deprecated attributes
6629  *
6630  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6631  */
6632 htmlStatus
htmlAttrAllowed(const htmlElemDesc * elt,const xmlChar * attr,int legacy)6633 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6634   const char** p ;
6635 
6636   if ( !elt || ! attr )
6637 	return HTML_INVALID ;
6638 
6639   if ( elt->attrs_req )
6640     for ( p = elt->attrs_req; *p; ++p)
6641       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6642         return HTML_REQUIRED ;
6643 
6644   if ( elt->attrs_opt )
6645     for ( p = elt->attrs_opt; *p; ++p)
6646       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6647         return HTML_VALID ;
6648 
6649   if ( legacy && elt->attrs_depr )
6650     for ( p = elt->attrs_depr; *p; ++p)
6651       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6652         return HTML_DEPRECATED ;
6653 
6654   return HTML_INVALID ;
6655 }
6656 /**
6657  * htmlNodeStatus:
6658  * @node: an htmlNodePtr in a tree
6659  * @legacy: whether to allow deprecated elements (YES is faster here
6660  *	for Element nodes)
6661  *
6662  * Checks whether the tree node is valid.  Experimental (the author
6663  *     only uses the HTML enhancements in a SAX parser)
6664  *
6665  * Return: for Element nodes, a return from htmlElementAllowedHere (if
6666  *	legacy allowed) or htmlElementStatusHere (otherwise).
6667  *	for Attribute nodes, a return from htmlAttrAllowed
6668  *	for other nodes, HTML_NA (no checks performed)
6669  */
6670 htmlStatus
htmlNodeStatus(const htmlNodePtr node,int legacy)6671 htmlNodeStatus(const htmlNodePtr node, int legacy) {
6672   if ( ! node )
6673     return HTML_INVALID ;
6674 
6675   switch ( node->type ) {
6676     case XML_ELEMENT_NODE:
6677       return legacy
6678 	? ( htmlElementAllowedHere (
6679 		htmlTagLookup(node->parent->name) , node->name
6680 		) ? HTML_VALID : HTML_INVALID )
6681 	: htmlElementStatusHere(
6682 		htmlTagLookup(node->parent->name) ,
6683 		htmlTagLookup(node->name) )
6684 	;
6685     case XML_ATTRIBUTE_NODE:
6686       return htmlAttrAllowed(
6687 	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6688     default: return HTML_NA ;
6689   }
6690 }
6691 /************************************************************************
6692  *									*
6693  *	New set (2.6.0) of simpler and more flexible APIs		*
6694  *									*
6695  ************************************************************************/
6696 /**
6697  * DICT_FREE:
6698  * @str:  a string
6699  *
6700  * Free a string if it is not owned by the "dict" dictionary in the
6701  * current scope
6702  */
6703 #define DICT_FREE(str)						\
6704 	if ((str) && ((!dict) ||				\
6705 	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6706 	    xmlFree((char *)(str));
6707 
6708 /**
6709  * htmlCtxtReset:
6710  * @ctxt: an HTML parser context
6711  *
6712  * Reset a parser context
6713  */
6714 void
htmlCtxtReset(htmlParserCtxtPtr ctxt)6715 htmlCtxtReset(htmlParserCtxtPtr ctxt)
6716 {
6717     xmlParserInputPtr input;
6718     xmlDictPtr dict;
6719 
6720     if (ctxt == NULL)
6721         return;
6722 
6723     xmlInitParser();
6724     dict = ctxt->dict;
6725 
6726     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6727         xmlFreeInputStream(input);
6728     }
6729     ctxt->inputNr = 0;
6730     ctxt->input = NULL;
6731 
6732     ctxt->spaceNr = 0;
6733     if (ctxt->spaceTab != NULL) {
6734 	ctxt->spaceTab[0] = -1;
6735 	ctxt->space = &ctxt->spaceTab[0];
6736     } else {
6737 	ctxt->space = NULL;
6738     }
6739 
6740 
6741     ctxt->nodeNr = 0;
6742     ctxt->node = NULL;
6743 
6744     ctxt->nameNr = 0;
6745     ctxt->name = NULL;
6746 
6747     DICT_FREE(ctxt->version);
6748     ctxt->version = NULL;
6749     DICT_FREE(ctxt->encoding);
6750     ctxt->encoding = NULL;
6751     DICT_FREE(ctxt->directory);
6752     ctxt->directory = NULL;
6753     DICT_FREE(ctxt->extSubURI);
6754     ctxt->extSubURI = NULL;
6755     DICT_FREE(ctxt->extSubSystem);
6756     ctxt->extSubSystem = NULL;
6757     if (ctxt->myDoc != NULL)
6758         xmlFreeDoc(ctxt->myDoc);
6759     ctxt->myDoc = NULL;
6760 
6761     ctxt->standalone = -1;
6762     ctxt->hasExternalSubset = 0;
6763     ctxt->hasPErefs = 0;
6764     ctxt->html = 1;
6765     ctxt->external = 0;
6766     ctxt->instate = XML_PARSER_START;
6767     ctxt->token = 0;
6768 
6769     ctxt->wellFormed = 1;
6770     ctxt->nsWellFormed = 1;
6771     ctxt->disableSAX = 0;
6772     ctxt->valid = 1;
6773     ctxt->vctxt.userData = ctxt;
6774     ctxt->vctxt.error = xmlParserValidityError;
6775     ctxt->vctxt.warning = xmlParserValidityWarning;
6776     ctxt->record_info = 0;
6777     ctxt->checkIndex = 0;
6778     ctxt->inSubset = 0;
6779     ctxt->errNo = XML_ERR_OK;
6780     ctxt->depth = 0;
6781     ctxt->charset = XML_CHAR_ENCODING_NONE;
6782     ctxt->catalogs = NULL;
6783     xmlInitNodeInfoSeq(&ctxt->node_seq);
6784 
6785     if (ctxt->attsDefault != NULL) {
6786         xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6787         ctxt->attsDefault = NULL;
6788     }
6789     if (ctxt->attsSpecial != NULL) {
6790         xmlHashFree(ctxt->attsSpecial, NULL);
6791         ctxt->attsSpecial = NULL;
6792     }
6793 }
6794 
6795 /**
6796  * htmlCtxtUseOptions:
6797  * @ctxt: an HTML parser context
6798  * @options:  a combination of htmlParserOption(s)
6799  *
6800  * Applies the options to the parser context
6801  *
6802  * Returns 0 in case of success, the set of unknown or unimplemented options
6803  *         in case of error.
6804  */
6805 int
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt,int options)6806 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6807 {
6808     if (ctxt == NULL)
6809         return(-1);
6810 
6811     if (options & HTML_PARSE_NOWARNING) {
6812         ctxt->sax->warning = NULL;
6813         ctxt->vctxt.warning = NULL;
6814         options -= XML_PARSE_NOWARNING;
6815 	ctxt->options |= XML_PARSE_NOWARNING;
6816     }
6817     if (options & HTML_PARSE_NOERROR) {
6818         ctxt->sax->error = NULL;
6819         ctxt->vctxt.error = NULL;
6820         ctxt->sax->fatalError = NULL;
6821         options -= XML_PARSE_NOERROR;
6822 	ctxt->options |= XML_PARSE_NOERROR;
6823     }
6824     if (options & HTML_PARSE_PEDANTIC) {
6825         ctxt->pedantic = 1;
6826         options -= XML_PARSE_PEDANTIC;
6827 	ctxt->options |= XML_PARSE_PEDANTIC;
6828     } else
6829         ctxt->pedantic = 0;
6830     if (options & XML_PARSE_NOBLANKS) {
6831         ctxt->keepBlanks = 0;
6832         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6833         options -= XML_PARSE_NOBLANKS;
6834 	ctxt->options |= XML_PARSE_NOBLANKS;
6835     } else
6836         ctxt->keepBlanks = 1;
6837     if (options & HTML_PARSE_RECOVER) {
6838         ctxt->recovery = 1;
6839 	options -= HTML_PARSE_RECOVER;
6840     } else
6841         ctxt->recovery = 0;
6842     if (options & HTML_PARSE_COMPACT) {
6843 	ctxt->options |= HTML_PARSE_COMPACT;
6844         options -= HTML_PARSE_COMPACT;
6845     }
6846     if (options & XML_PARSE_HUGE) {
6847 	ctxt->options |= XML_PARSE_HUGE;
6848         options -= XML_PARSE_HUGE;
6849     }
6850     if (options & HTML_PARSE_NODEFDTD) {
6851 	ctxt->options |= HTML_PARSE_NODEFDTD;
6852         options -= HTML_PARSE_NODEFDTD;
6853     }
6854     if (options & HTML_PARSE_IGNORE_ENC) {
6855 	ctxt->options |= HTML_PARSE_IGNORE_ENC;
6856         options -= HTML_PARSE_IGNORE_ENC;
6857     }
6858     if (options & HTML_PARSE_NOIMPLIED) {
6859         ctxt->options |= HTML_PARSE_NOIMPLIED;
6860         options -= HTML_PARSE_NOIMPLIED;
6861     }
6862     ctxt->dictNames = 0;
6863     return (options);
6864 }
6865 
6866 /**
6867  * htmlDoRead:
6868  * @ctxt:  an HTML parser context
6869  * @URL:  the base URL to use for the document
6870  * @encoding:  the document encoding, or NULL
6871  * @options:  a combination of htmlParserOption(s)
6872  * @reuse:  keep the context for reuse
6873  *
6874  * Common front-end for the htmlRead functions
6875  *
6876  * Returns the resulting document tree or NULL
6877  */
6878 static htmlDocPtr
htmlDoRead(htmlParserCtxtPtr ctxt,const char * URL,const char * encoding,int options,int reuse)6879 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6880           int options, int reuse)
6881 {
6882     htmlDocPtr ret;
6883 
6884     htmlCtxtUseOptions(ctxt, options);
6885     ctxt->html = 1;
6886     if (encoding != NULL) {
6887         xmlCharEncodingHandlerPtr hdlr;
6888 
6889 	hdlr = xmlFindCharEncodingHandler(encoding);
6890 	if (hdlr != NULL) {
6891 	    xmlSwitchToEncoding(ctxt, hdlr);
6892 	    if (ctxt->input->encoding != NULL)
6893 	      xmlFree((xmlChar *) ctxt->input->encoding);
6894             ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6895         }
6896     }
6897     if ((URL != NULL) && (ctxt->input != NULL) &&
6898         (ctxt->input->filename == NULL))
6899         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6900     htmlParseDocument(ctxt);
6901     ret = ctxt->myDoc;
6902     ctxt->myDoc = NULL;
6903     if (!reuse) {
6904         if ((ctxt->dictNames) &&
6905 	    (ret != NULL) &&
6906 	    (ret->dict == ctxt->dict))
6907 	    ctxt->dict = NULL;
6908 	xmlFreeParserCtxt(ctxt);
6909     }
6910     return (ret);
6911 }
6912 
6913 /**
6914  * htmlReadDoc:
6915  * @cur:  a pointer to a zero terminated string
6916  * @URL:  the base URL to use for the document
6917  * @encoding:  the document encoding, or NULL
6918  * @options:  a combination of htmlParserOption(s)
6919  *
6920  * parse an XML in-memory document and build a tree.
6921  *
6922  * Returns the resulting document tree
6923  */
6924 htmlDocPtr
htmlReadDoc(const xmlChar * cur,const char * URL,const char * encoding,int options)6925 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6926 {
6927     htmlParserCtxtPtr ctxt;
6928 
6929     if (cur == NULL)
6930         return (NULL);
6931 
6932     xmlInitParser();
6933     ctxt = htmlCreateDocParserCtxt(cur, NULL);
6934     if (ctxt == NULL)
6935         return (NULL);
6936     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6937 }
6938 
6939 /**
6940  * htmlReadFile:
6941  * @filename:  a file or URL
6942  * @encoding:  the document encoding, or NULL
6943  * @options:  a combination of htmlParserOption(s)
6944  *
6945  * parse an XML file from the filesystem or the network.
6946  *
6947  * Returns the resulting document tree
6948  */
6949 htmlDocPtr
htmlReadFile(const char * filename,const char * encoding,int options)6950 htmlReadFile(const char *filename, const char *encoding, int options)
6951 {
6952     htmlParserCtxtPtr ctxt;
6953 
6954     xmlInitParser();
6955     ctxt = htmlCreateFileParserCtxt(filename, encoding);
6956     if (ctxt == NULL)
6957         return (NULL);
6958     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6959 }
6960 
6961 /**
6962  * htmlReadMemory:
6963  * @buffer:  a pointer to a char array
6964  * @size:  the size of the array
6965  * @URL:  the base URL to use for the document
6966  * @encoding:  the document encoding, or NULL
6967  * @options:  a combination of htmlParserOption(s)
6968  *
6969  * parse an XML in-memory document and build a tree.
6970  *
6971  * Returns the resulting document tree
6972  */
6973 htmlDocPtr
htmlReadMemory(const char * buffer,int size,const char * URL,const char * encoding,int options)6974 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6975 {
6976     htmlParserCtxtPtr ctxt;
6977 
6978     xmlInitParser();
6979     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6980     if (ctxt == NULL)
6981         return (NULL);
6982     htmlDefaultSAXHandlerInit();
6983     if (ctxt->sax != NULL)
6984         memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6985     return (htmlDoRead(ctxt, URL, encoding, options, 0));
6986 }
6987 
6988 /**
6989  * htmlReadFd:
6990  * @fd:  an open file descriptor
6991  * @URL:  the base URL to use for the document
6992  * @encoding:  the document encoding, or NULL
6993  * @options:  a combination of htmlParserOption(s)
6994  *
6995  * parse an HTML from a file descriptor and build a tree.
6996  * NOTE that the file descriptor will not be closed when the
6997  *      reader is closed or reset.
6998  *
6999  * Returns the resulting document tree
7000  */
7001 htmlDocPtr
htmlReadFd(int fd,const char * URL,const char * encoding,int options)7002 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7003 {
7004     htmlParserCtxtPtr ctxt;
7005     xmlParserInputBufferPtr input;
7006     htmlParserInputPtr stream;
7007 
7008     if (fd < 0)
7009         return (NULL);
7010 
7011     xmlInitParser();
7012     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7013     if (input == NULL)
7014         return (NULL);
7015     input->closecallback = NULL;
7016     ctxt = htmlNewParserCtxt();
7017     if (ctxt == NULL) {
7018         xmlFreeParserInputBuffer(input);
7019         return (NULL);
7020     }
7021     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7022     if (stream == NULL) {
7023         xmlFreeParserInputBuffer(input);
7024 	htmlFreeParserCtxt(ctxt);
7025         return (NULL);
7026     }
7027     inputPush(ctxt, stream);
7028     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7029 }
7030 
7031 /**
7032  * htmlReadIO:
7033  * @ioread:  an I/O read function
7034  * @ioclose:  an I/O close function
7035  * @ioctx:  an I/O handler
7036  * @URL:  the base URL to use for the document
7037  * @encoding:  the document encoding, or NULL
7038  * @options:  a combination of htmlParserOption(s)
7039  *
7040  * parse an HTML document from I/O functions and source and build a tree.
7041  *
7042  * Returns the resulting document tree
7043  */
7044 htmlDocPtr
htmlReadIO(xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7045 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7046           void *ioctx, const char *URL, const char *encoding, int options)
7047 {
7048     htmlParserCtxtPtr ctxt;
7049     xmlParserInputBufferPtr input;
7050     xmlParserInputPtr stream;
7051 
7052     if (ioread == NULL)
7053         return (NULL);
7054     xmlInitParser();
7055 
7056     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7057                                          XML_CHAR_ENCODING_NONE);
7058     if (input == NULL) {
7059         if (ioclose != NULL)
7060             ioclose(ioctx);
7061         return (NULL);
7062     }
7063     ctxt = htmlNewParserCtxt();
7064     if (ctxt == NULL) {
7065         xmlFreeParserInputBuffer(input);
7066         return (NULL);
7067     }
7068     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7069     if (stream == NULL) {
7070         xmlFreeParserInputBuffer(input);
7071 	xmlFreeParserCtxt(ctxt);
7072         return (NULL);
7073     }
7074     inputPush(ctxt, stream);
7075     return (htmlDoRead(ctxt, URL, encoding, options, 0));
7076 }
7077 
7078 /**
7079  * htmlCtxtReadDoc:
7080  * @ctxt:  an HTML parser context
7081  * @cur:  a pointer to a zero terminated string
7082  * @URL:  the base URL to use for the document
7083  * @encoding:  the document encoding, or NULL
7084  * @options:  a combination of htmlParserOption(s)
7085  *
7086  * parse an XML in-memory document and build a tree.
7087  * This reuses the existing @ctxt parser context
7088  *
7089  * Returns the resulting document tree
7090  */
7091 htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt,const xmlChar * cur,const char * URL,const char * encoding,int options)7092 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7093                const char *URL, const char *encoding, int options)
7094 {
7095     xmlParserInputPtr stream;
7096 
7097     if (cur == NULL)
7098         return (NULL);
7099     if (ctxt == NULL)
7100         return (NULL);
7101     xmlInitParser();
7102 
7103     htmlCtxtReset(ctxt);
7104 
7105     stream = xmlNewStringInputStream(ctxt, cur);
7106     if (stream == NULL) {
7107         return (NULL);
7108     }
7109     inputPush(ctxt, stream);
7110     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7111 }
7112 
7113 /**
7114  * htmlCtxtReadFile:
7115  * @ctxt:  an HTML parser context
7116  * @filename:  a file or URL
7117  * @encoding:  the document encoding, or NULL
7118  * @options:  a combination of htmlParserOption(s)
7119  *
7120  * parse an XML file from the filesystem or the network.
7121  * This reuses the existing @ctxt parser context
7122  *
7123  * Returns the resulting document tree
7124  */
7125 htmlDocPtr
htmlCtxtReadFile(htmlParserCtxtPtr ctxt,const char * filename,const char * encoding,int options)7126 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7127                 const char *encoding, int options)
7128 {
7129     xmlParserInputPtr stream;
7130 
7131     if (filename == NULL)
7132         return (NULL);
7133     if (ctxt == NULL)
7134         return (NULL);
7135     xmlInitParser();
7136 
7137     htmlCtxtReset(ctxt);
7138 
7139     stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7140     if (stream == NULL) {
7141         return (NULL);
7142     }
7143     inputPush(ctxt, stream);
7144     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7145 }
7146 
7147 /**
7148  * htmlCtxtReadMemory:
7149  * @ctxt:  an HTML parser context
7150  * @buffer:  a pointer to a char array
7151  * @size:  the size of the array
7152  * @URL:  the base URL to use for the document
7153  * @encoding:  the document encoding, or NULL
7154  * @options:  a combination of htmlParserOption(s)
7155  *
7156  * parse an XML in-memory document and build a tree.
7157  * This reuses the existing @ctxt parser context
7158  *
7159  * Returns the resulting document tree
7160  */
7161 htmlDocPtr
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt,const char * buffer,int size,const char * URL,const char * encoding,int options)7162 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7163                   const char *URL, const char *encoding, int options)
7164 {
7165     xmlParserInputBufferPtr input;
7166     xmlParserInputPtr stream;
7167 
7168     if (ctxt == NULL)
7169         return (NULL);
7170     if (buffer == NULL)
7171         return (NULL);
7172     xmlInitParser();
7173 
7174     htmlCtxtReset(ctxt);
7175 
7176     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7177     if (input == NULL) {
7178 	return(NULL);
7179     }
7180 
7181     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7182     if (stream == NULL) {
7183 	xmlFreeParserInputBuffer(input);
7184 	return(NULL);
7185     }
7186 
7187     inputPush(ctxt, stream);
7188     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7189 }
7190 
7191 /**
7192  * htmlCtxtReadFd:
7193  * @ctxt:  an HTML parser context
7194  * @fd:  an open file descriptor
7195  * @URL:  the base URL to use for the document
7196  * @encoding:  the document encoding, or NULL
7197  * @options:  a combination of htmlParserOption(s)
7198  *
7199  * parse an XML from a file descriptor and build a tree.
7200  * This reuses the existing @ctxt parser context
7201  *
7202  * Returns the resulting document tree
7203  */
7204 htmlDocPtr
htmlCtxtReadFd(htmlParserCtxtPtr ctxt,int fd,const char * URL,const char * encoding,int options)7205 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7206               const char *URL, const char *encoding, int options)
7207 {
7208     xmlParserInputBufferPtr input;
7209     xmlParserInputPtr stream;
7210 
7211     if (fd < 0)
7212         return (NULL);
7213     if (ctxt == NULL)
7214         return (NULL);
7215     xmlInitParser();
7216 
7217     htmlCtxtReset(ctxt);
7218 
7219 
7220     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7221     if (input == NULL)
7222         return (NULL);
7223     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7224     if (stream == NULL) {
7225         xmlFreeParserInputBuffer(input);
7226         return (NULL);
7227     }
7228     inputPush(ctxt, stream);
7229     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7230 }
7231 
7232 /**
7233  * htmlCtxtReadIO:
7234  * @ctxt:  an HTML parser context
7235  * @ioread:  an I/O read function
7236  * @ioclose:  an I/O close function
7237  * @ioctx:  an I/O handler
7238  * @URL:  the base URL to use for the document
7239  * @encoding:  the document encoding, or NULL
7240  * @options:  a combination of htmlParserOption(s)
7241  *
7242  * parse an HTML document from I/O functions and source and build a tree.
7243  * This reuses the existing @ctxt parser context
7244  *
7245  * Returns the resulting document tree
7246  */
7247 htmlDocPtr
htmlCtxtReadIO(htmlParserCtxtPtr ctxt,xmlInputReadCallback ioread,xmlInputCloseCallback ioclose,void * ioctx,const char * URL,const char * encoding,int options)7248 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7249               xmlInputCloseCallback ioclose, void *ioctx,
7250 	      const char *URL,
7251               const char *encoding, int options)
7252 {
7253     xmlParserInputBufferPtr input;
7254     xmlParserInputPtr stream;
7255 
7256     if (ioread == NULL)
7257         return (NULL);
7258     if (ctxt == NULL)
7259         return (NULL);
7260     xmlInitParser();
7261 
7262     htmlCtxtReset(ctxt);
7263 
7264     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7265                                          XML_CHAR_ENCODING_NONE);
7266     if (input == NULL) {
7267         if (ioclose != NULL)
7268             ioclose(ioctx);
7269         return (NULL);
7270     }
7271     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7272     if (stream == NULL) {
7273         xmlFreeParserInputBuffer(input);
7274         return (NULL);
7275     }
7276     inputPush(ctxt, stream);
7277     return (htmlDoRead(ctxt, URL, encoding, options, 1));
7278 }
7279 
7280 #endif /* LIBXML_HTML_ENABLED */
7281